9 Commits

Author SHA1 Message Date
gagb
0c25a086e7 Merge branch 'main' into gagb/add-github-issue-conversion 2024-12-14 18:34:18 -08:00
gagb
8a30fca732 Add support for GH prs as well 2024-12-13 14:57:39 -08:00
gagb
0b6554738c Move github handling from convert to convert_url 2024-12-13 14:16:56 -08:00
gagb
f1274dca87 Run pre-commit 2024-12-13 13:58:24 -08:00
gagb
778fca3f70 Fix code scanning alert no. 1: Incomplete URL substring sanitization
Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com>
2024-12-13 13:57:03 -08:00
gagb
7979eecfef SHift to Documentconverter class 2024-12-13 13:52:37 -08:00
gagb
8f16f32d53 Add tests 2024-12-12 23:10:23 +00:00
gagb
28af7ad341 Run pre-commit 2024-12-12 22:39:03 +00:00
gagb
9d047103d5 Add method to convert GitHub issue to markdown
Add support for converting GitHub issues to markdown.

* Add `convert_github_issue` method in `src/markitdown/_markitdown.py` to handle GitHub issue conversion.
* Use `PyGithub` to fetch issue details using the provided token.
* Convert the issue details to markdown format and return as `DocumentConverterResult`.
* Add optional GitHub issue support with `IS_GITHUB_ISSUE_CAPABLE` flag.
2024-12-12 13:41:31 -08:00
28 changed files with 281 additions and 1347 deletions

View File

@@ -1,32 +0,0 @@
// For format details, see https://aka.ms/devcontainer.json. For config options, see the
// README at: https://github.com/devcontainers/templates/tree/main/src/docker-existing-dockerfile
{
"name": "Existing Dockerfile",
"build": {
// Sets the run context to one level up instead of the .devcontainer folder.
"context": "..",
// Update the 'dockerFile' property if you aren't using the standard 'Dockerfile' filename.
"dockerfile": "../Dockerfile",
"args": {
"INSTALL_GIT": "true"
}
},
// Features to add to the dev container. More info: https://containers.dev/features.
// "features": {},
"features": {
"ghcr.io/devcontainers-extra/features/hatch:2": {}
},
// Use 'forwardPorts' to make a list of ports inside the container available locally.
// "forwardPorts": [],
// Uncomment the next line to run commands after the container is created.
// "postCreateCommand": "cat /etc/os-release",
// Configure tool-specific properties.
// "customizations": {},
// Uncomment to connect as an existing user other than the container default. More info: https://aka.ms/dev-containers-non-root.
"remoteUser": "root"
}

View File

@@ -1 +0,0 @@
*

1
.gitattributes vendored
View File

@@ -1 +0,0 @@
tests/test_files/** linguist-vendored

View File

@@ -1,6 +0,0 @@
version: 2
updates:
- package-ecosystem: "github-actions"
directory: "/"
schedule:
interval: "weekly"

View File

@@ -5,9 +5,9 @@ jobs:
pre-commit:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v2
- name: Set up Python
uses: actions/setup-python@v5
uses: actions/setup-python@v2
with:
python-version: "3.x"

View File

@@ -5,8 +5,8 @@ jobs:
tests:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
with:
python-version: |
3.10
@@ -14,7 +14,7 @@ jobs:
3.12
- name: Set up pip cache
if: runner.os == 'Linux'
uses: actions/cache@v4
uses: actions/cache@v3
with:
path: ~/.cache/pip
key: ${{ runner.os }}-pip-${{ hashFiles('pyproject.toml') }}

4
.gitignore vendored
View File

@@ -1,5 +1,3 @@
.vscode
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
@@ -162,5 +160,3 @@ cython_debug/
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
src/.DS_Store
.DS_Store

View File

@@ -1,23 +0,0 @@
FROM python:3.13-slim-bullseye
USER root
ARG INSTALL_GIT=false
RUN if [ "$INSTALL_GIT" = "true" ]; then \
apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*; \
fi
# Runtime dependency
RUN apt-get update && apt-get install -y --no-install-recommends \
ffmpeg \
&& rm -rf /var/lib/apt/lists/*
RUN pip install markitdown
# Default USERID and GROUPID
ARG USERID=10000
ARG GROUPID=10000
USER $USERID:$GROUPID
ENTRYPOINT [ "markitdown" ]

126
README.md
View File

@@ -1,93 +1,28 @@
# MarkItDown
[![PyPI](https://img.shields.io/pypi/v/markitdown.svg)](https://pypi.org/project/markitdown/)
![PyPI - Downloads](https://img.shields.io/pypi/dd/markitdown)
[![Built by AutoGen Team](https://img.shields.io/badge/Built%20by-AutoGen%20Team-blue)](https://github.com/microsoft/autogen)
The MarkItDown library is a utility tool for converting various files to Markdown (e.g., for indexing, text analysis, etc.)
It presently supports:
MarkItDown is a utility for converting various files to Markdown (e.g., for indexing, text analysis, etc).
It supports:
- PDF
- PowerPoint
- Word
- Excel
- Images (EXIF metadata and OCR)
- Audio (EXIF metadata and speech transcription)
- HTML
- Text-based formats (CSV, JSON, XML)
- ZIP files (iterates over contents)
- PDF (.pdf)
- PowerPoint (.pptx)
- Word (.docx)
- Excel (.xlsx)
- Images (EXIF metadata, and OCR)
- Audio (EXIF metadata, and speech transcription)
- HTML (special handling of Wikipedia, etc.)
- Various other text-based formats (csv, json, xml, etc.)
To install MarkItDown, use pip: `pip install markitdown`. Alternatively, you can install it from the source: `pip install -e .`
## Usage
### Command-Line
```bash
markitdown path-to-file.pdf > document.md
```
Or use `-o` to specify the output file:
```bash
markitdown path-to-file.pdf -o document.md
```
To use Document Intelligence conversion:
```bash
markitdown path-to-file.pdf -o document.md -d -e "<document_intelligence_endpoint>"
```
You can also pipe content:
```bash
cat path-to-file.pdf | markitdown
```
More information about how to set up an Azure Document Intelligence Resource can be found [here](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/how-to-guides/create-document-intelligence-resource?view=doc-intel-4.0.0)
### Python API
Basic usage in Python:
The API is simple:
```python
from markitdown import MarkItDown
md = MarkItDown()
result = md.convert("test.xlsx")
markitdown = MarkItDown()
result = markitdown.convert("test.xlsx")
print(result.text_content)
```
Document Intelligence conversion in Python:
```python
from markitdown import MarkItDown
md = MarkItDown(docintel_endpoint="<document_intelligence_endpoint>")
result = md.convert("test.pdf")
print(result.text_content)
```
To use Large Language Models for image descriptions, provide `llm_client` and `llm_model`:
```python
from markitdown import MarkItDown
from openai import OpenAI
client = OpenAI()
md = MarkItDown(llm_client=client, llm_model="gpt-4o")
result = md.convert("example.jpg")
print(result.text_content)
```
### Docker
```sh
docker build -t markitdown:latest .
docker run --rm -i markitdown:latest < ~/your-file.pdf > output.md
```
## Contributing
This project welcomes contributions and suggestions. Most contributions require you to agree to a
@@ -102,41 +37,10 @@ This project has adopted the [Microsoft Open Source Code of Conduct](https://ope
For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
### How to Contribute
You can help by looking at issues or helping review PRs. Any issue or PR is welcome, but we have also marked some as 'open for contribution' and 'open for reviewing' to help facilitate community contributions. These are ofcourse just suggestions and you are welcome to contribute in any way you like.
<div align="center">
| | All | Especially Needs Help from Community |
|-----------------------|------------------------------------------|------------------------------------------------------------------------------------------|
| **Issues** | [All Issues](https://github.com/microsoft/markitdown/issues) | [Issues open for contribution](https://github.com/microsoft/markitdown/issues?q=is%3Aissue+is%3Aopen+label%3A%22open+for+contribution%22) |
| **PRs** | [All PRs](https://github.com/microsoft/markitdown/pulls) | [PRs open for reviewing](https://github.com/microsoft/markitdown/pulls?q=is%3Apr+is%3Aopen+label%3A%22open+for+reviewing%22) |
</div>
### Running Tests and Checks
- Install `hatch` in your environment and run tests:
```sh
pip install hatch # Other ways of installing hatch: https://hatch.pypa.io/dev/install/
hatch shell
hatch test
```
(Alternative) Use the Devcontainer which has all the dependencies installed:
```sh
# Reopen the project in Devcontainer and run:
hatch test
```
- Run pre-commit checks before submitting a PR: `pre-commit run --all-files`
## Trademarks
This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft
trademarks or logos is subject to and must follow
This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft
trademarks or logos is subject to and must follow
[Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general).
Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship.
Any use of third-party trademarks or logos are subject to those third-party's policies.

View File

@@ -5,7 +5,7 @@ build-backend = "hatchling.build"
[project]
name = "markitdown"
dynamic = ["version"]
description = 'Utility tool for converting various files to Markdown'
description = ''
readme = "README.md"
requires-python = ">=3.10"
license = "MIT"
@@ -32,18 +32,13 @@ dependencies = [
"python-pptx",
"pandas",
"openpyxl",
"xlrd",
"pdfminer.six",
"puremagic",
"pydub",
"olefile",
"youtube-transcript-api",
"SpeechRecognition",
"pathvalidate",
"charset-normalizer",
"openai",
"azure-ai-documentintelligence",
"azure-identity"
"pygithub"
]
[project.urls]
@@ -82,6 +77,3 @@ exclude_lines = [
"if __name__ == .__main__.:",
"if TYPE_CHECKING:",
]
[tool.hatch.build.targets.sdist]
only-include = ["src/markitdown"]

View File

@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
#
# SPDX-License-Identifier: MIT
__version__ = "0.0.1"
__version__ = "0.0.1a1"

View File

@@ -1,110 +1,41 @@
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
#
# SPDX-License-Identifier: MIT
import argparse
import sys
import shutil
from textwrap import dedent
from .__about__ import __version__
from ._markitdown import MarkItDown, DocumentConverterResult
from ._markitdown import MarkItDown
def main():
parser = argparse.ArgumentParser(
description="Convert various file formats to markdown.",
prog="markitdown",
formatter_class=argparse.RawDescriptionHelpFormatter,
usage=dedent(
"""
SYNTAX:
markitdown <OPTIONAL: FILENAME>
If FILENAME is empty, markitdown reads from stdin.
EXAMPLE:
markitdown example.pdf
OR
cat example.pdf | markitdown
OR
markitdown < example.pdf
OR to save to a file use
markitdown example.pdf -o example.md
OR
markitdown example.pdf > example.md
"""
).strip(),
)
parser.add_argument(
"-v",
"--version",
action="version",
version=f"%(prog)s {__version__}",
help="show the version number and exit",
)
parser.add_argument(
"-o",
"--output",
help="Output file name. If not provided, output is written to stdout.",
)
parser.add_argument(
"-d",
"--use-docintel",
action="store_true",
help="Use Document Intelligence to extract text instead of offline conversion. Requires a valid Document Intelligence Endpoint.",
)
parser.add_argument(
"-e",
"--endpoint",
type=str,
help="Document Intelligence Endpoint. Required if using Document Intelligence.",
)
parser.add_argument("filename", nargs="?")
args = parser.parse_args()
which_exiftool = shutil.which("exiftool")
if args.use_docintel:
if args.endpoint is None:
raise ValueError(
"Document Intelligence Endpoint is required when using Document Intelligence."
)
elif args.filename is None:
raise ValueError("Filename is required when using Document Intelligence.")
markitdown = MarkItDown(
exiftool_path=which_exiftool, docintel_endpoint=args.endpoint
)
else:
markitdown = MarkItDown(exiftool_path=which_exiftool)
if args.filename is None:
if len(sys.argv) == 1:
markitdown = MarkItDown()
result = markitdown.convert_stream(sys.stdin.buffer)
else:
result = markitdown.convert(args.filename)
_handle_output(args, result)
def _handle_output(args, result: DocumentConverterResult):
"""Handle output to stdout or file"""
if args.output:
with open(args.output, "w", encoding="utf-8") as f:
f.write(result.text_content)
else:
print(result.text_content)
elif len(sys.argv) == 2:
markitdown = MarkItDown()
result = markitdown.convert(sys.argv[1])
print(result.text_content)
else:
sys.stderr.write(
"""
SYNTAX:
markitdown <OPTIONAL: FILENAME>
If FILENAME is empty, markitdown reads from stdin.
EXAMPLE:
markitdown example.pdf
OR
cat example.pdf | markitdown
OR
markitdown < example.pdf
""".strip()
+ "\n"
)
if __name__ == "__main__":

File diff suppressed because it is too large Load Diff

0
tests/test_files/test.docx Normal file → Executable file
View File

0
tests/test_files/test.jpg Normal file → Executable file
View File

Before

Width:  |  Height:  |  Size: 463 KiB

After

Width:  |  Height:  |  Size: 463 KiB

View File

@@ -1,10 +0,0 @@
{
"key1": "string_value",
"key2": 1234,
"key3": [
"list_value1",
"list_value2"
],
"5b64c88c-b3c3-4510-bcb8-da0b200602d8": "uuid_key",
"uuid_value": "9700dc99-6685-40b4-9a3a-5e406dcb37f3"
}

BIN
tests/test_files/test.pptx Normal file → Executable file

Binary file not shown.

Binary file not shown.

0
tests/test_files/test.xlsx Normal file → Executable file
View File

Binary file not shown.

Binary file not shown.

Before

Width:  |  Height:  |  Size: 145 KiB

View File

@@ -1,4 +0,0 @@
<EFBFBD><EFBFBD><EFBFBD>O,<EFBFBD>N<EFBFBD><EFBFBD>,<EFBFBD>Z<EFBFBD><EFBFBD>
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Y,30,<EFBFBD><EFBFBD><EFBFBD><EFBFBD>
<EFBFBD>O<EFBFBD>؉p<EFBFBD>q,25,<EFBFBD><EFBFBD><EFBFBD><EFBFBD>
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>~,35,<EFBFBD><EFBFBD><EFBFBD>É<EFBFBD>
1 –¼‘O ”N—î �Z�Š
2 �²“¡‘¾˜Y 30 “Œ‹ž
3 ŽO–؉pŽq 25 ‘å�ã
4 îà‹´�~ 35 –¼ŒÃ‰®

View File

@@ -1,89 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "0f61db80",
"metadata": {},
"source": [
"# Test Notebook"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "3f2a5bbd",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"markitdown\n"
]
}
],
"source": [
"print('markitdown')"
]
},
{
"cell_type": "markdown",
"id": "9b9c0468",
"metadata": {},
"source": [
"## Code Cell Below"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "37d8088a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"42\n"
]
}
],
"source": [
"# comment in code\n",
"print(42)"
]
},
{
"cell_type": "markdown",
"id": "2e3177bd",
"metadata": {},
"source": [
"End\n",
"\n",
"---"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.8"
},
"title": "Test Notebook Title"
},
"nbformat": 4,
"nbformat_minor": 5
}

File diff suppressed because one or more lines are too long

View File

@@ -6,23 +6,11 @@ import shutil
import pytest
import requests
from warnings import catch_warnings, resetwarnings
from markitdown import MarkItDown
skip_remote = (
True if os.environ.get("GITHUB_ACTIONS") else False
) # Don't run these tests in CI
# Don't run the llm tests without a key and the client library
skip_llm = False if os.environ.get("OPENAI_API_KEY") else True
try:
import openai
except ModuleNotFoundError:
skip_llm = True
# Skip exiftool tests if not installed
skip_exiftool = shutil.which("exiftool") is None
TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")
@@ -54,12 +42,6 @@ XLSX_TEST_STRINGS = [
"affc7dad-52dc-4b98-9b5d-51e65d8a8ad0",
]
XLS_TEST_STRINGS = [
"## 09060124-b5e7-4717-9d07-3c046eb",
"6ff4173b-42a5-4784-9b19-f49caff4d93d",
"affc7dad-52dc-4b98-9b5d-51e65d8a8ad0",
]
DOCX_TEST_STRINGS = [
"314b0a30-5b04-470b-b9f7-eed2c2bec74a",
"49e168b7-d2ae-407f-a055-2167576f39a1",
@@ -69,34 +51,12 @@ DOCX_TEST_STRINGS = [
"AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
]
MSG_TEST_STRINGS = [
"# Email Message",
"**From:** test.sender@example.com",
"**To:** test.recipient@example.com",
"**Subject:** Test Email Message",
"## Content",
"This is the body of the test email message",
]
DOCX_COMMENT_TEST_STRINGS = [
"314b0a30-5b04-470b-b9f7-eed2c2bec74a",
"49e168b7-d2ae-407f-a055-2167576f39a1",
"## d666f1f7-46cb-42bd-9a39-9a39cf2a509f",
"# Abstract",
"# Introduction",
"AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
"This is a test comment. 12df-321a",
"Yet another comment in the doc. 55yiyi-asd09",
]
PPTX_TEST_STRINGS = [
"2cdda5c8-e50e-4db4-b5f0-9722a649f455",
"04191ea8-5c73-4215-a1d3-1cfb43aaaf12",
"44bf7d06-5e7a-4a40-a2e1-a2e42ef28c8a",
"1b92870d-e3b5-4e65-8153-919f4ff45592",
"AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
"a3f6004b-6f4f-4ea8-bee3-3741f4dc385f", # chart title
"2003", # chart value
]
BLOG_TEST_URL = "https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math"
@@ -105,13 +65,6 @@ BLOG_TEST_STRINGS = [
"an example where high cost can easily prevent a generic complex",
]
RSS_TEST_STRINGS = [
"The Official Microsoft Blog",
"In the case of AI, it is absolutely true that the industry is moving incredibly fast",
]
WIKIPEDIA_TEST_URL = "https://en.wikipedia.org/wiki/Microsoft"
WIKIPEDIA_TEST_STRINGS = [
"Microsoft entered the operating system (OS) business in 1980 with its own version of [Unix]",
@@ -134,32 +87,9 @@ SERP_TEST_EXCLUDES = [
"data:image/svg+xml,%3Csvg%20width%3D",
]
CSV_CP932_TEST_STRINGS = [
"名前,年齢,住所",
"佐藤太郎,30,東京",
"三木英子,25,大阪",
"髙橋淳,35,名古屋",
]
LLM_TEST_STRINGS = [
"5bda1dd6",
]
JSON_TEST_STRINGS = [
"5b64c88c-b3c3-4510-bcb8-da0b200602d8",
"9700dc99-6685-40b4-9a3a-5e406dcb37f3",
]
# --- Helper Functions ---
def validate_strings(result, expected_strings, exclude_strings=None):
"""Validate presence or absence of specific strings."""
text_content = result.text_content.replace("\\", "")
for string in expected_strings:
assert string in text_content
if exclude_strings:
for string in exclude_strings:
assert string not in text_content
GITHUB_ISSUE_URL = "https://github.com/microsoft/autogen/issues/1421"
GITHUB_PR_URL = "https://github.com/microsoft/autogen/pull/194"
GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN", "")
@pytest.mark.skipif(
@@ -194,175 +124,93 @@ def test_markitdown_local() -> None:
# Test XLSX processing
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx"))
validate_strings(result, XLSX_TEST_STRINGS)
# Test XLS processing
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xls"))
for test_string in XLS_TEST_STRINGS:
for test_string in XLSX_TEST_STRINGS:
text_content = result.text_content.replace("\\", "")
assert test_string in text_content
# Test DOCX processing
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.docx"))
validate_strings(result, DOCX_TEST_STRINGS)
# Test DOCX processing, with comments
result = markitdown.convert(
os.path.join(TEST_FILES_DIR, "test_with_comment.docx"),
style_map="comment-reference => ",
)
validate_strings(result, DOCX_COMMENT_TEST_STRINGS)
# Test DOCX processing, with comments and setting style_map on init
markitdown_with_style_map = MarkItDown(style_map="comment-reference => ")
result = markitdown_with_style_map.convert(
os.path.join(TEST_FILES_DIR, "test_with_comment.docx")
)
validate_strings(result, DOCX_COMMENT_TEST_STRINGS)
for test_string in DOCX_TEST_STRINGS:
text_content = result.text_content.replace("\\", "")
assert test_string in text_content
# Test PPTX processing
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.pptx"))
validate_strings(result, PPTX_TEST_STRINGS)
for test_string in PPTX_TEST_STRINGS:
text_content = result.text_content.replace("\\", "")
assert test_string in text_content
# Test HTML processing
result = markitdown.convert(
os.path.join(TEST_FILES_DIR, "test_blog.html"), url=BLOG_TEST_URL
)
validate_strings(result, BLOG_TEST_STRINGS)
# Test ZIP file processing
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_files.zip"))
validate_strings(result, XLSX_TEST_STRINGS)
for test_string in BLOG_TEST_STRINGS:
text_content = result.text_content.replace("\\", "")
assert test_string in text_content
# Test Wikipedia processing
result = markitdown.convert(
os.path.join(TEST_FILES_DIR, "test_wikipedia.html"), url=WIKIPEDIA_TEST_URL
)
text_content = result.text_content.replace("\\", "")
validate_strings(result, WIKIPEDIA_TEST_STRINGS, WIKIPEDIA_TEST_EXCLUDES)
for test_string in WIKIPEDIA_TEST_EXCLUDES:
assert test_string not in text_content
for test_string in WIKIPEDIA_TEST_STRINGS:
assert test_string in text_content
# Test Bing processing
result = markitdown.convert(
os.path.join(TEST_FILES_DIR, "test_serp.html"), url=SERP_TEST_URL
)
text_content = result.text_content.replace("\\", "")
validate_strings(result, SERP_TEST_STRINGS, SERP_TEST_EXCLUDES)
# Test RSS processing
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_rss.xml"))
text_content = result.text_content.replace("\\", "")
for test_string in RSS_TEST_STRINGS:
for test_string in SERP_TEST_EXCLUDES:
assert test_string not in text_content
for test_string in SERP_TEST_STRINGS:
assert test_string in text_content
## Test non-UTF-8 encoding
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_mskanji.csv"))
validate_strings(result, CSV_CP932_TEST_STRINGS)
# Test MSG (Outlook email) processing
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_outlook_msg.msg"))
validate_strings(result, MSG_TEST_STRINGS)
# Test JSON processing
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.json"))
validate_strings(result, JSON_TEST_STRINGS)
# Test input with leading blank characters
input_data = b" \n\n\n<html><body><h1>Test</h1></body></html>"
result = markitdown.convert_stream(io.BytesIO(input_data))
assert "# Test" in result.text_content
@pytest.mark.skipif(
skip_exiftool,
reason="do not run if exiftool is not installed",
)
def test_markitdown_exiftool() -> None:
# Test the automatic discovery of exiftool throws a warning
# and is disabled
try:
with catch_warnings(record=True) as w:
markitdown = MarkItDown()
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
assert len(w) == 1
assert w[0].category is DeprecationWarning
assert result.text_content.strip() == ""
finally:
resetwarnings()
# Test explicitly setting the location of exiftool
which_exiftool = shutil.which("exiftool")
markitdown = MarkItDown(exiftool_path=which_exiftool)
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
for key in JPG_TEST_EXIFTOOL:
target = f"{key}: {JPG_TEST_EXIFTOOL[key]}"
assert target in result.text_content
# Test setting the exiftool path through an environment variable
os.environ["EXIFTOOL_PATH"] = which_exiftool
markitdown = MarkItDown()
# Test JPG metadata processing
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
for key in JPG_TEST_EXIFTOOL:
target = f"{key}: {JPG_TEST_EXIFTOOL[key]}"
assert target in result.text_content
def test_markitdown_deprecation() -> None:
try:
with catch_warnings(record=True) as w:
test_client = object()
markitdown = MarkItDown(mlm_client=test_client)
assert len(w) == 1
assert w[0].category is DeprecationWarning
assert markitdown._llm_client == test_client
finally:
resetwarnings()
try:
with catch_warnings(record=True) as w:
markitdown = MarkItDown(mlm_model="gpt-4o")
assert len(w) == 1
assert w[0].category is DeprecationWarning
assert markitdown._llm_model == "gpt-4o"
finally:
resetwarnings()
try:
test_client = object()
markitdown = MarkItDown(mlm_client=test_client, llm_client=test_client)
assert False
except ValueError:
pass
try:
markitdown = MarkItDown(mlm_model="gpt-4o", llm_model="gpt-4o")
assert False
except ValueError:
pass
@pytest.mark.skipif(
skip_llm,
reason="do not run llm tests without a key",
not GITHUB_TOKEN,
reason="GitHub token not provided",
)
def test_markitdown_llm() -> None:
client = openai.OpenAI()
markitdown = MarkItDown(llm_client=client, llm_model="gpt-4o")
def test_markitdown_github_issue() -> None:
markitdown = MarkItDown()
result = markitdown.convert(GITHUB_ISSUE_URL, github_token=GITHUB_TOKEN)
print(result.text_content)
assert "User-Defined Functions" in result.text_content
assert "closed" in result.text_content
assert "Comments:" in result.text_content
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_llm.jpg"))
for test_string in LLM_TEST_STRINGS:
assert test_string in result.text_content
# This is not super precise. It would also accept "red square", "blue circle",
# "the square is not blue", etc. But it's sufficient for this test.
for test_string in ["red", "circle", "blue", "square"]:
assert test_string in result.text_content.lower()
@pytest.mark.skipif(
not GITHUB_TOKEN,
reason="GitHub token not provided",
)
def test_markitdown_github_pr() -> None:
markitdown = MarkItDown()
result = markitdown.convert(GITHUB_PR_URL, github_token=GITHUB_TOKEN)
print(result.text_content)
assert "faq" in result.text_content
if __name__ == "__main__":
"""Runs this file's tests from the command line."""
# test_markitdown_remote()
# test_markitdown_local()
test_markitdown_remote()
test_markitdown_local()
test_markitdown_exiftool()
# test_markitdown_deprecation()
# test_markitdown_llm()
test_markitdown_github_issue()
test_markitdown_github_pr()