Compare commits
8 Commits
zip_format
...
v0.0.X
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
e58bc486ee | ||
|
|
81ef601c09 | ||
|
|
518b12c1fb | ||
|
|
8eaf5a1da9 | ||
|
|
38c924793c | ||
|
|
b9526d5e47 | ||
|
|
519fe172aa | ||
|
|
abe9752438 |
36
README.md
36
README.md
@@ -87,42 +87,6 @@ print(result.text_content)
|
|||||||
docker build -t markitdown:latest .
|
docker build -t markitdown:latest .
|
||||||
docker run --rm -i markitdown:latest < ~/your-file.pdf > output.md
|
docker run --rm -i markitdown:latest < ~/your-file.pdf > output.md
|
||||||
```
|
```
|
||||||
<details>
|
|
||||||
|
|
||||||
<summary>Batch Processing Multiple Files</summary>
|
|
||||||
|
|
||||||
This example shows how to convert multiple files to markdown format in a single run. The script processes all supported files in a directory and creates corresponding markdown files.
|
|
||||||
|
|
||||||
|
|
||||||
```python convert.py
|
|
||||||
from markitdown import MarkItDown
|
|
||||||
from openai import OpenAI
|
|
||||||
import os
|
|
||||||
client = OpenAI(api_key="your-api-key-here")
|
|
||||||
md = MarkItDown(llm_client=client, llm_model="gpt-4o-2024-11-20")
|
|
||||||
supported_extensions = ('.pptx', '.docx', '.pdf', '.jpg', '.jpeg', '.png')
|
|
||||||
files_to_convert = [f for f in os.listdir('.') if f.lower().endswith(supported_extensions)]
|
|
||||||
for file in files_to_convert:
|
|
||||||
print(f"\nConverting {file}...")
|
|
||||||
try:
|
|
||||||
md_file = os.path.splitext(file)[0] + '.md'
|
|
||||||
result = md.convert(file)
|
|
||||||
with open(md_file, 'w') as f:
|
|
||||||
f.write(result.text_content)
|
|
||||||
|
|
||||||
print(f"Successfully converted {file} to {md_file}")
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Error converting {file}: {str(e)}")
|
|
||||||
|
|
||||||
print("\nAll conversions completed!")
|
|
||||||
```
|
|
||||||
2. Place the script in the same directory as your files
|
|
||||||
3. Install required packages: like openai
|
|
||||||
4. Run script ```bash python convert.py ```
|
|
||||||
|
|
||||||
Note that original files will remain unchanged and new markdown files are created with the same base name.
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
## Contributing
|
## Contributing
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
|
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
|
||||||
#
|
#
|
||||||
# SPDX-License-Identifier: MIT
|
# SPDX-License-Identifier: MIT
|
||||||
__version__ = "0.0.1a3"
|
__version__ = "0.0.2"
|
||||||
|
|||||||
@@ -17,7 +17,7 @@ from xml.dom import minidom
|
|||||||
from typing import Any, Dict, List, Optional, Union
|
from typing import Any, Dict, List, Optional, Union
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
|
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
|
||||||
from warnings import warn, resetwarnings, catch_warnings
|
from warnings import warn, filterwarnings
|
||||||
|
|
||||||
import mammoth
|
import mammoth
|
||||||
import markdownify
|
import markdownify
|
||||||
@@ -51,21 +51,14 @@ mimetypes.add_type("text/csv", ".csv")
|
|||||||
|
|
||||||
# Optional Transcription support
|
# Optional Transcription support
|
||||||
IS_AUDIO_TRANSCRIPTION_CAPABLE = False
|
IS_AUDIO_TRANSCRIPTION_CAPABLE = False
|
||||||
|
filterwarnings("ignore", message=r".*Couldn\'t find ffmpeg or avconv.*", module="pydub")
|
||||||
try:
|
try:
|
||||||
# Using warnings' catch_warnings to catch
|
import pydub
|
||||||
# pydub's warning of ffmpeg or avconv missing
|
|
||||||
with catch_warnings(record=True) as w:
|
|
||||||
import pydub
|
|
||||||
|
|
||||||
if w:
|
|
||||||
raise ModuleNotFoundError
|
|
||||||
import speech_recognition as sr
|
import speech_recognition as sr
|
||||||
|
|
||||||
IS_AUDIO_TRANSCRIPTION_CAPABLE = True
|
IS_AUDIO_TRANSCRIPTION_CAPABLE = True
|
||||||
except ModuleNotFoundError:
|
except ModuleNotFoundError:
|
||||||
pass
|
pass
|
||||||
finally:
|
|
||||||
resetwarnings()
|
|
||||||
|
|
||||||
# Optional YouTube transcription support
|
# Optional YouTube transcription support
|
||||||
try:
|
try:
|
||||||
@@ -91,7 +84,14 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
|
|||||||
# Explicitly cast options to the expected type if necessary
|
# Explicitly cast options to the expected type if necessary
|
||||||
super().__init__(**options)
|
super().__init__(**options)
|
||||||
|
|
||||||
def convert_hn(self, n: int, el: Any, text: str, convert_as_inline: bool) -> str:
|
def convert_hn(
|
||||||
|
self,
|
||||||
|
n: int,
|
||||||
|
el: Any,
|
||||||
|
text: str,
|
||||||
|
convert_as_inline: Optional[bool] = False,
|
||||||
|
**kwargs,
|
||||||
|
) -> str:
|
||||||
"""Same as usual, but be sure to start with a new line"""
|
"""Same as usual, but be sure to start with a new line"""
|
||||||
if not convert_as_inline:
|
if not convert_as_inline:
|
||||||
if not re.search(r"^\n", text):
|
if not re.search(r"^\n", text):
|
||||||
@@ -99,7 +99,9 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
|
|||||||
|
|
||||||
return super().convert_hn(n, el, text, convert_as_inline) # type: ignore
|
return super().convert_hn(n, el, text, convert_as_inline) # type: ignore
|
||||||
|
|
||||||
def convert_a(self, el: Any, text: str, convert_as_inline: bool):
|
def convert_a(
|
||||||
|
self, el: Any, text: str, convert_as_inline: Optional[bool] = False, **kwargs
|
||||||
|
):
|
||||||
"""Same as usual converter, but removes Javascript links and escapes URIs."""
|
"""Same as usual converter, but removes Javascript links and escapes URIs."""
|
||||||
prefix, suffix, text = markdownify.chomp(text) # type: ignore
|
prefix, suffix, text = markdownify.chomp(text) # type: ignore
|
||||||
if not text:
|
if not text:
|
||||||
@@ -135,7 +137,9 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
|
|||||||
else text
|
else text
|
||||||
)
|
)
|
||||||
|
|
||||||
def convert_img(self, el: Any, text: str, convert_as_inline: bool) -> str:
|
def convert_img(
|
||||||
|
self, el: Any, text: str, convert_as_inline: Optional[bool] = False, **kwargs
|
||||||
|
) -> str:
|
||||||
"""Same as usual converter, but removes data URIs"""
|
"""Same as usual converter, but removes data URIs"""
|
||||||
|
|
||||||
alt = el.attrs.get("alt", None) or ""
|
alt = el.attrs.get("alt", None) or ""
|
||||||
@@ -963,18 +967,6 @@ class MediaConverter(DocumentConverter):
|
|||||||
|
|
||||||
def _get_metadata(self, local_path, exiftool_path=None):
|
def _get_metadata(self, local_path, exiftool_path=None):
|
||||||
if not exiftool_path:
|
if not exiftool_path:
|
||||||
which_exiftool = shutil.which("exiftool")
|
|
||||||
if which_exiftool:
|
|
||||||
warn(
|
|
||||||
f"""Implicit discovery of 'exiftool' is disabled. If you would like to continue to use exiftool in MarkItDown, please set the exiftool_path parameter in the MarkItDown consructor. E.g.,
|
|
||||||
|
|
||||||
md = MarkItDown(exiftool_path="{which_exiftool}")
|
|
||||||
|
|
||||||
This warning will be removed in future releases.
|
|
||||||
""",
|
|
||||||
DeprecationWarning,
|
|
||||||
)
|
|
||||||
|
|
||||||
return None
|
return None
|
||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
@@ -1077,6 +1069,14 @@ class Mp3Converter(WavConverter):
|
|||||||
handle, temp_path = tempfile.mkstemp(suffix=".wav")
|
handle, temp_path = tempfile.mkstemp(suffix=".wav")
|
||||||
os.close(handle)
|
os.close(handle)
|
||||||
try:
|
try:
|
||||||
|
# Check if pydub defaulted to ffmpeg
|
||||||
|
if pydub.AudioSegment.converter == "ffmpeg" and not shutil.which(
|
||||||
|
"ffmpeg"
|
||||||
|
):
|
||||||
|
warn(
|
||||||
|
"pydub: Couldn't find ffmpeg or avconv - defaulting to ffmpeg, but may not work",
|
||||||
|
RuntimeWarning,
|
||||||
|
)
|
||||||
sound = pydub.AudioSegment.from_mp3(local_path)
|
sound = pydub.AudioSegment.from_mp3(local_path)
|
||||||
sound.export(temp_path, format="wav")
|
sound.export(temp_path, format="wav")
|
||||||
|
|
||||||
@@ -1487,34 +1487,26 @@ class MarkItDown:
|
|||||||
if exiftool_path is None:
|
if exiftool_path is None:
|
||||||
exiftool_path = os.environ.get("EXIFTOOL_PATH")
|
exiftool_path = os.environ.get("EXIFTOOL_PATH")
|
||||||
|
|
||||||
# Handle deprecation notices
|
# Still none? Check well-known paths
|
||||||
#############################
|
if exiftool_path is None:
|
||||||
if mlm_client is not None:
|
candidate = shutil.which("exiftool")
|
||||||
if llm_client is None:
|
if candidate:
|
||||||
warn(
|
candidate = os.path.abspath(candidate)
|
||||||
"'mlm_client' is deprecated, and was renamed 'llm_client'.",
|
if any(
|
||||||
DeprecationWarning,
|
d == os.path.dirname(candidate)
|
||||||
)
|
for d in [
|
||||||
llm_client = mlm_client
|
"/usr/bin",
|
||||||
mlm_client = None
|
"/usr/local/bin",
|
||||||
else:
|
"/opt",
|
||||||
raise ValueError(
|
"/opt/bin",
|
||||||
"'mlm_client' is deprecated, and was renamed 'llm_client'. Do not use both at the same time. Just use 'llm_client' instead."
|
"/opt/local/bin",
|
||||||
)
|
"/opt/homebrew/bin",
|
||||||
|
"C:\\Windows\\System32",
|
||||||
if mlm_model is not None:
|
"C:\\Program Files",
|
||||||
if llm_model is None:
|
"C:\\Program Files (x86)",
|
||||||
warn(
|
]
|
||||||
"'mlm_model' is deprecated, and was renamed 'llm_model'.",
|
):
|
||||||
DeprecationWarning,
|
exiftool_path = candidate
|
||||||
)
|
|
||||||
llm_model = mlm_model
|
|
||||||
mlm_model = None
|
|
||||||
else:
|
|
||||||
raise ValueError(
|
|
||||||
"'mlm_model' is deprecated, and was renamed 'llm_model'. Do not use both at the same time. Just use 'llm_model' instead."
|
|
||||||
)
|
|
||||||
#############################
|
|
||||||
|
|
||||||
self._llm_client = llm_client
|
self._llm_client = llm_client
|
||||||
self._llm_model = llm_model
|
self._llm_model = llm_model
|
||||||
@@ -1752,6 +1744,8 @@ class MarkItDown:
|
|||||||
ext = ext.strip()
|
ext = ext.strip()
|
||||||
if ext == "":
|
if ext == "":
|
||||||
return
|
return
|
||||||
|
if ext in extensions:
|
||||||
|
return
|
||||||
# if ext not in extensions:
|
# if ext not in extensions:
|
||||||
extensions.append(ext)
|
extensions.append(ext)
|
||||||
|
|
||||||
|
|||||||
@@ -6,8 +6,6 @@ import shutil
|
|||||||
import pytest
|
import pytest
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
from warnings import catch_warnings, resetwarnings
|
|
||||||
|
|
||||||
from markitdown import MarkItDown
|
from markitdown import MarkItDown
|
||||||
|
|
||||||
skip_remote = (
|
skip_remote = (
|
||||||
@@ -277,18 +275,6 @@ def test_markitdown_local() -> None:
|
|||||||
reason="do not run if exiftool is not installed",
|
reason="do not run if exiftool is not installed",
|
||||||
)
|
)
|
||||||
def test_markitdown_exiftool() -> None:
|
def test_markitdown_exiftool() -> None:
|
||||||
# Test the automatic discovery of exiftool throws a warning
|
|
||||||
# and is disabled
|
|
||||||
try:
|
|
||||||
with catch_warnings(record=True) as w:
|
|
||||||
markitdown = MarkItDown()
|
|
||||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
|
|
||||||
assert len(w) == 1
|
|
||||||
assert w[0].category is DeprecationWarning
|
|
||||||
assert result.text_content.strip() == ""
|
|
||||||
finally:
|
|
||||||
resetwarnings()
|
|
||||||
|
|
||||||
# Test explicitly setting the location of exiftool
|
# Test explicitly setting the location of exiftool
|
||||||
which_exiftool = shutil.which("exiftool")
|
which_exiftool = shutil.which("exiftool")
|
||||||
markitdown = MarkItDown(exiftool_path=which_exiftool)
|
markitdown = MarkItDown(exiftool_path=which_exiftool)
|
||||||
@@ -306,40 +292,6 @@ def test_markitdown_exiftool() -> None:
|
|||||||
assert target in result.text_content
|
assert target in result.text_content
|
||||||
|
|
||||||
|
|
||||||
def test_markitdown_deprecation() -> None:
|
|
||||||
try:
|
|
||||||
with catch_warnings(record=True) as w:
|
|
||||||
test_client = object()
|
|
||||||
markitdown = MarkItDown(mlm_client=test_client)
|
|
||||||
assert len(w) == 1
|
|
||||||
assert w[0].category is DeprecationWarning
|
|
||||||
assert markitdown._llm_client == test_client
|
|
||||||
finally:
|
|
||||||
resetwarnings()
|
|
||||||
|
|
||||||
try:
|
|
||||||
with catch_warnings(record=True) as w:
|
|
||||||
markitdown = MarkItDown(mlm_model="gpt-4o")
|
|
||||||
assert len(w) == 1
|
|
||||||
assert w[0].category is DeprecationWarning
|
|
||||||
assert markitdown._llm_model == "gpt-4o"
|
|
||||||
finally:
|
|
||||||
resetwarnings()
|
|
||||||
|
|
||||||
try:
|
|
||||||
test_client = object()
|
|
||||||
markitdown = MarkItDown(mlm_client=test_client, llm_client=test_client)
|
|
||||||
assert False
|
|
||||||
except ValueError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
try:
|
|
||||||
markitdown = MarkItDown(mlm_model="gpt-4o", llm_model="gpt-4o")
|
|
||||||
assert False
|
|
||||||
except ValueError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(
|
@pytest.mark.skipif(
|
||||||
skip_llm,
|
skip_llm,
|
||||||
reason="do not run llm tests without a key",
|
reason="do not run llm tests without a key",
|
||||||
@@ -364,5 +316,4 @@ if __name__ == "__main__":
|
|||||||
# test_markitdown_remote()
|
# test_markitdown_remote()
|
||||||
# test_markitdown_local()
|
# test_markitdown_local()
|
||||||
test_markitdown_exiftool()
|
test_markitdown_exiftool()
|
||||||
# test_markitdown_deprecation()
|
|
||||||
# test_markitdown_llm()
|
# test_markitdown_llm()
|
||||||
|
|||||||
Reference in New Issue
Block a user