8 Commits
main ... v0.0.2

Author SHA1 Message Date
Adam Fourney
e58bc486ee Added missing comma. 2025-03-07 16:18:47 -08:00
afourney
81ef601c09 Removed deprecation and other warnings. (#1105) 2025-03-07 16:17:03 -08:00
afourney
518b12c1fb Addresses #1068 (#1101) 2025-03-07 15:46:30 -08:00
Adam Fourney
8eaf5a1da9 Clean up README.md 2025-03-05 21:35:08 -08:00
afourney
38c924793c Bump version (#1095) 2025-03-05 21:30:56 -08:00
afourney
b9526d5e47 Bump version. (#1075) 2025-02-28 07:30:46 -08:00
Hieu Lam
519fe172aa Unable to convert HTML to Markdown (#1072)
* feat: issue where inherited function from `markdownify.MarkdownConverter` doesn't have `current_tags` leading to error using `kwargs`, also set default value for `convert_as_inline`
2025-02-28 00:57:41 -08:00
Adam Fourney
abe9752438 Bumped version 2025-02-10 16:01:17 -08:00
4 changed files with 48 additions and 139 deletions

View File

@@ -87,42 +87,6 @@ print(result.text_content)
docker build -t markitdown:latest . docker build -t markitdown:latest .
docker run --rm -i markitdown:latest < ~/your-file.pdf > output.md docker run --rm -i markitdown:latest < ~/your-file.pdf > output.md
``` ```
<details>
<summary>Batch Processing Multiple Files</summary>
This example shows how to convert multiple files to markdown format in a single run. The script processes all supported files in a directory and creates corresponding markdown files.
```python convert.py
from markitdown import MarkItDown
from openai import OpenAI
import os
client = OpenAI(api_key="your-api-key-here")
md = MarkItDown(llm_client=client, llm_model="gpt-4o-2024-11-20")
supported_extensions = ('.pptx', '.docx', '.pdf', '.jpg', '.jpeg', '.png')
files_to_convert = [f for f in os.listdir('.') if f.lower().endswith(supported_extensions)]
for file in files_to_convert:
print(f"\nConverting {file}...")
try:
md_file = os.path.splitext(file)[0] + '.md'
result = md.convert(file)
with open(md_file, 'w') as f:
f.write(result.text_content)
print(f"Successfully converted {file} to {md_file}")
except Exception as e:
print(f"Error converting {file}: {str(e)}")
print("\nAll conversions completed!")
```
2. Place the script in the same directory as your files
3. Install required packages: like openai
4. Run script ```bash python convert.py ```
Note that original files will remain unchanged and new markdown files are created with the same base name.
</details>
## Contributing ## Contributing

View File

@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com> # SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
# #
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
__version__ = "0.0.1a3" __version__ = "0.0.2"

View File

@@ -17,7 +17,7 @@ from xml.dom import minidom
from typing import Any, Dict, List, Optional, Union from typing import Any, Dict, List, Optional, Union
from pathlib import Path from pathlib import Path
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
from warnings import warn, resetwarnings, catch_warnings from warnings import warn, filterwarnings
import mammoth import mammoth
import markdownify import markdownify
@@ -51,21 +51,14 @@ mimetypes.add_type("text/csv", ".csv")
# Optional Transcription support # Optional Transcription support
IS_AUDIO_TRANSCRIPTION_CAPABLE = False IS_AUDIO_TRANSCRIPTION_CAPABLE = False
filterwarnings("ignore", message=r".*Couldn\'t find ffmpeg or avconv.*", module="pydub")
try: try:
# Using warnings' catch_warnings to catch import pydub
# pydub's warning of ffmpeg or avconv missing
with catch_warnings(record=True) as w:
import pydub
if w:
raise ModuleNotFoundError
import speech_recognition as sr import speech_recognition as sr
IS_AUDIO_TRANSCRIPTION_CAPABLE = True IS_AUDIO_TRANSCRIPTION_CAPABLE = True
except ModuleNotFoundError: except ModuleNotFoundError:
pass pass
finally:
resetwarnings()
# Optional YouTube transcription support # Optional YouTube transcription support
try: try:
@@ -91,7 +84,14 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
# Explicitly cast options to the expected type if necessary # Explicitly cast options to the expected type if necessary
super().__init__(**options) super().__init__(**options)
def convert_hn(self, n: int, el: Any, text: str, convert_as_inline: bool) -> str: def convert_hn(
self,
n: int,
el: Any,
text: str,
convert_as_inline: Optional[bool] = False,
**kwargs,
) -> str:
"""Same as usual, but be sure to start with a new line""" """Same as usual, but be sure to start with a new line"""
if not convert_as_inline: if not convert_as_inline:
if not re.search(r"^\n", text): if not re.search(r"^\n", text):
@@ -99,7 +99,9 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
return super().convert_hn(n, el, text, convert_as_inline) # type: ignore return super().convert_hn(n, el, text, convert_as_inline) # type: ignore
def convert_a(self, el: Any, text: str, convert_as_inline: bool): def convert_a(
self, el: Any, text: str, convert_as_inline: Optional[bool] = False, **kwargs
):
"""Same as usual converter, but removes Javascript links and escapes URIs.""" """Same as usual converter, but removes Javascript links and escapes URIs."""
prefix, suffix, text = markdownify.chomp(text) # type: ignore prefix, suffix, text = markdownify.chomp(text) # type: ignore
if not text: if not text:
@@ -135,7 +137,9 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
else text else text
) )
def convert_img(self, el: Any, text: str, convert_as_inline: bool) -> str: def convert_img(
self, el: Any, text: str, convert_as_inline: Optional[bool] = False, **kwargs
) -> str:
"""Same as usual converter, but removes data URIs""" """Same as usual converter, but removes data URIs"""
alt = el.attrs.get("alt", None) or "" alt = el.attrs.get("alt", None) or ""
@@ -963,18 +967,6 @@ class MediaConverter(DocumentConverter):
def _get_metadata(self, local_path, exiftool_path=None): def _get_metadata(self, local_path, exiftool_path=None):
if not exiftool_path: if not exiftool_path:
which_exiftool = shutil.which("exiftool")
if which_exiftool:
warn(
f"""Implicit discovery of 'exiftool' is disabled. If you would like to continue to use exiftool in MarkItDown, please set the exiftool_path parameter in the MarkItDown consructor. E.g.,
md = MarkItDown(exiftool_path="{which_exiftool}")
This warning will be removed in future releases.
""",
DeprecationWarning,
)
return None return None
else: else:
try: try:
@@ -1077,6 +1069,14 @@ class Mp3Converter(WavConverter):
handle, temp_path = tempfile.mkstemp(suffix=".wav") handle, temp_path = tempfile.mkstemp(suffix=".wav")
os.close(handle) os.close(handle)
try: try:
# Check if pydub defaulted to ffmpeg
if pydub.AudioSegment.converter == "ffmpeg" and not shutil.which(
"ffmpeg"
):
warn(
"pydub: Couldn't find ffmpeg or avconv - defaulting to ffmpeg, but may not work",
RuntimeWarning,
)
sound = pydub.AudioSegment.from_mp3(local_path) sound = pydub.AudioSegment.from_mp3(local_path)
sound.export(temp_path, format="wav") sound.export(temp_path, format="wav")
@@ -1487,34 +1487,26 @@ class MarkItDown:
if exiftool_path is None: if exiftool_path is None:
exiftool_path = os.environ.get("EXIFTOOL_PATH") exiftool_path = os.environ.get("EXIFTOOL_PATH")
# Handle deprecation notices # Still none? Check well-known paths
############################# if exiftool_path is None:
if mlm_client is not None: candidate = shutil.which("exiftool")
if llm_client is None: if candidate:
warn( candidate = os.path.abspath(candidate)
"'mlm_client' is deprecated, and was renamed 'llm_client'.", if any(
DeprecationWarning, d == os.path.dirname(candidate)
) for d in [
llm_client = mlm_client "/usr/bin",
mlm_client = None "/usr/local/bin",
else: "/opt",
raise ValueError( "/opt/bin",
"'mlm_client' is deprecated, and was renamed 'llm_client'. Do not use both at the same time. Just use 'llm_client' instead." "/opt/local/bin",
) "/opt/homebrew/bin",
"C:\\Windows\\System32",
if mlm_model is not None: "C:\\Program Files",
if llm_model is None: "C:\\Program Files (x86)",
warn( ]
"'mlm_model' is deprecated, and was renamed 'llm_model'.", ):
DeprecationWarning, exiftool_path = candidate
)
llm_model = mlm_model
mlm_model = None
else:
raise ValueError(
"'mlm_model' is deprecated, and was renamed 'llm_model'. Do not use both at the same time. Just use 'llm_model' instead."
)
#############################
self._llm_client = llm_client self._llm_client = llm_client
self._llm_model = llm_model self._llm_model = llm_model
@@ -1752,6 +1744,8 @@ class MarkItDown:
ext = ext.strip() ext = ext.strip()
if ext == "": if ext == "":
return return
if ext in extensions:
return
# if ext not in extensions: # if ext not in extensions:
extensions.append(ext) extensions.append(ext)

View File

@@ -6,8 +6,6 @@ import shutil
import pytest import pytest
import requests import requests
from warnings import catch_warnings, resetwarnings
from markitdown import MarkItDown from markitdown import MarkItDown
skip_remote = ( skip_remote = (
@@ -277,18 +275,6 @@ def test_markitdown_local() -> None:
reason="do not run if exiftool is not installed", reason="do not run if exiftool is not installed",
) )
def test_markitdown_exiftool() -> None: def test_markitdown_exiftool() -> None:
# Test the automatic discovery of exiftool throws a warning
# and is disabled
try:
with catch_warnings(record=True) as w:
markitdown = MarkItDown()
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
assert len(w) == 1
assert w[0].category is DeprecationWarning
assert result.text_content.strip() == ""
finally:
resetwarnings()
# Test explicitly setting the location of exiftool # Test explicitly setting the location of exiftool
which_exiftool = shutil.which("exiftool") which_exiftool = shutil.which("exiftool")
markitdown = MarkItDown(exiftool_path=which_exiftool) markitdown = MarkItDown(exiftool_path=which_exiftool)
@@ -306,40 +292,6 @@ def test_markitdown_exiftool() -> None:
assert target in result.text_content assert target in result.text_content
def test_markitdown_deprecation() -> None:
try:
with catch_warnings(record=True) as w:
test_client = object()
markitdown = MarkItDown(mlm_client=test_client)
assert len(w) == 1
assert w[0].category is DeprecationWarning
assert markitdown._llm_client == test_client
finally:
resetwarnings()
try:
with catch_warnings(record=True) as w:
markitdown = MarkItDown(mlm_model="gpt-4o")
assert len(w) == 1
assert w[0].category is DeprecationWarning
assert markitdown._llm_model == "gpt-4o"
finally:
resetwarnings()
try:
test_client = object()
markitdown = MarkItDown(mlm_client=test_client, llm_client=test_client)
assert False
except ValueError:
pass
try:
markitdown = MarkItDown(mlm_model="gpt-4o", llm_model="gpt-4o")
assert False
except ValueError:
pass
@pytest.mark.skipif( @pytest.mark.skipif(
skip_llm, skip_llm,
reason="do not run llm tests without a key", reason="do not run llm tests without a key",
@@ -364,5 +316,4 @@ if __name__ == "__main__":
# test_markitdown_remote() # test_markitdown_remote()
# test_markitdown_local() # test_markitdown_local()
test_markitdown_exiftool() test_markitdown_exiftool()
# test_markitdown_deprecation()
# test_markitdown_llm() # test_markitdown_llm()