Added missing comma.

Removed deprecation and other warnings. (#1105 )
Addresses #1068 (#1101 )
2025-03-07 16:18:47 -08:00 · 2025-03-07 16:17:03 -08:00 · 2025-03-07 15:46:30 -08:00 · 2025-03-05 21:35:08 -08:00 · 2025-03-05 21:30:56 -08:00 · 2025-02-28 07:30:46 -08:00
4 changed files with 48 additions and 139 deletions
--- a/README.md
+++ b/README.md
@@ -87,42 +87,6 @@ print(result.text_content)
 docker build -t markitdown:latest .
 docker run --rm -i markitdown:latest < ~/your-file.pdf > output.md
 ```
 <details>
 <summary>Batch Processing Multiple Files</summary>
 This example shows how to convert multiple files to markdown format in a single run. The script processes all supported files in a directory and creates corresponding markdown files.
 ```python convert.py
 from markitdown import MarkItDown
 from openai import OpenAI
 import os
 client = OpenAI(api_key="your-api-key-here")
 md = MarkItDown(llm_client=client, llm_model="gpt-4o-2024-11-20")
 supported_extensions = ('.pptx', '.docx', '.pdf', '.jpg', '.jpeg', '.png')
 files_to_convert = [f for f in os.listdir('.') if f.lower().endswith(supported_extensions)]
 for file in files_to_convert:
    print(f"\nConverting {file}...")
    try:
        md_file = os.path.splitext(file)[0] + '.md'
        result = md.convert(file)
        with open(md_file, 'w') as f:
            f.write(result.text_content)
        print(f"Successfully converted {file} to {md_file}")
    except Exception as e:
        print(f"Error converting {file}: {str(e)}")
 print("\nAll conversions completed!")
 ```
 2. Place the script in the same directory as your files
 3. Install required packages: like openai
 4. Run script ```bash python convert.py ```
 Note that original files will remain unchanged and new markdown files are created with the same base name.
 </details>
 ## Contributing
--- a/src/markitdown/about.py
+++ b/src/markitdown/about.py
@@ -1,4 +1,4 @@
 # SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
 #
 # SPDX-License-Identifier: MIT
-__version__ = "0.0.1a3"
+__version__ = "0.0.2"
--- a/src/markitdown/_markitdown.py
+++ b/src/markitdown/_markitdown.py
@@ -17,7 +17,7 @@ from xml.dom import minidom
 from typing import Any, Dict, List, Optional, Union
 from pathlib import Path
 from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
-from warnings import warn, resetwarnings, catch_warnings
+from warnings import warn, filterwarnings
 import mammoth
 import markdownify
@@ -51,21 +51,14 @@ mimetypes.add_type("text/csv", ".csv")
 # Optional Transcription support
 IS_AUDIO_TRANSCRIPTION_CAPABLE = False
 filterwarnings("ignore", message=r".*Couldn\'t find ffmpeg or avconv.*", module="pydub")
 try:
-    # Using warnings' catch_warnings to catch
+    import pydub
    # pydub's warning of ffmpeg or avconv missing
    with catch_warnings(record=True) as w:
        import pydub
        if w:
            raise ModuleNotFoundError
    import speech_recognition as sr
    IS_AUDIO_TRANSCRIPTION_CAPABLE = True
 except ModuleNotFoundError:
    pass
 finally:
    resetwarnings()
 # Optional YouTube transcription support
 try:
@@ -91,7 +84,14 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
        # Explicitly cast options to the expected type if necessary
        super().__init__(**options)
-    def convert_hn(self, n: int, el: Any, text: str, convert_as_inline: bool) -> str:
+    def convert_hn(
        self,
        n: int,
        el: Any,
        text: str,
        convert_as_inline: Optional[bool] = False,
        **kwargs,
    ) -> str:
        """Same as usual, but be sure to start with a new line"""
        if not convert_as_inline:
            if not re.search(r"^\n", text):
@@ -99,7 +99,9 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
        return super().convert_hn(n, el, text, convert_as_inline)  # type: ignore
-    def convert_a(self, el: Any, text: str, convert_as_inline: bool):
+    def convert_a(
        self, el: Any, text: str, convert_as_inline: Optional[bool] = False, **kwargs
    ):
        """Same as usual converter, but removes Javascript links and escapes URIs."""
        prefix, suffix, text = markdownify.chomp(text)  # type: ignore
        if not text:
@@ -135,7 +137,9 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
            else text
        )
-    def convert_img(self, el: Any, text: str, convert_as_inline: bool) -> str:
+    def convert_img(
        self, el: Any, text: str, convert_as_inline: Optional[bool] = False, **kwargs
    ) -> str:
        """Same as usual converter, but removes data URIs"""
        alt = el.attrs.get("alt", None) or ""
@@ -963,18 +967,6 @@ class MediaConverter(DocumentConverter):
    def _get_metadata(self, local_path, exiftool_path=None):
        if not exiftool_path:
            which_exiftool = shutil.which("exiftool")
            if which_exiftool:
                warn(
                    f"""Implicit discovery of 'exiftool' is disabled. If you would like to continue to use exiftool in MarkItDown, please set the exiftool_path parameter in the MarkItDown consructor. E.g., 
    md = MarkItDown(exiftool_path="{which_exiftool}")
 This warning will be removed in future releases.
 """,
                    DeprecationWarning,
                )
            return None
        else:
            try:
@@ -1077,6 +1069,14 @@ class Mp3Converter(WavConverter):
            handle, temp_path = tempfile.mkstemp(suffix=".wav")
            os.close(handle)
            try:
                # Check if pydub defaulted to ffmpeg
                if pydub.AudioSegment.converter == "ffmpeg" and not shutil.which(
                    "ffmpeg"
                ):
                    warn(
                        "pydub: Couldn't find ffmpeg or avconv - defaulting to ffmpeg, but may not work",
                        RuntimeWarning,
                    )
                sound = pydub.AudioSegment.from_mp3(local_path)
                sound.export(temp_path, format="wav")
@@ -1487,34 +1487,26 @@ class MarkItDown:
        if exiftool_path is None:
            exiftool_path = os.environ.get("EXIFTOOL_PATH")
-        # Handle deprecation notices
+        # Still none? Check well-known paths
-        #############################
+        if exiftool_path is None:
-        if mlm_client is not None:
+            candidate = shutil.which("exiftool")
-            if llm_client is None:
+            if candidate:
-                warn(
+                candidate = os.path.abspath(candidate)
-                    "'mlm_client' is deprecated, and was renamed 'llm_client'.",
+                if any(
-                    DeprecationWarning,
+                    d == os.path.dirname(candidate)
-                )
+                    for d in [
-                llm_client = mlm_client
+                        "/usr/bin",
-                mlm_client = None
+                        "/usr/local/bin",
-            else:
+                        "/opt",
-                raise ValueError(
+                        "/opt/bin",
-                    "'mlm_client' is deprecated, and was renamed 'llm_client'. Do not use both at the same time. Just use 'llm_client' instead."
+                        "/opt/local/bin",
-                )
+                        "/opt/homebrew/bin",
-
+                        "C:\\Windows\\System32",
-        if mlm_model is not None:
+                        "C:\\Program Files",
-            if llm_model is None:
+                        "C:\\Program Files (x86)",
-                warn(
+                    ]
-                    "'mlm_model' is deprecated, and was renamed 'llm_model'.",
+                ):
-                    DeprecationWarning,
+                    exiftool_path = candidate
                )
                llm_model = mlm_model
                mlm_model = None
            else:
                raise ValueError(
                    "'mlm_model' is deprecated, and was renamed 'llm_model'. Do not use both at the same time. Just use 'llm_model' instead."
                )
        #############################
        self._llm_client = llm_client
        self._llm_model = llm_model
@@ -1752,6 +1744,8 @@ class MarkItDown:
        ext = ext.strip()
        if ext == "":
            return
        if ext in extensions:
            return
        # if ext not in extensions:
        extensions.append(ext)
--- a/tests/test_markitdown.py
+++ b/tests/test_markitdown.py
@@ -6,8 +6,6 @@ import shutil
 import pytest
 import requests
 from warnings import catch_warnings, resetwarnings
 from markitdown import MarkItDown
 skip_remote = (
@@ -277,18 +275,6 @@ def test_markitdown_local() -> None:
    reason="do not run if exiftool is not installed",
 )
 def test_markitdown_exiftool() -> None:
    # Test the automatic discovery of exiftool throws a warning
    # and is disabled
    try:
        with catch_warnings(record=True) as w:
            markitdown = MarkItDown()
            result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
            assert len(w) == 1
            assert w[0].category is DeprecationWarning
            assert result.text_content.strip() == ""
    finally:
        resetwarnings()
    # Test explicitly setting the location of exiftool
    which_exiftool = shutil.which("exiftool")
    markitdown = MarkItDown(exiftool_path=which_exiftool)
@@ -306,40 +292,6 @@ def test_markitdown_exiftool() -> None:
        assert target in result.text_content
 def test_markitdown_deprecation() -> None:
    try:
        with catch_warnings(record=True) as w:
            test_client = object()
            markitdown = MarkItDown(mlm_client=test_client)
            assert len(w) == 1
            assert w[0].category is DeprecationWarning
            assert markitdown._llm_client == test_client
    finally:
        resetwarnings()
    try:
        with catch_warnings(record=True) as w:
            markitdown = MarkItDown(mlm_model="gpt-4o")
            assert len(w) == 1
            assert w[0].category is DeprecationWarning
            assert markitdown._llm_model == "gpt-4o"
    finally:
        resetwarnings()
    try:
        test_client = object()
        markitdown = MarkItDown(mlm_client=test_client, llm_client=test_client)
        assert False
    except ValueError:
        pass
    try:
        markitdown = MarkItDown(mlm_model="gpt-4o", llm_model="gpt-4o")
        assert False
    except ValueError:
        pass
@pytest.mark.skipif(
    skip_llm,
    reason="do not run llm tests without a key",
@@ -364,5 +316,4 @@ if __name__ == "__main__":
    # test_markitdown_remote()
    # test_markitdown_local()
    test_markitdown_exiftool()
    # test_markitdown_deprecation()
    # test_markitdown_llm()
Author	SHA1	Message	Date
Adam Fourney	e58bc486ee	Added missing comma.	2025-03-07 16:18:47 -08:00
afourney	81ef601c09	Removed deprecation and other warnings. (#1105 )	2025-03-07 16:17:03 -08:00
afourney	518b12c1fb	Addresses #1068 (#1101 )	2025-03-07 15:46:30 -08:00
Adam Fourney	8eaf5a1da9	Clean up README.md	2025-03-05 21:35:08 -08:00
afourney	38c924793c	Bump version (#1095 )	2025-03-05 21:30:56 -08:00
afourney	b9526d5e47	Bump version. (#1075 )	2025-02-28 07:30:46 -08:00
Hieu Lam	519fe172aa	Unable to convert HTML to Markdown (#1072 ) * feat: issue where inherited function from `markdownify.MarkdownConverter` doesn't have `current_tags` leading to error using `kwargs`, also set default value for `convert_as_inline`	2025-02-28 00:57:41 -08:00
Adam Fourney	abe9752438	Bumped version	2025-02-10 16:01:17 -08:00