Initial work to port #55 to MarkItDown 0.1.X

Fixed formatting. (#1098 )
Fixed deepcopy failure when passing llm_client (#1089 )
2025-03-06 13:17:58 -08:00 · 2025-03-05 23:30:29 -08:00 · 2025-03-05 23:25:37 -08:00 · 2025-03-05 23:21:10 -08:00 · 2025-03-05 23:10:21 -08:00
6 changed files with 100 additions and 5 deletions
--- a/README.md
+++ b/README.md
@@ -5,8 +5,8 @@
 [![Built by AutoGen Team](https://img.shields.io/badge/Built%20by-AutoGen%20Team-blue)](https://github.com/microsoft/autogen)
 > [!IMPORTANT]
-> Breaking changes between 0.0.1 to 0.0.2:
+> Breaking changes between 0.0.1 to 0.1.0:
-> * Dependencies are now organized into optional feature-groups (further details below). Use `pip install markitdown[all]` to have backward-compatible behavior. 
+> * Dependencies are now organized into optional feature-groups (further details below). Use `pip install 'markitdown[all]~=0.1.0a1'` to have backward-compatible behavior. 
 > * The DocumentConverter class interface has changed to read from file-like streams rather than file paths. *No temporary files are created anymore*. If you are the maintainer of a plugin, or custom DocumentConverter, you likely need to update your code. Otherwise, if only using the MarkItDown class or CLI (as in these examples), you should not need to change anything.
 MarkItDown is a lightweight Python utility for converting various files to Markdown for use with LLMs and related text analysis pipelines. To this end, it is most comparable to [textract](https://github.com/deanmalmgren/textract), but with a focus on preserving important document structure and content as Markdown (including: headings, lists, tables, links, etc.) While the output is often reasonably presentable and human-friendly, it is meant to be consumed by text analysis tools -- and may not be the best option for high-fidelity document conversions for human consumption.
@@ -17,6 +17,7 @@ At present, MarkItDown supports:
 - PowerPoint
 - Word
 - Excel
 - OneNote
 - Images (EXIF metadata and OCR)
 - Audio (EXIF metadata and speech transcription)
 - HTML
@@ -36,7 +37,7 @@ are also highly token-efficient.
 ## Installation
-To install MarkItDown, use pip: `pip install markitdown[all]`. Alternatively, you can install it from the source:
+To install MarkItDown, use pip: `pip install 'markitdown[all]~=0.1.0a1'`. Alternatively, you can install it from the source:
 ```bash
 git clone git@github.com:microsoft/markitdown.git
@@ -82,6 +83,7 @@ At the moment, the following optional dependencies are available:
 * `[xls]` Installs dependencies for older Excel files
 * `[pdf]` Installs dependencies for PDF files
 * `[outlook]` Installs dependencies for Outlook messages
 * `[onenote]` Installs dependencies for OneNote .one files
 * `[az-doc-intel]` Installs dependencies for Azure Document Intelligence
 * `[audio-transcription]` Installs dependencies for audio transcription of wav and mp3 files
 * `[youtube-transcription]` Installs dependencies for fetching YouTube video transcription
--- a/packages/markitdown/pyproject.toml
+++ b/packages/markitdown/pyproject.toml
@@ -45,7 +45,8 @@ all = [
  "SpeechRecognition",
  "youtube-transcript-api",
  "azure-ai-documentintelligence",
-  "azure-identity"
+  "azure-identity",
  "one-extract",
 ]
 pptx = ["python-pptx"]
 docx = ["mammoth"]
@@ -53,6 +54,7 @@ xlsx = ["pandas", "openpyxl"]
 xls = ["pandas", "xlrd"]
 pdf = ["pdfminer.six"]
 outlook = ["olefile"]
 onenote = ["one-extract"]
 audio-transcription = ["pydub", "SpeechRecognition"]
 youtube-transcription = ["youtube-transcript-api"]
 az-doc-intel = ["azure-ai-documentintelligence", "azure-identity"]
--- a/packages/markitdown/src/markitdown/_markitdown.py
+++ b/packages/markitdown/src/markitdown/_markitdown.py
@@ -30,6 +30,7 @@ from .converters import (
    BingSerpConverter,
    PdfConverter,
    DocxConverter,
    OneNoteConverter,
    XlsxConverter,
    XlsConverter,
    PptxConverter,
@@ -158,6 +159,7 @@ class MarkItDown:
            self.register_converter(YouTubeConverter())
            self.register_converter(BingSerpConverter())
            self.register_converter(DocxConverter())
            self.register_converter(OneNoteConverter())
            self.register_converter(XlsxConverter())
            self.register_converter(XlsConverter())
            self.register_converter(PptxConverter())
@@ -455,7 +457,7 @@ class MarkItDown:
                    cur_pos == file_stream.tell()
                ), f"File stream position should NOT change between guess iterations"
-                _kwargs = copy.deepcopy(kwargs)
+                _kwargs = {k: v for k, v in kwargs.items()}
                # Copy any additional global options
                if "llm_client" not in _kwargs and self._llm_client is not None:
--- a/packages/markitdown/src/markitdown/converters/init.py
+++ b/packages/markitdown/src/markitdown/converters/init.py
@@ -11,6 +11,7 @@ from ._ipynb_converter import IpynbConverter
 from ._bing_serp_converter import BingSerpConverter
 from ._pdf_converter import PdfConverter
 from ._docx_converter import DocxConverter
 from ._onenote_converter import OneNoteConverter
 from ._xlsx_converter import XlsxConverter, XlsConverter
 from ._pptx_converter import PptxConverter
 from ._image_converter import ImageConverter
@@ -29,6 +30,7 @@ __all__ = [
    "BingSerpConverter",
    "PdfConverter",
    "DocxConverter",
    "OneNoteConverter",
    "XlsxConverter",
    "XlsConverter",
    "PptxConverter",
--- a/packages/markitdown/src/markitdown/converters/_onenote_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_onenote_converter.py
@@ -0,0 +1,87 @@
 import sys
 from typing import BinaryIO, Any
 from ._html_converter import HtmlConverter
 from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._stream_info import StreamInfo
 from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
 # Try loading optional (but in this case, required) dependencies
 # Save reporting of any exceptions for later
 _dependency_exc_info = None
 try:
    import one_extract
 except ImportError:
    # Preserve the error and stack trace for later
    _dependency_exc_info = sys.exc_info()
 ACCEPTED_MIME_TYPE_PREFIXES = []
 ACCEPTED_FILE_EXTENSIONS = [".one"]
 class OneNoteConverter(DocumentConverter):
    """
    Converts OneNote files to Markdown.
    """
    def __init__(self):
        super().__init__()
        self._html_converter = HtmlConverter()
    def accepts(
        self,
        file_stream: BinaryIO,
        stream_info: StreamInfo,
        **kwargs: Any,  # Options to pass to the converter
    ) -> bool:
        mimetype = (stream_info.mimetype or "").lower()
        extension = (stream_info.extension or "").lower()
        if extension in ACCEPTED_FILE_EXTENSIONS:
            return True
        for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
            if mimetype.startswith(prefix):
                return True
        return False
    def convert(
        self,
        file_stream: BinaryIO,
        stream_info: StreamInfo,
        **kwargs: Any,  # Options to pass to the converter
    ) -> DocumentConverterResult:
        # Check: the dependencies
        if _dependency_exc_info is not None:
            raise MissingDependencyException(
                MISSING_DEPENDENCY_MESSAGE.format(
                    converter=type(self).__name__,
                    extension=".one",
                    feature="onenote",
                )
            ) from _dependency_exc_info[
                1
            ].with_traceback(  # type: ignore[union-attr]
                _dependency_exc_info[2]
            )
        # Perform the conversion
        md_content = ""
        notebook = one_extract.Notebook(file_stream)
        for section in notebook.sections:
            md_content += f"\n\n# {section.name}\n"
            for page in section.pages:
                md_content += f"\n\n## {page.name}\n"
                md_content += (
                    self._html_converter.convert_string(page.content).markdown.strip()
                    + "\n\n"
                )
        return DocumentConverterResult(
            title=None,
            text_content=md_content.strip(),
        )
--- a/packages/markitdown/tests/test_files/test.one
+++ b/packages/markitdown/tests/test_files/test.one
Author	SHA1	Message	Date
Adam Fourney	da73d64bfa	Initial work to port #55 to MarkItDown 0.1.X	2025-03-06 13:17:58 -08:00
afourney	82d84e3edd	Fixed formatting. (#1098 )	2025-03-05 23:30:29 -08:00
scalabreseGD	36c4bc9ec3	Fixed deepcopy failure when passing llm_client (#1089 ) Co-authored-by: afourney <adamfo@microsoft.com>	2025-03-05 23:25:37 -08:00
Andrea Pietrobon	80baa5db18	fix(README): correct pip install command formatting (#1090 ) Added missing quotes around `markitdown[all]` in the installation command to ensure proper package resolution by pip.	2025-03-05 23:21:10 -08:00
Adam Fourney	00a65e8f8b	Fixed version in README.	2025-03-05 23:10:21 -08:00