diff --git a/README.md b/README.md index 0aa788c..517283f 100644 --- a/README.md +++ b/README.md @@ -17,6 +17,7 @@ At present, MarkItDown supports: - PowerPoint - Word - Excel +- OneNote - Images (EXIF metadata and OCR) - Audio (EXIF metadata and speech transcription) - HTML @@ -82,6 +83,7 @@ At the moment, the following optional dependencies are available: * `[xls]` Installs dependencies for older Excel files * `[pdf]` Installs dependencies for PDF files * `[outlook]` Installs dependencies for Outlook messages +* `[onenote]` Installs dependencies for OneNote .one files * `[az-doc-intel]` Installs dependencies for Azure Document Intelligence * `[audio-transcription]` Installs dependencies for audio transcription of wav and mp3 files * `[youtube-transcription]` Installs dependencies for fetching YouTube video transcription diff --git a/packages/markitdown/pyproject.toml b/packages/markitdown/pyproject.toml index d0f515e..071cd24 100644 --- a/packages/markitdown/pyproject.toml +++ b/packages/markitdown/pyproject.toml @@ -45,7 +45,8 @@ all = [ "SpeechRecognition", "youtube-transcript-api", "azure-ai-documentintelligence", - "azure-identity" + "azure-identity", + "one-extract", ] pptx = ["python-pptx"] docx = ["mammoth"] @@ -53,6 +54,7 @@ xlsx = ["pandas", "openpyxl"] xls = ["pandas", "xlrd"] pdf = ["pdfminer.six"] outlook = ["olefile"] +onenote = ["one-extract"] audio-transcription = ["pydub", "SpeechRecognition"] youtube-transcription = ["youtube-transcript-api"] az-doc-intel = ["azure-ai-documentintelligence", "azure-identity"] diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py index f32b236..d6cdbb5 100644 --- a/packages/markitdown/src/markitdown/_markitdown.py +++ b/packages/markitdown/src/markitdown/_markitdown.py @@ -30,6 +30,7 @@ from .converters import ( BingSerpConverter, PdfConverter, DocxConverter, + OneNoteConverter, XlsxConverter, XlsConverter, PptxConverter, @@ -158,6 +159,7 @@ class MarkItDown: self.register_converter(YouTubeConverter()) self.register_converter(BingSerpConverter()) self.register_converter(DocxConverter()) + self.register_converter(OneNoteConverter()) self.register_converter(XlsxConverter()) self.register_converter(XlsConverter()) self.register_converter(PptxConverter()) diff --git a/packages/markitdown/src/markitdown/converters/__init__.py b/packages/markitdown/src/markitdown/converters/__init__.py index f43efe3..2a9cbe1 100644 --- a/packages/markitdown/src/markitdown/converters/__init__.py +++ b/packages/markitdown/src/markitdown/converters/__init__.py @@ -11,6 +11,7 @@ from ._ipynb_converter import IpynbConverter from ._bing_serp_converter import BingSerpConverter from ._pdf_converter import PdfConverter from ._docx_converter import DocxConverter +from ._onenote_converter import OneNoteConverter from ._xlsx_converter import XlsxConverter, XlsConverter from ._pptx_converter import PptxConverter from ._image_converter import ImageConverter @@ -29,6 +30,7 @@ __all__ = [ "BingSerpConverter", "PdfConverter", "DocxConverter", + "OneNoteConverter", "XlsxConverter", "XlsConverter", "PptxConverter", diff --git a/packages/markitdown/src/markitdown/converters/_onenote_converter.py b/packages/markitdown/src/markitdown/converters/_onenote_converter.py new file mode 100644 index 0000000..56b1a19 --- /dev/null +++ b/packages/markitdown/src/markitdown/converters/_onenote_converter.py @@ -0,0 +1,87 @@ +import sys + +from typing import BinaryIO, Any + +from ._html_converter import HtmlConverter +from .._base_converter import DocumentConverter, DocumentConverterResult +from .._stream_info import StreamInfo +from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE + +# Try loading optional (but in this case, required) dependencies +# Save reporting of any exceptions for later +_dependency_exc_info = None +try: + import one_extract +except ImportError: + # Preserve the error and stack trace for later + _dependency_exc_info = sys.exc_info() + + +ACCEPTED_MIME_TYPE_PREFIXES = [] + +ACCEPTED_FILE_EXTENSIONS = [".one"] + + +class OneNoteConverter(DocumentConverter): + """ + Converts OneNote files to Markdown. + """ + + def __init__(self): + super().__init__() + self._html_converter = HtmlConverter() + + def accepts( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> bool: + mimetype = (stream_info.mimetype or "").lower() + extension = (stream_info.extension or "").lower() + + if extension in ACCEPTED_FILE_EXTENSIONS: + return True + + for prefix in ACCEPTED_MIME_TYPE_PREFIXES: + if mimetype.startswith(prefix): + return True + + return False + + def convert( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> DocumentConverterResult: + # Check: the dependencies + if _dependency_exc_info is not None: + raise MissingDependencyException( + MISSING_DEPENDENCY_MESSAGE.format( + converter=type(self).__name__, + extension=".one", + feature="onenote", + ) + ) from _dependency_exc_info[ + 1 + ].with_traceback( # type: ignore[union-attr] + _dependency_exc_info[2] + ) + + # Perform the conversion + md_content = "" + notebook = one_extract.Notebook(file_stream) + for section in notebook.sections: + md_content += f"\n\n# {section.name}\n" + for page in section.pages: + md_content += f"\n\n## {page.name}\n" + md_content += ( + self._html_converter.convert_string(page.content).markdown.strip() + + "\n\n" + ) + + return DocumentConverterResult( + title=None, + text_content=md_content.strip(), + ) diff --git a/packages/markitdown/tests/test_files/test.one b/packages/markitdown/tests/test_files/test.one new file mode 100755 index 0000000..66ba082 Binary files /dev/null and b/packages/markitdown/tests/test_files/test.one differ