Initial work to port #55 to MarkItDown 0.1.X

This commit is contained in:
Adam Fourney
2025-03-06 13:17:58 -08:00
parent 82d84e3edd
commit da73d64bfa
6 changed files with 96 additions and 1 deletions

View File

@@ -17,6 +17,7 @@ At present, MarkItDown supports:
- PowerPoint
- Word
- Excel
- OneNote
- Images (EXIF metadata and OCR)
- Audio (EXIF metadata and speech transcription)
- HTML
@@ -82,6 +83,7 @@ At the moment, the following optional dependencies are available:
* `[xls]` Installs dependencies for older Excel files
* `[pdf]` Installs dependencies for PDF files
* `[outlook]` Installs dependencies for Outlook messages
* `[onenote]` Installs dependencies for OneNote .one files
* `[az-doc-intel]` Installs dependencies for Azure Document Intelligence
* `[audio-transcription]` Installs dependencies for audio transcription of wav and mp3 files
* `[youtube-transcription]` Installs dependencies for fetching YouTube video transcription

View File

@@ -45,7 +45,8 @@ all = [
"SpeechRecognition",
"youtube-transcript-api",
"azure-ai-documentintelligence",
"azure-identity"
"azure-identity",
"one-extract",
]
pptx = ["python-pptx"]
docx = ["mammoth"]
@@ -53,6 +54,7 @@ xlsx = ["pandas", "openpyxl"]
xls = ["pandas", "xlrd"]
pdf = ["pdfminer.six"]
outlook = ["olefile"]
onenote = ["one-extract"]
audio-transcription = ["pydub", "SpeechRecognition"]
youtube-transcription = ["youtube-transcript-api"]
az-doc-intel = ["azure-ai-documentintelligence", "azure-identity"]

View File

@@ -30,6 +30,7 @@ from .converters import (
BingSerpConverter,
PdfConverter,
DocxConverter,
OneNoteConverter,
XlsxConverter,
XlsConverter,
PptxConverter,
@@ -158,6 +159,7 @@ class MarkItDown:
self.register_converter(YouTubeConverter())
self.register_converter(BingSerpConverter())
self.register_converter(DocxConverter())
self.register_converter(OneNoteConverter())
self.register_converter(XlsxConverter())
self.register_converter(XlsConverter())
self.register_converter(PptxConverter())

View File

@@ -11,6 +11,7 @@ from ._ipynb_converter import IpynbConverter
from ._bing_serp_converter import BingSerpConverter
from ._pdf_converter import PdfConverter
from ._docx_converter import DocxConverter
from ._onenote_converter import OneNoteConverter
from ._xlsx_converter import XlsxConverter, XlsConverter
from ._pptx_converter import PptxConverter
from ._image_converter import ImageConverter
@@ -29,6 +30,7 @@ __all__ = [
"BingSerpConverter",
"PdfConverter",
"DocxConverter",
"OneNoteConverter",
"XlsxConverter",
"XlsConverter",
"PptxConverter",

View File

@@ -0,0 +1,87 @@
import sys
from typing import BinaryIO, Any
from ._html_converter import HtmlConverter
from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
# Try loading optional (but in this case, required) dependencies
# Save reporting of any exceptions for later
_dependency_exc_info = None
try:
import one_extract
except ImportError:
# Preserve the error and stack trace for later
_dependency_exc_info = sys.exc_info()
ACCEPTED_MIME_TYPE_PREFIXES = []
ACCEPTED_FILE_EXTENSIONS = [".one"]
class OneNoteConverter(DocumentConverter):
"""
Converts OneNote files to Markdown.
"""
def __init__(self):
super().__init__()
self._html_converter = HtmlConverter()
def accepts(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> bool:
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()
if extension in ACCEPTED_FILE_EXTENSIONS:
return True
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
if mimetype.startswith(prefix):
return True
return False
def convert(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult:
# Check: the dependencies
if _dependency_exc_info is not None:
raise MissingDependencyException(
MISSING_DEPENDENCY_MESSAGE.format(
converter=type(self).__name__,
extension=".one",
feature="onenote",
)
) from _dependency_exc_info[
1
].with_traceback( # type: ignore[union-attr]
_dependency_exc_info[2]
)
# Perform the conversion
md_content = ""
notebook = one_extract.Notebook(file_stream)
for section in notebook.sections:
md_content += f"\n\n# {section.name}\n"
for page in section.pages:
md_content += f"\n\n## {page.name}\n"
md_content += (
self._html_converter.convert_string(page.content).markdown.strip()
+ "\n\n"
)
return DocumentConverterResult(
title=None,
text_content=md_content.strip(),
)

BIN
packages/markitdown/tests/test_files/test.one vendored Executable file

Binary file not shown.