Compare commits
1 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
da73d64bfa |
@@ -17,6 +17,7 @@ At present, MarkItDown supports:
|
||||
- PowerPoint
|
||||
- Word
|
||||
- Excel
|
||||
- OneNote
|
||||
- Images (EXIF metadata and OCR)
|
||||
- Audio (EXIF metadata and speech transcription)
|
||||
- HTML
|
||||
@@ -82,6 +83,7 @@ At the moment, the following optional dependencies are available:
|
||||
* `[xls]` Installs dependencies for older Excel files
|
||||
* `[pdf]` Installs dependencies for PDF files
|
||||
* `[outlook]` Installs dependencies for Outlook messages
|
||||
* `[onenote]` Installs dependencies for OneNote .one files
|
||||
* `[az-doc-intel]` Installs dependencies for Azure Document Intelligence
|
||||
* `[audio-transcription]` Installs dependencies for audio transcription of wav and mp3 files
|
||||
* `[youtube-transcription]` Installs dependencies for fetching YouTube video transcription
|
||||
|
||||
@@ -45,7 +45,8 @@ all = [
|
||||
"SpeechRecognition",
|
||||
"youtube-transcript-api",
|
||||
"azure-ai-documentintelligence",
|
||||
"azure-identity"
|
||||
"azure-identity",
|
||||
"one-extract",
|
||||
]
|
||||
pptx = ["python-pptx"]
|
||||
docx = ["mammoth"]
|
||||
@@ -53,6 +54,7 @@ xlsx = ["pandas", "openpyxl"]
|
||||
xls = ["pandas", "xlrd"]
|
||||
pdf = ["pdfminer.six"]
|
||||
outlook = ["olefile"]
|
||||
onenote = ["one-extract"]
|
||||
audio-transcription = ["pydub", "SpeechRecognition"]
|
||||
youtube-transcription = ["youtube-transcript-api"]
|
||||
az-doc-intel = ["azure-ai-documentintelligence", "azure-identity"]
|
||||
|
||||
@@ -30,6 +30,7 @@ from .converters import (
|
||||
BingSerpConverter,
|
||||
PdfConverter,
|
||||
DocxConverter,
|
||||
OneNoteConverter,
|
||||
XlsxConverter,
|
||||
XlsConverter,
|
||||
PptxConverter,
|
||||
@@ -158,6 +159,7 @@ class MarkItDown:
|
||||
self.register_converter(YouTubeConverter())
|
||||
self.register_converter(BingSerpConverter())
|
||||
self.register_converter(DocxConverter())
|
||||
self.register_converter(OneNoteConverter())
|
||||
self.register_converter(XlsxConverter())
|
||||
self.register_converter(XlsConverter())
|
||||
self.register_converter(PptxConverter())
|
||||
|
||||
@@ -11,6 +11,7 @@ from ._ipynb_converter import IpynbConverter
|
||||
from ._bing_serp_converter import BingSerpConverter
|
||||
from ._pdf_converter import PdfConverter
|
||||
from ._docx_converter import DocxConverter
|
||||
from ._onenote_converter import OneNoteConverter
|
||||
from ._xlsx_converter import XlsxConverter, XlsConverter
|
||||
from ._pptx_converter import PptxConverter
|
||||
from ._image_converter import ImageConverter
|
||||
@@ -29,6 +30,7 @@ __all__ = [
|
||||
"BingSerpConverter",
|
||||
"PdfConverter",
|
||||
"DocxConverter",
|
||||
"OneNoteConverter",
|
||||
"XlsxConverter",
|
||||
"XlsConverter",
|
||||
"PptxConverter",
|
||||
|
||||
@@ -0,0 +1,87 @@
|
||||
import sys
|
||||
|
||||
from typing import BinaryIO, Any
|
||||
|
||||
from ._html_converter import HtmlConverter
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._stream_info import StreamInfo
|
||||
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||
|
||||
# Try loading optional (but in this case, required) dependencies
|
||||
# Save reporting of any exceptions for later
|
||||
_dependency_exc_info = None
|
||||
try:
|
||||
import one_extract
|
||||
except ImportError:
|
||||
# Preserve the error and stack trace for later
|
||||
_dependency_exc_info = sys.exc_info()
|
||||
|
||||
|
||||
ACCEPTED_MIME_TYPE_PREFIXES = []
|
||||
|
||||
ACCEPTED_FILE_EXTENSIONS = [".one"]
|
||||
|
||||
|
||||
class OneNoteConverter(DocumentConverter):
|
||||
"""
|
||||
Converts OneNote files to Markdown.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self._html_converter = HtmlConverter()
|
||||
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> bool:
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
|
||||
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||
return True
|
||||
|
||||
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
||||
if mimetype.startswith(prefix):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> DocumentConverterResult:
|
||||
# Check: the dependencies
|
||||
if _dependency_exc_info is not None:
|
||||
raise MissingDependencyException(
|
||||
MISSING_DEPENDENCY_MESSAGE.format(
|
||||
converter=type(self).__name__,
|
||||
extension=".one",
|
||||
feature="onenote",
|
||||
)
|
||||
) from _dependency_exc_info[
|
||||
1
|
||||
].with_traceback( # type: ignore[union-attr]
|
||||
_dependency_exc_info[2]
|
||||
)
|
||||
|
||||
# Perform the conversion
|
||||
md_content = ""
|
||||
notebook = one_extract.Notebook(file_stream)
|
||||
for section in notebook.sections:
|
||||
md_content += f"\n\n# {section.name}\n"
|
||||
for page in section.pages:
|
||||
md_content += f"\n\n## {page.name}\n"
|
||||
md_content += (
|
||||
self._html_converter.convert_string(page.content).markdown.strip()
|
||||
+ "\n\n"
|
||||
)
|
||||
|
||||
return DocumentConverterResult(
|
||||
title=None,
|
||||
text_content=md_content.strip(),
|
||||
)
|
||||
BIN
packages/markitdown/tests/test_files/test.one
vendored
Executable file
BIN
packages/markitdown/tests/test_files/test.one
vendored
Executable file
Binary file not shown.
Reference in New Issue
Block a user