Compare commits
5 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
da73d64bfa | ||
|
|
82d84e3edd | ||
|
|
36c4bc9ec3 | ||
|
|
80baa5db18 | ||
|
|
00a65e8f8b |
@@ -5,8 +5,8 @@
|
|||||||
[](https://github.com/microsoft/autogen)
|
[](https://github.com/microsoft/autogen)
|
||||||
|
|
||||||
> [!IMPORTANT]
|
> [!IMPORTANT]
|
||||||
> Breaking changes between 0.0.1 to 0.0.2:
|
> Breaking changes between 0.0.1 to 0.1.0:
|
||||||
> * Dependencies are now organized into optional feature-groups (further details below). Use `pip install markitdown[all]` to have backward-compatible behavior.
|
> * Dependencies are now organized into optional feature-groups (further details below). Use `pip install 'markitdown[all]~=0.1.0a1'` to have backward-compatible behavior.
|
||||||
> * The DocumentConverter class interface has changed to read from file-like streams rather than file paths. *No temporary files are created anymore*. If you are the maintainer of a plugin, or custom DocumentConverter, you likely need to update your code. Otherwise, if only using the MarkItDown class or CLI (as in these examples), you should not need to change anything.
|
> * The DocumentConverter class interface has changed to read from file-like streams rather than file paths. *No temporary files are created anymore*. If you are the maintainer of a plugin, or custom DocumentConverter, you likely need to update your code. Otherwise, if only using the MarkItDown class or CLI (as in these examples), you should not need to change anything.
|
||||||
|
|
||||||
MarkItDown is a lightweight Python utility for converting various files to Markdown for use with LLMs and related text analysis pipelines. To this end, it is most comparable to [textract](https://github.com/deanmalmgren/textract), but with a focus on preserving important document structure and content as Markdown (including: headings, lists, tables, links, etc.) While the output is often reasonably presentable and human-friendly, it is meant to be consumed by text analysis tools -- and may not be the best option for high-fidelity document conversions for human consumption.
|
MarkItDown is a lightweight Python utility for converting various files to Markdown for use with LLMs and related text analysis pipelines. To this end, it is most comparable to [textract](https://github.com/deanmalmgren/textract), but with a focus on preserving important document structure and content as Markdown (including: headings, lists, tables, links, etc.) While the output is often reasonably presentable and human-friendly, it is meant to be consumed by text analysis tools -- and may not be the best option for high-fidelity document conversions for human consumption.
|
||||||
@@ -17,6 +17,7 @@ At present, MarkItDown supports:
|
|||||||
- PowerPoint
|
- PowerPoint
|
||||||
- Word
|
- Word
|
||||||
- Excel
|
- Excel
|
||||||
|
- OneNote
|
||||||
- Images (EXIF metadata and OCR)
|
- Images (EXIF metadata and OCR)
|
||||||
- Audio (EXIF metadata and speech transcription)
|
- Audio (EXIF metadata and speech transcription)
|
||||||
- HTML
|
- HTML
|
||||||
@@ -36,7 +37,7 @@ are also highly token-efficient.
|
|||||||
|
|
||||||
## Installation
|
## Installation
|
||||||
|
|
||||||
To install MarkItDown, use pip: `pip install markitdown[all]`. Alternatively, you can install it from the source:
|
To install MarkItDown, use pip: `pip install 'markitdown[all]~=0.1.0a1'`. Alternatively, you can install it from the source:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
git clone git@github.com:microsoft/markitdown.git
|
git clone git@github.com:microsoft/markitdown.git
|
||||||
@@ -82,6 +83,7 @@ At the moment, the following optional dependencies are available:
|
|||||||
* `[xls]` Installs dependencies for older Excel files
|
* `[xls]` Installs dependencies for older Excel files
|
||||||
* `[pdf]` Installs dependencies for PDF files
|
* `[pdf]` Installs dependencies for PDF files
|
||||||
* `[outlook]` Installs dependencies for Outlook messages
|
* `[outlook]` Installs dependencies for Outlook messages
|
||||||
|
* `[onenote]` Installs dependencies for OneNote .one files
|
||||||
* `[az-doc-intel]` Installs dependencies for Azure Document Intelligence
|
* `[az-doc-intel]` Installs dependencies for Azure Document Intelligence
|
||||||
* `[audio-transcription]` Installs dependencies for audio transcription of wav and mp3 files
|
* `[audio-transcription]` Installs dependencies for audio transcription of wav and mp3 files
|
||||||
* `[youtube-transcription]` Installs dependencies for fetching YouTube video transcription
|
* `[youtube-transcription]` Installs dependencies for fetching YouTube video transcription
|
||||||
|
|||||||
@@ -45,7 +45,8 @@ all = [
|
|||||||
"SpeechRecognition",
|
"SpeechRecognition",
|
||||||
"youtube-transcript-api",
|
"youtube-transcript-api",
|
||||||
"azure-ai-documentintelligence",
|
"azure-ai-documentintelligence",
|
||||||
"azure-identity"
|
"azure-identity",
|
||||||
|
"one-extract",
|
||||||
]
|
]
|
||||||
pptx = ["python-pptx"]
|
pptx = ["python-pptx"]
|
||||||
docx = ["mammoth"]
|
docx = ["mammoth"]
|
||||||
@@ -53,6 +54,7 @@ xlsx = ["pandas", "openpyxl"]
|
|||||||
xls = ["pandas", "xlrd"]
|
xls = ["pandas", "xlrd"]
|
||||||
pdf = ["pdfminer.six"]
|
pdf = ["pdfminer.six"]
|
||||||
outlook = ["olefile"]
|
outlook = ["olefile"]
|
||||||
|
onenote = ["one-extract"]
|
||||||
audio-transcription = ["pydub", "SpeechRecognition"]
|
audio-transcription = ["pydub", "SpeechRecognition"]
|
||||||
youtube-transcription = ["youtube-transcript-api"]
|
youtube-transcription = ["youtube-transcript-api"]
|
||||||
az-doc-intel = ["azure-ai-documentintelligence", "azure-identity"]
|
az-doc-intel = ["azure-ai-documentintelligence", "azure-identity"]
|
||||||
|
|||||||
@@ -30,6 +30,7 @@ from .converters import (
|
|||||||
BingSerpConverter,
|
BingSerpConverter,
|
||||||
PdfConverter,
|
PdfConverter,
|
||||||
DocxConverter,
|
DocxConverter,
|
||||||
|
OneNoteConverter,
|
||||||
XlsxConverter,
|
XlsxConverter,
|
||||||
XlsConverter,
|
XlsConverter,
|
||||||
PptxConverter,
|
PptxConverter,
|
||||||
@@ -158,6 +159,7 @@ class MarkItDown:
|
|||||||
self.register_converter(YouTubeConverter())
|
self.register_converter(YouTubeConverter())
|
||||||
self.register_converter(BingSerpConverter())
|
self.register_converter(BingSerpConverter())
|
||||||
self.register_converter(DocxConverter())
|
self.register_converter(DocxConverter())
|
||||||
|
self.register_converter(OneNoteConverter())
|
||||||
self.register_converter(XlsxConverter())
|
self.register_converter(XlsxConverter())
|
||||||
self.register_converter(XlsConverter())
|
self.register_converter(XlsConverter())
|
||||||
self.register_converter(PptxConverter())
|
self.register_converter(PptxConverter())
|
||||||
@@ -455,7 +457,7 @@ class MarkItDown:
|
|||||||
cur_pos == file_stream.tell()
|
cur_pos == file_stream.tell()
|
||||||
), f"File stream position should NOT change between guess iterations"
|
), f"File stream position should NOT change between guess iterations"
|
||||||
|
|
||||||
_kwargs = copy.deepcopy(kwargs)
|
_kwargs = {k: v for k, v in kwargs.items()}
|
||||||
|
|
||||||
# Copy any additional global options
|
# Copy any additional global options
|
||||||
if "llm_client" not in _kwargs and self._llm_client is not None:
|
if "llm_client" not in _kwargs and self._llm_client is not None:
|
||||||
|
|||||||
@@ -11,6 +11,7 @@ from ._ipynb_converter import IpynbConverter
|
|||||||
from ._bing_serp_converter import BingSerpConverter
|
from ._bing_serp_converter import BingSerpConverter
|
||||||
from ._pdf_converter import PdfConverter
|
from ._pdf_converter import PdfConverter
|
||||||
from ._docx_converter import DocxConverter
|
from ._docx_converter import DocxConverter
|
||||||
|
from ._onenote_converter import OneNoteConverter
|
||||||
from ._xlsx_converter import XlsxConverter, XlsConverter
|
from ._xlsx_converter import XlsxConverter, XlsConverter
|
||||||
from ._pptx_converter import PptxConverter
|
from ._pptx_converter import PptxConverter
|
||||||
from ._image_converter import ImageConverter
|
from ._image_converter import ImageConverter
|
||||||
@@ -29,6 +30,7 @@ __all__ = [
|
|||||||
"BingSerpConverter",
|
"BingSerpConverter",
|
||||||
"PdfConverter",
|
"PdfConverter",
|
||||||
"DocxConverter",
|
"DocxConverter",
|
||||||
|
"OneNoteConverter",
|
||||||
"XlsxConverter",
|
"XlsxConverter",
|
||||||
"XlsConverter",
|
"XlsConverter",
|
||||||
"PptxConverter",
|
"PptxConverter",
|
||||||
|
|||||||
@@ -0,0 +1,87 @@
|
|||||||
|
import sys
|
||||||
|
|
||||||
|
from typing import BinaryIO, Any
|
||||||
|
|
||||||
|
from ._html_converter import HtmlConverter
|
||||||
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
|
from .._stream_info import StreamInfo
|
||||||
|
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||||
|
|
||||||
|
# Try loading optional (but in this case, required) dependencies
|
||||||
|
# Save reporting of any exceptions for later
|
||||||
|
_dependency_exc_info = None
|
||||||
|
try:
|
||||||
|
import one_extract
|
||||||
|
except ImportError:
|
||||||
|
# Preserve the error and stack trace for later
|
||||||
|
_dependency_exc_info = sys.exc_info()
|
||||||
|
|
||||||
|
|
||||||
|
ACCEPTED_MIME_TYPE_PREFIXES = []
|
||||||
|
|
||||||
|
ACCEPTED_FILE_EXTENSIONS = [".one"]
|
||||||
|
|
||||||
|
|
||||||
|
class OneNoteConverter(DocumentConverter):
|
||||||
|
"""
|
||||||
|
Converts OneNote files to Markdown.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
self._html_converter = HtmlConverter()
|
||||||
|
|
||||||
|
def accepts(
|
||||||
|
self,
|
||||||
|
file_stream: BinaryIO,
|
||||||
|
stream_info: StreamInfo,
|
||||||
|
**kwargs: Any, # Options to pass to the converter
|
||||||
|
) -> bool:
|
||||||
|
mimetype = (stream_info.mimetype or "").lower()
|
||||||
|
extension = (stream_info.extension or "").lower()
|
||||||
|
|
||||||
|
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||||
|
return True
|
||||||
|
|
||||||
|
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
||||||
|
if mimetype.startswith(prefix):
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
def convert(
|
||||||
|
self,
|
||||||
|
file_stream: BinaryIO,
|
||||||
|
stream_info: StreamInfo,
|
||||||
|
**kwargs: Any, # Options to pass to the converter
|
||||||
|
) -> DocumentConverterResult:
|
||||||
|
# Check: the dependencies
|
||||||
|
if _dependency_exc_info is not None:
|
||||||
|
raise MissingDependencyException(
|
||||||
|
MISSING_DEPENDENCY_MESSAGE.format(
|
||||||
|
converter=type(self).__name__,
|
||||||
|
extension=".one",
|
||||||
|
feature="onenote",
|
||||||
|
)
|
||||||
|
) from _dependency_exc_info[
|
||||||
|
1
|
||||||
|
].with_traceback( # type: ignore[union-attr]
|
||||||
|
_dependency_exc_info[2]
|
||||||
|
)
|
||||||
|
|
||||||
|
# Perform the conversion
|
||||||
|
md_content = ""
|
||||||
|
notebook = one_extract.Notebook(file_stream)
|
||||||
|
for section in notebook.sections:
|
||||||
|
md_content += f"\n\n# {section.name}\n"
|
||||||
|
for page in section.pages:
|
||||||
|
md_content += f"\n\n## {page.name}\n"
|
||||||
|
md_content += (
|
||||||
|
self._html_converter.convert_string(page.content).markdown.strip()
|
||||||
|
+ "\n\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
return DocumentConverterResult(
|
||||||
|
title=None,
|
||||||
|
text_content=md_content.strip(),
|
||||||
|
)
|
||||||
BIN
packages/markitdown/tests/test_files/test.one
vendored
Executable file
BIN
packages/markitdown/tests/test_files/test.one
vendored
Executable file
Binary file not shown.
Reference in New Issue
Block a user