Compare commits
5 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
da73d64bfa | ||
|
|
82d84e3edd | ||
|
|
36c4bc9ec3 | ||
|
|
80baa5db18 | ||
|
|
00a65e8f8b |
@@ -5,8 +5,8 @@
|
||||
[](https://github.com/microsoft/autogen)
|
||||
|
||||
> [!IMPORTANT]
|
||||
> Breaking changes between 0.0.1 to 0.0.2:
|
||||
> * Dependencies are now organized into optional feature-groups (further details below). Use `pip install markitdown[all]` to have backward-compatible behavior.
|
||||
> Breaking changes between 0.0.1 to 0.1.0:
|
||||
> * Dependencies are now organized into optional feature-groups (further details below). Use `pip install 'markitdown[all]~=0.1.0a1'` to have backward-compatible behavior.
|
||||
> * The DocumentConverter class interface has changed to read from file-like streams rather than file paths. *No temporary files are created anymore*. If you are the maintainer of a plugin, or custom DocumentConverter, you likely need to update your code. Otherwise, if only using the MarkItDown class or CLI (as in these examples), you should not need to change anything.
|
||||
|
||||
MarkItDown is a lightweight Python utility for converting various files to Markdown for use with LLMs and related text analysis pipelines. To this end, it is most comparable to [textract](https://github.com/deanmalmgren/textract), but with a focus on preserving important document structure and content as Markdown (including: headings, lists, tables, links, etc.) While the output is often reasonably presentable and human-friendly, it is meant to be consumed by text analysis tools -- and may not be the best option for high-fidelity document conversions for human consumption.
|
||||
@@ -17,6 +17,7 @@ At present, MarkItDown supports:
|
||||
- PowerPoint
|
||||
- Word
|
||||
- Excel
|
||||
- OneNote
|
||||
- Images (EXIF metadata and OCR)
|
||||
- Audio (EXIF metadata and speech transcription)
|
||||
- HTML
|
||||
@@ -36,7 +37,7 @@ are also highly token-efficient.
|
||||
|
||||
## Installation
|
||||
|
||||
To install MarkItDown, use pip: `pip install markitdown[all]`. Alternatively, you can install it from the source:
|
||||
To install MarkItDown, use pip: `pip install 'markitdown[all]~=0.1.0a1'`. Alternatively, you can install it from the source:
|
||||
|
||||
```bash
|
||||
git clone git@github.com:microsoft/markitdown.git
|
||||
@@ -82,6 +83,7 @@ At the moment, the following optional dependencies are available:
|
||||
* `[xls]` Installs dependencies for older Excel files
|
||||
* `[pdf]` Installs dependencies for PDF files
|
||||
* `[outlook]` Installs dependencies for Outlook messages
|
||||
* `[onenote]` Installs dependencies for OneNote .one files
|
||||
* `[az-doc-intel]` Installs dependencies for Azure Document Intelligence
|
||||
* `[audio-transcription]` Installs dependencies for audio transcription of wav and mp3 files
|
||||
* `[youtube-transcription]` Installs dependencies for fetching YouTube video transcription
|
||||
|
||||
@@ -45,7 +45,8 @@ all = [
|
||||
"SpeechRecognition",
|
||||
"youtube-transcript-api",
|
||||
"azure-ai-documentintelligence",
|
||||
"azure-identity"
|
||||
"azure-identity",
|
||||
"one-extract",
|
||||
]
|
||||
pptx = ["python-pptx"]
|
||||
docx = ["mammoth"]
|
||||
@@ -53,6 +54,7 @@ xlsx = ["pandas", "openpyxl"]
|
||||
xls = ["pandas", "xlrd"]
|
||||
pdf = ["pdfminer.six"]
|
||||
outlook = ["olefile"]
|
||||
onenote = ["one-extract"]
|
||||
audio-transcription = ["pydub", "SpeechRecognition"]
|
||||
youtube-transcription = ["youtube-transcript-api"]
|
||||
az-doc-intel = ["azure-ai-documentintelligence", "azure-identity"]
|
||||
|
||||
@@ -30,6 +30,7 @@ from .converters import (
|
||||
BingSerpConverter,
|
||||
PdfConverter,
|
||||
DocxConverter,
|
||||
OneNoteConverter,
|
||||
XlsxConverter,
|
||||
XlsConverter,
|
||||
PptxConverter,
|
||||
@@ -158,6 +159,7 @@ class MarkItDown:
|
||||
self.register_converter(YouTubeConverter())
|
||||
self.register_converter(BingSerpConverter())
|
||||
self.register_converter(DocxConverter())
|
||||
self.register_converter(OneNoteConverter())
|
||||
self.register_converter(XlsxConverter())
|
||||
self.register_converter(XlsConverter())
|
||||
self.register_converter(PptxConverter())
|
||||
@@ -455,7 +457,7 @@ class MarkItDown:
|
||||
cur_pos == file_stream.tell()
|
||||
), f"File stream position should NOT change between guess iterations"
|
||||
|
||||
_kwargs = copy.deepcopy(kwargs)
|
||||
_kwargs = {k: v for k, v in kwargs.items()}
|
||||
|
||||
# Copy any additional global options
|
||||
if "llm_client" not in _kwargs and self._llm_client is not None:
|
||||
|
||||
@@ -11,6 +11,7 @@ from ._ipynb_converter import IpynbConverter
|
||||
from ._bing_serp_converter import BingSerpConverter
|
||||
from ._pdf_converter import PdfConverter
|
||||
from ._docx_converter import DocxConverter
|
||||
from ._onenote_converter import OneNoteConverter
|
||||
from ._xlsx_converter import XlsxConverter, XlsConverter
|
||||
from ._pptx_converter import PptxConverter
|
||||
from ._image_converter import ImageConverter
|
||||
@@ -29,6 +30,7 @@ __all__ = [
|
||||
"BingSerpConverter",
|
||||
"PdfConverter",
|
||||
"DocxConverter",
|
||||
"OneNoteConverter",
|
||||
"XlsxConverter",
|
||||
"XlsConverter",
|
||||
"PptxConverter",
|
||||
|
||||
@@ -0,0 +1,87 @@
|
||||
import sys
|
||||
|
||||
from typing import BinaryIO, Any
|
||||
|
||||
from ._html_converter import HtmlConverter
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._stream_info import StreamInfo
|
||||
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||
|
||||
# Try loading optional (but in this case, required) dependencies
|
||||
# Save reporting of any exceptions for later
|
||||
_dependency_exc_info = None
|
||||
try:
|
||||
import one_extract
|
||||
except ImportError:
|
||||
# Preserve the error and stack trace for later
|
||||
_dependency_exc_info = sys.exc_info()
|
||||
|
||||
|
||||
ACCEPTED_MIME_TYPE_PREFIXES = []
|
||||
|
||||
ACCEPTED_FILE_EXTENSIONS = [".one"]
|
||||
|
||||
|
||||
class OneNoteConverter(DocumentConverter):
|
||||
"""
|
||||
Converts OneNote files to Markdown.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self._html_converter = HtmlConverter()
|
||||
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> bool:
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
|
||||
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||
return True
|
||||
|
||||
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
||||
if mimetype.startswith(prefix):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> DocumentConverterResult:
|
||||
# Check: the dependencies
|
||||
if _dependency_exc_info is not None:
|
||||
raise MissingDependencyException(
|
||||
MISSING_DEPENDENCY_MESSAGE.format(
|
||||
converter=type(self).__name__,
|
||||
extension=".one",
|
||||
feature="onenote",
|
||||
)
|
||||
) from _dependency_exc_info[
|
||||
1
|
||||
].with_traceback( # type: ignore[union-attr]
|
||||
_dependency_exc_info[2]
|
||||
)
|
||||
|
||||
# Perform the conversion
|
||||
md_content = ""
|
||||
notebook = one_extract.Notebook(file_stream)
|
||||
for section in notebook.sections:
|
||||
md_content += f"\n\n# {section.name}\n"
|
||||
for page in section.pages:
|
||||
md_content += f"\n\n## {page.name}\n"
|
||||
md_content += (
|
||||
self._html_converter.convert_string(page.content).markdown.strip()
|
||||
+ "\n\n"
|
||||
)
|
||||
|
||||
return DocumentConverterResult(
|
||||
title=None,
|
||||
text_content=md_content.strip(),
|
||||
)
|
||||
BIN
packages/markitdown/tests/test_files/test.one
vendored
Executable file
BIN
packages/markitdown/tests/test_files/test.one
vendored
Executable file
Binary file not shown.
Reference in New Issue
Block a user