5 Commits

Author SHA1 Message Date
Adam Fourney
da73d64bfa Initial work to port #55 to MarkItDown 0.1.X 2025-03-06 13:17:58 -08:00
afourney
82d84e3edd Fixed formatting. (#1098) 2025-03-05 23:30:29 -08:00
scalabreseGD
36c4bc9ec3 Fixed deepcopy failure when passing llm_client (#1089)
Co-authored-by: afourney <adamfo@microsoft.com>
2025-03-05 23:25:37 -08:00
Andrea Pietrobon
80baa5db18 fix(README): correct pip install command formatting (#1090)
Added missing quotes around `markitdown[all]` in the installation command  
to ensure proper package resolution by pip.
2025-03-05 23:21:10 -08:00
Adam Fourney
00a65e8f8b Fixed version in README. 2025-03-05 23:10:21 -08:00
6 changed files with 100 additions and 5 deletions

View File

@@ -5,8 +5,8 @@
[![Built by AutoGen Team](https://img.shields.io/badge/Built%20by-AutoGen%20Team-blue)](https://github.com/microsoft/autogen)
> [!IMPORTANT]
> Breaking changes between 0.0.1 to 0.0.2:
> * Dependencies are now organized into optional feature-groups (further details below). Use `pip install markitdown[all]` to have backward-compatible behavior.
> Breaking changes between 0.0.1 to 0.1.0:
> * Dependencies are now organized into optional feature-groups (further details below). Use `pip install 'markitdown[all]~=0.1.0a1'` to have backward-compatible behavior.
> * The DocumentConverter class interface has changed to read from file-like streams rather than file paths. *No temporary files are created anymore*. If you are the maintainer of a plugin, or custom DocumentConverter, you likely need to update your code. Otherwise, if only using the MarkItDown class or CLI (as in these examples), you should not need to change anything.
MarkItDown is a lightweight Python utility for converting various files to Markdown for use with LLMs and related text analysis pipelines. To this end, it is most comparable to [textract](https://github.com/deanmalmgren/textract), but with a focus on preserving important document structure and content as Markdown (including: headings, lists, tables, links, etc.) While the output is often reasonably presentable and human-friendly, it is meant to be consumed by text analysis tools -- and may not be the best option for high-fidelity document conversions for human consumption.
@@ -17,6 +17,7 @@ At present, MarkItDown supports:
- PowerPoint
- Word
- Excel
- OneNote
- Images (EXIF metadata and OCR)
- Audio (EXIF metadata and speech transcription)
- HTML
@@ -36,7 +37,7 @@ are also highly token-efficient.
## Installation
To install MarkItDown, use pip: `pip install markitdown[all]`. Alternatively, you can install it from the source:
To install MarkItDown, use pip: `pip install 'markitdown[all]~=0.1.0a1'`. Alternatively, you can install it from the source:
```bash
git clone git@github.com:microsoft/markitdown.git
@@ -82,6 +83,7 @@ At the moment, the following optional dependencies are available:
* `[xls]` Installs dependencies for older Excel files
* `[pdf]` Installs dependencies for PDF files
* `[outlook]` Installs dependencies for Outlook messages
* `[onenote]` Installs dependencies for OneNote .one files
* `[az-doc-intel]` Installs dependencies for Azure Document Intelligence
* `[audio-transcription]` Installs dependencies for audio transcription of wav and mp3 files
* `[youtube-transcription]` Installs dependencies for fetching YouTube video transcription

View File

@@ -45,7 +45,8 @@ all = [
"SpeechRecognition",
"youtube-transcript-api",
"azure-ai-documentintelligence",
"azure-identity"
"azure-identity",
"one-extract",
]
pptx = ["python-pptx"]
docx = ["mammoth"]
@@ -53,6 +54,7 @@ xlsx = ["pandas", "openpyxl"]
xls = ["pandas", "xlrd"]
pdf = ["pdfminer.six"]
outlook = ["olefile"]
onenote = ["one-extract"]
audio-transcription = ["pydub", "SpeechRecognition"]
youtube-transcription = ["youtube-transcript-api"]
az-doc-intel = ["azure-ai-documentintelligence", "azure-identity"]

View File

@@ -30,6 +30,7 @@ from .converters import (
BingSerpConverter,
PdfConverter,
DocxConverter,
OneNoteConverter,
XlsxConverter,
XlsConverter,
PptxConverter,
@@ -158,6 +159,7 @@ class MarkItDown:
self.register_converter(YouTubeConverter())
self.register_converter(BingSerpConverter())
self.register_converter(DocxConverter())
self.register_converter(OneNoteConverter())
self.register_converter(XlsxConverter())
self.register_converter(XlsConverter())
self.register_converter(PptxConverter())
@@ -455,7 +457,7 @@ class MarkItDown:
cur_pos == file_stream.tell()
), f"File stream position should NOT change between guess iterations"
_kwargs = copy.deepcopy(kwargs)
_kwargs = {k: v for k, v in kwargs.items()}
# Copy any additional global options
if "llm_client" not in _kwargs and self._llm_client is not None:

View File

@@ -11,6 +11,7 @@ from ._ipynb_converter import IpynbConverter
from ._bing_serp_converter import BingSerpConverter
from ._pdf_converter import PdfConverter
from ._docx_converter import DocxConverter
from ._onenote_converter import OneNoteConverter
from ._xlsx_converter import XlsxConverter, XlsConverter
from ._pptx_converter import PptxConverter
from ._image_converter import ImageConverter
@@ -29,6 +30,7 @@ __all__ = [
"BingSerpConverter",
"PdfConverter",
"DocxConverter",
"OneNoteConverter",
"XlsxConverter",
"XlsConverter",
"PptxConverter",

View File

@@ -0,0 +1,87 @@
import sys
from typing import BinaryIO, Any
from ._html_converter import HtmlConverter
from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
# Try loading optional (but in this case, required) dependencies
# Save reporting of any exceptions for later
_dependency_exc_info = None
try:
import one_extract
except ImportError:
# Preserve the error and stack trace for later
_dependency_exc_info = sys.exc_info()
ACCEPTED_MIME_TYPE_PREFIXES = []
ACCEPTED_FILE_EXTENSIONS = [".one"]
class OneNoteConverter(DocumentConverter):
"""
Converts OneNote files to Markdown.
"""
def __init__(self):
super().__init__()
self._html_converter = HtmlConverter()
def accepts(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> bool:
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()
if extension in ACCEPTED_FILE_EXTENSIONS:
return True
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
if mimetype.startswith(prefix):
return True
return False
def convert(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult:
# Check: the dependencies
if _dependency_exc_info is not None:
raise MissingDependencyException(
MISSING_DEPENDENCY_MESSAGE.format(
converter=type(self).__name__,
extension=".one",
feature="onenote",
)
) from _dependency_exc_info[
1
].with_traceback( # type: ignore[union-attr]
_dependency_exc_info[2]
)
# Perform the conversion
md_content = ""
notebook = one_extract.Notebook(file_stream)
for section in notebook.sections:
md_content += f"\n\n# {section.name}\n"
for page in section.pages:
md_content += f"\n\n## {page.name}\n"
md_content += (
self._html_converter.convert_string(page.content).markdown.strip()
+ "\n\n"
)
return DocumentConverterResult(
title=None,
text_content=md_content.strip(),
)

BIN
packages/markitdown/tests/test_files/test.one vendored Executable file

Binary file not shown.