4 Commits

Author SHA1 Message Date
Adam Fourney
f17bc21c9d If files use zip packaging, be smarter about inspecting their types. 2025-03-07 23:06:56 -08:00
afourney
99d8e562db Fix exiftool in well-known paths. (#1106) 2025-03-07 21:47:20 -08:00
Sebastian Yaghoubi
515fa854bf feat(docker): improve dockerfile build (#220)
* refactor(docker): remove unnecessary root user

The USER root directive isn't needed directly after FROM

Signed-off-by: Sebastian Yaghoubi <sebastianyaghoubi@gmail.com>

* fix(docker): use generic nobody nogroup default instead of uid gid

Signed-off-by: Sebastian Yaghoubi <sebastianyaghoubi@gmail.com>

* fix(docker): build app from source locally instead of installing package

Signed-off-by: Sebastian Yaghoubi <sebastianyaghoubi@gmail.com>

* fix(docker): use correct files in dockerignore

Signed-off-by: Sebastian Yaghoubi <sebastianyaghoubi@gmail.com>

* chore(docker): dont install recommended packages with git

Signed-off-by: Sebastian Yaghoubi <sebastianyaghoubi@gmail.com>

* fix(docker): run apt as non-interactive

Signed-off-by: Sebastian Yaghoubi <sebastianyaghoubi@gmail.com>

* Update Dockerfile to new package structure, and fix streaming bugs.

---------

Signed-off-by: Sebastian Yaghoubi <sebastianyaghoubi@gmail.com>
Co-authored-by: afourney <adamfo@microsoft.com>
2025-03-07 20:07:40 -08:00
Richard Ye
0229ff6cb7 feat: sort pptx shapes to be parsed in top-to-bottom, left-to-right order (#1104)
* Sort PPTX shapes to be read in top-to-bottom, left-to-right order

Referenced from 39bef65b31/pptx2md/parser.py (L249)

* Update README.md
* Fixed formatting.
* Added missing import
2025-03-07 15:45:14 -08:00
14 changed files with 157 additions and 143 deletions

View File

@@ -1 +1,2 @@
* *
!packages/

View File

@@ -1,22 +1,32 @@
FROM python:3.13-slim-bullseye FROM python:3.13-slim-bullseye
USER root ENV DEBIAN_FRONTEND=noninteractive
ENV EXIFTOOL_PATH=/usr/bin/exiftool
ARG INSTALL_GIT=false ENV FFMPEG_PATH=/usr/bin/ffmpeg
RUN if [ "$INSTALL_GIT" = "true" ]; then \
apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*; \
fi
# Runtime dependency # Runtime dependency
RUN apt-get update && apt-get install -y --no-install-recommends \ RUN apt-get update && apt-get install -y --no-install-recommends \
ffmpeg \ ffmpeg \
&& rm -rf /var/lib/apt/lists/* exiftool
RUN pip install markitdown ARG INSTALL_GIT=false
RUN if [ "$INSTALL_GIT" = "true" ]; then \
apt-get install -y --no-install-recommends \
git; \
fi
# Cleanup
RUN rm -rf /var/lib/apt/lists/*
WORKDIR /app
COPY . /app
RUN pip --no-cache-dir install \
/app/packages/markitdown[all] \
/app/packages/markitdown-sample-plugin
# Default USERID and GROUPID # Default USERID and GROUPID
ARG USERID=10000 ARG USERID=nobody
ARG GROUPID=10000 ARG GROUPID=nogroup
USER $USERID:$GROUPID USER $USERID:$GROUPID

View File

@@ -14,10 +14,9 @@ MarkItDown is a lightweight Python utility for converting various files to Markd
At present, MarkItDown supports: At present, MarkItDown supports:
- PDF - PDF
- PowerPoint - PowerPoint (reading in top-to-bottom, left-to-right order)
- Word - Word
- Excel - Excel
- OneNote
- Images (EXIF metadata and OCR) - Images (EXIF metadata and OCR)
- Audio (EXIF metadata and speech transcription) - Audio (EXIF metadata and speech transcription)
- HTML - HTML
@@ -83,7 +82,6 @@ At the moment, the following optional dependencies are available:
* `[xls]` Installs dependencies for older Excel files * `[xls]` Installs dependencies for older Excel files
* `[pdf]` Installs dependencies for PDF files * `[pdf]` Installs dependencies for PDF files
* `[outlook]` Installs dependencies for Outlook messages * `[outlook]` Installs dependencies for Outlook messages
* `[onenote]` Installs dependencies for OneNote .one files
* `[az-doc-intel]` Installs dependencies for Azure Document Intelligence * `[az-doc-intel]` Installs dependencies for Azure Document Intelligence
* `[audio-transcription]` Installs dependencies for audio transcription of wav and mp3 files * `[audio-transcription]` Installs dependencies for audio transcription of wav and mp3 files
* `[youtube-transcription]` Installs dependencies for fetching YouTube video transcription * `[youtube-transcription]` Installs dependencies for fetching YouTube video transcription

View File

@@ -45,8 +45,7 @@ all = [
"SpeechRecognition", "SpeechRecognition",
"youtube-transcript-api", "youtube-transcript-api",
"azure-ai-documentintelligence", "azure-ai-documentintelligence",
"azure-identity", "azure-identity"
"one-extract",
] ]
pptx = ["python-pptx"] pptx = ["python-pptx"]
docx = ["mammoth"] docx = ["mammoth"]
@@ -54,7 +53,6 @@ xlsx = ["pandas", "openpyxl"]
xls = ["pandas", "xlrd"] xls = ["pandas", "xlrd"]
pdf = ["pdfminer.six"] pdf = ["pdfminer.six"]
outlook = ["olefile"] outlook = ["olefile"]
onenote = ["one-extract"]
audio-transcription = ["pydub", "SpeechRecognition"] audio-transcription = ["pydub", "SpeechRecognition"]
youtube-transcription = ["youtube-transcript-api"] youtube-transcription = ["youtube-transcript-api"]
az-doc-intel = ["azure-ai-documentintelligence", "azure-identity"] az-doc-intel = ["azure-ai-documentintelligence", "azure-identity"]

View File

@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com> # SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
# #
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
__version__ = "0.1.0a1" __version__ = "0.1.0a2"

View File

@@ -3,6 +3,7 @@ import mimetypes
import os import os
import re import re
import sys import sys
import shutil
import tempfile import tempfile
import warnings import warnings
import traceback import traceback
@@ -30,7 +31,6 @@ from .converters import (
BingSerpConverter, BingSerpConverter,
PdfConverter, PdfConverter,
DocxConverter, DocxConverter,
OneNoteConverter,
XlsxConverter, XlsxConverter,
XlsConverter, XlsConverter,
PptxConverter, PptxConverter,
@@ -139,9 +139,30 @@ class MarkItDown:
self._llm_model = kwargs.get("llm_model") self._llm_model = kwargs.get("llm_model")
self._exiftool_path = kwargs.get("exiftool_path") self._exiftool_path = kwargs.get("exiftool_path")
self._style_map = kwargs.get("style_map") self._style_map = kwargs.get("style_map")
if self._exiftool_path is None: if self._exiftool_path is None:
self._exiftool_path = os.getenv("EXIFTOOL_PATH") self._exiftool_path = os.getenv("EXIFTOOL_PATH")
# Still none? Check well-known paths
if self._exiftool_path is None:
candidate = shutil.which("exiftool")
if candidate:
candidate = os.path.abspath(candidate)
if any(
d == os.path.dirname(candidate)
for d in [
"/usr/bin",
"/usr/local/bin",
"/opt",
"/opt/bin",
"/opt/local/bin",
"/opt/homebrew/bin" "C:\\Windows\\System32",
"C:\\Program Files",
"C:\\Program Files (x86)",
]
):
self._exiftool_path = candidate
# Register converters for successful browsing operations # Register converters for successful browsing operations
# Later registrations are tried first / take higher priority than earlier registrations # Later registrations are tried first / take higher priority than earlier registrations
# To this end, the most specific converters should appear below the most generic converters # To this end, the most specific converters should appear below the most generic converters
@@ -159,7 +180,6 @@ class MarkItDown:
self.register_converter(YouTubeConverter()) self.register_converter(YouTubeConverter())
self.register_converter(BingSerpConverter()) self.register_converter(BingSerpConverter())
self.register_converter(DocxConverter()) self.register_converter(DocxConverter())
self.register_converter(OneNoteConverter())
self.register_converter(XlsxConverter()) self.register_converter(XlsxConverter())
self.register_converter(XlsConverter()) self.register_converter(XlsConverter())
self.register_converter(PptxConverter()) self.register_converter(PptxConverter())
@@ -329,6 +349,17 @@ class MarkItDown:
elif base_guess.extension is not None: elif base_guess.extension is not None:
placeholder_filename = "placeholder" + base_guess.extension placeholder_filename = "placeholder" + base_guess.extension
# Check if we have a seekable stream. If not, load the entire stream into memory.
if not stream.seekable():
buffer = io.BytesIO()
while True:
chunk = stream.read(4096)
if not chunk:
break
buffer.write(chunk)
buffer.seek(0)
stream = buffer
# Add guesses based on stream content # Add guesses based on stream content
for guess in _guess_stream_info_from_stream( for guess in _guess_stream_info_from_stream(
file_stream=stream, filename_hint=placeholder_filename file_stream=stream, filename_hint=placeholder_filename

View File

@@ -1,8 +1,9 @@
import puremagic import puremagic
import mimetypes import mimetypes
import zipfile
import os import os
from dataclasses import dataclass, asdict from dataclasses import dataclass, asdict
from typing import Optional, BinaryIO, List, TypeVar, Type from typing import Optional, BinaryIO, List, Union
# Mimetype substitutions table # Mimetype substitutions table
MIMETYPE_SUBSTITUTIONS = { MIMETYPE_SUBSTITUTIONS = {
@@ -74,6 +75,20 @@ def _guess_stream_info_from_stream(
) )
) )
# If it looks like a zip use _guess_stream_info_from_zip rather than puremagic
cur_pos = file_stream.tell()
try:
header = file_stream.read(4)
file_stream.seek(cur_pos)
if header == b"PK\x03\x04":
zip_guess = _guess_stream_info_from_zip(file_stream)
if zip_guess:
guesses.append(zip_guess)
return guesses
finally:
file_stream.seek(cur_pos)
# Fall back to using puremagic
def _puremagic( def _puremagic(
file_stream, filename_hint file_stream, filename_hint
) -> List[puremagic.main.PureMagicWithConfidence]: ) -> List[puremagic.main.PureMagicWithConfidence]:
@@ -120,3 +135,74 @@ def _guess_stream_info_from_stream(
guesses.append(StreamInfo(**kwargs)) guesses.append(StreamInfo(**kwargs))
return guesses return guesses
def _guess_stream_info_from_zip(file_stream: BinaryIO) -> Union[None, StreamInfo]:
"""
Guess StreamInfo properties (mostly mimetype and extension) from a zip stream.
Args:
- stream: The stream to guess the StreamInfo from.
Returns the single best guess, or None if no guess could be made.
"""
cur_pos = file_stream.tell()
try:
with zipfile.ZipFile(file_stream) as z:
table_of_contents = z.namelist()
# OpenPackageFormat (OPF) file
if "[Content_Types].xml" in table_of_contents:
# Word file
if "word/document.xml" in table_of_contents:
return StreamInfo(
mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
extension=".docx",
)
# Excel file
if "xl/workbook.xml" in table_of_contents:
return StreamInfo(
mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
extension=".xlsx",
)
# PowerPoint file
if "ppt/presentation.xml" in table_of_contents:
return StreamInfo(
mimetype="application/vnd.openxmlformats-officedocument.presentationml.presentation",
extension=".pptx",
)
# Visio file
if "visio/document.xml" in table_of_contents:
return StreamInfo(
mimetype="application/vnd.ms-visio.drawing",
extension=".vsd",
)
# XPS file
if "FixedDocSeq.fdseq" in table_of_contents:
return StreamInfo(
mimetype="application/vnd.ms-xpsdocument",
extension=".xps",
)
# EPUB, or similar
if "mimetype" in table_of_contents:
_mimetype = z.read("mimetype").decode("ascii").strip()
_extension = mimetypes.guess_extension(_mimetype)
return StreamInfo(mimetype=_mimetype, extension=_extension)
# JAR
if "META-INF/MANIFEST.MF" in table_of_contents:
return StreamInfo(mimetype="application/java-archive", extension=".jar")
# If we made it this far, we couldn't identify the file
return StreamInfo(mimetype="application/zip", extension=".zip")
except zipfile.BadZipFile:
return None
finally:
file_stream.seek(cur_pos)

View File

@@ -11,7 +11,6 @@ from ._ipynb_converter import IpynbConverter
from ._bing_serp_converter import BingSerpConverter from ._bing_serp_converter import BingSerpConverter
from ._pdf_converter import PdfConverter from ._pdf_converter import PdfConverter
from ._docx_converter import DocxConverter from ._docx_converter import DocxConverter
from ._onenote_converter import OneNoteConverter
from ._xlsx_converter import XlsxConverter, XlsConverter from ._xlsx_converter import XlsxConverter, XlsConverter
from ._pptx_converter import PptxConverter from ._pptx_converter import PptxConverter
from ._image_converter import ImageConverter from ._image_converter import ImageConverter
@@ -30,7 +29,6 @@ __all__ = [
"BingSerpConverter", "BingSerpConverter",
"PdfConverter", "PdfConverter",
"DocxConverter", "DocxConverter",
"OneNoteConverter",
"XlsxConverter", "XlsxConverter",
"XlsConverter", "XlsConverter",
"PptxConverter", "PptxConverter",

View File

@@ -5,26 +5,16 @@ import sys
import shutil import shutil
import os import os
import warnings import warnings
from typing import BinaryIO, Optional, Any from typing import BinaryIO, Any, Union
def exiftool_metadata( def exiftool_metadata(
file_stream: BinaryIO, *, exiftool_path: Optional[str] = None file_stream: BinaryIO,
*,
exiftool_path: Union[str, None],
) -> Any: # Need a better type for json data ) -> Any: # Need a better type for json data
# Check if we have a valid pointer to exiftool # Nothing to do
if not exiftool_path: if not exiftool_path:
which_exiftool = shutil.which("exiftool")
if which_exiftool:
warnings.warn(
f"""Implicit discovery of 'exiftool' is disabled. If you would like to continue to use exiftool in MarkItDown, please set the exiftool_path parameter in the MarkItDown consructor. E.g.,
md = MarkItDown(exiftool_path="{which_exiftool}")
This warning will be removed in future releases.
""",
DeprecationWarning,
)
# Nothing to do
return {} return {}
# Run exiftool # Run exiftool

View File

@@ -1,87 +0,0 @@
import sys
from typing import BinaryIO, Any
from ._html_converter import HtmlConverter
from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
# Try loading optional (but in this case, required) dependencies
# Save reporting of any exceptions for later
_dependency_exc_info = None
try:
import one_extract
except ImportError:
# Preserve the error and stack trace for later
_dependency_exc_info = sys.exc_info()
ACCEPTED_MIME_TYPE_PREFIXES = []
ACCEPTED_FILE_EXTENSIONS = [".one"]
class OneNoteConverter(DocumentConverter):
"""
Converts OneNote files to Markdown.
"""
def __init__(self):
super().__init__()
self._html_converter = HtmlConverter()
def accepts(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> bool:
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()
if extension in ACCEPTED_FILE_EXTENSIONS:
return True
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
if mimetype.startswith(prefix):
return True
return False
def convert(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult:
# Check: the dependencies
if _dependency_exc_info is not None:
raise MissingDependencyException(
MISSING_DEPENDENCY_MESSAGE.format(
converter=type(self).__name__,
extension=".one",
feature="onenote",
)
) from _dependency_exc_info[
1
].with_traceback( # type: ignore[union-attr]
_dependency_exc_info[2]
)
# Perform the conversion
md_content = ""
notebook = one_extract.Notebook(file_stream)
for section in notebook.sections:
md_content += f"\n\n# {section.name}\n"
for page in section.pages:
md_content += f"\n\n## {page.name}\n"
md_content += (
self._html_converter.convert_string(page.content).markdown.strip()
+ "\n\n"
)
return DocumentConverterResult(
title=None,
text_content=md_content.strip(),
)

View File

@@ -7,6 +7,7 @@ from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
# Try loading optional (but in this case, required) dependencies # Try loading optional (but in this case, required) dependencies
# Save reporting of any exceptions for later # Save reporting of any exceptions for later
_dependency_exc_info = None _dependency_exc_info = None
olefile = None
try: try:
import olefile import olefile
except ImportError: except ImportError:
@@ -48,7 +49,7 @@ class OutlookMsgConverter(DocumentConverter):
# Brute force, check if we have an OLE file # Brute force, check if we have an OLE file
cur_pos = file_stream.tell() cur_pos = file_stream.tell()
try: try:
if not olefile.isOleFile(file_stream): if olefile and not olefile.isOleFile(file_stream):
return False return False
finally: finally:
file_stream.seek(cur_pos) file_stream.seek(cur_pos)

View File

@@ -6,6 +6,7 @@ import re
import html import html
from typing import BinaryIO, Any from typing import BinaryIO, Any
from operator import attrgetter
from ._html_converter import HtmlConverter from ._html_converter import HtmlConverter
from ._llm_caption import llm_caption from ._llm_caption import llm_caption
@@ -160,10 +161,12 @@ class PptxConverter(DocumentConverter):
# Group Shapes # Group Shapes
if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.GROUP: if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.GROUP:
for subshape in shape.shapes: sorted_shapes = sorted(shape.shapes, key=attrgetter("top", "left"))
for subshape in sorted_shapes:
get_shape_content(subshape, **kwargs) get_shape_content(subshape, **kwargs)
for shape in slide.shapes: sorted_shapes = sorted(slide.shapes, key=attrgetter("top", "left"))
for shape in sorted_shapes:
get_shape_content(shape, **kwargs) get_shape_content(shape, **kwargs)
md_content = md_content.strip() md_content = md_content.strip()

Binary file not shown.

View File

@@ -7,8 +7,6 @@ import openai
import pytest import pytest
import requests import requests
import warnings
from markitdown import ( from markitdown import (
MarkItDown, MarkItDown,
UnsupportedFormatException, UnsupportedFormatException,
@@ -517,19 +515,6 @@ def test_exceptions() -> None:
reason="do not run if exiftool is not installed", reason="do not run if exiftool is not installed",
) )
def test_markitdown_exiftool() -> None: def test_markitdown_exiftool() -> None:
# Test the automatic discovery of exiftool throws a warning
# and is disabled
try:
warnings.simplefilter("default")
with warnings.catch_warnings(record=True) as w:
markitdown = MarkItDown()
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
assert len(w) == 1
assert w[0].category is DeprecationWarning
assert result.text_content.strip() == ""
finally:
warnings.resetwarnings()
which_exiftool = shutil.which("exiftool") which_exiftool = shutil.which("exiftool")
assert which_exiftool is not None assert which_exiftool is not None