4 Commits

Author SHA1 Message Date
Adam Fourney
f17bc21c9d If files use zip packaging, be smarter about inspecting their types. 2025-03-07 23:06:56 -08:00
afourney
99d8e562db Fix exiftool in well-known paths. (#1106) 2025-03-07 21:47:20 -08:00
Sebastian Yaghoubi
515fa854bf feat(docker): improve dockerfile build (#220)
* refactor(docker): remove unnecessary root user

The USER root directive isn't needed directly after FROM

Signed-off-by: Sebastian Yaghoubi <sebastianyaghoubi@gmail.com>

* fix(docker): use generic nobody nogroup default instead of uid gid

Signed-off-by: Sebastian Yaghoubi <sebastianyaghoubi@gmail.com>

* fix(docker): build app from source locally instead of installing package

Signed-off-by: Sebastian Yaghoubi <sebastianyaghoubi@gmail.com>

* fix(docker): use correct files in dockerignore

Signed-off-by: Sebastian Yaghoubi <sebastianyaghoubi@gmail.com>

* chore(docker): dont install recommended packages with git

Signed-off-by: Sebastian Yaghoubi <sebastianyaghoubi@gmail.com>

* fix(docker): run apt as non-interactive

Signed-off-by: Sebastian Yaghoubi <sebastianyaghoubi@gmail.com>

* Update Dockerfile to new package structure, and fix streaming bugs.

---------

Signed-off-by: Sebastian Yaghoubi <sebastianyaghoubi@gmail.com>
Co-authored-by: afourney <adamfo@microsoft.com>
2025-03-07 20:07:40 -08:00
Richard Ye
0229ff6cb7 feat: sort pptx shapes to be parsed in top-to-bottom, left-to-right order (#1104)
* Sort PPTX shapes to be read in top-to-bottom, left-to-right order

Referenced from 39bef65b31/pptx2md/parser.py (L249)

* Update README.md
* Fixed formatting.
* Added missing import
2025-03-07 15:45:14 -08:00
10 changed files with 156 additions and 47 deletions

View File

@@ -1 +1,2 @@
* *
!packages/

View File

@@ -1,22 +1,32 @@
FROM python:3.13-slim-bullseye FROM python:3.13-slim-bullseye
USER root ENV DEBIAN_FRONTEND=noninteractive
ENV EXIFTOOL_PATH=/usr/bin/exiftool
ARG INSTALL_GIT=false ENV FFMPEG_PATH=/usr/bin/ffmpeg
RUN if [ "$INSTALL_GIT" = "true" ]; then \
apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*; \
fi
# Runtime dependency # Runtime dependency
RUN apt-get update && apt-get install -y --no-install-recommends \ RUN apt-get update && apt-get install -y --no-install-recommends \
ffmpeg \ ffmpeg \
&& rm -rf /var/lib/apt/lists/* exiftool
RUN pip install markitdown ARG INSTALL_GIT=false
RUN if [ "$INSTALL_GIT" = "true" ]; then \
apt-get install -y --no-install-recommends \
git; \
fi
# Cleanup
RUN rm -rf /var/lib/apt/lists/*
WORKDIR /app
COPY . /app
RUN pip --no-cache-dir install \
/app/packages/markitdown[all] \
/app/packages/markitdown-sample-plugin
# Default USERID and GROUPID # Default USERID and GROUPID
ARG USERID=10000 ARG USERID=nobody
ARG GROUPID=10000 ARG GROUPID=nogroup
USER $USERID:$GROUPID USER $USERID:$GROUPID

View File

@@ -14,7 +14,7 @@ MarkItDown is a lightweight Python utility for converting various files to Markd
At present, MarkItDown supports: At present, MarkItDown supports:
- PDF - PDF
- PowerPoint - PowerPoint (reading in top-to-bottom, left-to-right order)
- Word - Word
- Excel - Excel
- Images (EXIF metadata and OCR) - Images (EXIF metadata and OCR)

View File

@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com> # SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
# #
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
__version__ = "0.1.0a1" __version__ = "0.1.0a2"

View File

@@ -3,6 +3,7 @@ import mimetypes
import os import os
import re import re
import sys import sys
import shutil
import tempfile import tempfile
import warnings import warnings
import traceback import traceback
@@ -138,9 +139,30 @@ class MarkItDown:
self._llm_model = kwargs.get("llm_model") self._llm_model = kwargs.get("llm_model")
self._exiftool_path = kwargs.get("exiftool_path") self._exiftool_path = kwargs.get("exiftool_path")
self._style_map = kwargs.get("style_map") self._style_map = kwargs.get("style_map")
if self._exiftool_path is None: if self._exiftool_path is None:
self._exiftool_path = os.getenv("EXIFTOOL_PATH") self._exiftool_path = os.getenv("EXIFTOOL_PATH")
# Still none? Check well-known paths
if self._exiftool_path is None:
candidate = shutil.which("exiftool")
if candidate:
candidate = os.path.abspath(candidate)
if any(
d == os.path.dirname(candidate)
for d in [
"/usr/bin",
"/usr/local/bin",
"/opt",
"/opt/bin",
"/opt/local/bin",
"/opt/homebrew/bin" "C:\\Windows\\System32",
"C:\\Program Files",
"C:\\Program Files (x86)",
]
):
self._exiftool_path = candidate
# Register converters for successful browsing operations # Register converters for successful browsing operations
# Later registrations are tried first / take higher priority than earlier registrations # Later registrations are tried first / take higher priority than earlier registrations
# To this end, the most specific converters should appear below the most generic converters # To this end, the most specific converters should appear below the most generic converters
@@ -327,6 +349,17 @@ class MarkItDown:
elif base_guess.extension is not None: elif base_guess.extension is not None:
placeholder_filename = "placeholder" + base_guess.extension placeholder_filename = "placeholder" + base_guess.extension
# Check if we have a seekable stream. If not, load the entire stream into memory.
if not stream.seekable():
buffer = io.BytesIO()
while True:
chunk = stream.read(4096)
if not chunk:
break
buffer.write(chunk)
buffer.seek(0)
stream = buffer
# Add guesses based on stream content # Add guesses based on stream content
for guess in _guess_stream_info_from_stream( for guess in _guess_stream_info_from_stream(
file_stream=stream, filename_hint=placeholder_filename file_stream=stream, filename_hint=placeholder_filename

View File

@@ -1,8 +1,9 @@
import puremagic import puremagic
import mimetypes import mimetypes
import zipfile
import os import os
from dataclasses import dataclass, asdict from dataclasses import dataclass, asdict
from typing import Optional, BinaryIO, List, TypeVar, Type from typing import Optional, BinaryIO, List, Union
# Mimetype substitutions table # Mimetype substitutions table
MIMETYPE_SUBSTITUTIONS = { MIMETYPE_SUBSTITUTIONS = {
@@ -74,6 +75,20 @@ def _guess_stream_info_from_stream(
) )
) )
# If it looks like a zip use _guess_stream_info_from_zip rather than puremagic
cur_pos = file_stream.tell()
try:
header = file_stream.read(4)
file_stream.seek(cur_pos)
if header == b"PK\x03\x04":
zip_guess = _guess_stream_info_from_zip(file_stream)
if zip_guess:
guesses.append(zip_guess)
return guesses
finally:
file_stream.seek(cur_pos)
# Fall back to using puremagic
def _puremagic( def _puremagic(
file_stream, filename_hint file_stream, filename_hint
) -> List[puremagic.main.PureMagicWithConfidence]: ) -> List[puremagic.main.PureMagicWithConfidence]:
@@ -120,3 +135,74 @@ def _guess_stream_info_from_stream(
guesses.append(StreamInfo(**kwargs)) guesses.append(StreamInfo(**kwargs))
return guesses return guesses
def _guess_stream_info_from_zip(file_stream: BinaryIO) -> Union[None, StreamInfo]:
"""
Guess StreamInfo properties (mostly mimetype and extension) from a zip stream.
Args:
- stream: The stream to guess the StreamInfo from.
Returns the single best guess, or None if no guess could be made.
"""
cur_pos = file_stream.tell()
try:
with zipfile.ZipFile(file_stream) as z:
table_of_contents = z.namelist()
# OpenPackageFormat (OPF) file
if "[Content_Types].xml" in table_of_contents:
# Word file
if "word/document.xml" in table_of_contents:
return StreamInfo(
mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
extension=".docx",
)
# Excel file
if "xl/workbook.xml" in table_of_contents:
return StreamInfo(
mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
extension=".xlsx",
)
# PowerPoint file
if "ppt/presentation.xml" in table_of_contents:
return StreamInfo(
mimetype="application/vnd.openxmlformats-officedocument.presentationml.presentation",
extension=".pptx",
)
# Visio file
if "visio/document.xml" in table_of_contents:
return StreamInfo(
mimetype="application/vnd.ms-visio.drawing",
extension=".vsd",
)
# XPS file
if "FixedDocSeq.fdseq" in table_of_contents:
return StreamInfo(
mimetype="application/vnd.ms-xpsdocument",
extension=".xps",
)
# EPUB, or similar
if "mimetype" in table_of_contents:
_mimetype = z.read("mimetype").decode("ascii").strip()
_extension = mimetypes.guess_extension(_mimetype)
return StreamInfo(mimetype=_mimetype, extension=_extension)
# JAR
if "META-INF/MANIFEST.MF" in table_of_contents:
return StreamInfo(mimetype="application/java-archive", extension=".jar")
# If we made it this far, we couldn't identify the file
return StreamInfo(mimetype="application/zip", extension=".zip")
except zipfile.BadZipFile:
return None
finally:
file_stream.seek(cur_pos)

View File

@@ -5,26 +5,16 @@ import sys
import shutil import shutil
import os import os
import warnings import warnings
from typing import BinaryIO, Optional, Any from typing import BinaryIO, Any, Union
def exiftool_metadata( def exiftool_metadata(
file_stream: BinaryIO, *, exiftool_path: Optional[str] = None file_stream: BinaryIO,
*,
exiftool_path: Union[str, None],
) -> Any: # Need a better type for json data ) -> Any: # Need a better type for json data
# Check if we have a valid pointer to exiftool # Nothing to do
if not exiftool_path: if not exiftool_path:
which_exiftool = shutil.which("exiftool")
if which_exiftool:
warnings.warn(
f"""Implicit discovery of 'exiftool' is disabled. If you would like to continue to use exiftool in MarkItDown, please set the exiftool_path parameter in the MarkItDown consructor. E.g.,
md = MarkItDown(exiftool_path="{which_exiftool}")
This warning will be removed in future releases.
""",
DeprecationWarning,
)
# Nothing to do
return {} return {}
# Run exiftool # Run exiftool

View File

@@ -7,6 +7,7 @@ from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
# Try loading optional (but in this case, required) dependencies # Try loading optional (but in this case, required) dependencies
# Save reporting of any exceptions for later # Save reporting of any exceptions for later
_dependency_exc_info = None _dependency_exc_info = None
olefile = None
try: try:
import olefile import olefile
except ImportError: except ImportError:
@@ -48,7 +49,7 @@ class OutlookMsgConverter(DocumentConverter):
# Brute force, check if we have an OLE file # Brute force, check if we have an OLE file
cur_pos = file_stream.tell() cur_pos = file_stream.tell()
try: try:
if not olefile.isOleFile(file_stream): if olefile and not olefile.isOleFile(file_stream):
return False return False
finally: finally:
file_stream.seek(cur_pos) file_stream.seek(cur_pos)

View File

@@ -6,6 +6,7 @@ import re
import html import html
from typing import BinaryIO, Any from typing import BinaryIO, Any
from operator import attrgetter
from ._html_converter import HtmlConverter from ._html_converter import HtmlConverter
from ._llm_caption import llm_caption from ._llm_caption import llm_caption
@@ -160,10 +161,12 @@ class PptxConverter(DocumentConverter):
# Group Shapes # Group Shapes
if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.GROUP: if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.GROUP:
for subshape in shape.shapes: sorted_shapes = sorted(shape.shapes, key=attrgetter("top", "left"))
for subshape in sorted_shapes:
get_shape_content(subshape, **kwargs) get_shape_content(subshape, **kwargs)
for shape in slide.shapes: sorted_shapes = sorted(slide.shapes, key=attrgetter("top", "left"))
for shape in sorted_shapes:
get_shape_content(shape, **kwargs) get_shape_content(shape, **kwargs)
md_content = md_content.strip() md_content = md_content.strip()

View File

@@ -7,8 +7,6 @@ import openai
import pytest import pytest
import requests import requests
import warnings
from markitdown import ( from markitdown import (
MarkItDown, MarkItDown,
UnsupportedFormatException, UnsupportedFormatException,
@@ -517,19 +515,6 @@ def test_exceptions() -> None:
reason="do not run if exiftool is not installed", reason="do not run if exiftool is not installed",
) )
def test_markitdown_exiftool() -> None: def test_markitdown_exiftool() -> None:
# Test the automatic discovery of exiftool throws a warning
# and is disabled
try:
warnings.simplefilter("default")
with warnings.catch_warnings(record=True) as w:
markitdown = MarkItDown()
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
assert len(w) == 1
assert w[0].category is DeprecationWarning
assert result.text_content.strip() == ""
finally:
warnings.resetwarnings()
which_exiftool = shutil.which("exiftool") which_exiftool = shutil.which("exiftool")
assert which_exiftool is not None assert which_exiftool is not None