8 Commits

Author SHA1 Message Date
Adam Fourney
f17bc21c9d If files use zip packaging, be smarter about inspecting their types. 2025-03-07 23:06:56 -08:00
afourney
99d8e562db Fix exiftool in well-known paths. (#1106) 2025-03-07 21:47:20 -08:00
Sebastian Yaghoubi
515fa854bf feat(docker): improve dockerfile build (#220)
* refactor(docker): remove unnecessary root user

The USER root directive isn't needed directly after FROM

Signed-off-by: Sebastian Yaghoubi <sebastianyaghoubi@gmail.com>

* fix(docker): use generic nobody nogroup default instead of uid gid

Signed-off-by: Sebastian Yaghoubi <sebastianyaghoubi@gmail.com>

* fix(docker): build app from source locally instead of installing package

Signed-off-by: Sebastian Yaghoubi <sebastianyaghoubi@gmail.com>

* fix(docker): use correct files in dockerignore

Signed-off-by: Sebastian Yaghoubi <sebastianyaghoubi@gmail.com>

* chore(docker): dont install recommended packages with git

Signed-off-by: Sebastian Yaghoubi <sebastianyaghoubi@gmail.com>

* fix(docker): run apt as non-interactive

Signed-off-by: Sebastian Yaghoubi <sebastianyaghoubi@gmail.com>

* Update Dockerfile to new package structure, and fix streaming bugs.

---------

Signed-off-by: Sebastian Yaghoubi <sebastianyaghoubi@gmail.com>
Co-authored-by: afourney <adamfo@microsoft.com>
2025-03-07 20:07:40 -08:00
Richard Ye
0229ff6cb7 feat: sort pptx shapes to be parsed in top-to-bottom, left-to-right order (#1104)
* Sort PPTX shapes to be read in top-to-bottom, left-to-right order

Referenced from 39bef65b31/pptx2md/parser.py (L249)

* Update README.md
* Fixed formatting.
* Added missing import
2025-03-07 15:45:14 -08:00
afourney
82d84e3edd Fixed formatting. (#1098) 2025-03-05 23:30:29 -08:00
scalabreseGD
36c4bc9ec3 Fixed deepcopy failure when passing llm_client (#1089)
Co-authored-by: afourney <adamfo@microsoft.com>
2025-03-05 23:25:37 -08:00
Andrea Pietrobon
80baa5db18 fix(README): correct pip install command formatting (#1090)
Added missing quotes around `markitdown[all]` in the installation command  
to ensure proper package resolution by pip.
2025-03-05 23:21:10 -08:00
Adam Fourney
00a65e8f8b Fixed version in README. 2025-03-05 23:10:21 -08:00
10 changed files with 160 additions and 51 deletions

View File

@@ -1 +1,2 @@
*
*
!packages/

View File

@@ -1,22 +1,32 @@
FROM python:3.13-slim-bullseye
USER root
ARG INSTALL_GIT=false
RUN if [ "$INSTALL_GIT" = "true" ]; then \
apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*; \
fi
ENV DEBIAN_FRONTEND=noninteractive
ENV EXIFTOOL_PATH=/usr/bin/exiftool
ENV FFMPEG_PATH=/usr/bin/ffmpeg
# Runtime dependency
RUN apt-get update && apt-get install -y --no-install-recommends \
ffmpeg \
&& rm -rf /var/lib/apt/lists/*
exiftool
RUN pip install markitdown
ARG INSTALL_GIT=false
RUN if [ "$INSTALL_GIT" = "true" ]; then \
apt-get install -y --no-install-recommends \
git; \
fi
# Cleanup
RUN rm -rf /var/lib/apt/lists/*
WORKDIR /app
COPY . /app
RUN pip --no-cache-dir install \
/app/packages/markitdown[all] \
/app/packages/markitdown-sample-plugin
# Default USERID and GROUPID
ARG USERID=10000
ARG GROUPID=10000
ARG USERID=nobody
ARG GROUPID=nogroup
USER $USERID:$GROUPID

View File

@@ -5,8 +5,8 @@
[![Built by AutoGen Team](https://img.shields.io/badge/Built%20by-AutoGen%20Team-blue)](https://github.com/microsoft/autogen)
> [!IMPORTANT]
> Breaking changes between 0.0.1 to 0.0.2:
> * Dependencies are now organized into optional feature-groups (further details below). Use `pip install markitdown[all]` to have backward-compatible behavior.
> Breaking changes between 0.0.1 to 0.1.0:
> * Dependencies are now organized into optional feature-groups (further details below). Use `pip install 'markitdown[all]~=0.1.0a1'` to have backward-compatible behavior.
> * The DocumentConverter class interface has changed to read from file-like streams rather than file paths. *No temporary files are created anymore*. If you are the maintainer of a plugin, or custom DocumentConverter, you likely need to update your code. Otherwise, if only using the MarkItDown class or CLI (as in these examples), you should not need to change anything.
MarkItDown is a lightweight Python utility for converting various files to Markdown for use with LLMs and related text analysis pipelines. To this end, it is most comparable to [textract](https://github.com/deanmalmgren/textract), but with a focus on preserving important document structure and content as Markdown (including: headings, lists, tables, links, etc.) While the output is often reasonably presentable and human-friendly, it is meant to be consumed by text analysis tools -- and may not be the best option for high-fidelity document conversions for human consumption.
@@ -14,7 +14,7 @@ MarkItDown is a lightweight Python utility for converting various files to Markd
At present, MarkItDown supports:
- PDF
- PowerPoint
- PowerPoint (reading in top-to-bottom, left-to-right order)
- Word
- Excel
- Images (EXIF metadata and OCR)
@@ -36,7 +36,7 @@ are also highly token-efficient.
## Installation
To install MarkItDown, use pip: `pip install markitdown[all]`. Alternatively, you can install it from the source:
To install MarkItDown, use pip: `pip install 'markitdown[all]~=0.1.0a1'`. Alternatively, you can install it from the source:
```bash
git clone git@github.com:microsoft/markitdown.git

View File

@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
#
# SPDX-License-Identifier: MIT
__version__ = "0.1.0a1"
__version__ = "0.1.0a2"

View File

@@ -3,6 +3,7 @@ import mimetypes
import os
import re
import sys
import shutil
import tempfile
import warnings
import traceback
@@ -138,9 +139,30 @@ class MarkItDown:
self._llm_model = kwargs.get("llm_model")
self._exiftool_path = kwargs.get("exiftool_path")
self._style_map = kwargs.get("style_map")
if self._exiftool_path is None:
self._exiftool_path = os.getenv("EXIFTOOL_PATH")
# Still none? Check well-known paths
if self._exiftool_path is None:
candidate = shutil.which("exiftool")
if candidate:
candidate = os.path.abspath(candidate)
if any(
d == os.path.dirname(candidate)
for d in [
"/usr/bin",
"/usr/local/bin",
"/opt",
"/opt/bin",
"/opt/local/bin",
"/opt/homebrew/bin" "C:\\Windows\\System32",
"C:\\Program Files",
"C:\\Program Files (x86)",
]
):
self._exiftool_path = candidate
# Register converters for successful browsing operations
# Later registrations are tried first / take higher priority than earlier registrations
# To this end, the most specific converters should appear below the most generic converters
@@ -327,6 +349,17 @@ class MarkItDown:
elif base_guess.extension is not None:
placeholder_filename = "placeholder" + base_guess.extension
# Check if we have a seekable stream. If not, load the entire stream into memory.
if not stream.seekable():
buffer = io.BytesIO()
while True:
chunk = stream.read(4096)
if not chunk:
break
buffer.write(chunk)
buffer.seek(0)
stream = buffer
# Add guesses based on stream content
for guess in _guess_stream_info_from_stream(
file_stream=stream, filename_hint=placeholder_filename
@@ -455,7 +488,7 @@ class MarkItDown:
cur_pos == file_stream.tell()
), f"File stream position should NOT change between guess iterations"
_kwargs = copy.deepcopy(kwargs)
_kwargs = {k: v for k, v in kwargs.items()}
# Copy any additional global options
if "llm_client" not in _kwargs and self._llm_client is not None:

View File

@@ -1,8 +1,9 @@
import puremagic
import mimetypes
import zipfile
import os
from dataclasses import dataclass, asdict
from typing import Optional, BinaryIO, List, TypeVar, Type
from typing import Optional, BinaryIO, List, Union
# Mimetype substitutions table
MIMETYPE_SUBSTITUTIONS = {
@@ -74,6 +75,20 @@ def _guess_stream_info_from_stream(
)
)
# If it looks like a zip use _guess_stream_info_from_zip rather than puremagic
cur_pos = file_stream.tell()
try:
header = file_stream.read(4)
file_stream.seek(cur_pos)
if header == b"PK\x03\x04":
zip_guess = _guess_stream_info_from_zip(file_stream)
if zip_guess:
guesses.append(zip_guess)
return guesses
finally:
file_stream.seek(cur_pos)
# Fall back to using puremagic
def _puremagic(
file_stream, filename_hint
) -> List[puremagic.main.PureMagicWithConfidence]:
@@ -120,3 +135,74 @@ def _guess_stream_info_from_stream(
guesses.append(StreamInfo(**kwargs))
return guesses
def _guess_stream_info_from_zip(file_stream: BinaryIO) -> Union[None, StreamInfo]:
"""
Guess StreamInfo properties (mostly mimetype and extension) from a zip stream.
Args:
- stream: The stream to guess the StreamInfo from.
Returns the single best guess, or None if no guess could be made.
"""
cur_pos = file_stream.tell()
try:
with zipfile.ZipFile(file_stream) as z:
table_of_contents = z.namelist()
# OpenPackageFormat (OPF) file
if "[Content_Types].xml" in table_of_contents:
# Word file
if "word/document.xml" in table_of_contents:
return StreamInfo(
mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
extension=".docx",
)
# Excel file
if "xl/workbook.xml" in table_of_contents:
return StreamInfo(
mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
extension=".xlsx",
)
# PowerPoint file
if "ppt/presentation.xml" in table_of_contents:
return StreamInfo(
mimetype="application/vnd.openxmlformats-officedocument.presentationml.presentation",
extension=".pptx",
)
# Visio file
if "visio/document.xml" in table_of_contents:
return StreamInfo(
mimetype="application/vnd.ms-visio.drawing",
extension=".vsd",
)
# XPS file
if "FixedDocSeq.fdseq" in table_of_contents:
return StreamInfo(
mimetype="application/vnd.ms-xpsdocument",
extension=".xps",
)
# EPUB, or similar
if "mimetype" in table_of_contents:
_mimetype = z.read("mimetype").decode("ascii").strip()
_extension = mimetypes.guess_extension(_mimetype)
return StreamInfo(mimetype=_mimetype, extension=_extension)
# JAR
if "META-INF/MANIFEST.MF" in table_of_contents:
return StreamInfo(mimetype="application/java-archive", extension=".jar")
# If we made it this far, we couldn't identify the file
return StreamInfo(mimetype="application/zip", extension=".zip")
except zipfile.BadZipFile:
return None
finally:
file_stream.seek(cur_pos)

View File

@@ -5,26 +5,16 @@ import sys
import shutil
import os
import warnings
from typing import BinaryIO, Optional, Any
from typing import BinaryIO, Any, Union
def exiftool_metadata(
file_stream: BinaryIO, *, exiftool_path: Optional[str] = None
file_stream: BinaryIO,
*,
exiftool_path: Union[str, None],
) -> Any: # Need a better type for json data
# Check if we have a valid pointer to exiftool
# Nothing to do
if not exiftool_path:
which_exiftool = shutil.which("exiftool")
if which_exiftool:
warnings.warn(
f"""Implicit discovery of 'exiftool' is disabled. If you would like to continue to use exiftool in MarkItDown, please set the exiftool_path parameter in the MarkItDown consructor. E.g.,
md = MarkItDown(exiftool_path="{which_exiftool}")
This warning will be removed in future releases.
""",
DeprecationWarning,
)
# Nothing to do
return {}
# Run exiftool

View File

@@ -7,6 +7,7 @@ from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
# Try loading optional (but in this case, required) dependencies
# Save reporting of any exceptions for later
_dependency_exc_info = None
olefile = None
try:
import olefile
except ImportError:
@@ -48,7 +49,7 @@ class OutlookMsgConverter(DocumentConverter):
# Brute force, check if we have an OLE file
cur_pos = file_stream.tell()
try:
if not olefile.isOleFile(file_stream):
if olefile and not olefile.isOleFile(file_stream):
return False
finally:
file_stream.seek(cur_pos)

View File

@@ -6,6 +6,7 @@ import re
import html
from typing import BinaryIO, Any
from operator import attrgetter
from ._html_converter import HtmlConverter
from ._llm_caption import llm_caption
@@ -160,10 +161,12 @@ class PptxConverter(DocumentConverter):
# Group Shapes
if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.GROUP:
for subshape in shape.shapes:
sorted_shapes = sorted(shape.shapes, key=attrgetter("top", "left"))
for subshape in sorted_shapes:
get_shape_content(subshape, **kwargs)
for shape in slide.shapes:
sorted_shapes = sorted(slide.shapes, key=attrgetter("top", "left"))
for shape in sorted_shapes:
get_shape_content(shape, **kwargs)
md_content = md_content.strip()

View File

@@ -7,8 +7,6 @@ import openai
import pytest
import requests
import warnings
from markitdown import (
MarkItDown,
UnsupportedFormatException,
@@ -517,19 +515,6 @@ def test_exceptions() -> None:
reason="do not run if exiftool is not installed",
)
def test_markitdown_exiftool() -> None:
# Test the automatic discovery of exiftool throws a warning
# and is disabled
try:
warnings.simplefilter("default")
with warnings.catch_warnings(record=True) as w:
markitdown = MarkItDown()
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
assert len(w) == 1
assert w[0].category is DeprecationWarning
assert result.text_content.strip() == ""
finally:
warnings.resetwarnings()
which_exiftool = shutil.which("exiftool")
assert which_exiftool is not None