Compare commits
4 Commits
onenote
...
zip_format
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f17bc21c9d | ||
|
|
99d8e562db | ||
|
|
515fa854bf | ||
|
|
0229ff6cb7 |
@@ -1 +1,2 @@
|
|||||||
*
|
*
|
||||||
|
!packages/
|
||||||
|
|||||||
30
Dockerfile
30
Dockerfile
@@ -1,22 +1,32 @@
|
|||||||
FROM python:3.13-slim-bullseye
|
FROM python:3.13-slim-bullseye
|
||||||
|
|
||||||
USER root
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
|
ENV EXIFTOOL_PATH=/usr/bin/exiftool
|
||||||
ARG INSTALL_GIT=false
|
ENV FFMPEG_PATH=/usr/bin/ffmpeg
|
||||||
RUN if [ "$INSTALL_GIT" = "true" ]; then \
|
|
||||||
apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*; \
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Runtime dependency
|
# Runtime dependency
|
||||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
ffmpeg \
|
ffmpeg \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
exiftool
|
||||||
|
|
||||||
RUN pip install markitdown
|
ARG INSTALL_GIT=false
|
||||||
|
RUN if [ "$INSTALL_GIT" = "true" ]; then \
|
||||||
|
apt-get install -y --no-install-recommends \
|
||||||
|
git; \
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Cleanup
|
||||||
|
RUN rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
COPY . /app
|
||||||
|
RUN pip --no-cache-dir install \
|
||||||
|
/app/packages/markitdown[all] \
|
||||||
|
/app/packages/markitdown-sample-plugin
|
||||||
|
|
||||||
# Default USERID and GROUPID
|
# Default USERID and GROUPID
|
||||||
ARG USERID=10000
|
ARG USERID=nobody
|
||||||
ARG GROUPID=10000
|
ARG GROUPID=nogroup
|
||||||
|
|
||||||
USER $USERID:$GROUPID
|
USER $USERID:$GROUPID
|
||||||
|
|
||||||
|
|||||||
@@ -14,7 +14,7 @@ MarkItDown is a lightweight Python utility for converting various files to Markd
|
|||||||
At present, MarkItDown supports:
|
At present, MarkItDown supports:
|
||||||
|
|
||||||
- PDF
|
- PDF
|
||||||
- PowerPoint
|
- PowerPoint (reading in top-to-bottom, left-to-right order)
|
||||||
- Word
|
- Word
|
||||||
- Excel
|
- Excel
|
||||||
- Images (EXIF metadata and OCR)
|
- Images (EXIF metadata and OCR)
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
|
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
|
||||||
#
|
#
|
||||||
# SPDX-License-Identifier: MIT
|
# SPDX-License-Identifier: MIT
|
||||||
__version__ = "0.1.0a1"
|
__version__ = "0.1.0a2"
|
||||||
|
|||||||
@@ -3,6 +3,7 @@ import mimetypes
|
|||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
|
import shutil
|
||||||
import tempfile
|
import tempfile
|
||||||
import warnings
|
import warnings
|
||||||
import traceback
|
import traceback
|
||||||
@@ -138,9 +139,30 @@ class MarkItDown:
|
|||||||
self._llm_model = kwargs.get("llm_model")
|
self._llm_model = kwargs.get("llm_model")
|
||||||
self._exiftool_path = kwargs.get("exiftool_path")
|
self._exiftool_path = kwargs.get("exiftool_path")
|
||||||
self._style_map = kwargs.get("style_map")
|
self._style_map = kwargs.get("style_map")
|
||||||
|
|
||||||
if self._exiftool_path is None:
|
if self._exiftool_path is None:
|
||||||
self._exiftool_path = os.getenv("EXIFTOOL_PATH")
|
self._exiftool_path = os.getenv("EXIFTOOL_PATH")
|
||||||
|
|
||||||
|
# Still none? Check well-known paths
|
||||||
|
if self._exiftool_path is None:
|
||||||
|
candidate = shutil.which("exiftool")
|
||||||
|
if candidate:
|
||||||
|
candidate = os.path.abspath(candidate)
|
||||||
|
if any(
|
||||||
|
d == os.path.dirname(candidate)
|
||||||
|
for d in [
|
||||||
|
"/usr/bin",
|
||||||
|
"/usr/local/bin",
|
||||||
|
"/opt",
|
||||||
|
"/opt/bin",
|
||||||
|
"/opt/local/bin",
|
||||||
|
"/opt/homebrew/bin" "C:\\Windows\\System32",
|
||||||
|
"C:\\Program Files",
|
||||||
|
"C:\\Program Files (x86)",
|
||||||
|
]
|
||||||
|
):
|
||||||
|
self._exiftool_path = candidate
|
||||||
|
|
||||||
# Register converters for successful browsing operations
|
# Register converters for successful browsing operations
|
||||||
# Later registrations are tried first / take higher priority than earlier registrations
|
# Later registrations are tried first / take higher priority than earlier registrations
|
||||||
# To this end, the most specific converters should appear below the most generic converters
|
# To this end, the most specific converters should appear below the most generic converters
|
||||||
@@ -327,6 +349,17 @@ class MarkItDown:
|
|||||||
elif base_guess.extension is not None:
|
elif base_guess.extension is not None:
|
||||||
placeholder_filename = "placeholder" + base_guess.extension
|
placeholder_filename = "placeholder" + base_guess.extension
|
||||||
|
|
||||||
|
# Check if we have a seekable stream. If not, load the entire stream into memory.
|
||||||
|
if not stream.seekable():
|
||||||
|
buffer = io.BytesIO()
|
||||||
|
while True:
|
||||||
|
chunk = stream.read(4096)
|
||||||
|
if not chunk:
|
||||||
|
break
|
||||||
|
buffer.write(chunk)
|
||||||
|
buffer.seek(0)
|
||||||
|
stream = buffer
|
||||||
|
|
||||||
# Add guesses based on stream content
|
# Add guesses based on stream content
|
||||||
for guess in _guess_stream_info_from_stream(
|
for guess in _guess_stream_info_from_stream(
|
||||||
file_stream=stream, filename_hint=placeholder_filename
|
file_stream=stream, filename_hint=placeholder_filename
|
||||||
|
|||||||
@@ -1,8 +1,9 @@
|
|||||||
import puremagic
|
import puremagic
|
||||||
import mimetypes
|
import mimetypes
|
||||||
|
import zipfile
|
||||||
import os
|
import os
|
||||||
from dataclasses import dataclass, asdict
|
from dataclasses import dataclass, asdict
|
||||||
from typing import Optional, BinaryIO, List, TypeVar, Type
|
from typing import Optional, BinaryIO, List, Union
|
||||||
|
|
||||||
# Mimetype substitutions table
|
# Mimetype substitutions table
|
||||||
MIMETYPE_SUBSTITUTIONS = {
|
MIMETYPE_SUBSTITUTIONS = {
|
||||||
@@ -74,6 +75,20 @@ def _guess_stream_info_from_stream(
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# If it looks like a zip use _guess_stream_info_from_zip rather than puremagic
|
||||||
|
cur_pos = file_stream.tell()
|
||||||
|
try:
|
||||||
|
header = file_stream.read(4)
|
||||||
|
file_stream.seek(cur_pos)
|
||||||
|
if header == b"PK\x03\x04":
|
||||||
|
zip_guess = _guess_stream_info_from_zip(file_stream)
|
||||||
|
if zip_guess:
|
||||||
|
guesses.append(zip_guess)
|
||||||
|
return guesses
|
||||||
|
finally:
|
||||||
|
file_stream.seek(cur_pos)
|
||||||
|
|
||||||
|
# Fall back to using puremagic
|
||||||
def _puremagic(
|
def _puremagic(
|
||||||
file_stream, filename_hint
|
file_stream, filename_hint
|
||||||
) -> List[puremagic.main.PureMagicWithConfidence]:
|
) -> List[puremagic.main.PureMagicWithConfidence]:
|
||||||
@@ -120,3 +135,74 @@ def _guess_stream_info_from_stream(
|
|||||||
guesses.append(StreamInfo(**kwargs))
|
guesses.append(StreamInfo(**kwargs))
|
||||||
|
|
||||||
return guesses
|
return guesses
|
||||||
|
|
||||||
|
|
||||||
|
def _guess_stream_info_from_zip(file_stream: BinaryIO) -> Union[None, StreamInfo]:
|
||||||
|
"""
|
||||||
|
Guess StreamInfo properties (mostly mimetype and extension) from a zip stream.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
- stream: The stream to guess the StreamInfo from.
|
||||||
|
|
||||||
|
Returns the single best guess, or None if no guess could be made.
|
||||||
|
"""
|
||||||
|
|
||||||
|
cur_pos = file_stream.tell()
|
||||||
|
try:
|
||||||
|
with zipfile.ZipFile(file_stream) as z:
|
||||||
|
table_of_contents = z.namelist()
|
||||||
|
|
||||||
|
# OpenPackageFormat (OPF) file
|
||||||
|
if "[Content_Types].xml" in table_of_contents:
|
||||||
|
# Word file
|
||||||
|
if "word/document.xml" in table_of_contents:
|
||||||
|
return StreamInfo(
|
||||||
|
mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||||
|
extension=".docx",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Excel file
|
||||||
|
if "xl/workbook.xml" in table_of_contents:
|
||||||
|
return StreamInfo(
|
||||||
|
mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||||
|
extension=".xlsx",
|
||||||
|
)
|
||||||
|
|
||||||
|
# PowerPoint file
|
||||||
|
if "ppt/presentation.xml" in table_of_contents:
|
||||||
|
return StreamInfo(
|
||||||
|
mimetype="application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
||||||
|
extension=".pptx",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Visio file
|
||||||
|
if "visio/document.xml" in table_of_contents:
|
||||||
|
return StreamInfo(
|
||||||
|
mimetype="application/vnd.ms-visio.drawing",
|
||||||
|
extension=".vsd",
|
||||||
|
)
|
||||||
|
|
||||||
|
# XPS file
|
||||||
|
if "FixedDocSeq.fdseq" in table_of_contents:
|
||||||
|
return StreamInfo(
|
||||||
|
mimetype="application/vnd.ms-xpsdocument",
|
||||||
|
extension=".xps",
|
||||||
|
)
|
||||||
|
|
||||||
|
# EPUB, or similar
|
||||||
|
if "mimetype" in table_of_contents:
|
||||||
|
_mimetype = z.read("mimetype").decode("ascii").strip()
|
||||||
|
_extension = mimetypes.guess_extension(_mimetype)
|
||||||
|
return StreamInfo(mimetype=_mimetype, extension=_extension)
|
||||||
|
|
||||||
|
# JAR
|
||||||
|
if "META-INF/MANIFEST.MF" in table_of_contents:
|
||||||
|
return StreamInfo(mimetype="application/java-archive", extension=".jar")
|
||||||
|
|
||||||
|
# If we made it this far, we couldn't identify the file
|
||||||
|
return StreamInfo(mimetype="application/zip", extension=".zip")
|
||||||
|
|
||||||
|
except zipfile.BadZipFile:
|
||||||
|
return None
|
||||||
|
finally:
|
||||||
|
file_stream.seek(cur_pos)
|
||||||
|
|||||||
@@ -5,26 +5,16 @@ import sys
|
|||||||
import shutil
|
import shutil
|
||||||
import os
|
import os
|
||||||
import warnings
|
import warnings
|
||||||
from typing import BinaryIO, Optional, Any
|
from typing import BinaryIO, Any, Union
|
||||||
|
|
||||||
|
|
||||||
def exiftool_metadata(
|
def exiftool_metadata(
|
||||||
file_stream: BinaryIO, *, exiftool_path: Optional[str] = None
|
file_stream: BinaryIO,
|
||||||
|
*,
|
||||||
|
exiftool_path: Union[str, None],
|
||||||
) -> Any: # Need a better type for json data
|
) -> Any: # Need a better type for json data
|
||||||
# Check if we have a valid pointer to exiftool
|
# Nothing to do
|
||||||
if not exiftool_path:
|
if not exiftool_path:
|
||||||
which_exiftool = shutil.which("exiftool")
|
|
||||||
if which_exiftool:
|
|
||||||
warnings.warn(
|
|
||||||
f"""Implicit discovery of 'exiftool' is disabled. If you would like to continue to use exiftool in MarkItDown, please set the exiftool_path parameter in the MarkItDown consructor. E.g.,
|
|
||||||
|
|
||||||
md = MarkItDown(exiftool_path="{which_exiftool}")
|
|
||||||
|
|
||||||
This warning will be removed in future releases.
|
|
||||||
""",
|
|
||||||
DeprecationWarning,
|
|
||||||
)
|
|
||||||
# Nothing to do
|
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
# Run exiftool
|
# Run exiftool
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
|||||||
# Try loading optional (but in this case, required) dependencies
|
# Try loading optional (but in this case, required) dependencies
|
||||||
# Save reporting of any exceptions for later
|
# Save reporting of any exceptions for later
|
||||||
_dependency_exc_info = None
|
_dependency_exc_info = None
|
||||||
|
olefile = None
|
||||||
try:
|
try:
|
||||||
import olefile
|
import olefile
|
||||||
except ImportError:
|
except ImportError:
|
||||||
@@ -48,7 +49,7 @@ class OutlookMsgConverter(DocumentConverter):
|
|||||||
# Brute force, check if we have an OLE file
|
# Brute force, check if we have an OLE file
|
||||||
cur_pos = file_stream.tell()
|
cur_pos = file_stream.tell()
|
||||||
try:
|
try:
|
||||||
if not olefile.isOleFile(file_stream):
|
if olefile and not olefile.isOleFile(file_stream):
|
||||||
return False
|
return False
|
||||||
finally:
|
finally:
|
||||||
file_stream.seek(cur_pos)
|
file_stream.seek(cur_pos)
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ import re
|
|||||||
import html
|
import html
|
||||||
|
|
||||||
from typing import BinaryIO, Any
|
from typing import BinaryIO, Any
|
||||||
|
from operator import attrgetter
|
||||||
|
|
||||||
from ._html_converter import HtmlConverter
|
from ._html_converter import HtmlConverter
|
||||||
from ._llm_caption import llm_caption
|
from ._llm_caption import llm_caption
|
||||||
@@ -160,10 +161,12 @@ class PptxConverter(DocumentConverter):
|
|||||||
|
|
||||||
# Group Shapes
|
# Group Shapes
|
||||||
if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.GROUP:
|
if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.GROUP:
|
||||||
for subshape in shape.shapes:
|
sorted_shapes = sorted(shape.shapes, key=attrgetter("top", "left"))
|
||||||
|
for subshape in sorted_shapes:
|
||||||
get_shape_content(subshape, **kwargs)
|
get_shape_content(subshape, **kwargs)
|
||||||
|
|
||||||
for shape in slide.shapes:
|
sorted_shapes = sorted(slide.shapes, key=attrgetter("top", "left"))
|
||||||
|
for shape in sorted_shapes:
|
||||||
get_shape_content(shape, **kwargs)
|
get_shape_content(shape, **kwargs)
|
||||||
|
|
||||||
md_content = md_content.strip()
|
md_content = md_content.strip()
|
||||||
|
|||||||
@@ -7,8 +7,6 @@ import openai
|
|||||||
import pytest
|
import pytest
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
import warnings
|
|
||||||
|
|
||||||
from markitdown import (
|
from markitdown import (
|
||||||
MarkItDown,
|
MarkItDown,
|
||||||
UnsupportedFormatException,
|
UnsupportedFormatException,
|
||||||
@@ -517,19 +515,6 @@ def test_exceptions() -> None:
|
|||||||
reason="do not run if exiftool is not installed",
|
reason="do not run if exiftool is not installed",
|
||||||
)
|
)
|
||||||
def test_markitdown_exiftool() -> None:
|
def test_markitdown_exiftool() -> None:
|
||||||
# Test the automatic discovery of exiftool throws a warning
|
|
||||||
# and is disabled
|
|
||||||
try:
|
|
||||||
warnings.simplefilter("default")
|
|
||||||
with warnings.catch_warnings(record=True) as w:
|
|
||||||
markitdown = MarkItDown()
|
|
||||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
|
|
||||||
assert len(w) == 1
|
|
||||||
assert w[0].category is DeprecationWarning
|
|
||||||
assert result.text_content.strip() == ""
|
|
||||||
finally:
|
|
||||||
warnings.resetwarnings()
|
|
||||||
|
|
||||||
which_exiftool = shutil.which("exiftool")
|
which_exiftool = shutil.which("exiftool")
|
||||||
assert which_exiftool is not None
|
assert which_exiftool is not None
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user