Compare commits
4 Commits
onenote
...
zip_format
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f17bc21c9d | ||
|
|
99d8e562db | ||
|
|
515fa854bf | ||
|
|
0229ff6cb7 |
@@ -1 +1,2 @@
|
||||
*
|
||||
!packages/
|
||||
|
||||
30
Dockerfile
30
Dockerfile
@@ -1,22 +1,32 @@
|
||||
FROM python:3.13-slim-bullseye
|
||||
|
||||
USER root
|
||||
|
||||
ARG INSTALL_GIT=false
|
||||
RUN if [ "$INSTALL_GIT" = "true" ]; then \
|
||||
apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*; \
|
||||
fi
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
ENV EXIFTOOL_PATH=/usr/bin/exiftool
|
||||
ENV FFMPEG_PATH=/usr/bin/ffmpeg
|
||||
|
||||
# Runtime dependency
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
ffmpeg \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
exiftool
|
||||
|
||||
RUN pip install markitdown
|
||||
ARG INSTALL_GIT=false
|
||||
RUN if [ "$INSTALL_GIT" = "true" ]; then \
|
||||
apt-get install -y --no-install-recommends \
|
||||
git; \
|
||||
fi
|
||||
|
||||
# Cleanup
|
||||
RUN rm -rf /var/lib/apt/lists/*
|
||||
|
||||
WORKDIR /app
|
||||
COPY . /app
|
||||
RUN pip --no-cache-dir install \
|
||||
/app/packages/markitdown[all] \
|
||||
/app/packages/markitdown-sample-plugin
|
||||
|
||||
# Default USERID and GROUPID
|
||||
ARG USERID=10000
|
||||
ARG GROUPID=10000
|
||||
ARG USERID=nobody
|
||||
ARG GROUPID=nogroup
|
||||
|
||||
USER $USERID:$GROUPID
|
||||
|
||||
|
||||
@@ -14,7 +14,7 @@ MarkItDown is a lightweight Python utility for converting various files to Markd
|
||||
At present, MarkItDown supports:
|
||||
|
||||
- PDF
|
||||
- PowerPoint
|
||||
- PowerPoint (reading in top-to-bottom, left-to-right order)
|
||||
- Word
|
||||
- Excel
|
||||
- Images (EXIF metadata and OCR)
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
|
||||
#
|
||||
# SPDX-License-Identifier: MIT
|
||||
__version__ = "0.1.0a1"
|
||||
__version__ = "0.1.0a2"
|
||||
|
||||
@@ -3,6 +3,7 @@ import mimetypes
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import shutil
|
||||
import tempfile
|
||||
import warnings
|
||||
import traceback
|
||||
@@ -138,9 +139,30 @@ class MarkItDown:
|
||||
self._llm_model = kwargs.get("llm_model")
|
||||
self._exiftool_path = kwargs.get("exiftool_path")
|
||||
self._style_map = kwargs.get("style_map")
|
||||
|
||||
if self._exiftool_path is None:
|
||||
self._exiftool_path = os.getenv("EXIFTOOL_PATH")
|
||||
|
||||
# Still none? Check well-known paths
|
||||
if self._exiftool_path is None:
|
||||
candidate = shutil.which("exiftool")
|
||||
if candidate:
|
||||
candidate = os.path.abspath(candidate)
|
||||
if any(
|
||||
d == os.path.dirname(candidate)
|
||||
for d in [
|
||||
"/usr/bin",
|
||||
"/usr/local/bin",
|
||||
"/opt",
|
||||
"/opt/bin",
|
||||
"/opt/local/bin",
|
||||
"/opt/homebrew/bin" "C:\\Windows\\System32",
|
||||
"C:\\Program Files",
|
||||
"C:\\Program Files (x86)",
|
||||
]
|
||||
):
|
||||
self._exiftool_path = candidate
|
||||
|
||||
# Register converters for successful browsing operations
|
||||
# Later registrations are tried first / take higher priority than earlier registrations
|
||||
# To this end, the most specific converters should appear below the most generic converters
|
||||
@@ -327,6 +349,17 @@ class MarkItDown:
|
||||
elif base_guess.extension is not None:
|
||||
placeholder_filename = "placeholder" + base_guess.extension
|
||||
|
||||
# Check if we have a seekable stream. If not, load the entire stream into memory.
|
||||
if not stream.seekable():
|
||||
buffer = io.BytesIO()
|
||||
while True:
|
||||
chunk = stream.read(4096)
|
||||
if not chunk:
|
||||
break
|
||||
buffer.write(chunk)
|
||||
buffer.seek(0)
|
||||
stream = buffer
|
||||
|
||||
# Add guesses based on stream content
|
||||
for guess in _guess_stream_info_from_stream(
|
||||
file_stream=stream, filename_hint=placeholder_filename
|
||||
|
||||
@@ -1,8 +1,9 @@
|
||||
import puremagic
|
||||
import mimetypes
|
||||
import zipfile
|
||||
import os
|
||||
from dataclasses import dataclass, asdict
|
||||
from typing import Optional, BinaryIO, List, TypeVar, Type
|
||||
from typing import Optional, BinaryIO, List, Union
|
||||
|
||||
# Mimetype substitutions table
|
||||
MIMETYPE_SUBSTITUTIONS = {
|
||||
@@ -74,6 +75,20 @@ def _guess_stream_info_from_stream(
|
||||
)
|
||||
)
|
||||
|
||||
# If it looks like a zip use _guess_stream_info_from_zip rather than puremagic
|
||||
cur_pos = file_stream.tell()
|
||||
try:
|
||||
header = file_stream.read(4)
|
||||
file_stream.seek(cur_pos)
|
||||
if header == b"PK\x03\x04":
|
||||
zip_guess = _guess_stream_info_from_zip(file_stream)
|
||||
if zip_guess:
|
||||
guesses.append(zip_guess)
|
||||
return guesses
|
||||
finally:
|
||||
file_stream.seek(cur_pos)
|
||||
|
||||
# Fall back to using puremagic
|
||||
def _puremagic(
|
||||
file_stream, filename_hint
|
||||
) -> List[puremagic.main.PureMagicWithConfidence]:
|
||||
@@ -120,3 +135,74 @@ def _guess_stream_info_from_stream(
|
||||
guesses.append(StreamInfo(**kwargs))
|
||||
|
||||
return guesses
|
||||
|
||||
|
||||
def _guess_stream_info_from_zip(file_stream: BinaryIO) -> Union[None, StreamInfo]:
|
||||
"""
|
||||
Guess StreamInfo properties (mostly mimetype and extension) from a zip stream.
|
||||
|
||||
Args:
|
||||
- stream: The stream to guess the StreamInfo from.
|
||||
|
||||
Returns the single best guess, or None if no guess could be made.
|
||||
"""
|
||||
|
||||
cur_pos = file_stream.tell()
|
||||
try:
|
||||
with zipfile.ZipFile(file_stream) as z:
|
||||
table_of_contents = z.namelist()
|
||||
|
||||
# OpenPackageFormat (OPF) file
|
||||
if "[Content_Types].xml" in table_of_contents:
|
||||
# Word file
|
||||
if "word/document.xml" in table_of_contents:
|
||||
return StreamInfo(
|
||||
mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
extension=".docx",
|
||||
)
|
||||
|
||||
# Excel file
|
||||
if "xl/workbook.xml" in table_of_contents:
|
||||
return StreamInfo(
|
||||
mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||
extension=".xlsx",
|
||||
)
|
||||
|
||||
# PowerPoint file
|
||||
if "ppt/presentation.xml" in table_of_contents:
|
||||
return StreamInfo(
|
||||
mimetype="application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
||||
extension=".pptx",
|
||||
)
|
||||
|
||||
# Visio file
|
||||
if "visio/document.xml" in table_of_contents:
|
||||
return StreamInfo(
|
||||
mimetype="application/vnd.ms-visio.drawing",
|
||||
extension=".vsd",
|
||||
)
|
||||
|
||||
# XPS file
|
||||
if "FixedDocSeq.fdseq" in table_of_contents:
|
||||
return StreamInfo(
|
||||
mimetype="application/vnd.ms-xpsdocument",
|
||||
extension=".xps",
|
||||
)
|
||||
|
||||
# EPUB, or similar
|
||||
if "mimetype" in table_of_contents:
|
||||
_mimetype = z.read("mimetype").decode("ascii").strip()
|
||||
_extension = mimetypes.guess_extension(_mimetype)
|
||||
return StreamInfo(mimetype=_mimetype, extension=_extension)
|
||||
|
||||
# JAR
|
||||
if "META-INF/MANIFEST.MF" in table_of_contents:
|
||||
return StreamInfo(mimetype="application/java-archive", extension=".jar")
|
||||
|
||||
# If we made it this far, we couldn't identify the file
|
||||
return StreamInfo(mimetype="application/zip", extension=".zip")
|
||||
|
||||
except zipfile.BadZipFile:
|
||||
return None
|
||||
finally:
|
||||
file_stream.seek(cur_pos)
|
||||
|
||||
@@ -5,26 +5,16 @@ import sys
|
||||
import shutil
|
||||
import os
|
||||
import warnings
|
||||
from typing import BinaryIO, Optional, Any
|
||||
from typing import BinaryIO, Any, Union
|
||||
|
||||
|
||||
def exiftool_metadata(
|
||||
file_stream: BinaryIO, *, exiftool_path: Optional[str] = None
|
||||
file_stream: BinaryIO,
|
||||
*,
|
||||
exiftool_path: Union[str, None],
|
||||
) -> Any: # Need a better type for json data
|
||||
# Check if we have a valid pointer to exiftool
|
||||
if not exiftool_path:
|
||||
which_exiftool = shutil.which("exiftool")
|
||||
if which_exiftool:
|
||||
warnings.warn(
|
||||
f"""Implicit discovery of 'exiftool' is disabled. If you would like to continue to use exiftool in MarkItDown, please set the exiftool_path parameter in the MarkItDown consructor. E.g.,
|
||||
|
||||
md = MarkItDown(exiftool_path="{which_exiftool}")
|
||||
|
||||
This warning will be removed in future releases.
|
||||
""",
|
||||
DeprecationWarning,
|
||||
)
|
||||
# Nothing to do
|
||||
if not exiftool_path:
|
||||
return {}
|
||||
|
||||
# Run exiftool
|
||||
|
||||
@@ -7,6 +7,7 @@ from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||
# Try loading optional (but in this case, required) dependencies
|
||||
# Save reporting of any exceptions for later
|
||||
_dependency_exc_info = None
|
||||
olefile = None
|
||||
try:
|
||||
import olefile
|
||||
except ImportError:
|
||||
@@ -48,7 +49,7 @@ class OutlookMsgConverter(DocumentConverter):
|
||||
# Brute force, check if we have an OLE file
|
||||
cur_pos = file_stream.tell()
|
||||
try:
|
||||
if not olefile.isOleFile(file_stream):
|
||||
if olefile and not olefile.isOleFile(file_stream):
|
||||
return False
|
||||
finally:
|
||||
file_stream.seek(cur_pos)
|
||||
|
||||
@@ -6,6 +6,7 @@ import re
|
||||
import html
|
||||
|
||||
from typing import BinaryIO, Any
|
||||
from operator import attrgetter
|
||||
|
||||
from ._html_converter import HtmlConverter
|
||||
from ._llm_caption import llm_caption
|
||||
@@ -160,10 +161,12 @@ class PptxConverter(DocumentConverter):
|
||||
|
||||
# Group Shapes
|
||||
if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.GROUP:
|
||||
for subshape in shape.shapes:
|
||||
sorted_shapes = sorted(shape.shapes, key=attrgetter("top", "left"))
|
||||
for subshape in sorted_shapes:
|
||||
get_shape_content(subshape, **kwargs)
|
||||
|
||||
for shape in slide.shapes:
|
||||
sorted_shapes = sorted(slide.shapes, key=attrgetter("top", "left"))
|
||||
for shape in sorted_shapes:
|
||||
get_shape_content(shape, **kwargs)
|
||||
|
||||
md_content = md_content.strip()
|
||||
|
||||
@@ -7,8 +7,6 @@ import openai
|
||||
import pytest
|
||||
import requests
|
||||
|
||||
import warnings
|
||||
|
||||
from markitdown import (
|
||||
MarkItDown,
|
||||
UnsupportedFormatException,
|
||||
@@ -517,19 +515,6 @@ def test_exceptions() -> None:
|
||||
reason="do not run if exiftool is not installed",
|
||||
)
|
||||
def test_markitdown_exiftool() -> None:
|
||||
# Test the automatic discovery of exiftool throws a warning
|
||||
# and is disabled
|
||||
try:
|
||||
warnings.simplefilter("default")
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
markitdown = MarkItDown()
|
||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
|
||||
assert len(w) == 1
|
||||
assert w[0].category is DeprecationWarning
|
||||
assert result.text_content.strip() == ""
|
||||
finally:
|
||||
warnings.resetwarnings()
|
||||
|
||||
which_exiftool = shutil.which("exiftool")
|
||||
assert which_exiftool is not None
|
||||
|
||||
|
||||
Reference in New Issue
Block a user