diff --git a/.dockerignore b/.dockerignore index f59ec20..319b932 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1 +1,2 @@ -* \ No newline at end of file +* +!packages/ diff --git a/Dockerfile b/Dockerfile index 0072d9e..c65bf9c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,22 +1,32 @@ FROM python:3.13-slim-bullseye -USER root - -ARG INSTALL_GIT=false -RUN if [ "$INSTALL_GIT" = "true" ]; then \ - apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*; \ - fi +ENV DEBIAN_FRONTEND=noninteractive +ENV EXIFTOOL_PATH=/usr/bin/exiftool +ENV FFMPEG_PATH=/usr/bin/ffmpeg # Runtime dependency RUN apt-get update && apt-get install -y --no-install-recommends \ ffmpeg \ - && rm -rf /var/lib/apt/lists/* + exiftool -RUN pip install markitdown +ARG INSTALL_GIT=false +RUN if [ "$INSTALL_GIT" = "true" ]; then \ + apt-get install -y --no-install-recommends \ + git; \ + fi + +# Cleanup +RUN rm -rf /var/lib/apt/lists/* + +WORKDIR /app +COPY . /app +RUN pip --no-cache-dir install \ + /app/packages/markitdown[all] \ + /app/packages/markitdown-sample-plugin # Default USERID and GROUPID -ARG USERID=10000 -ARG GROUPID=10000 +ARG USERID=nobody +ARG GROUPID=nogroup USER $USERID:$GROUPID diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py index f32b236..04015d7 100644 --- a/packages/markitdown/src/markitdown/_markitdown.py +++ b/packages/markitdown/src/markitdown/_markitdown.py @@ -327,6 +327,17 @@ class MarkItDown: elif base_guess.extension is not None: placeholder_filename = "placeholder" + base_guess.extension + # Check if we have a seekable stream. If not, load the entire stream into memory. + if not stream.seekable(): + buffer = io.BytesIO() + while True: + chunk = stream.read(4096) + if not chunk: + break + buffer.write(chunk) + buffer.seek(0) + stream = buffer + # Add guesses based on stream content for guess in _guess_stream_info_from_stream( file_stream=stream, filename_hint=placeholder_filename diff --git a/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py b/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py index 8a61b0c..8e20dc5 100644 --- a/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py +++ b/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py @@ -7,6 +7,7 @@ from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE # Try loading optional (but in this case, required) dependencies # Save reporting of any exceptions for later _dependency_exc_info = None +olefile = None try: import olefile except ImportError: @@ -48,7 +49,7 @@ class OutlookMsgConverter(DocumentConverter): # Brute force, check if we have an OLE file cur_pos = file_stream.tell() try: - if not olefile.isOleFile(file_stream): + if olefile and not olefile.isOleFile(file_stream): return False finally: file_stream.seek(cur_pos)