feat(docker): improve dockerfile build (#220)
* refactor(docker): remove unnecessary root user The USER root directive isn't needed directly after FROM Signed-off-by: Sebastian Yaghoubi <sebastianyaghoubi@gmail.com> * fix(docker): use generic nobody nogroup default instead of uid gid Signed-off-by: Sebastian Yaghoubi <sebastianyaghoubi@gmail.com> * fix(docker): build app from source locally instead of installing package Signed-off-by: Sebastian Yaghoubi <sebastianyaghoubi@gmail.com> * fix(docker): use correct files in dockerignore Signed-off-by: Sebastian Yaghoubi <sebastianyaghoubi@gmail.com> * chore(docker): dont install recommended packages with git Signed-off-by: Sebastian Yaghoubi <sebastianyaghoubi@gmail.com> * fix(docker): run apt as non-interactive Signed-off-by: Sebastian Yaghoubi <sebastianyaghoubi@gmail.com> * Update Dockerfile to new package structure, and fix streaming bugs. --------- Signed-off-by: Sebastian Yaghoubi <sebastianyaghoubi@gmail.com> Co-authored-by: afourney <adamfo@microsoft.com>
This commit is contained in:
committed by
GitHub
parent
0229ff6cb7
commit
515fa854bf
@@ -1 +1,2 @@
|
|||||||
*
|
*
|
||||||
|
!packages/
|
||||||
|
|||||||
30
Dockerfile
30
Dockerfile
@@ -1,22 +1,32 @@
|
|||||||
FROM python:3.13-slim-bullseye
|
FROM python:3.13-slim-bullseye
|
||||||
|
|
||||||
USER root
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
|
ENV EXIFTOOL_PATH=/usr/bin/exiftool
|
||||||
ARG INSTALL_GIT=false
|
ENV FFMPEG_PATH=/usr/bin/ffmpeg
|
||||||
RUN if [ "$INSTALL_GIT" = "true" ]; then \
|
|
||||||
apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*; \
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Runtime dependency
|
# Runtime dependency
|
||||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
ffmpeg \
|
ffmpeg \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
exiftool
|
||||||
|
|
||||||
RUN pip install markitdown
|
ARG INSTALL_GIT=false
|
||||||
|
RUN if [ "$INSTALL_GIT" = "true" ]; then \
|
||||||
|
apt-get install -y --no-install-recommends \
|
||||||
|
git; \
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Cleanup
|
||||||
|
RUN rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
COPY . /app
|
||||||
|
RUN pip --no-cache-dir install \
|
||||||
|
/app/packages/markitdown[all] \
|
||||||
|
/app/packages/markitdown-sample-plugin
|
||||||
|
|
||||||
# Default USERID and GROUPID
|
# Default USERID and GROUPID
|
||||||
ARG USERID=10000
|
ARG USERID=nobody
|
||||||
ARG GROUPID=10000
|
ARG GROUPID=nogroup
|
||||||
|
|
||||||
USER $USERID:$GROUPID
|
USER $USERID:$GROUPID
|
||||||
|
|
||||||
|
|||||||
@@ -327,6 +327,17 @@ class MarkItDown:
|
|||||||
elif base_guess.extension is not None:
|
elif base_guess.extension is not None:
|
||||||
placeholder_filename = "placeholder" + base_guess.extension
|
placeholder_filename = "placeholder" + base_guess.extension
|
||||||
|
|
||||||
|
# Check if we have a seekable stream. If not, load the entire stream into memory.
|
||||||
|
if not stream.seekable():
|
||||||
|
buffer = io.BytesIO()
|
||||||
|
while True:
|
||||||
|
chunk = stream.read(4096)
|
||||||
|
if not chunk:
|
||||||
|
break
|
||||||
|
buffer.write(chunk)
|
||||||
|
buffer.seek(0)
|
||||||
|
stream = buffer
|
||||||
|
|
||||||
# Add guesses based on stream content
|
# Add guesses based on stream content
|
||||||
for guess in _guess_stream_info_from_stream(
|
for guess in _guess_stream_info_from_stream(
|
||||||
file_stream=stream, filename_hint=placeholder_filename
|
file_stream=stream, filename_hint=placeholder_filename
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
|||||||
# Try loading optional (but in this case, required) dependencies
|
# Try loading optional (but in this case, required) dependencies
|
||||||
# Save reporting of any exceptions for later
|
# Save reporting of any exceptions for later
|
||||||
_dependency_exc_info = None
|
_dependency_exc_info = None
|
||||||
|
olefile = None
|
||||||
try:
|
try:
|
||||||
import olefile
|
import olefile
|
||||||
except ImportError:
|
except ImportError:
|
||||||
@@ -48,7 +49,7 @@ class OutlookMsgConverter(DocumentConverter):
|
|||||||
# Brute force, check if we have an OLE file
|
# Brute force, check if we have an OLE file
|
||||||
cur_pos = file_stream.tell()
|
cur_pos = file_stream.tell()
|
||||||
try:
|
try:
|
||||||
if not olefile.isOleFile(file_stream):
|
if olefile and not olefile.isOleFile(file_stream):
|
||||||
return False
|
return False
|
||||||
finally:
|
finally:
|
||||||
file_stream.seek(cur_pos)
|
file_stream.seek(cur_pos)
|
||||||
|
|||||||
Reference in New Issue
Block a user