feat(docker): improve dockerfile build (#220)

* refactor(docker): remove unnecessary root user

The USER root directive isn't needed directly after FROM

Signed-off-by: Sebastian Yaghoubi <sebastianyaghoubi@gmail.com>

* fix(docker): use generic nobody nogroup default instead of uid gid

Signed-off-by: Sebastian Yaghoubi <sebastianyaghoubi@gmail.com>

* fix(docker): build app from source locally instead of installing package

Signed-off-by: Sebastian Yaghoubi <sebastianyaghoubi@gmail.com>

* fix(docker): use correct files in dockerignore

Signed-off-by: Sebastian Yaghoubi <sebastianyaghoubi@gmail.com>

* chore(docker): dont install recommended packages with git

Signed-off-by: Sebastian Yaghoubi <sebastianyaghoubi@gmail.com>

* fix(docker): run apt as non-interactive

Signed-off-by: Sebastian Yaghoubi <sebastianyaghoubi@gmail.com>

* Update Dockerfile to new package structure, and fix streaming bugs.

---------

Signed-off-by: Sebastian Yaghoubi <sebastianyaghoubi@gmail.com>
Co-authored-by: afourney <adamfo@microsoft.com>
This commit is contained in:
Sebastian Yaghoubi
2025-03-07 20:07:40 -08:00
committed by GitHub
parent 0229ff6cb7
commit 515fa854bf
4 changed files with 35 additions and 12 deletions

View File

@@ -327,6 +327,17 @@ class MarkItDown:
elif base_guess.extension is not None:
placeholder_filename = "placeholder" + base_guess.extension
# Check if we have a seekable stream. If not, load the entire stream into memory.
if not stream.seekable():
buffer = io.BytesIO()
while True:
chunk = stream.read(4096)
if not chunk:
break
buffer.write(chunk)
buffer.seek(0)
stream = buffer
# Add guesses based on stream content
for guess in _guess_stream_info_from_stream(
file_stream=stream, filename_hint=placeholder_filename

View File

@@ -7,6 +7,7 @@ from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
# Try loading optional (but in this case, required) dependencies
# Save reporting of any exceptions for later
_dependency_exc_info = None
olefile = None
try:
import olefile
except ImportError:
@@ -48,7 +49,7 @@ class OutlookMsgConverter(DocumentConverter):
# Brute force, check if we have an OLE file
cur_pos = file_stream.tell()
try:
if not olefile.isOleFile(file_stream):
if olefile and not olefile.isOleFile(file_stream):
return False
finally:
file_stream.seek(cur_pos)