Compare commits
4 Commits
onenote
...
zip_format
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f17bc21c9d | ||
|
|
99d8e562db | ||
|
|
515fa854bf | ||
|
|
0229ff6cb7 |
@@ -1 +1,2 @@
|
||||
*
|
||||
*
|
||||
!packages/
|
||||
|
||||
30
Dockerfile
30
Dockerfile
@@ -1,22 +1,32 @@
|
||||
FROM python:3.13-slim-bullseye
|
||||
|
||||
USER root
|
||||
|
||||
ARG INSTALL_GIT=false
|
||||
RUN if [ "$INSTALL_GIT" = "true" ]; then \
|
||||
apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*; \
|
||||
fi
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
ENV EXIFTOOL_PATH=/usr/bin/exiftool
|
||||
ENV FFMPEG_PATH=/usr/bin/ffmpeg
|
||||
|
||||
# Runtime dependency
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
ffmpeg \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
exiftool
|
||||
|
||||
RUN pip install markitdown
|
||||
ARG INSTALL_GIT=false
|
||||
RUN if [ "$INSTALL_GIT" = "true" ]; then \
|
||||
apt-get install -y --no-install-recommends \
|
||||
git; \
|
||||
fi
|
||||
|
||||
# Cleanup
|
||||
RUN rm -rf /var/lib/apt/lists/*
|
||||
|
||||
WORKDIR /app
|
||||
COPY . /app
|
||||
RUN pip --no-cache-dir install \
|
||||
/app/packages/markitdown[all] \
|
||||
/app/packages/markitdown-sample-plugin
|
||||
|
||||
# Default USERID and GROUPID
|
||||
ARG USERID=10000
|
||||
ARG GROUPID=10000
|
||||
ARG USERID=nobody
|
||||
ARG GROUPID=nogroup
|
||||
|
||||
USER $USERID:$GROUPID
|
||||
|
||||
|
||||
@@ -14,10 +14,9 @@ MarkItDown is a lightweight Python utility for converting various files to Markd
|
||||
At present, MarkItDown supports:
|
||||
|
||||
- PDF
|
||||
- PowerPoint
|
||||
- PowerPoint (reading in top-to-bottom, left-to-right order)
|
||||
- Word
|
||||
- Excel
|
||||
- OneNote
|
||||
- Images (EXIF metadata and OCR)
|
||||
- Audio (EXIF metadata and speech transcription)
|
||||
- HTML
|
||||
@@ -83,7 +82,6 @@ At the moment, the following optional dependencies are available:
|
||||
* `[xls]` Installs dependencies for older Excel files
|
||||
* `[pdf]` Installs dependencies for PDF files
|
||||
* `[outlook]` Installs dependencies for Outlook messages
|
||||
* `[onenote]` Installs dependencies for OneNote .one files
|
||||
* `[az-doc-intel]` Installs dependencies for Azure Document Intelligence
|
||||
* `[audio-transcription]` Installs dependencies for audio transcription of wav and mp3 files
|
||||
* `[youtube-transcription]` Installs dependencies for fetching YouTube video transcription
|
||||
|
||||
@@ -45,8 +45,7 @@ all = [
|
||||
"SpeechRecognition",
|
||||
"youtube-transcript-api",
|
||||
"azure-ai-documentintelligence",
|
||||
"azure-identity",
|
||||
"one-extract",
|
||||
"azure-identity"
|
||||
]
|
||||
pptx = ["python-pptx"]
|
||||
docx = ["mammoth"]
|
||||
@@ -54,7 +53,6 @@ xlsx = ["pandas", "openpyxl"]
|
||||
xls = ["pandas", "xlrd"]
|
||||
pdf = ["pdfminer.six"]
|
||||
outlook = ["olefile"]
|
||||
onenote = ["one-extract"]
|
||||
audio-transcription = ["pydub", "SpeechRecognition"]
|
||||
youtube-transcription = ["youtube-transcript-api"]
|
||||
az-doc-intel = ["azure-ai-documentintelligence", "azure-identity"]
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
|
||||
#
|
||||
# SPDX-License-Identifier: MIT
|
||||
__version__ = "0.1.0a1"
|
||||
__version__ = "0.1.0a2"
|
||||
|
||||
@@ -3,6 +3,7 @@ import mimetypes
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import shutil
|
||||
import tempfile
|
||||
import warnings
|
||||
import traceback
|
||||
@@ -30,7 +31,6 @@ from .converters import (
|
||||
BingSerpConverter,
|
||||
PdfConverter,
|
||||
DocxConverter,
|
||||
OneNoteConverter,
|
||||
XlsxConverter,
|
||||
XlsConverter,
|
||||
PptxConverter,
|
||||
@@ -139,9 +139,30 @@ class MarkItDown:
|
||||
self._llm_model = kwargs.get("llm_model")
|
||||
self._exiftool_path = kwargs.get("exiftool_path")
|
||||
self._style_map = kwargs.get("style_map")
|
||||
|
||||
if self._exiftool_path is None:
|
||||
self._exiftool_path = os.getenv("EXIFTOOL_PATH")
|
||||
|
||||
# Still none? Check well-known paths
|
||||
if self._exiftool_path is None:
|
||||
candidate = shutil.which("exiftool")
|
||||
if candidate:
|
||||
candidate = os.path.abspath(candidate)
|
||||
if any(
|
||||
d == os.path.dirname(candidate)
|
||||
for d in [
|
||||
"/usr/bin",
|
||||
"/usr/local/bin",
|
||||
"/opt",
|
||||
"/opt/bin",
|
||||
"/opt/local/bin",
|
||||
"/opt/homebrew/bin" "C:\\Windows\\System32",
|
||||
"C:\\Program Files",
|
||||
"C:\\Program Files (x86)",
|
||||
]
|
||||
):
|
||||
self._exiftool_path = candidate
|
||||
|
||||
# Register converters for successful browsing operations
|
||||
# Later registrations are tried first / take higher priority than earlier registrations
|
||||
# To this end, the most specific converters should appear below the most generic converters
|
||||
@@ -159,7 +180,6 @@ class MarkItDown:
|
||||
self.register_converter(YouTubeConverter())
|
||||
self.register_converter(BingSerpConverter())
|
||||
self.register_converter(DocxConverter())
|
||||
self.register_converter(OneNoteConverter())
|
||||
self.register_converter(XlsxConverter())
|
||||
self.register_converter(XlsConverter())
|
||||
self.register_converter(PptxConverter())
|
||||
@@ -329,6 +349,17 @@ class MarkItDown:
|
||||
elif base_guess.extension is not None:
|
||||
placeholder_filename = "placeholder" + base_guess.extension
|
||||
|
||||
# Check if we have a seekable stream. If not, load the entire stream into memory.
|
||||
if not stream.seekable():
|
||||
buffer = io.BytesIO()
|
||||
while True:
|
||||
chunk = stream.read(4096)
|
||||
if not chunk:
|
||||
break
|
||||
buffer.write(chunk)
|
||||
buffer.seek(0)
|
||||
stream = buffer
|
||||
|
||||
# Add guesses based on stream content
|
||||
for guess in _guess_stream_info_from_stream(
|
||||
file_stream=stream, filename_hint=placeholder_filename
|
||||
|
||||
@@ -1,8 +1,9 @@
|
||||
import puremagic
|
||||
import mimetypes
|
||||
import zipfile
|
||||
import os
|
||||
from dataclasses import dataclass, asdict
|
||||
from typing import Optional, BinaryIO, List, TypeVar, Type
|
||||
from typing import Optional, BinaryIO, List, Union
|
||||
|
||||
# Mimetype substitutions table
|
||||
MIMETYPE_SUBSTITUTIONS = {
|
||||
@@ -74,6 +75,20 @@ def _guess_stream_info_from_stream(
|
||||
)
|
||||
)
|
||||
|
||||
# If it looks like a zip use _guess_stream_info_from_zip rather than puremagic
|
||||
cur_pos = file_stream.tell()
|
||||
try:
|
||||
header = file_stream.read(4)
|
||||
file_stream.seek(cur_pos)
|
||||
if header == b"PK\x03\x04":
|
||||
zip_guess = _guess_stream_info_from_zip(file_stream)
|
||||
if zip_guess:
|
||||
guesses.append(zip_guess)
|
||||
return guesses
|
||||
finally:
|
||||
file_stream.seek(cur_pos)
|
||||
|
||||
# Fall back to using puremagic
|
||||
def _puremagic(
|
||||
file_stream, filename_hint
|
||||
) -> List[puremagic.main.PureMagicWithConfidence]:
|
||||
@@ -120,3 +135,74 @@ def _guess_stream_info_from_stream(
|
||||
guesses.append(StreamInfo(**kwargs))
|
||||
|
||||
return guesses
|
||||
|
||||
|
||||
def _guess_stream_info_from_zip(file_stream: BinaryIO) -> Union[None, StreamInfo]:
|
||||
"""
|
||||
Guess StreamInfo properties (mostly mimetype and extension) from a zip stream.
|
||||
|
||||
Args:
|
||||
- stream: The stream to guess the StreamInfo from.
|
||||
|
||||
Returns the single best guess, or None if no guess could be made.
|
||||
"""
|
||||
|
||||
cur_pos = file_stream.tell()
|
||||
try:
|
||||
with zipfile.ZipFile(file_stream) as z:
|
||||
table_of_contents = z.namelist()
|
||||
|
||||
# OpenPackageFormat (OPF) file
|
||||
if "[Content_Types].xml" in table_of_contents:
|
||||
# Word file
|
||||
if "word/document.xml" in table_of_contents:
|
||||
return StreamInfo(
|
||||
mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
extension=".docx",
|
||||
)
|
||||
|
||||
# Excel file
|
||||
if "xl/workbook.xml" in table_of_contents:
|
||||
return StreamInfo(
|
||||
mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||
extension=".xlsx",
|
||||
)
|
||||
|
||||
# PowerPoint file
|
||||
if "ppt/presentation.xml" in table_of_contents:
|
||||
return StreamInfo(
|
||||
mimetype="application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
||||
extension=".pptx",
|
||||
)
|
||||
|
||||
# Visio file
|
||||
if "visio/document.xml" in table_of_contents:
|
||||
return StreamInfo(
|
||||
mimetype="application/vnd.ms-visio.drawing",
|
||||
extension=".vsd",
|
||||
)
|
||||
|
||||
# XPS file
|
||||
if "FixedDocSeq.fdseq" in table_of_contents:
|
||||
return StreamInfo(
|
||||
mimetype="application/vnd.ms-xpsdocument",
|
||||
extension=".xps",
|
||||
)
|
||||
|
||||
# EPUB, or similar
|
||||
if "mimetype" in table_of_contents:
|
||||
_mimetype = z.read("mimetype").decode("ascii").strip()
|
||||
_extension = mimetypes.guess_extension(_mimetype)
|
||||
return StreamInfo(mimetype=_mimetype, extension=_extension)
|
||||
|
||||
# JAR
|
||||
if "META-INF/MANIFEST.MF" in table_of_contents:
|
||||
return StreamInfo(mimetype="application/java-archive", extension=".jar")
|
||||
|
||||
# If we made it this far, we couldn't identify the file
|
||||
return StreamInfo(mimetype="application/zip", extension=".zip")
|
||||
|
||||
except zipfile.BadZipFile:
|
||||
return None
|
||||
finally:
|
||||
file_stream.seek(cur_pos)
|
||||
|
||||
@@ -11,7 +11,6 @@ from ._ipynb_converter import IpynbConverter
|
||||
from ._bing_serp_converter import BingSerpConverter
|
||||
from ._pdf_converter import PdfConverter
|
||||
from ._docx_converter import DocxConverter
|
||||
from ._onenote_converter import OneNoteConverter
|
||||
from ._xlsx_converter import XlsxConverter, XlsConverter
|
||||
from ._pptx_converter import PptxConverter
|
||||
from ._image_converter import ImageConverter
|
||||
@@ -30,7 +29,6 @@ __all__ = [
|
||||
"BingSerpConverter",
|
||||
"PdfConverter",
|
||||
"DocxConverter",
|
||||
"OneNoteConverter",
|
||||
"XlsxConverter",
|
||||
"XlsConverter",
|
||||
"PptxConverter",
|
||||
|
||||
@@ -5,26 +5,16 @@ import sys
|
||||
import shutil
|
||||
import os
|
||||
import warnings
|
||||
from typing import BinaryIO, Optional, Any
|
||||
from typing import BinaryIO, Any, Union
|
||||
|
||||
|
||||
def exiftool_metadata(
|
||||
file_stream: BinaryIO, *, exiftool_path: Optional[str] = None
|
||||
file_stream: BinaryIO,
|
||||
*,
|
||||
exiftool_path: Union[str, None],
|
||||
) -> Any: # Need a better type for json data
|
||||
# Check if we have a valid pointer to exiftool
|
||||
# Nothing to do
|
||||
if not exiftool_path:
|
||||
which_exiftool = shutil.which("exiftool")
|
||||
if which_exiftool:
|
||||
warnings.warn(
|
||||
f"""Implicit discovery of 'exiftool' is disabled. If you would like to continue to use exiftool in MarkItDown, please set the exiftool_path parameter in the MarkItDown consructor. E.g.,
|
||||
|
||||
md = MarkItDown(exiftool_path="{which_exiftool}")
|
||||
|
||||
This warning will be removed in future releases.
|
||||
""",
|
||||
DeprecationWarning,
|
||||
)
|
||||
# Nothing to do
|
||||
return {}
|
||||
|
||||
# Run exiftool
|
||||
|
||||
@@ -1,87 +0,0 @@
|
||||
import sys
|
||||
|
||||
from typing import BinaryIO, Any
|
||||
|
||||
from ._html_converter import HtmlConverter
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._stream_info import StreamInfo
|
||||
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||
|
||||
# Try loading optional (but in this case, required) dependencies
|
||||
# Save reporting of any exceptions for later
|
||||
_dependency_exc_info = None
|
||||
try:
|
||||
import one_extract
|
||||
except ImportError:
|
||||
# Preserve the error and stack trace for later
|
||||
_dependency_exc_info = sys.exc_info()
|
||||
|
||||
|
||||
ACCEPTED_MIME_TYPE_PREFIXES = []
|
||||
|
||||
ACCEPTED_FILE_EXTENSIONS = [".one"]
|
||||
|
||||
|
||||
class OneNoteConverter(DocumentConverter):
|
||||
"""
|
||||
Converts OneNote files to Markdown.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self._html_converter = HtmlConverter()
|
||||
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> bool:
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
|
||||
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||
return True
|
||||
|
||||
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
||||
if mimetype.startswith(prefix):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> DocumentConverterResult:
|
||||
# Check: the dependencies
|
||||
if _dependency_exc_info is not None:
|
||||
raise MissingDependencyException(
|
||||
MISSING_DEPENDENCY_MESSAGE.format(
|
||||
converter=type(self).__name__,
|
||||
extension=".one",
|
||||
feature="onenote",
|
||||
)
|
||||
) from _dependency_exc_info[
|
||||
1
|
||||
].with_traceback( # type: ignore[union-attr]
|
||||
_dependency_exc_info[2]
|
||||
)
|
||||
|
||||
# Perform the conversion
|
||||
md_content = ""
|
||||
notebook = one_extract.Notebook(file_stream)
|
||||
for section in notebook.sections:
|
||||
md_content += f"\n\n# {section.name}\n"
|
||||
for page in section.pages:
|
||||
md_content += f"\n\n## {page.name}\n"
|
||||
md_content += (
|
||||
self._html_converter.convert_string(page.content).markdown.strip()
|
||||
+ "\n\n"
|
||||
)
|
||||
|
||||
return DocumentConverterResult(
|
||||
title=None,
|
||||
text_content=md_content.strip(),
|
||||
)
|
||||
@@ -7,6 +7,7 @@ from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||
# Try loading optional (but in this case, required) dependencies
|
||||
# Save reporting of any exceptions for later
|
||||
_dependency_exc_info = None
|
||||
olefile = None
|
||||
try:
|
||||
import olefile
|
||||
except ImportError:
|
||||
@@ -48,7 +49,7 @@ class OutlookMsgConverter(DocumentConverter):
|
||||
# Brute force, check if we have an OLE file
|
||||
cur_pos = file_stream.tell()
|
||||
try:
|
||||
if not olefile.isOleFile(file_stream):
|
||||
if olefile and not olefile.isOleFile(file_stream):
|
||||
return False
|
||||
finally:
|
||||
file_stream.seek(cur_pos)
|
||||
|
||||
@@ -6,6 +6,7 @@ import re
|
||||
import html
|
||||
|
||||
from typing import BinaryIO, Any
|
||||
from operator import attrgetter
|
||||
|
||||
from ._html_converter import HtmlConverter
|
||||
from ._llm_caption import llm_caption
|
||||
@@ -160,10 +161,12 @@ class PptxConverter(DocumentConverter):
|
||||
|
||||
# Group Shapes
|
||||
if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.GROUP:
|
||||
for subshape in shape.shapes:
|
||||
sorted_shapes = sorted(shape.shapes, key=attrgetter("top", "left"))
|
||||
for subshape in sorted_shapes:
|
||||
get_shape_content(subshape, **kwargs)
|
||||
|
||||
for shape in slide.shapes:
|
||||
sorted_shapes = sorted(slide.shapes, key=attrgetter("top", "left"))
|
||||
for shape in sorted_shapes:
|
||||
get_shape_content(shape, **kwargs)
|
||||
|
||||
md_content = md_content.strip()
|
||||
|
||||
BIN
packages/markitdown/tests/test_files/test.one
vendored
BIN
packages/markitdown/tests/test_files/test.one
vendored
Binary file not shown.
@@ -7,8 +7,6 @@ import openai
|
||||
import pytest
|
||||
import requests
|
||||
|
||||
import warnings
|
||||
|
||||
from markitdown import (
|
||||
MarkItDown,
|
||||
UnsupportedFormatException,
|
||||
@@ -517,19 +515,6 @@ def test_exceptions() -> None:
|
||||
reason="do not run if exiftool is not installed",
|
||||
)
|
||||
def test_markitdown_exiftool() -> None:
|
||||
# Test the automatic discovery of exiftool throws a warning
|
||||
# and is disabled
|
||||
try:
|
||||
warnings.simplefilter("default")
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
markitdown = MarkItDown()
|
||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
|
||||
assert len(w) == 1
|
||||
assert w[0].category is DeprecationWarning
|
||||
assert result.text_content.strip() == ""
|
||||
finally:
|
||||
warnings.resetwarnings()
|
||||
|
||||
which_exiftool = shutil.which("exiftool")
|
||||
assert which_exiftool is not None
|
||||
|
||||
|
||||
Reference in New Issue
Block a user