1 Commits

Author SHA1 Message Date
Josh Bradley
33a0cd8efe small formatting change 2025-01-14 18:04:14 -05:00
6 changed files with 16 additions and 223 deletions

View File

@@ -17,7 +17,6 @@ RUN pip install markitdown
# Default USERID and GROUPID
ARG USERID=10000
ARG GROUPID=10000
USER $USERID:$GROUPID
ENTRYPOINT [ "markitdown" ]

View File

@@ -33,20 +33,12 @@ Or use `-o` to specify the output file:
markitdown path-to-file.pdf -o document.md
```
To use Document Intelligence conversion:
```bash
markitdown path-to-file.pdf -o document.md -d -e "<document_intelligence_endpoint>"
```
You can also pipe content:
```bash
cat path-to-file.pdf | markitdown
```
More information about how to set up an Azure Document Intelligence Resource can be found [here](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/how-to-guides/create-document-intelligence-resource?view=doc-intel-4.0.0)
### Python API
Basic usage in Python:
@@ -59,16 +51,6 @@ result = md.convert("test.xlsx")
print(result.text_content)
```
Document Intelligence conversion in Python:
```python
from markitdown import MarkItDown
md = MarkItDown(docintel_endpoint="<document_intelligence_endpoint>")
result = md.convert("test.pdf")
print(result.text_content)
```
To use Large Language Models for image descriptions, provide `llm_client` and `llm_model`:
```python

View File

@@ -42,8 +42,6 @@ dependencies = [
"pathvalidate",
"charset-normalizer",
"openai",
"azure-ai-documentintelligence",
"azure-identity"
]
[project.urls]

View File

@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
#
# SPDX-License-Identifier: MIT
__version__ = "0.0.1a5"
__version__ = "0.0.1a3"

View File

@@ -3,7 +3,6 @@
# SPDX-License-Identifier: MIT
import argparse
import sys
import shutil
from textwrap import dedent
from .__about__ import __version__
from ._markitdown import MarkItDown, DocumentConverterResult
@@ -52,50 +51,22 @@ def main():
help="show the version number and exit",
)
parser.add_argument("filename", nargs="?")
parser.add_argument(
"-o",
"--output",
help="Output file name. If not provided, output is written to stdout.",
)
parser.add_argument(
"-d",
"--use-docintel",
action="store_true",
help="Use Document Intelligence to extract text instead of offline conversion. Requires a valid Document Intelligence Endpoint.",
)
parser.add_argument(
"-e",
"--endpoint",
type=str,
help="Document Intelligence Endpoint. Required if using Document Intelligence.",
)
parser.add_argument("filename", nargs="?")
args = parser.parse_args()
which_exiftool = shutil.which("exiftool")
if args.use_docintel:
if args.endpoint is None:
raise ValueError(
"Document Intelligence Endpoint is required when using Document Intelligence."
)
elif args.filename is None:
raise ValueError("Filename is required when using Document Intelligence.")
markitdown = MarkItDown(
exiftool_path=which_exiftool, docintel_endpoint=args.endpoint
)
else:
markitdown = MarkItDown(exiftool_path=which_exiftool)
if args.filename is None:
markitdown = MarkItDown()
result = markitdown.convert_stream(sys.stdin.buffer)
_handle_output(args, result)
else:
markitdown = MarkItDown()
result = markitdown.convert(args.filename)
_handle_output(args, result)
_handle_output(args, result)
def _handle_output(args, result: DocumentConverterResult):

View File

@@ -33,22 +33,6 @@ import requests
from bs4 import BeautifulSoup
from charset_normalizer import from_path
# Azure imports
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import (
AnalyzeDocumentRequest,
AnalyzeResult,
DocumentAnalysisFeature,
)
from azure.identity import DefaultAzureCredential
# TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum.
# This constant is a temporary fix until the bug is resolved.
CONTENT_FORMAT = "markdown"
# Override mimetype for csv to fix issue on windows
mimetypes.add_type("text/csv", ".csv")
# Optional Transcription support
IS_AUDIO_TRANSCRIPTION_CAPABLE = False
try:
@@ -91,14 +75,7 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
# Explicitly cast options to the expected type if necessary
super().__init__(**options)
def convert_hn(
self,
n: int,
el: Any,
text: str,
convert_as_inline: Optional[bool] = False,
**kwargs,
) -> str:
def convert_hn(self, n: int, el: Any, text: str, convert_as_inline: bool) -> str:
"""Same as usual, but be sure to start with a new line"""
if not convert_as_inline:
if not re.search(r"^\n", text):
@@ -106,9 +83,7 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
return super().convert_hn(n, el, text, convert_as_inline) # type: ignore
def convert_a(
self, el: Any, text: str, convert_as_inline: Optional[bool] = False, **kwargs
):
def convert_a(self, el: Any, text: str, convert_as_inline: bool):
"""Same as usual converter, but removes Javascript links and escapes URIs."""
prefix, suffix, text = markdownify.chomp(text) # type: ignore
if not text:
@@ -144,9 +119,7 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
else text
)
def convert_img(
self, el: Any, text: str, convert_as_inline: Optional[bool] = False, **kwargs
) -> str:
def convert_img(self, el: Any, text: str, convert_as_inline: bool) -> str:
"""Same as usual converter, but removes data URIs"""
alt = el.attrs.get("alt", None) or ""
@@ -231,7 +204,7 @@ class HtmlConverter(DocumentConverter):
return result
def _convert(self, html_content: str) -> Union[None, DocumentConverterResult]:
"""Helper function that converts an HTML string."""
"""Helper function that converts and HTML string."""
# Parse the string
soup = BeautifulSoup(html_content, "html.parser")
@@ -250,9 +223,6 @@ class HtmlConverter(DocumentConverter):
assert isinstance(webpage_text, str)
# remove leading and trailing \n
webpage_text = webpage_text.strip()
return DocumentConverterResult(
title=None if soup.title is None else soup.title.string,
text_content=webpage_text,
@@ -801,35 +771,6 @@ class PptxConverter(HtmlConverter):
Converts PPTX files to Markdown. Supports heading, tables and images with alt text.
"""
def _get_llm_description(
self, llm_client, llm_model, image_blob, content_type, prompt=None
):
if prompt is None or prompt.strip() == "":
prompt = "Write a detailed alt text for this image with less than 50 words."
image_base64 = base64.b64encode(image_blob).decode("utf-8")
data_uri = f"data:{content_type};base64,{image_base64}"
messages = [
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": data_uri,
},
},
{"type": "text", "text": prompt},
],
}
]
response = llm_client.chat.completions.create(
model=llm_model, messages=messages
)
return response.choices[0].message.content
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not a PPTX
extension = kwargs.get("file_extension", "")
@@ -850,38 +791,17 @@ class PptxConverter(HtmlConverter):
# Pictures
if self._is_picture(shape):
# https://github.com/scanny/python-pptx/pull/512#issuecomment-1713100069
llm_description = None
alt_text = None
llm_client = kwargs.get("llm_client")
llm_model = kwargs.get("llm_model")
if llm_client is not None and llm_model is not None:
try:
llm_description = self._get_llm_description(
llm_client,
llm_model,
shape.image.blob,
shape.image.content_type,
)
except Exception:
# Unable to describe with LLM
pass
if not llm_description:
try:
alt_text = shape._element._nvXxPr.cNvPr.attrib.get(
"descr", ""
)
except Exception:
# Unable to get alt text
pass
alt_text = ""
try:
alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "")
except Exception:
pass
# A placeholder name
filename = re.sub(r"\W", "", shape.name) + ".jpg"
md_content += (
"\n!["
+ (llm_description or alt_text or shape.name)
+ (alt_text if alt_text else shape.name)
+ "]("
+ filename
+ ")\n"
@@ -1398,74 +1318,6 @@ class ZipConverter(DocumentConverter):
)
class DocumentIntelligenceConverter(DocumentConverter):
"""Specialized DocumentConverter that uses Document Intelligence to extract text from documents."""
def __init__(
self,
endpoint: str,
api_version: str = "2024-07-31-preview",
):
self.endpoint = endpoint
self.api_version = api_version
self.doc_intel_client = DocumentIntelligenceClient(
endpoint=self.endpoint,
api_version=self.api_version,
credential=DefaultAzureCredential(),
)
def convert(
self, local_path: str, **kwargs: Any
) -> Union[None, DocumentConverterResult]:
# Bail if extension is not supported by Document Intelligence
extension = kwargs.get("file_extension", "")
docintel_extensions = [
".pdf",
".docx",
".xlsx",
".pptx",
".html",
".jpeg",
".jpg",
".png",
".bmp",
".tiff",
".heif",
]
if extension.lower() not in docintel_extensions:
return None
# Get the bytestring for the local path
with open(local_path, "rb") as f:
file_bytes = f.read()
# Certain document analysis features are not availiable for filetypes (.xlsx, .pptx, .html)
if extension.lower() in [".xlsx", ".pptx", ".html"]:
analysis_features = []
else:
analysis_features = [
DocumentAnalysisFeature.FORMULAS, # enable formula extraction
DocumentAnalysisFeature.OCR_HIGH_RESOLUTION, # enable high resolution OCR
DocumentAnalysisFeature.STYLE_FONT, # enable font style extraction
]
# Extract the text using Azure Document Intelligence
poller = self.doc_intel_client.begin_analyze_document(
model_id="prebuilt-layout",
body=AnalyzeDocumentRequest(bytes_source=file_bytes),
features=analysis_features,
output_content_format=CONTENT_FORMAT, # TODO: replace with "ContentFormat.MARKDOWN" when the bug is fixed
)
result: AnalyzeResult = poller.result()
# remove comments from the markdown content generated by Doc Intelligence and append to markdown string
markdown_text = re.sub(r"<!--.*?-->", "", result.content, flags=re.DOTALL)
return DocumentConverterResult(
title=None,
text_content=markdown_text,
)
class FileConversionException(BaseException):
pass
@@ -1485,7 +1337,6 @@ class MarkItDown:
llm_model: Optional[str] = None,
style_map: Optional[str] = None,
exiftool_path: Optional[str] = None,
docintel_endpoint: Optional[str] = None,
# Deprecated
mlm_client: Optional[Any] = None,
mlm_model: Optional[str] = None,
@@ -1555,12 +1406,6 @@ class MarkItDown:
self.register_page_converter(ZipConverter())
self.register_page_converter(OutlookMsgConverter())
# Register Document Intelligence converter at the top of the stack if endpoint is provided
if docintel_endpoint is not None:
self.register_page_converter(
DocumentIntelligenceConverter(endpoint=docintel_endpoint)
)
def convert(
self, source: Union[str, requests.Response, Path], **kwargs: Any
) -> DocumentConverterResult: # TODO: deal with kwargs
@@ -1763,8 +1608,6 @@ class MarkItDown:
ext = ext.strip()
if ext == "":
return
if ext in extensions:
return
# if ext not in extensions:
extensions.append(ext)