@@ -49,6 +49,9 @@ Source = "https://github.com/microsoft/markitdown"
|
|||||||
[tool.hatch.version]
|
[tool.hatch.version]
|
||||||
path = "src/markitdown/__about__.py"
|
path = "src/markitdown/__about__.py"
|
||||||
|
|
||||||
|
[project.scripts]
|
||||||
|
markitdown = "markitdown.__main__:main"
|
||||||
|
|
||||||
[tool.hatch.envs.types]
|
[tool.hatch.envs.types]
|
||||||
extra-dependencies = [
|
extra-dependencies = [
|
||||||
"mypy>=1.0.0",
|
"mypy>=1.0.0",
|
||||||
|
|||||||
42
src/markitdown/__main__.py
Normal file
42
src/markitdown/__main__.py
Normal file
@@ -0,0 +1,42 @@
|
|||||||
|
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
|
||||||
|
#
|
||||||
|
# SPDX-License-Identifier: MIT
|
||||||
|
import sys
|
||||||
|
from ._markitdown import MarkItDown
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
if len(sys.argv) == 1:
|
||||||
|
markitdown = MarkItDown()
|
||||||
|
result = markitdown.convert_stream(sys.stdin.buffer)
|
||||||
|
print(result.text_content)
|
||||||
|
elif len(sys.argv) == 2:
|
||||||
|
markitdown = MarkItDown()
|
||||||
|
result = markitdown.convert(sys.argv[1])
|
||||||
|
print(result.text_content)
|
||||||
|
else:
|
||||||
|
sys.stderr.write(
|
||||||
|
"""
|
||||||
|
SYNTAX:
|
||||||
|
|
||||||
|
markitdown <OPTIONAL: FILENAME>
|
||||||
|
If FILENAME is empty, markitdown reads from stdin.
|
||||||
|
|
||||||
|
EXAMPLE:
|
||||||
|
|
||||||
|
markitdown example.pdf
|
||||||
|
|
||||||
|
OR
|
||||||
|
|
||||||
|
cat example.pdf | markitdown
|
||||||
|
|
||||||
|
OR
|
||||||
|
|
||||||
|
markitdown < example.pdf
|
||||||
|
""".strip()
|
||||||
|
+ "\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -11,6 +11,7 @@ import shutil
|
|||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
import tempfile
|
import tempfile
|
||||||
|
import traceback
|
||||||
from typing import Any, Dict, List, Optional, Union
|
from typing import Any, Dict, List, Optional, Union
|
||||||
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
|
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
|
||||||
|
|
||||||
@@ -913,7 +914,9 @@ class MarkItDown:
|
|||||||
# Get extension alternatives from the path and puremagic
|
# Get extension alternatives from the path and puremagic
|
||||||
base, ext = os.path.splitext(path)
|
base, ext = os.path.splitext(path)
|
||||||
self._append_ext(extensions, ext)
|
self._append_ext(extensions, ext)
|
||||||
self._append_ext(extensions, self._guess_ext_magic(path))
|
|
||||||
|
for g in self._guess_ext_magic(path):
|
||||||
|
self._append_ext(extensions, g)
|
||||||
|
|
||||||
# Convert
|
# Convert
|
||||||
return self._convert(path, extensions, **kwargs)
|
return self._convert(path, extensions, **kwargs)
|
||||||
@@ -940,7 +943,8 @@ class MarkItDown:
|
|||||||
fh.close()
|
fh.close()
|
||||||
|
|
||||||
# Use puremagic to check for more extension options
|
# Use puremagic to check for more extension options
|
||||||
self._append_ext(extensions, self._guess_ext_magic(temp_path))
|
for g in self._guess_ext_magic(temp_path):
|
||||||
|
self._append_ext(extensions, g)
|
||||||
|
|
||||||
# Convert
|
# Convert
|
||||||
result = self._convert(temp_path, extensions, **kwargs)
|
result = self._convert(temp_path, extensions, **kwargs)
|
||||||
@@ -1032,10 +1036,10 @@ class MarkItDown:
|
|||||||
_kwargs["mlm_model"] = self._mlm_model
|
_kwargs["mlm_model"] = self._mlm_model
|
||||||
|
|
||||||
# If we hit an error log it and keep trying
|
# If we hit an error log it and keep trying
|
||||||
# try:
|
try:
|
||||||
res = converter.convert(local_path, **_kwargs)
|
res = converter.convert(local_path, **_kwargs)
|
||||||
# except Exception:
|
except Exception:
|
||||||
# error_trace = ("\n\n" + traceback.format_exc()).strip()
|
error_trace = ("\n\n" + traceback.format_exc()).strip()
|
||||||
|
|
||||||
if res is not None:
|
if res is not None:
|
||||||
# Normalize the content
|
# Normalize the content
|
||||||
@@ -1074,10 +1078,15 @@ class MarkItDown:
|
|||||||
# Use puremagic to guess
|
# Use puremagic to guess
|
||||||
try:
|
try:
|
||||||
guesses = puremagic.magic_file(path)
|
guesses = puremagic.magic_file(path)
|
||||||
if len(guesses) > 0:
|
extensions = list()
|
||||||
ext = guesses[0].extension.strip()
|
for g in guesses:
|
||||||
|
ext = g.extension.strip()
|
||||||
if len(ext) > 0:
|
if len(ext) > 0:
|
||||||
return ext
|
if not ext.startswith("."):
|
||||||
|
ext = "." + ext
|
||||||
|
if ext not in extensions:
|
||||||
|
extensions.append(ext)
|
||||||
|
return extensions
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
pass
|
pass
|
||||||
except IsADirectoryError:
|
except IsADirectoryError:
|
||||||
|
|||||||
Reference in New Issue
Block a user