added conversion path for file object in central class
This commit is contained in:
@@ -10,6 +10,7 @@ from typing import Any, List, Optional, Union
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
from warnings import warn
|
from warnings import warn
|
||||||
|
from io import BufferedIOBase, TextIOBase
|
||||||
|
|
||||||
# File-format detection
|
# File-format detection
|
||||||
import puremagic
|
import puremagic
|
||||||
@@ -174,11 +175,11 @@ class MarkItDown:
|
|||||||
warn("Plugins converters are already enabled.", RuntimeWarning)
|
warn("Plugins converters are already enabled.", RuntimeWarning)
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self, source: Union[str, requests.Response, Path], **kwargs: Any
|
self, source: Union[str, requests.Response, Path, BufferedIOBase, TextIOBase], **kwargs: Any
|
||||||
) -> DocumentConverterResult: # TODO: deal with kwargs
|
) -> DocumentConverterResult: # TODO: deal with kwargs
|
||||||
"""
|
"""
|
||||||
Args:
|
Args:
|
||||||
- source: can be a string representing a path either as string pathlib path object or url, or a requests.response object
|
- source: can be a string representing a path either as string pathlib path object or url, a requests.response object, or a file object (TextIO or BinaryIO)
|
||||||
- extension: specifies the file extension to use when interpreting the file. If None, infer from source (path, uri, content-type, etc.)
|
- extension: specifies the file extension to use when interpreting the file. If None, infer from source (path, uri, content-type, etc.)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@@ -211,7 +212,7 @@ class MarkItDown:
|
|||||||
base, ext = os.path.splitext(path)
|
base, ext = os.path.splitext(path)
|
||||||
self._append_ext(extensions, ext)
|
self._append_ext(extensions, ext)
|
||||||
|
|
||||||
for g in self._guess_ext_magic(path):
|
for g in self._guess_ext_magic(source=path):
|
||||||
self._append_ext(extensions, g)
|
self._append_ext(extensions, g)
|
||||||
|
|
||||||
# Create the ConverterInput object
|
# Create the ConverterInput object
|
||||||
@@ -220,6 +221,23 @@ class MarkItDown:
|
|||||||
# Convert
|
# Convert
|
||||||
return self._convert(input, extensions, **kwargs)
|
return self._convert(input, extensions, **kwargs)
|
||||||
|
|
||||||
|
def convert_file_object(
|
||||||
|
self, file_object: Union[BufferedIOBase, TextIOBase], **kwargs: Any
|
||||||
|
) -> DocumentConverterResult: #TODO: deal with kwargs
|
||||||
|
# Prepare a list of extensions to try (in order of priority)
|
||||||
|
ext = kwargs.get("file_extension")
|
||||||
|
extensions = [ext] if ext is not None else []
|
||||||
|
|
||||||
|
# Get extension alternatives from puremagic
|
||||||
|
for g in self._guess_ext_magic(source=file_object):
|
||||||
|
self._append_ext(extensions, g)
|
||||||
|
|
||||||
|
# Create the ConverterInput object
|
||||||
|
input = ConverterInput(input_type="object", file_object=file_object)
|
||||||
|
|
||||||
|
# Convert
|
||||||
|
return self._convert(input, extensions, **kwargs)
|
||||||
|
|
||||||
# TODO what should stream's type be?
|
# TODO what should stream's type be?
|
||||||
def convert_stream(
|
def convert_stream(
|
||||||
self, stream: Any, **kwargs: Any
|
self, stream: Any, **kwargs: Any
|
||||||
@@ -242,7 +260,7 @@ class MarkItDown:
|
|||||||
fh.close()
|
fh.close()
|
||||||
|
|
||||||
# Use puremagic to check for more extension options
|
# Use puremagic to check for more extension options
|
||||||
for g in self._guess_ext_magic(temp_path):
|
for g in self._guess_ext_magic(source=temp_path):
|
||||||
self._append_ext(extensions, g)
|
self._append_ext(extensions, g)
|
||||||
|
|
||||||
# Create the ConverterInput object
|
# Create the ConverterInput object
|
||||||
@@ -301,7 +319,7 @@ class MarkItDown:
|
|||||||
fh.close()
|
fh.close()
|
||||||
|
|
||||||
# Use puremagic to check for more extension options
|
# Use puremagic to check for more extension options
|
||||||
for g in self._guess_ext_magic(temp_path):
|
for g in self._guess_ext_magic(source=temp_path):
|
||||||
self._append_ext(extensions, g)
|
self._append_ext(extensions, g)
|
||||||
|
|
||||||
# Create the ConverterInput object
|
# Create the ConverterInput object
|
||||||
@@ -393,18 +411,22 @@ class MarkItDown:
|
|||||||
# if ext not in extensions:
|
# if ext not in extensions:
|
||||||
extensions.append(ext)
|
extensions.append(ext)
|
||||||
|
|
||||||
def _guess_ext_magic(self, path):
|
def _guess_ext_magic(self, source):
|
||||||
"""Use puremagic (a Python implementation of libmagic) to guess a file's extension based on the first few bytes."""
|
"""Use puremagic (a Python implementation of libmagic) to guess a file's extension based on the first few bytes."""
|
||||||
# Use puremagic to guess
|
# Use puremagic to guess
|
||||||
try:
|
try:
|
||||||
guesses = puremagic.magic_file(path)
|
guesses = None
|
||||||
|
|
||||||
|
# Guess extensions for filepaths
|
||||||
|
if isinstance(source, str):
|
||||||
|
guesses = puremagic.magic_file(source)
|
||||||
|
|
||||||
# Fix for: https://github.com/microsoft/markitdown/issues/222
|
# Fix for: https://github.com/microsoft/markitdown/issues/222
|
||||||
# If there are no guesses, then try again after trimming leading ASCII whitespaces.
|
# If there are no guesses, then try again after trimming leading ASCII whitespaces.
|
||||||
# ASCII whitespace characters are those byte values in the sequence b' \t\n\r\x0b\f'
|
# ASCII whitespace characters are those byte values in the sequence b' \t\n\r\x0b\f'
|
||||||
# (space, tab, newline, carriage return, vertical tab, form feed).
|
# (space, tab, newline, carriage return, vertical tab, form feed).
|
||||||
if len(guesses) == 0:
|
if len(guesses) == 0:
|
||||||
with open(path, "rb") as file:
|
with open(source, "rb") as file:
|
||||||
while True:
|
while True:
|
||||||
char = file.read(1)
|
char = file.read(1)
|
||||||
if not char: # End of file
|
if not char: # End of file
|
||||||
@@ -417,6 +439,10 @@ class MarkItDown:
|
|||||||
except puremagic.main.PureError:
|
except puremagic.main.PureError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
# Guess extensions for file objects
|
||||||
|
elif isinstance(source, BufferedIOBase) or isinstance(source, TextIOBase):
|
||||||
|
guesses = puremagic.magic_stream(source)
|
||||||
|
|
||||||
extensions = list()
|
extensions = list()
|
||||||
for g in guesses:
|
for g in guesses:
|
||||||
ext = g.extension.strip()
|
ext = g.extension.strip()
|
||||||
|
|||||||
Reference in New Issue
Block a user