Added support to use Pathlib (#93)
* Add support for Path objects in MarkItDown conversion methods * Remove unnecessary blank line in test_markitdown_exiftool function * Remove unnecessary blank line in test_markitdown_exiftool function * remove pathlib path in test file --------- Co-authored-by: afourney <adamfo@microsoft.com> Co-authored-by: gagb <gagb@users.noreply.github.com>
This commit is contained in:
@@ -15,6 +15,7 @@ import traceback
|
|||||||
import zipfile
|
import zipfile
|
||||||
from xml.dom import minidom
|
from xml.dom import minidom
|
||||||
from typing import Any, Dict, List, Optional, Union
|
from typing import Any, Dict, List, Optional, Union
|
||||||
|
from pathlib import Path
|
||||||
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
|
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
|
||||||
from warnings import warn, resetwarnings, catch_warnings
|
from warnings import warn, resetwarnings, catch_warnings
|
||||||
|
|
||||||
@@ -1286,11 +1287,11 @@ class MarkItDown:
|
|||||||
self.register_page_converter(ZipConverter())
|
self.register_page_converter(ZipConverter())
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self, source: Union[str, requests.Response], **kwargs: Any
|
self, source: Union[str, requests.Response, Path], **kwargs: Any
|
||||||
) -> DocumentConverterResult: # TODO: deal with kwargs
|
) -> DocumentConverterResult: # TODO: deal with kwargs
|
||||||
"""
|
"""
|
||||||
Args:
|
Args:
|
||||||
- source: can be a string representing a path or url, or a requests.response object
|
- source: can be a string representing a path either as string pathlib path object or url, or a requests.response object
|
||||||
- extension: specifies the file extension to use when interpreting the file. If None, infer from source (path, uri, content-type, etc.)
|
- extension: specifies the file extension to use when interpreting the file. If None, infer from source (path, uri, content-type, etc.)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@@ -1307,10 +1308,14 @@ class MarkItDown:
|
|||||||
# Request response
|
# Request response
|
||||||
elif isinstance(source, requests.Response):
|
elif isinstance(source, requests.Response):
|
||||||
return self.convert_response(source, **kwargs)
|
return self.convert_response(source, **kwargs)
|
||||||
|
elif isinstance(source, Path):
|
||||||
|
return self.convert_local(source, **kwargs)
|
||||||
|
|
||||||
def convert_local(
|
def convert_local(
|
||||||
self, path: str, **kwargs: Any
|
self, path: Union[str, Path], **kwargs: Any
|
||||||
) -> DocumentConverterResult: # TODO: deal with kwargs
|
) -> DocumentConverterResult: # TODO: deal with kwargs
|
||||||
|
if isinstance(path, Path):
|
||||||
|
path = str(path)
|
||||||
# Prepare a list of extensions to try (in order of priority)
|
# Prepare a list of extensions to try (in order of priority)
|
||||||
ext = kwargs.get("file_extension")
|
ext = kwargs.get("file_extension")
|
||||||
extensions = [ext] if ext is not None else []
|
extensions = [ext] if ext is not None else []
|
||||||
|
|||||||
Reference in New Issue
Block a user