Added support to use Pathlib (#93)
* Add support for Path objects in MarkItDown conversion methods * Remove unnecessary blank line in test_markitdown_exiftool function * Remove unnecessary blank line in test_markitdown_exiftool function * remove pathlib path in test file --------- Co-authored-by: afourney <adamfo@microsoft.com> Co-authored-by: gagb <gagb@users.noreply.github.com>
This commit is contained in:
@@ -15,6 +15,7 @@ import traceback
|
||||
import zipfile
|
||||
from xml.dom import minidom
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
from pathlib import Path
|
||||
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
|
||||
from warnings import warn, resetwarnings, catch_warnings
|
||||
|
||||
@@ -1286,11 +1287,11 @@ class MarkItDown:
|
||||
self.register_page_converter(ZipConverter())
|
||||
|
||||
def convert(
|
||||
self, source: Union[str, requests.Response], **kwargs: Any
|
||||
self, source: Union[str, requests.Response, Path], **kwargs: Any
|
||||
) -> DocumentConverterResult: # TODO: deal with kwargs
|
||||
"""
|
||||
Args:
|
||||
- source: can be a string representing a path or url, or a requests.response object
|
||||
- source: can be a string representing a path either as string pathlib path object or url, or a requests.response object
|
||||
- extension: specifies the file extension to use when interpreting the file. If None, infer from source (path, uri, content-type, etc.)
|
||||
"""
|
||||
|
||||
@@ -1307,10 +1308,14 @@ class MarkItDown:
|
||||
# Request response
|
||||
elif isinstance(source, requests.Response):
|
||||
return self.convert_response(source, **kwargs)
|
||||
elif isinstance(source, Path):
|
||||
return self.convert_local(source, **kwargs)
|
||||
|
||||
def convert_local(
|
||||
self, path: str, **kwargs: Any
|
||||
self, path: Union[str, Path], **kwargs: Any
|
||||
) -> DocumentConverterResult: # TODO: deal with kwargs
|
||||
if isinstance(path, Path):
|
||||
path = str(path)
|
||||
# Prepare a list of extensions to try (in order of priority)
|
||||
ext = kwargs.get("file_extension")
|
||||
extensions = [ext] if ext is not None else []
|
||||
|
||||
Reference in New Issue
Block a user