Added support to use Pathlib (#93)

* Add support for Path objects in MarkItDown conversion methods

* Remove unnecessary blank line in test_markitdown_exiftool function

* Remove unnecessary blank line in test_markitdown_exiftool function

* remove pathlib path in test file

---------

Co-authored-by: afourney <adamfo@microsoft.com>
Co-authored-by: gagb <gagb@users.noreply.github.com>
This commit is contained in:
SigireddyBalasai
2024-12-21 03:42:48 +05:30
committed by GitHub
parent 7e6c36c5d4
commit 5276616ba1

View File

@@ -15,6 +15,7 @@ import traceback
import zipfile
from xml.dom import minidom
from typing import Any, Dict, List, Optional, Union
from pathlib import Path
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
from warnings import warn, resetwarnings, catch_warnings
@@ -1286,11 +1287,11 @@ class MarkItDown:
self.register_page_converter(ZipConverter())
def convert(
self, source: Union[str, requests.Response], **kwargs: Any
self, source: Union[str, requests.Response, Path], **kwargs: Any
) -> DocumentConverterResult: # TODO: deal with kwargs
"""
Args:
- source: can be a string representing a path or url, or a requests.response object
- source: can be a string representing a path either as string pathlib path object or url, or a requests.response object
- extension: specifies the file extension to use when interpreting the file. If None, infer from source (path, uri, content-type, etc.)
"""
@@ -1307,10 +1308,14 @@ class MarkItDown:
# Request response
elif isinstance(source, requests.Response):
return self.convert_response(source, **kwargs)
elif isinstance(source, Path):
return self.convert_local(source, **kwargs)
def convert_local(
self, path: str, **kwargs: Any
self, path: Union[str, Path], **kwargs: Any
) -> DocumentConverterResult: # TODO: deal with kwargs
if isinstance(path, Path):
path = str(path)
# Prepare a list of extensions to try (in order of priority)
ext = kwargs.get("file_extension")
extensions = [ext] if ext is not None else []