Added support to use Pathlib (#93)

* Add support for Path objects in MarkItDown conversion methods

* Remove unnecessary blank line in test_markitdown_exiftool function

* Remove unnecessary blank line in test_markitdown_exiftool function

* remove pathlib path in test file

---------

Co-authored-by: afourney <adamfo@microsoft.com>
Co-authored-by: gagb <gagb@users.noreply.github.com>
This commit is contained in:
SigireddyBalasai
2024-12-21 03:42:48 +05:30
committed by GitHub
parent 7e6c36c5d4
commit 5276616ba1

View File

@@ -15,6 +15,7 @@ import traceback
import zipfile import zipfile
from xml.dom import minidom from xml.dom import minidom
from typing import Any, Dict, List, Optional, Union from typing import Any, Dict, List, Optional, Union
from pathlib import Path
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
from warnings import warn, resetwarnings, catch_warnings from warnings import warn, resetwarnings, catch_warnings
@@ -1286,11 +1287,11 @@ class MarkItDown:
self.register_page_converter(ZipConverter()) self.register_page_converter(ZipConverter())
def convert( def convert(
self, source: Union[str, requests.Response], **kwargs: Any self, source: Union[str, requests.Response, Path], **kwargs: Any
) -> DocumentConverterResult: # TODO: deal with kwargs ) -> DocumentConverterResult: # TODO: deal with kwargs
""" """
Args: Args:
- source: can be a string representing a path or url, or a requests.response object - source: can be a string representing a path either as string pathlib path object or url, or a requests.response object
- extension: specifies the file extension to use when interpreting the file. If None, infer from source (path, uri, content-type, etc.) - extension: specifies the file extension to use when interpreting the file. If None, infer from source (path, uri, content-type, etc.)
""" """
@@ -1307,10 +1308,14 @@ class MarkItDown:
# Request response # Request response
elif isinstance(source, requests.Response): elif isinstance(source, requests.Response):
return self.convert_response(source, **kwargs) return self.convert_response(source, **kwargs)
elif isinstance(source, Path):
return self.convert_local(source, **kwargs)
def convert_local( def convert_local(
self, path: str, **kwargs: Any self, path: Union[str, Path], **kwargs: Any
) -> DocumentConverterResult: # TODO: deal with kwargs ) -> DocumentConverterResult: # TODO: deal with kwargs
if isinstance(path, Path):
path = str(path)
# Prepare a list of extensions to try (in order of priority) # Prepare a list of extensions to try (in order of priority)
ext = kwargs.get("file_extension") ext = kwargs.get("file_extension")
extensions = [ext] if ext is not None else [] extensions = [ext] if ext is not None else []