modify ext guesser

This commit is contained in:
Kenny Zhang
2025-02-20 16:47:37 -05:00
parent 395ce2d301
commit b8b3897952

View File

@@ -10,7 +10,7 @@ from typing import Any, List, Optional, Union
from pathlib import Path from pathlib import Path
from urllib.parse import urlparse from urllib.parse import urlparse
from warnings import warn from warnings import warn
from io import BufferedIOBase, TextIOBase from io import BufferedIOBase, TextIOBase, BytesIO
# File-format detection # File-format detection
import puremagic import puremagic
@@ -416,7 +416,7 @@ class MarkItDown:
"""Use puremagic (a Python implementation of libmagic) to guess a file's extension based on the first few bytes.""" """Use puremagic (a Python implementation of libmagic) to guess a file's extension based on the first few bytes."""
# Use puremagic to guess # Use puremagic to guess
try: try:
guesses = None guesses = []
# Guess extensions for filepaths # Guess extensions for filepaths
if isinstance(source, str): if isinstance(source, str):
@@ -440,8 +440,9 @@ class MarkItDown:
except puremagic.main.PureError: except puremagic.main.PureError:
pass pass
# Guess extensions for file objects # Guess extensions for file objects. Note that the puremagic's magic_stream function requires a BytesIO-like file source
elif isinstance(source, BufferedIOBase) or isinstance(source, TextIOBase): # TODO: Figure out how to guess extensions for TextIO-like file sources (manually converting to BytesIO does not currently work)
elif isinstance(source, BufferedIOBase):
guesses = puremagic.magic_stream(source) guesses = puremagic.magic_stream(source)
extensions = list() extensions = list()