close file object after using

This commit is contained in:
Kenny Zhang
2025-02-20 13:54:51 -05:00
parent 808401a331
commit 395ce2d301
13 changed files with 23 additions and 4 deletions

View File

@@ -182,7 +182,6 @@ class MarkItDown:
- source: can be a string representing a path either as string pathlib path object or url, a requests.response object, or a file object (TextIO or BinaryIO)
- extension: specifies the file extension to use when interpreting the file. If None, infer from source (path, uri, content-type, etc.)
"""
# Local path or url
if isinstance(source, str):
if (
@@ -198,6 +197,9 @@ class MarkItDown:
return self.convert_response(source, **kwargs)
elif isinstance(source, Path):
return self.convert_local(source, **kwargs)
# File object
elif isinstance(source, BufferedIOBase) or isinstance(source, TextIOBase):
return self.convert_file_object(source, **kwargs)
def convert_local(
self, path: Union[str, Path], **kwargs: Any
@@ -341,7 +343,6 @@ class MarkItDown:
self, input: ConverterInput, extensions: List[Union[str, None]], **kwargs
) -> DocumentConverterResult:
error_trace = ""
# Create a copy of the page_converters list, sorted by priority.
# We do this with each call to _convert because the priority of converters may change between calls.
# The sort is guaranteed to be stable, so converters with the same priority will remain in the same order.
@@ -442,7 +443,7 @@ class MarkItDown:
# Guess extensions for file objects
elif isinstance(source, BufferedIOBase) or isinstance(source, TextIOBase):
guesses = puremagic.magic_stream(source)
extensions = list()
for g in guesses:
ext = g.extension.strip()

View File

@@ -39,6 +39,7 @@ class BingSerpConverter(DocumentConverter):
soup = None
file_obj = input.read_file(mode="rt", encoding="utf-8")
soup = BeautifulSoup(file_obj.read(), "html.parser")
file_obj.close()
# Clean up some formatting
for tptt in soup.find_all(class_="tptt"):

View File

@@ -63,6 +63,7 @@ class DocumentIntelligenceConverter(DocumentConverter):
# Get the bytestring from the converter input
file_obj = input.read_file(mode='rb')
file_bytes = file_obj.read()
file_obj.close()
# Certain document analysis features are not availiable for office filetypes (.xlsx, .pptx, .html, .docx)
if extension.lower() in [".xlsx", ".pptx", ".html", ".docx"]:

View File

@@ -31,6 +31,7 @@ class DocxConverter(HtmlConverter):
style_map = kwargs.get("style_map", None)
file_obj = input.read_file(mode="rb")
result = mammoth.convert_to_html(file_obj, style_map=style_map)
file_obj.close()
html_content = result.value
result = self._convert(html_content)

View File

@@ -25,6 +25,7 @@ class HtmlConverter(DocumentConverter):
result = None
file_obj = input.read_file(mode="rt", encoding="utf-8")
result = self._convert(file_obj.read())
file_obj.close()
return result

View File

@@ -73,6 +73,7 @@ class ImageConverter(MediaConverter):
content_type = "image/jpeg"
image_file = input.read_file(mode="rb")
image_base64 = base64.b64encode(image_file.read()).decode("utf-8")
image_file.close()
data_uri = f"data:{content_type};base64,{image_base64}"
messages = [

View File

@@ -30,6 +30,7 @@ class IpynbConverter(DocumentConverter):
result = None
file_obj = input.read_file(mode="rt", encoding="utf-8")
notebook_content = json.load(file_obj)
file_obj.close()
result = self._convert(notebook_content)
return result

View File

@@ -26,8 +26,9 @@ class OutlookMsgConverter(DocumentConverter):
return None
try:
file_obj = input.read_file(mode="rt", encoding="utf-8")
file_obj = input.read_file(mode="rb")
msg = olefile.OleFileIO(file_obj)
# Extract email metadata
md_content = "# Email Message\n\n"
@@ -51,6 +52,7 @@ class OutlookMsgConverter(DocumentConverter):
md_content += body
msg.close()
file_obj.close()
return DocumentConverterResult(
title=headers.get("Subject"), text_content=md_content.strip()

View File

@@ -25,6 +25,8 @@ class PdfConverter(DocumentConverter):
output = StringIO()
file_obj = input.read_file(mode="rb")
pdfminer.high_level.extract_text_to_fp(file_obj, output)
file_obj.close()
return DocumentConverterResult(
title=None,
text_content=output.getvalue(),

View File

@@ -59,6 +59,8 @@ class PptxConverter(HtmlConverter):
file_obj = input.read_file(mode="rb")
presentation = pptx.Presentation(file_obj)
file_obj.close()
slide_num = 0
for slide in presentation.slides:
slide_num += 1

View File

@@ -31,6 +31,7 @@ class WikipediaConverter(DocumentConverter):
soup = None
file_obj = input.read_file(mode="rt", encoding="utf-8")
soup = BeautifulSoup(file_obj.read(), "html.parser")
file_obj.close()
# Remove javascript and style blocks
for script in soup(["script", "style"]):

View File

@@ -25,6 +25,8 @@ class XlsxConverter(HtmlConverter):
file_obj = input.read_file(mode="rb")
sheets = pd.read_excel(file_obj, sheet_name=None, engine="openpyxl")
file_obj.close()
md_content = ""
for s in sheets:
md_content += f"## {s}\n"
@@ -50,6 +52,8 @@ class XlsConverter(HtmlConverter):
file_obj = input.read_file(mode="rb")
sheets = pd.read_excel(file_obj, sheet_name=None, engine="xlrd")
file_obj.close()
md_content = ""
for s in sheets:
md_content += f"## {s}\n"

View File

@@ -41,6 +41,7 @@ class YouTubeConverter(DocumentConverter):
soup = None
file_obj = input.read_file(mode="rt", encoding="utf-8")
soup = BeautifulSoup(file_obj.read(), "html.parser")
file_obj.close()
# Read the meta tags
assert soup.title is not None and soup.title.string is not None