close file object after using
This commit is contained in:
@@ -182,7 +182,6 @@ class MarkItDown:
|
||||
- source: can be a string representing a path either as string pathlib path object or url, a requests.response object, or a file object (TextIO or BinaryIO)
|
||||
- extension: specifies the file extension to use when interpreting the file. If None, infer from source (path, uri, content-type, etc.)
|
||||
"""
|
||||
|
||||
# Local path or url
|
||||
if isinstance(source, str):
|
||||
if (
|
||||
@@ -198,6 +197,9 @@ class MarkItDown:
|
||||
return self.convert_response(source, **kwargs)
|
||||
elif isinstance(source, Path):
|
||||
return self.convert_local(source, **kwargs)
|
||||
# File object
|
||||
elif isinstance(source, BufferedIOBase) or isinstance(source, TextIOBase):
|
||||
return self.convert_file_object(source, **kwargs)
|
||||
|
||||
def convert_local(
|
||||
self, path: Union[str, Path], **kwargs: Any
|
||||
@@ -341,7 +343,6 @@ class MarkItDown:
|
||||
self, input: ConverterInput, extensions: List[Union[str, None]], **kwargs
|
||||
) -> DocumentConverterResult:
|
||||
error_trace = ""
|
||||
|
||||
# Create a copy of the page_converters list, sorted by priority.
|
||||
# We do this with each call to _convert because the priority of converters may change between calls.
|
||||
# The sort is guaranteed to be stable, so converters with the same priority will remain in the same order.
|
||||
@@ -442,7 +443,7 @@ class MarkItDown:
|
||||
# Guess extensions for file objects
|
||||
elif isinstance(source, BufferedIOBase) or isinstance(source, TextIOBase):
|
||||
guesses = puremagic.magic_stream(source)
|
||||
|
||||
|
||||
extensions = list()
|
||||
for g in guesses:
|
||||
ext = g.extension.strip()
|
||||
|
||||
@@ -39,6 +39,7 @@ class BingSerpConverter(DocumentConverter):
|
||||
soup = None
|
||||
file_obj = input.read_file(mode="rt", encoding="utf-8")
|
||||
soup = BeautifulSoup(file_obj.read(), "html.parser")
|
||||
file_obj.close()
|
||||
|
||||
# Clean up some formatting
|
||||
for tptt in soup.find_all(class_="tptt"):
|
||||
|
||||
@@ -63,6 +63,7 @@ class DocumentIntelligenceConverter(DocumentConverter):
|
||||
# Get the bytestring from the converter input
|
||||
file_obj = input.read_file(mode='rb')
|
||||
file_bytes = file_obj.read()
|
||||
file_obj.close()
|
||||
|
||||
# Certain document analysis features are not availiable for office filetypes (.xlsx, .pptx, .html, .docx)
|
||||
if extension.lower() in [".xlsx", ".pptx", ".html", ".docx"]:
|
||||
|
||||
@@ -31,6 +31,7 @@ class DocxConverter(HtmlConverter):
|
||||
style_map = kwargs.get("style_map", None)
|
||||
file_obj = input.read_file(mode="rb")
|
||||
result = mammoth.convert_to_html(file_obj, style_map=style_map)
|
||||
file_obj.close()
|
||||
html_content = result.value
|
||||
result = self._convert(html_content)
|
||||
|
||||
|
||||
@@ -25,6 +25,7 @@ class HtmlConverter(DocumentConverter):
|
||||
result = None
|
||||
file_obj = input.read_file(mode="rt", encoding="utf-8")
|
||||
result = self._convert(file_obj.read())
|
||||
file_obj.close()
|
||||
|
||||
return result
|
||||
|
||||
|
||||
@@ -73,6 +73,7 @@ class ImageConverter(MediaConverter):
|
||||
content_type = "image/jpeg"
|
||||
image_file = input.read_file(mode="rb")
|
||||
image_base64 = base64.b64encode(image_file.read()).decode("utf-8")
|
||||
image_file.close()
|
||||
data_uri = f"data:{content_type};base64,{image_base64}"
|
||||
|
||||
messages = [
|
||||
|
||||
@@ -30,6 +30,7 @@ class IpynbConverter(DocumentConverter):
|
||||
result = None
|
||||
file_obj = input.read_file(mode="rt", encoding="utf-8")
|
||||
notebook_content = json.load(file_obj)
|
||||
file_obj.close()
|
||||
result = self._convert(notebook_content)
|
||||
|
||||
return result
|
||||
|
||||
@@ -26,8 +26,9 @@ class OutlookMsgConverter(DocumentConverter):
|
||||
return None
|
||||
|
||||
try:
|
||||
file_obj = input.read_file(mode="rt", encoding="utf-8")
|
||||
file_obj = input.read_file(mode="rb")
|
||||
msg = olefile.OleFileIO(file_obj)
|
||||
|
||||
# Extract email metadata
|
||||
md_content = "# Email Message\n\n"
|
||||
|
||||
@@ -51,6 +52,7 @@ class OutlookMsgConverter(DocumentConverter):
|
||||
md_content += body
|
||||
|
||||
msg.close()
|
||||
file_obj.close()
|
||||
|
||||
return DocumentConverterResult(
|
||||
title=headers.get("Subject"), text_content=md_content.strip()
|
||||
|
||||
@@ -25,6 +25,8 @@ class PdfConverter(DocumentConverter):
|
||||
output = StringIO()
|
||||
file_obj = input.read_file(mode="rb")
|
||||
pdfminer.high_level.extract_text_to_fp(file_obj, output)
|
||||
file_obj.close()
|
||||
|
||||
return DocumentConverterResult(
|
||||
title=None,
|
||||
text_content=output.getvalue(),
|
||||
|
||||
@@ -59,6 +59,8 @@ class PptxConverter(HtmlConverter):
|
||||
|
||||
file_obj = input.read_file(mode="rb")
|
||||
presentation = pptx.Presentation(file_obj)
|
||||
file_obj.close()
|
||||
|
||||
slide_num = 0
|
||||
for slide in presentation.slides:
|
||||
slide_num += 1
|
||||
|
||||
@@ -31,6 +31,7 @@ class WikipediaConverter(DocumentConverter):
|
||||
soup = None
|
||||
file_obj = input.read_file(mode="rt", encoding="utf-8")
|
||||
soup = BeautifulSoup(file_obj.read(), "html.parser")
|
||||
file_obj.close()
|
||||
|
||||
# Remove javascript and style blocks
|
||||
for script in soup(["script", "style"]):
|
||||
|
||||
@@ -25,6 +25,8 @@ class XlsxConverter(HtmlConverter):
|
||||
|
||||
file_obj = input.read_file(mode="rb")
|
||||
sheets = pd.read_excel(file_obj, sheet_name=None, engine="openpyxl")
|
||||
file_obj.close()
|
||||
|
||||
md_content = ""
|
||||
for s in sheets:
|
||||
md_content += f"## {s}\n"
|
||||
@@ -50,6 +52,8 @@ class XlsConverter(HtmlConverter):
|
||||
|
||||
file_obj = input.read_file(mode="rb")
|
||||
sheets = pd.read_excel(file_obj, sheet_name=None, engine="xlrd")
|
||||
file_obj.close()
|
||||
|
||||
md_content = ""
|
||||
for s in sheets:
|
||||
md_content += f"## {s}\n"
|
||||
|
||||
@@ -41,6 +41,7 @@ class YouTubeConverter(DocumentConverter):
|
||||
soup = None
|
||||
file_obj = input.read_file(mode="rt", encoding="utf-8")
|
||||
soup = BeautifulSoup(file_obj.read(), "html.parser")
|
||||
file_obj.close()
|
||||
|
||||
# Read the meta tags
|
||||
assert soup.title is not None and soup.title.string is not None
|
||||
|
||||
Reference in New Issue
Block a user