close file object after using
This commit is contained in:
@@ -182,7 +182,6 @@ class MarkItDown:
|
|||||||
- source: can be a string representing a path either as string pathlib path object or url, a requests.response object, or a file object (TextIO or BinaryIO)
|
- source: can be a string representing a path either as string pathlib path object or url, a requests.response object, or a file object (TextIO or BinaryIO)
|
||||||
- extension: specifies the file extension to use when interpreting the file. If None, infer from source (path, uri, content-type, etc.)
|
- extension: specifies the file extension to use when interpreting the file. If None, infer from source (path, uri, content-type, etc.)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Local path or url
|
# Local path or url
|
||||||
if isinstance(source, str):
|
if isinstance(source, str):
|
||||||
if (
|
if (
|
||||||
@@ -198,6 +197,9 @@ class MarkItDown:
|
|||||||
return self.convert_response(source, **kwargs)
|
return self.convert_response(source, **kwargs)
|
||||||
elif isinstance(source, Path):
|
elif isinstance(source, Path):
|
||||||
return self.convert_local(source, **kwargs)
|
return self.convert_local(source, **kwargs)
|
||||||
|
# File object
|
||||||
|
elif isinstance(source, BufferedIOBase) or isinstance(source, TextIOBase):
|
||||||
|
return self.convert_file_object(source, **kwargs)
|
||||||
|
|
||||||
def convert_local(
|
def convert_local(
|
||||||
self, path: Union[str, Path], **kwargs: Any
|
self, path: Union[str, Path], **kwargs: Any
|
||||||
@@ -341,7 +343,6 @@ class MarkItDown:
|
|||||||
self, input: ConverterInput, extensions: List[Union[str, None]], **kwargs
|
self, input: ConverterInput, extensions: List[Union[str, None]], **kwargs
|
||||||
) -> DocumentConverterResult:
|
) -> DocumentConverterResult:
|
||||||
error_trace = ""
|
error_trace = ""
|
||||||
|
|
||||||
# Create a copy of the page_converters list, sorted by priority.
|
# Create a copy of the page_converters list, sorted by priority.
|
||||||
# We do this with each call to _convert because the priority of converters may change between calls.
|
# We do this with each call to _convert because the priority of converters may change between calls.
|
||||||
# The sort is guaranteed to be stable, so converters with the same priority will remain in the same order.
|
# The sort is guaranteed to be stable, so converters with the same priority will remain in the same order.
|
||||||
|
|||||||
@@ -39,6 +39,7 @@ class BingSerpConverter(DocumentConverter):
|
|||||||
soup = None
|
soup = None
|
||||||
file_obj = input.read_file(mode="rt", encoding="utf-8")
|
file_obj = input.read_file(mode="rt", encoding="utf-8")
|
||||||
soup = BeautifulSoup(file_obj.read(), "html.parser")
|
soup = BeautifulSoup(file_obj.read(), "html.parser")
|
||||||
|
file_obj.close()
|
||||||
|
|
||||||
# Clean up some formatting
|
# Clean up some formatting
|
||||||
for tptt in soup.find_all(class_="tptt"):
|
for tptt in soup.find_all(class_="tptt"):
|
||||||
|
|||||||
@@ -63,6 +63,7 @@ class DocumentIntelligenceConverter(DocumentConverter):
|
|||||||
# Get the bytestring from the converter input
|
# Get the bytestring from the converter input
|
||||||
file_obj = input.read_file(mode='rb')
|
file_obj = input.read_file(mode='rb')
|
||||||
file_bytes = file_obj.read()
|
file_bytes = file_obj.read()
|
||||||
|
file_obj.close()
|
||||||
|
|
||||||
# Certain document analysis features are not availiable for office filetypes (.xlsx, .pptx, .html, .docx)
|
# Certain document analysis features are not availiable for office filetypes (.xlsx, .pptx, .html, .docx)
|
||||||
if extension.lower() in [".xlsx", ".pptx", ".html", ".docx"]:
|
if extension.lower() in [".xlsx", ".pptx", ".html", ".docx"]:
|
||||||
|
|||||||
@@ -31,6 +31,7 @@ class DocxConverter(HtmlConverter):
|
|||||||
style_map = kwargs.get("style_map", None)
|
style_map = kwargs.get("style_map", None)
|
||||||
file_obj = input.read_file(mode="rb")
|
file_obj = input.read_file(mode="rb")
|
||||||
result = mammoth.convert_to_html(file_obj, style_map=style_map)
|
result = mammoth.convert_to_html(file_obj, style_map=style_map)
|
||||||
|
file_obj.close()
|
||||||
html_content = result.value
|
html_content = result.value
|
||||||
result = self._convert(html_content)
|
result = self._convert(html_content)
|
||||||
|
|
||||||
|
|||||||
@@ -25,6 +25,7 @@ class HtmlConverter(DocumentConverter):
|
|||||||
result = None
|
result = None
|
||||||
file_obj = input.read_file(mode="rt", encoding="utf-8")
|
file_obj = input.read_file(mode="rt", encoding="utf-8")
|
||||||
result = self._convert(file_obj.read())
|
result = self._convert(file_obj.read())
|
||||||
|
file_obj.close()
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|||||||
@@ -73,6 +73,7 @@ class ImageConverter(MediaConverter):
|
|||||||
content_type = "image/jpeg"
|
content_type = "image/jpeg"
|
||||||
image_file = input.read_file(mode="rb")
|
image_file = input.read_file(mode="rb")
|
||||||
image_base64 = base64.b64encode(image_file.read()).decode("utf-8")
|
image_base64 = base64.b64encode(image_file.read()).decode("utf-8")
|
||||||
|
image_file.close()
|
||||||
data_uri = f"data:{content_type};base64,{image_base64}"
|
data_uri = f"data:{content_type};base64,{image_base64}"
|
||||||
|
|
||||||
messages = [
|
messages = [
|
||||||
|
|||||||
@@ -30,6 +30,7 @@ class IpynbConverter(DocumentConverter):
|
|||||||
result = None
|
result = None
|
||||||
file_obj = input.read_file(mode="rt", encoding="utf-8")
|
file_obj = input.read_file(mode="rt", encoding="utf-8")
|
||||||
notebook_content = json.load(file_obj)
|
notebook_content = json.load(file_obj)
|
||||||
|
file_obj.close()
|
||||||
result = self._convert(notebook_content)
|
result = self._convert(notebook_content)
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|||||||
@@ -26,8 +26,9 @@ class OutlookMsgConverter(DocumentConverter):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
file_obj = input.read_file(mode="rt", encoding="utf-8")
|
file_obj = input.read_file(mode="rb")
|
||||||
msg = olefile.OleFileIO(file_obj)
|
msg = olefile.OleFileIO(file_obj)
|
||||||
|
|
||||||
# Extract email metadata
|
# Extract email metadata
|
||||||
md_content = "# Email Message\n\n"
|
md_content = "# Email Message\n\n"
|
||||||
|
|
||||||
@@ -51,6 +52,7 @@ class OutlookMsgConverter(DocumentConverter):
|
|||||||
md_content += body
|
md_content += body
|
||||||
|
|
||||||
msg.close()
|
msg.close()
|
||||||
|
file_obj.close()
|
||||||
|
|
||||||
return DocumentConverterResult(
|
return DocumentConverterResult(
|
||||||
title=headers.get("Subject"), text_content=md_content.strip()
|
title=headers.get("Subject"), text_content=md_content.strip()
|
||||||
|
|||||||
@@ -25,6 +25,8 @@ class PdfConverter(DocumentConverter):
|
|||||||
output = StringIO()
|
output = StringIO()
|
||||||
file_obj = input.read_file(mode="rb")
|
file_obj = input.read_file(mode="rb")
|
||||||
pdfminer.high_level.extract_text_to_fp(file_obj, output)
|
pdfminer.high_level.extract_text_to_fp(file_obj, output)
|
||||||
|
file_obj.close()
|
||||||
|
|
||||||
return DocumentConverterResult(
|
return DocumentConverterResult(
|
||||||
title=None,
|
title=None,
|
||||||
text_content=output.getvalue(),
|
text_content=output.getvalue(),
|
||||||
|
|||||||
@@ -59,6 +59,8 @@ class PptxConverter(HtmlConverter):
|
|||||||
|
|
||||||
file_obj = input.read_file(mode="rb")
|
file_obj = input.read_file(mode="rb")
|
||||||
presentation = pptx.Presentation(file_obj)
|
presentation = pptx.Presentation(file_obj)
|
||||||
|
file_obj.close()
|
||||||
|
|
||||||
slide_num = 0
|
slide_num = 0
|
||||||
for slide in presentation.slides:
|
for slide in presentation.slides:
|
||||||
slide_num += 1
|
slide_num += 1
|
||||||
|
|||||||
@@ -31,6 +31,7 @@ class WikipediaConverter(DocumentConverter):
|
|||||||
soup = None
|
soup = None
|
||||||
file_obj = input.read_file(mode="rt", encoding="utf-8")
|
file_obj = input.read_file(mode="rt", encoding="utf-8")
|
||||||
soup = BeautifulSoup(file_obj.read(), "html.parser")
|
soup = BeautifulSoup(file_obj.read(), "html.parser")
|
||||||
|
file_obj.close()
|
||||||
|
|
||||||
# Remove javascript and style blocks
|
# Remove javascript and style blocks
|
||||||
for script in soup(["script", "style"]):
|
for script in soup(["script", "style"]):
|
||||||
|
|||||||
@@ -25,6 +25,8 @@ class XlsxConverter(HtmlConverter):
|
|||||||
|
|
||||||
file_obj = input.read_file(mode="rb")
|
file_obj = input.read_file(mode="rb")
|
||||||
sheets = pd.read_excel(file_obj, sheet_name=None, engine="openpyxl")
|
sheets = pd.read_excel(file_obj, sheet_name=None, engine="openpyxl")
|
||||||
|
file_obj.close()
|
||||||
|
|
||||||
md_content = ""
|
md_content = ""
|
||||||
for s in sheets:
|
for s in sheets:
|
||||||
md_content += f"## {s}\n"
|
md_content += f"## {s}\n"
|
||||||
@@ -50,6 +52,8 @@ class XlsConverter(HtmlConverter):
|
|||||||
|
|
||||||
file_obj = input.read_file(mode="rb")
|
file_obj = input.read_file(mode="rb")
|
||||||
sheets = pd.read_excel(file_obj, sheet_name=None, engine="xlrd")
|
sheets = pd.read_excel(file_obj, sheet_name=None, engine="xlrd")
|
||||||
|
file_obj.close()
|
||||||
|
|
||||||
md_content = ""
|
md_content = ""
|
||||||
for s in sheets:
|
for s in sheets:
|
||||||
md_content += f"## {s}\n"
|
md_content += f"## {s}\n"
|
||||||
|
|||||||
@@ -41,6 +41,7 @@ class YouTubeConverter(DocumentConverter):
|
|||||||
soup = None
|
soup = None
|
||||||
file_obj = input.read_file(mode="rt", encoding="utf-8")
|
file_obj = input.read_file(mode="rt", encoding="utf-8")
|
||||||
soup = BeautifulSoup(file_obj.read(), "html.parser")
|
soup = BeautifulSoup(file_obj.read(), "html.parser")
|
||||||
|
file_obj.close()
|
||||||
|
|
||||||
# Read the meta tags
|
# Read the meta tags
|
||||||
assert soup.title is not None and soup.title.string is not None
|
assert soup.title is not None and soup.title.string is not None
|
||||||
|
|||||||
Reference in New Issue
Block a user