From 395ce2d30138db905ea4e71fb2e41e91642d4463 Mon Sep 17 00:00:00 2001 From: Kenny Zhang Date: Thu, 20 Feb 2025 13:54:51 -0500 Subject: [PATCH] close file object after using --- packages/markitdown/src/markitdown/_markitdown.py | 7 ++++--- .../src/markitdown/converters/_bing_serp_converter.py | 1 + .../src/markitdown/converters/_doc_intel_converter.py | 1 + .../src/markitdown/converters/_docx_converter.py | 1 + .../src/markitdown/converters/_html_converter.py | 1 + .../src/markitdown/converters/_image_converter.py | 1 + .../src/markitdown/converters/_ipynb_converter.py | 1 + .../src/markitdown/converters/_outlook_msg_converter.py | 4 +++- .../markitdown/src/markitdown/converters/_pdf_converter.py | 2 ++ .../src/markitdown/converters/_pptx_converter.py | 2 ++ .../src/markitdown/converters/_wikipedia_converter.py | 1 + .../src/markitdown/converters/_xlsx_converter.py | 4 ++++ .../src/markitdown/converters/_youtube_converter.py | 1 + 13 files changed, 23 insertions(+), 4 deletions(-) diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py index bcac863..c1eb939 100644 --- a/packages/markitdown/src/markitdown/_markitdown.py +++ b/packages/markitdown/src/markitdown/_markitdown.py @@ -182,7 +182,6 @@ class MarkItDown: - source: can be a string representing a path either as string pathlib path object or url, a requests.response object, or a file object (TextIO or BinaryIO) - extension: specifies the file extension to use when interpreting the file. If None, infer from source (path, uri, content-type, etc.) """ - # Local path or url if isinstance(source, str): if ( @@ -198,6 +197,9 @@ class MarkItDown: return self.convert_response(source, **kwargs) elif isinstance(source, Path): return self.convert_local(source, **kwargs) + # File object + elif isinstance(source, BufferedIOBase) or isinstance(source, TextIOBase): + return self.convert_file_object(source, **kwargs) def convert_local( self, path: Union[str, Path], **kwargs: Any @@ -341,7 +343,6 @@ class MarkItDown: self, input: ConverterInput, extensions: List[Union[str, None]], **kwargs ) -> DocumentConverterResult: error_trace = "" - # Create a copy of the page_converters list, sorted by priority. # We do this with each call to _convert because the priority of converters may change between calls. # The sort is guaranteed to be stable, so converters with the same priority will remain in the same order. @@ -442,7 +443,7 @@ class MarkItDown: # Guess extensions for file objects elif isinstance(source, BufferedIOBase) or isinstance(source, TextIOBase): guesses = puremagic.magic_stream(source) - + extensions = list() for g in guesses: ext = g.extension.strip() diff --git a/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py b/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py index 892f7e4..36b9a01 100644 --- a/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py +++ b/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py @@ -39,6 +39,7 @@ class BingSerpConverter(DocumentConverter): soup = None file_obj = input.read_file(mode="rt", encoding="utf-8") soup = BeautifulSoup(file_obj.read(), "html.parser") + file_obj.close() # Clean up some formatting for tptt in soup.find_all(class_="tptt"): diff --git a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py index fd30a74..f411d89 100644 --- a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py +++ b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py @@ -63,6 +63,7 @@ class DocumentIntelligenceConverter(DocumentConverter): # Get the bytestring from the converter input file_obj = input.read_file(mode='rb') file_bytes = file_obj.read() + file_obj.close() # Certain document analysis features are not availiable for office filetypes (.xlsx, .pptx, .html, .docx) if extension.lower() in [".xlsx", ".pptx", ".html", ".docx"]: diff --git a/packages/markitdown/src/markitdown/converters/_docx_converter.py b/packages/markitdown/src/markitdown/converters/_docx_converter.py index c8f7c10..b97aa75 100644 --- a/packages/markitdown/src/markitdown/converters/_docx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py @@ -31,6 +31,7 @@ class DocxConverter(HtmlConverter): style_map = kwargs.get("style_map", None) file_obj = input.read_file(mode="rb") result = mammoth.convert_to_html(file_obj, style_map=style_map) + file_obj.close() html_content = result.value result = self._convert(html_content) diff --git a/packages/markitdown/src/markitdown/converters/_html_converter.py b/packages/markitdown/src/markitdown/converters/_html_converter.py index 8ac882d..62ec150 100644 --- a/packages/markitdown/src/markitdown/converters/_html_converter.py +++ b/packages/markitdown/src/markitdown/converters/_html_converter.py @@ -25,6 +25,7 @@ class HtmlConverter(DocumentConverter): result = None file_obj = input.read_file(mode="rt", encoding="utf-8") result = self._convert(file_obj.read()) + file_obj.close() return result diff --git a/packages/markitdown/src/markitdown/converters/_image_converter.py b/packages/markitdown/src/markitdown/converters/_image_converter.py index 1c1056a..197f5cf 100644 --- a/packages/markitdown/src/markitdown/converters/_image_converter.py +++ b/packages/markitdown/src/markitdown/converters/_image_converter.py @@ -73,6 +73,7 @@ class ImageConverter(MediaConverter): content_type = "image/jpeg" image_file = input.read_file(mode="rb") image_base64 = base64.b64encode(image_file.read()).decode("utf-8") + image_file.close() data_uri = f"data:{content_type};base64,{image_base64}" messages = [ diff --git a/packages/markitdown/src/markitdown/converters/_ipynb_converter.py b/packages/markitdown/src/markitdown/converters/_ipynb_converter.py index aa3a887..90c26a5 100644 --- a/packages/markitdown/src/markitdown/converters/_ipynb_converter.py +++ b/packages/markitdown/src/markitdown/converters/_ipynb_converter.py @@ -30,6 +30,7 @@ class IpynbConverter(DocumentConverter): result = None file_obj = input.read_file(mode="rt", encoding="utf-8") notebook_content = json.load(file_obj) + file_obj.close() result = self._convert(notebook_content) return result diff --git a/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py b/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py index f0c33fe..cb61926 100644 --- a/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py +++ b/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py @@ -26,8 +26,9 @@ class OutlookMsgConverter(DocumentConverter): return None try: - file_obj = input.read_file(mode="rt", encoding="utf-8") + file_obj = input.read_file(mode="rb") msg = olefile.OleFileIO(file_obj) + # Extract email metadata md_content = "# Email Message\n\n" @@ -51,6 +52,7 @@ class OutlookMsgConverter(DocumentConverter): md_content += body msg.close() + file_obj.close() return DocumentConverterResult( title=headers.get("Subject"), text_content=md_content.strip() diff --git a/packages/markitdown/src/markitdown/converters/_pdf_converter.py b/packages/markitdown/src/markitdown/converters/_pdf_converter.py index 870d6bf..d512eb3 100644 --- a/packages/markitdown/src/markitdown/converters/_pdf_converter.py +++ b/packages/markitdown/src/markitdown/converters/_pdf_converter.py @@ -25,6 +25,8 @@ class PdfConverter(DocumentConverter): output = StringIO() file_obj = input.read_file(mode="rb") pdfminer.high_level.extract_text_to_fp(file_obj, output) + file_obj.close() + return DocumentConverterResult( title=None, text_content=output.getvalue(), diff --git a/packages/markitdown/src/markitdown/converters/_pptx_converter.py b/packages/markitdown/src/markitdown/converters/_pptx_converter.py index a5ee72d..07aa7b3 100644 --- a/packages/markitdown/src/markitdown/converters/_pptx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_pptx_converter.py @@ -59,6 +59,8 @@ class PptxConverter(HtmlConverter): file_obj = input.read_file(mode="rb") presentation = pptx.Presentation(file_obj) + file_obj.close() + slide_num = 0 for slide in presentation.slides: slide_num += 1 diff --git a/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py b/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py index e3b98ca..af2a30e 100644 --- a/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py +++ b/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py @@ -31,6 +31,7 @@ class WikipediaConverter(DocumentConverter): soup = None file_obj = input.read_file(mode="rt", encoding="utf-8") soup = BeautifulSoup(file_obj.read(), "html.parser") + file_obj.close() # Remove javascript and style blocks for script in soup(["script", "style"]): diff --git a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py index df80a47..18d930f 100644 --- a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py @@ -25,6 +25,8 @@ class XlsxConverter(HtmlConverter): file_obj = input.read_file(mode="rb") sheets = pd.read_excel(file_obj, sheet_name=None, engine="openpyxl") + file_obj.close() + md_content = "" for s in sheets: md_content += f"## {s}\n" @@ -50,6 +52,8 @@ class XlsConverter(HtmlConverter): file_obj = input.read_file(mode="rb") sheets = pd.read_excel(file_obj, sheet_name=None, engine="xlrd") + file_obj.close() + md_content = "" for s in sheets: md_content += f"## {s}\n" diff --git a/packages/markitdown/src/markitdown/converters/_youtube_converter.py b/packages/markitdown/src/markitdown/converters/_youtube_converter.py index 9bcc2b0..da6f6fc 100644 --- a/packages/markitdown/src/markitdown/converters/_youtube_converter.py +++ b/packages/markitdown/src/markitdown/converters/_youtube_converter.py @@ -41,6 +41,7 @@ class YouTubeConverter(DocumentConverter): soup = None file_obj = input.read_file(mode="rt", encoding="utf-8") soup = BeautifulSoup(file_obj.read(), "html.parser") + file_obj.close() # Read the meta tags assert soup.title is not None and soup.title.string is not None