feat: outlook ".msg" file converter (#196)
* feat: outlook .msg converter * add test, adjust docstring
This commit is contained in:
committed by
GitHub
parent
4678c8a2a4
commit
d248621ba4
@@ -21,6 +21,7 @@ from warnings import warn, resetwarnings, catch_warnings
|
||||
|
||||
import mammoth
|
||||
import markdownify
|
||||
import olefile
|
||||
import pandas as pd
|
||||
import pdfminer
|
||||
import pdfminer.high_level
|
||||
@@ -1077,6 +1078,79 @@ class ImageConverter(MediaConverter):
|
||||
return response.choices[0].message.content
|
||||
|
||||
|
||||
class OutlookMsgConverter(DocumentConverter):
|
||||
"""Converts Outlook .msg files to markdown by extracting email metadata and content.
|
||||
|
||||
Uses the olefile package to parse the .msg file structure and extract:
|
||||
- Email headers (From, To, Subject)
|
||||
- Email body content
|
||||
"""
|
||||
|
||||
def convert(
|
||||
self, local_path: str, **kwargs: Any
|
||||
) -> Union[None, DocumentConverterResult]:
|
||||
# Bail if not a MSG file
|
||||
extension = kwargs.get("file_extension", "")
|
||||
if extension.lower() != ".msg":
|
||||
return None
|
||||
|
||||
try:
|
||||
msg = olefile.OleFileIO(local_path)
|
||||
# Extract email metadata
|
||||
md_content = "# Email Message\n\n"
|
||||
|
||||
# Get headers
|
||||
headers = {
|
||||
"From": self._get_stream_data(msg, "__substg1.0_0C1F001F"),
|
||||
"To": self._get_stream_data(msg, "__substg1.0_0E04001F"),
|
||||
"Subject": self._get_stream_data(msg, "__substg1.0_0037001F"),
|
||||
}
|
||||
|
||||
# Add headers to markdown
|
||||
for key, value in headers.items():
|
||||
if value:
|
||||
md_content += f"**{key}:** {value}\n"
|
||||
|
||||
md_content += "\n## Content\n\n"
|
||||
|
||||
# Get email body
|
||||
body = self._get_stream_data(msg, "__substg1.0_1000001F")
|
||||
if body:
|
||||
md_content += body
|
||||
|
||||
msg.close()
|
||||
|
||||
return DocumentConverterResult(
|
||||
title=headers.get("Subject"), text_content=md_content.strip()
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
raise FileConversionException(
|
||||
f"Could not convert MSG file '{local_path}': {str(e)}"
|
||||
)
|
||||
|
||||
def _get_stream_data(
|
||||
self, msg: olefile.OleFileIO, stream_path: str
|
||||
) -> Union[str, None]:
|
||||
"""Helper to safely extract and decode stream data from the MSG file."""
|
||||
try:
|
||||
if msg.exists(stream_path):
|
||||
data = msg.openstream(stream_path).read()
|
||||
# Try UTF-16 first (common for .msg files)
|
||||
try:
|
||||
return data.decode("utf-16-le").strip()
|
||||
except UnicodeDecodeError:
|
||||
# Fall back to UTF-8
|
||||
try:
|
||||
return data.decode("utf-8").strip()
|
||||
except UnicodeDecodeError:
|
||||
# Last resort - ignore errors
|
||||
return data.decode("utf-8", errors="ignore").strip()
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
class ZipConverter(DocumentConverter):
|
||||
"""Converts ZIP files to markdown by extracting and converting all contained files.
|
||||
|
||||
@@ -1286,6 +1360,7 @@ class MarkItDown:
|
||||
self.register_page_converter(IpynbConverter())
|
||||
self.register_page_converter(PdfConverter())
|
||||
self.register_page_converter(ZipConverter())
|
||||
self.register_page_converter(OutlookMsgConverter())
|
||||
|
||||
def convert(
|
||||
self, source: Union[str, requests.Response, Path], **kwargs: Any
|
||||
|
||||
Reference in New Issue
Block a user