feat: outlook ".msg" file converter (#196)
* feat: outlook .msg converter * add test, adjust docstring
This commit is contained in:
committed by
GitHub
parent
4678c8a2a4
commit
d248621ba4
@@ -35,6 +35,7 @@ dependencies = [
|
||||
"pdfminer.six",
|
||||
"puremagic",
|
||||
"pydub",
|
||||
"olefile",
|
||||
"youtube-transcript-api",
|
||||
"SpeechRecognition",
|
||||
"pathvalidate",
|
||||
|
||||
@@ -21,6 +21,7 @@ from warnings import warn, resetwarnings, catch_warnings
|
||||
|
||||
import mammoth
|
||||
import markdownify
|
||||
import olefile
|
||||
import pandas as pd
|
||||
import pdfminer
|
||||
import pdfminer.high_level
|
||||
@@ -1077,6 +1078,79 @@ class ImageConverter(MediaConverter):
|
||||
return response.choices[0].message.content
|
||||
|
||||
|
||||
class OutlookMsgConverter(DocumentConverter):
|
||||
"""Converts Outlook .msg files to markdown by extracting email metadata and content.
|
||||
|
||||
Uses the olefile package to parse the .msg file structure and extract:
|
||||
- Email headers (From, To, Subject)
|
||||
- Email body content
|
||||
"""
|
||||
|
||||
def convert(
|
||||
self, local_path: str, **kwargs: Any
|
||||
) -> Union[None, DocumentConverterResult]:
|
||||
# Bail if not a MSG file
|
||||
extension = kwargs.get("file_extension", "")
|
||||
if extension.lower() != ".msg":
|
||||
return None
|
||||
|
||||
try:
|
||||
msg = olefile.OleFileIO(local_path)
|
||||
# Extract email metadata
|
||||
md_content = "# Email Message\n\n"
|
||||
|
||||
# Get headers
|
||||
headers = {
|
||||
"From": self._get_stream_data(msg, "__substg1.0_0C1F001F"),
|
||||
"To": self._get_stream_data(msg, "__substg1.0_0E04001F"),
|
||||
"Subject": self._get_stream_data(msg, "__substg1.0_0037001F"),
|
||||
}
|
||||
|
||||
# Add headers to markdown
|
||||
for key, value in headers.items():
|
||||
if value:
|
||||
md_content += f"**{key}:** {value}\n"
|
||||
|
||||
md_content += "\n## Content\n\n"
|
||||
|
||||
# Get email body
|
||||
body = self._get_stream_data(msg, "__substg1.0_1000001F")
|
||||
if body:
|
||||
md_content += body
|
||||
|
||||
msg.close()
|
||||
|
||||
return DocumentConverterResult(
|
||||
title=headers.get("Subject"), text_content=md_content.strip()
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
raise FileConversionException(
|
||||
f"Could not convert MSG file '{local_path}': {str(e)}"
|
||||
)
|
||||
|
||||
def _get_stream_data(
|
||||
self, msg: olefile.OleFileIO, stream_path: str
|
||||
) -> Union[str, None]:
|
||||
"""Helper to safely extract and decode stream data from the MSG file."""
|
||||
try:
|
||||
if msg.exists(stream_path):
|
||||
data = msg.openstream(stream_path).read()
|
||||
# Try UTF-16 first (common for .msg files)
|
||||
try:
|
||||
return data.decode("utf-16-le").strip()
|
||||
except UnicodeDecodeError:
|
||||
# Fall back to UTF-8
|
||||
try:
|
||||
return data.decode("utf-8").strip()
|
||||
except UnicodeDecodeError:
|
||||
# Last resort - ignore errors
|
||||
return data.decode("utf-8", errors="ignore").strip()
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
class ZipConverter(DocumentConverter):
|
||||
"""Converts ZIP files to markdown by extracting and converting all contained files.
|
||||
|
||||
@@ -1286,6 +1360,7 @@ class MarkItDown:
|
||||
self.register_page_converter(IpynbConverter())
|
||||
self.register_page_converter(PdfConverter())
|
||||
self.register_page_converter(ZipConverter())
|
||||
self.register_page_converter(OutlookMsgConverter())
|
||||
|
||||
def convert(
|
||||
self, source: Union[str, requests.Response, Path], **kwargs: Any
|
||||
|
||||
BIN
tests/test_files/test_outlook_msg.msg
vendored
Normal file
BIN
tests/test_files/test_outlook_msg.msg
vendored
Normal file
Binary file not shown.
@@ -63,6 +63,15 @@ DOCX_TEST_STRINGS = [
|
||||
"AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
|
||||
]
|
||||
|
||||
MSG_TEST_STRINGS = [
|
||||
"# Email Message",
|
||||
"**From:** test.sender@example.com",
|
||||
"**To:** test.recipient@example.com",
|
||||
"**Subject:** Test Email Message",
|
||||
"## Content",
|
||||
"This is the body of the test email message",
|
||||
]
|
||||
|
||||
DOCX_COMMENT_TEST_STRINGS = [
|
||||
"314b0a30-5b04-470b-b9f7-eed2c2bec74a",
|
||||
"49e168b7-d2ae-407f-a055-2167576f39a1",
|
||||
@@ -232,6 +241,10 @@ def test_markitdown_local() -> None:
|
||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_mskanji.csv"))
|
||||
validate_strings(result, CSV_CP932_TEST_STRINGS)
|
||||
|
||||
# Test MSG (Outlook email) processing
|
||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_outlook_msg.msg"))
|
||||
validate_strings(result, MSG_TEST_STRINGS)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
skip_exiftool,
|
||||
|
||||
Reference in New Issue
Block a user