* feat: Add CSV to Markdown table converter - Add new CsvConverter class to convert CSV files to Markdown tables\n- Support text/csv and application/csv MIME types\n- Preserve table structure with headers and data rows\n- Handle edge cases like empty cells and mismatched columns\n- Fix Azure Document Intelligence dependency handling\n- Register CsvConverter in MarkItDown class ---- Thanks also to @benny123tw who submitted a very similar PR in #1171
This commit is contained in:
@@ -41,6 +41,7 @@ from .converters import (
|
|||||||
ZipConverter,
|
ZipConverter,
|
||||||
EpubConverter,
|
EpubConverter,
|
||||||
DocumentIntelligenceConverter,
|
DocumentIntelligenceConverter,
|
||||||
|
CsvConverter,
|
||||||
)
|
)
|
||||||
|
|
||||||
from ._base_converter import DocumentConverter, DocumentConverterResult
|
from ._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
@@ -194,6 +195,7 @@ class MarkItDown:
|
|||||||
self.register_converter(PdfConverter())
|
self.register_converter(PdfConverter())
|
||||||
self.register_converter(OutlookMsgConverter())
|
self.register_converter(OutlookMsgConverter())
|
||||||
self.register_converter(EpubConverter())
|
self.register_converter(EpubConverter())
|
||||||
|
self.register_converter(CsvConverter())
|
||||||
|
|
||||||
# Register Document Intelligence converter at the top of the stack if endpoint is provided
|
# Register Document Intelligence converter at the top of the stack if endpoint is provided
|
||||||
docintel_endpoint = kwargs.get("docintel_endpoint")
|
docintel_endpoint = kwargs.get("docintel_endpoint")
|
||||||
|
|||||||
@@ -22,6 +22,7 @@ from ._doc_intel_converter import (
|
|||||||
DocumentIntelligenceFileType,
|
DocumentIntelligenceFileType,
|
||||||
)
|
)
|
||||||
from ._epub_converter import EpubConverter
|
from ._epub_converter import EpubConverter
|
||||||
|
from ._csv_converter import CsvConverter
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"PlainTextConverter",
|
"PlainTextConverter",
|
||||||
@@ -43,4 +44,5 @@ __all__ = [
|
|||||||
"DocumentIntelligenceConverter",
|
"DocumentIntelligenceConverter",
|
||||||
"DocumentIntelligenceFileType",
|
"DocumentIntelligenceFileType",
|
||||||
"EpubConverter",
|
"EpubConverter",
|
||||||
|
"CsvConverter",
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -0,0 +1,79 @@
|
|||||||
|
import sys
|
||||||
|
import csv
|
||||||
|
import io
|
||||||
|
from typing import BinaryIO, Any
|
||||||
|
from charset_normalizer import from_bytes
|
||||||
|
from ._html_converter import HtmlConverter
|
||||||
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
|
from .._stream_info import StreamInfo
|
||||||
|
|
||||||
|
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||||
|
"text/csv",
|
||||||
|
"application/csv",
|
||||||
|
]
|
||||||
|
ACCEPTED_FILE_EXTENSIONS = [".csv"]
|
||||||
|
|
||||||
|
|
||||||
|
class CsvConverter(DocumentConverter):
|
||||||
|
"""
|
||||||
|
Converts CSV files to Markdown tables.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
def accepts(
|
||||||
|
self,
|
||||||
|
file_stream: BinaryIO,
|
||||||
|
stream_info: StreamInfo,
|
||||||
|
**kwargs: Any, # Options to pass to the converter
|
||||||
|
) -> bool:
|
||||||
|
mimetype = (stream_info.mimetype or "").lower()
|
||||||
|
extension = (stream_info.extension or "").lower()
|
||||||
|
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||||
|
return True
|
||||||
|
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
||||||
|
if mimetype.startswith(prefix):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def convert(
|
||||||
|
self,
|
||||||
|
file_stream: BinaryIO,
|
||||||
|
stream_info: StreamInfo,
|
||||||
|
**kwargs: Any, # Options to pass to the converter
|
||||||
|
) -> DocumentConverterResult:
|
||||||
|
# Read the file content
|
||||||
|
if stream_info.charset:
|
||||||
|
content = file_stream.read().decode(stream_info.charset)
|
||||||
|
else:
|
||||||
|
content = str(from_bytes(file_stream.read()).best())
|
||||||
|
|
||||||
|
# Parse CSV content
|
||||||
|
reader = csv.reader(io.StringIO(content))
|
||||||
|
rows = list(reader)
|
||||||
|
|
||||||
|
if not rows:
|
||||||
|
return DocumentConverterResult(markdown="")
|
||||||
|
|
||||||
|
# Create markdown table
|
||||||
|
markdown_table = []
|
||||||
|
|
||||||
|
# Add header row
|
||||||
|
markdown_table.append("| " + " | ".join(rows[0]) + " |")
|
||||||
|
|
||||||
|
# Add separator row
|
||||||
|
markdown_table.append("| " + " | ".join(["---"] * len(rows[0])) + " |")
|
||||||
|
|
||||||
|
# Add data rows
|
||||||
|
for row in rows[1:]:
|
||||||
|
# Make sure row has the same number of columns as header
|
||||||
|
while len(row) < len(rows[0]):
|
||||||
|
row.append("")
|
||||||
|
# Truncate if row has more columns than header
|
||||||
|
row = row[: len(rows[0])]
|
||||||
|
markdown_table.append("| " + " | ".join(row) + " |")
|
||||||
|
|
||||||
|
result = "\n".join(markdown_table)
|
||||||
|
|
||||||
|
return DocumentConverterResult(markdown=result)
|
||||||
@@ -1,8 +1,7 @@
|
|||||||
import sys
|
import sys
|
||||||
import re
|
import re
|
||||||
import os
|
import os
|
||||||
|
from typing import BinaryIO, Any, List, Optional, Union
|
||||||
from typing import BinaryIO, Any, List
|
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
|
|
||||||
from ._html_converter import HtmlConverter
|
from ._html_converter import HtmlConverter
|
||||||
@@ -26,6 +25,28 @@ except ImportError:
|
|||||||
# Preserve the error and stack trace for later
|
# Preserve the error and stack trace for later
|
||||||
_dependency_exc_info = sys.exc_info()
|
_dependency_exc_info = sys.exc_info()
|
||||||
|
|
||||||
|
# Define these types for type hinting when the package is not available
|
||||||
|
class AzureKeyCredential:
|
||||||
|
pass
|
||||||
|
|
||||||
|
class TokenCredential:
|
||||||
|
pass
|
||||||
|
|
||||||
|
class DocumentIntelligenceClient:
|
||||||
|
pass
|
||||||
|
|
||||||
|
class AnalyzeDocumentRequest:
|
||||||
|
pass
|
||||||
|
|
||||||
|
class AnalyzeResult:
|
||||||
|
pass
|
||||||
|
|
||||||
|
class DocumentAnalysisFeature:
|
||||||
|
pass
|
||||||
|
|
||||||
|
class DefaultAzureCredential:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
# TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum.
|
# TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum.
|
||||||
# This constant is a temporary fix until the bug is resolved.
|
# This constant is a temporary fix until the bug is resolved.
|
||||||
|
|||||||
@@ -144,10 +144,11 @@ GENERAL_TEST_VECTORS = [
|
|||||||
charset="cp932",
|
charset="cp932",
|
||||||
url=None,
|
url=None,
|
||||||
must_include=[
|
must_include=[
|
||||||
"名前,年齢,住所",
|
"| 名前 | 年齢 | 住所 |",
|
||||||
"佐藤太郎,30,東京",
|
"| --- | --- | --- |",
|
||||||
"三木英子,25,大阪",
|
"| 佐藤太郎 | 30 | 東京 |",
|
||||||
"髙橋淳,35,名古屋",
|
"| 三木英子 | 25 | 大阪 |",
|
||||||
|
"| 髙橋淳 | 35 | 名古屋 |",
|
||||||
],
|
],
|
||||||
must_not_include=[],
|
must_not_include=[],
|
||||||
),
|
),
|
||||||
|
|||||||
Reference in New Issue
Block a user