Add CSV to Markdown table conversion - fixes #1144 (#1176)

* feat: Add CSV to Markdown table converter - Add new CsvConverter class to convert CSV files to Markdown tables\n- Support text/csv and application/csv MIME types\n- Preserve table structure with headers and data rows\n- Handle edge cases like empty cells and mismatched columns\n- Fix Azure Document Intelligence dependency handling\n- Register CsvConverter in MarkItDown class ---- Thanks also to @benny123tw who submitted a very similar PR in #1171
2025-04-13 21:19:00 +05:00
parent 3fcd48cdfc
commit 8576f1d915
5 changed files with 111 additions and 6 deletions
--- a/packages/markitdown/src/markitdown/_markitdown.py
+++ b/packages/markitdown/src/markitdown/_markitdown.py
@@ -41,6 +41,7 @@ from .converters import (
    ZipConverter,
    EpubConverter,
    DocumentIntelligenceConverter,
    CsvConverter,
 )
 from ._base_converter import DocumentConverter, DocumentConverterResult
@@ -194,6 +195,7 @@ class MarkItDown:
            self.register_converter(PdfConverter())
            self.register_converter(OutlookMsgConverter())
            self.register_converter(EpubConverter())
            self.register_converter(CsvConverter())
            # Register Document Intelligence converter at the top of the stack if endpoint is provided
            docintel_endpoint = kwargs.get("docintel_endpoint")
--- a/packages/markitdown/src/markitdown/converters/init.py
+++ b/packages/markitdown/src/markitdown/converters/init.py
@@ -22,6 +22,7 @@ from ._doc_intel_converter import (
    DocumentIntelligenceFileType,
 )
 from ._epub_converter import EpubConverter
 from ._csv_converter import CsvConverter
 __all__ = [
    "PlainTextConverter",
@@ -43,4 +44,5 @@ __all__ = [
    "DocumentIntelligenceConverter",
    "DocumentIntelligenceFileType",
    "EpubConverter",
    "CsvConverter",
 ]
--- a/packages/markitdown/src/markitdown/converters/_csv_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_csv_converter.py
@@ -0,0 +1,79 @@
 import sys
 import csv
 import io
 from typing import BinaryIO, Any
 from charset_normalizer import from_bytes
 from ._html_converter import HtmlConverter
 from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._stream_info import StreamInfo
 ACCEPTED_MIME_TYPE_PREFIXES = [
    "text/csv",
    "application/csv",
 ]
 ACCEPTED_FILE_EXTENSIONS = [".csv"]
 class CsvConverter(DocumentConverter):
    """
    Converts CSV files to Markdown tables.
    """
    def __init__(self):
        super().__init__()
    def accepts(
        self,
        file_stream: BinaryIO,
        stream_info: StreamInfo,
        **kwargs: Any,  # Options to pass to the converter
    ) -> bool:
        mimetype = (stream_info.mimetype or "").lower()
        extension = (stream_info.extension or "").lower()
        if extension in ACCEPTED_FILE_EXTENSIONS:
            return True
        for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
            if mimetype.startswith(prefix):
                return True
        return False
    def convert(
        self,
        file_stream: BinaryIO,
        stream_info: StreamInfo,
        **kwargs: Any,  # Options to pass to the converter
    ) -> DocumentConverterResult:
        # Read the file content
        if stream_info.charset:
            content = file_stream.read().decode(stream_info.charset)
        else:
            content = str(from_bytes(file_stream.read()).best())
        # Parse CSV content
        reader = csv.reader(io.StringIO(content))
        rows = list(reader)
        if not rows:
            return DocumentConverterResult(markdown="")
        # Create markdown table
        markdown_table = []
        # Add header row
        markdown_table.append("| " + " | ".join(rows[0]) + " |")
        # Add separator row
        markdown_table.append("| " + " | ".join(["---"] * len(rows[0])) + " |")
        # Add data rows
        for row in rows[1:]:
            # Make sure row has the same number of columns as header
            while len(row) < len(rows[0]):
                row.append("")
            # Truncate if row has more columns than header
            row = row[: len(rows[0])]
            markdown_table.append("| " + " | ".join(row) + " |")
        result = "\n".join(markdown_table)
        return DocumentConverterResult(markdown=result)
--- a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py
@@ -1,8 +1,7 @@
 import sys
 import re
 import os
-
+from typing import BinaryIO, Any, List, Optional, Union
 from typing import BinaryIO, Any, List
 from enum import Enum
 from ._html_converter import HtmlConverter
@@ -26,6 +25,28 @@ except ImportError:
    # Preserve the error and stack trace for later
    _dependency_exc_info = sys.exc_info()
    # Define these types for type hinting when the package is not available
    class AzureKeyCredential:
        pass
    class TokenCredential:
        pass
    class DocumentIntelligenceClient:
        pass
    class AnalyzeDocumentRequest:
        pass
    class AnalyzeResult:
        pass
    class DocumentAnalysisFeature:
        pass
    class DefaultAzureCredential:
        pass
 # TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum.
 # This constant is a temporary fix until the bug is resolved.
--- a/packages/markitdown/tests/_test_vectors.py
+++ b/packages/markitdown/tests/_test_vectors.py
@@ -144,10 +144,11 @@ GENERAL_TEST_VECTORS = [
        charset="cp932",
        url=None,
        must_include=[
-            "名前,年齢,住所",
+            "| 名前 | 年齢 | 住所 |",
-            "佐藤太郎,30,東京",
+            "| --- | --- | --- |",
-            "三木英子,25,大阪",
+            "| 佐藤太郎 | 30 | 東京 |",
-            "髙橋淳,35,名古屋",
+            "| 三木英子 | 25 | 大阪 |",
            "| 髙橋淳 | 35 | 名古屋 |",
        ],
        must_not_include=[],
    ),