From 8576f1d9153d43d7d7d8f3854ed771bcee281175 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Turd=C4=B1bek?= <110396648+erinshek@users.noreply.github.com> Date: Sun, 13 Apr 2025 21:19:00 +0500 Subject: [PATCH] Add CSV to Markdown table conversion - fixes #1144 (#1176) * feat: Add CSV to Markdown table converter - Add new CsvConverter class to convert CSV files to Markdown tables\n- Support text/csv and application/csv MIME types\n- Preserve table structure with headers and data rows\n- Handle edge cases like empty cells and mismatched columns\n- Fix Azure Document Intelligence dependency handling\n- Register CsvConverter in MarkItDown class ---- Thanks also to @benny123tw who submitted a very similar PR in #1171 --- .../markitdown/src/markitdown/_markitdown.py | 2 + .../src/markitdown/converters/__init__.py | 2 + .../markitdown/converters/_csv_converter.py | 79 +++++++++++++++++++ .../converters/_doc_intel_converter.py | 25 +++++- packages/markitdown/tests/_test_vectors.py | 9 ++- 5 files changed, 111 insertions(+), 6 deletions(-) create mode 100644 packages/markitdown/src/markitdown/converters/_csv_converter.py diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py index 54a0dc8..682902b 100644 --- a/packages/markitdown/src/markitdown/_markitdown.py +++ b/packages/markitdown/src/markitdown/_markitdown.py @@ -41,6 +41,7 @@ from .converters import ( ZipConverter, EpubConverter, DocumentIntelligenceConverter, + CsvConverter, ) from ._base_converter import DocumentConverter, DocumentConverterResult @@ -194,6 +195,7 @@ class MarkItDown: self.register_converter(PdfConverter()) self.register_converter(OutlookMsgConverter()) self.register_converter(EpubConverter()) + self.register_converter(CsvConverter()) # Register Document Intelligence converter at the top of the stack if endpoint is provided docintel_endpoint = kwargs.get("docintel_endpoint") diff --git a/packages/markitdown/src/markitdown/converters/__init__.py b/packages/markitdown/src/markitdown/converters/__init__.py index c68d0c3..e4437a5 100644 --- a/packages/markitdown/src/markitdown/converters/__init__.py +++ b/packages/markitdown/src/markitdown/converters/__init__.py @@ -22,6 +22,7 @@ from ._doc_intel_converter import ( DocumentIntelligenceFileType, ) from ._epub_converter import EpubConverter +from ._csv_converter import CsvConverter __all__ = [ "PlainTextConverter", @@ -43,4 +44,5 @@ __all__ = [ "DocumentIntelligenceConverter", "DocumentIntelligenceFileType", "EpubConverter", + "CsvConverter", ] diff --git a/packages/markitdown/src/markitdown/converters/_csv_converter.py b/packages/markitdown/src/markitdown/converters/_csv_converter.py new file mode 100644 index 0000000..7162889 --- /dev/null +++ b/packages/markitdown/src/markitdown/converters/_csv_converter.py @@ -0,0 +1,79 @@ +import sys +import csv +import io +from typing import BinaryIO, Any +from charset_normalizer import from_bytes +from ._html_converter import HtmlConverter +from .._base_converter import DocumentConverter, DocumentConverterResult +from .._stream_info import StreamInfo + +ACCEPTED_MIME_TYPE_PREFIXES = [ + "text/csv", + "application/csv", +] +ACCEPTED_FILE_EXTENSIONS = [".csv"] + + +class CsvConverter(DocumentConverter): + """ + Converts CSV files to Markdown tables. + """ + + def __init__(self): + super().__init__() + + def accepts( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> bool: + mimetype = (stream_info.mimetype or "").lower() + extension = (stream_info.extension or "").lower() + if extension in ACCEPTED_FILE_EXTENSIONS: + return True + for prefix in ACCEPTED_MIME_TYPE_PREFIXES: + if mimetype.startswith(prefix): + return True + return False + + def convert( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> DocumentConverterResult: + # Read the file content + if stream_info.charset: + content = file_stream.read().decode(stream_info.charset) + else: + content = str(from_bytes(file_stream.read()).best()) + + # Parse CSV content + reader = csv.reader(io.StringIO(content)) + rows = list(reader) + + if not rows: + return DocumentConverterResult(markdown="") + + # Create markdown table + markdown_table = [] + + # Add header row + markdown_table.append("| " + " | ".join(rows[0]) + " |") + + # Add separator row + markdown_table.append("| " + " | ".join(["---"] * len(rows[0])) + " |") + + # Add data rows + for row in rows[1:]: + # Make sure row has the same number of columns as header + while len(row) < len(rows[0]): + row.append("") + # Truncate if row has more columns than header + row = row[: len(rows[0])] + markdown_table.append("| " + " | ".join(row) + " |") + + result = "\n".join(markdown_table) + + return DocumentConverterResult(markdown=result) diff --git a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py index 5f4069b..d2dce91 100644 --- a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py +++ b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py @@ -1,8 +1,7 @@ import sys import re import os - -from typing import BinaryIO, Any, List +from typing import BinaryIO, Any, List, Optional, Union from enum import Enum from ._html_converter import HtmlConverter @@ -26,6 +25,28 @@ except ImportError: # Preserve the error and stack trace for later _dependency_exc_info = sys.exc_info() + # Define these types for type hinting when the package is not available + class AzureKeyCredential: + pass + + class TokenCredential: + pass + + class DocumentIntelligenceClient: + pass + + class AnalyzeDocumentRequest: + pass + + class AnalyzeResult: + pass + + class DocumentAnalysisFeature: + pass + + class DefaultAzureCredential: + pass + # TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum. # This constant is a temporary fix until the bug is resolved. diff --git a/packages/markitdown/tests/_test_vectors.py b/packages/markitdown/tests/_test_vectors.py index 4a7b54a..74fa9bd 100644 --- a/packages/markitdown/tests/_test_vectors.py +++ b/packages/markitdown/tests/_test_vectors.py @@ -144,10 +144,11 @@ GENERAL_TEST_VECTORS = [ charset="cp932", url=None, must_include=[ - "名前,年齢,住所", - "佐藤太郎,30,東京", - "三木英子,25,大阪", - "髙橋淳,35,名古屋", + "| 名前 | 年齢 | 住所 |", + "| --- | --- | --- |", + "| 佐藤太郎 | 30 | 東京 |", + "| 三木英子 | 25 | 大阪 |", + "| 髙橋淳 | 35 | 名古屋 |", ], must_not_include=[], ),