From 6ebef5af0cc672619c4127a7d1019dbce174c603 Mon Sep 17 00:00:00 2001 From: Simon Willison Date: Fri, 13 Dec 2024 11:06:11 -0800 Subject: [PATCH 1/3] CLI usage instructions Plus added a PyPI badge --- README.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/README.md b/README.md index 5034d03..6b62356 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,7 @@ # MarkItDown +[![PyPI](https://img.shields.io/pypi/v/markitdown.svg)](https://pypi.org/project/markitdown/) + The MarkItDown library is a utility tool for converting various files to Markdown (e.g., for indexing, text analysis, etc.) It presently supports: @@ -23,6 +25,16 @@ result = markitdown.convert("test.xlsx") print(result.text_content) ``` +To use this as a command-line utility, install it and then run it like this: + +```bash +markitdown path-to-file.pdf +``` +This will output Markdown to standard output. You can save it like this: +```bash +markitdown path-to-file.pdf > document.md +``` + ## Contributing This project welcomes contributions and suggestions. Most contributions require you to agree to a From 33ce17954dea8a0a127d96817b6d1dac8e50fb9b Mon Sep 17 00:00:00 2001 From: Simon Willison Date: Fri, 13 Dec 2024 11:09:03 -0800 Subject: [PATCH 2/3] Note about piping --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index 6b62356..851611e 100644 --- a/README.md +++ b/README.md @@ -34,6 +34,10 @@ This will output Markdown to standard output. You can save it like this: ```bash markitdown path-to-file.pdf > document.md ``` +You can pipe content to standard input by omitting the argument: +```bash +cat path-to-file.pdf | markitdown +``` ## Contributing From 52b723724c33b76cf3a2ee1e4d636ee81312e388 Mon Sep 17 00:00:00 2001 From: Divyansh Singh <40380293+brc-dd@users.noreply.github.com> Date: Sun, 15 Dec 2024 10:37:15 +0530 Subject: [PATCH 3/3] Fix character decoding issues with text-like files --- pyproject.toml | 1 + src/markitdown/_markitdown.py | 5 ++--- tests/test_files/test_mskanji.csv | 4 ++++ tests/test_markitdown.py | 13 +++++++++++++ 4 files changed, 20 insertions(+), 3 deletions(-) create mode 100644 tests/test_files/test_mskanji.csv diff --git a/pyproject.toml b/pyproject.toml index 74df032..756380a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,6 +38,7 @@ dependencies = [ "youtube-transcript-api", "SpeechRecognition", "pathvalidate", + "charset-normalizer", ] [project.urls] diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 96997cf..25786f6 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -26,6 +26,7 @@ import pptx import puremagic import requests from bs4 import BeautifulSoup +from charset_normalizer import from_path # Optional Transcription support try: @@ -161,9 +162,7 @@ class PlainTextConverter(DocumentConverter): elif "text/" not in content_type.lower(): return None - text_content = "" - with open(local_path, "rt", encoding="utf-8") as fh: - text_content = fh.read() + text_content = str(from_path(local_path).best()) return DocumentConverterResult( title=None, text_content=text_content, diff --git a/tests/test_files/test_mskanji.csv b/tests/test_files/test_mskanji.csv new file mode 100644 index 0000000..d67f5a3 --- /dev/null +++ b/tests/test_files/test_mskanji.csv @@ -0,0 +1,4 @@ +名前,年齢,住所 +佐藤太郎,30,東京 +三木英子,25,大阪 +煖エ淳,35,名古屋 diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py index 94fd886..ac08820 100644 --- a/tests/test_markitdown.py +++ b/tests/test_markitdown.py @@ -87,6 +87,13 @@ SERP_TEST_EXCLUDES = [ "data:image/svg+xml,%3Csvg%20width%3D", ] +CSV_CP932_TEST_STRINGS = [ + "蜷榊燕,蟷エ鮨「,菴乗園", + "菴占陸螟ェ驛,30,譚ア莠ャ", + "荳画惠闍ア蟄,25,螟ァ髦ェ", + "鬮呎ゥ区キウ,35,蜷榊商螻", +] + @pytest.mark.skipif( skip_remote, @@ -164,6 +171,12 @@ def test_markitdown_local() -> None: for test_string in SERP_TEST_STRINGS: assert test_string in text_content + ## Test non-UTF-8 encoding + result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_mskanji.csv")) + text_content = result.text_content.replace("\\", "") + for test_string in CSV_CP932_TEST_STRINGS: + assert test_string in text_content + @pytest.mark.skipif( skip_exiftool,