diff --git a/README.md b/README.md index 865d5a5..f0f2e83 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,7 @@ # MarkItDown +[![PyPI](https://img.shields.io/pypi/v/markitdown.svg)](https://pypi.org/project/markitdown/) + The MarkItDown library is a utility tool for converting various files to Markdown (e.g., for indexing, text analysis, etc.) It presently supports: @@ -28,7 +30,6 @@ or from the source pip install -e . ``` - # Usage The API is simple: @@ -40,6 +41,25 @@ result = markitdown.convert("test.xlsx") print(result.text_content) ``` +To use this as a command-line utility, install it and then run it like this: + +```bash +markitdown path-to-file.pdf +``` + +This will output Markdown to standard output. You can save it like this: + +```bash +markitdown path-to-file.pdf > document.md +``` + +You can pipe content to standard input by omitting the argument: + +```bash +cat path-to-file.pdf | markitdown +``` + + You can also configure markitdown to use Large Language Models to describe images. To do so you must provide mlm_client and mlm_model parameters to MarkItDown object, according to your specific client. ```python diff --git a/pyproject.toml b/pyproject.toml index 74df032..756380a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,6 +38,7 @@ dependencies = [ "youtube-transcript-api", "SpeechRecognition", "pathvalidate", + "charset-normalizer", ] [project.urls] diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 40c2307..1ba39eb 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -27,6 +27,7 @@ import pptx import puremagic import requests from bs4 import BeautifulSoup +from charset_normalizer import from_path # Optional Transcription support try: @@ -162,9 +163,7 @@ class PlainTextConverter(DocumentConverter): elif "text/" not in content_type.lower(): return None - text_content = "" - with open(local_path, "rt", encoding="utf-8") as fh: - text_content = fh.read() + text_content = str(from_path(local_path).best()) return DocumentConverterResult( title=None, text_content=text_content, diff --git a/tests/test_files/test_mskanji.csv b/tests/test_files/test_mskanji.csv new file mode 100644 index 0000000..d67f5a3 --- /dev/null +++ b/tests/test_files/test_mskanji.csv @@ -0,0 +1,4 @@ +名前,年齢,住所 +佐藤太郎,30,東京 +三木英子,25,大阪 +煖エ淳,35,名古屋 diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py index 15dd3de..9aaa37e 100644 --- a/tests/test_markitdown.py +++ b/tests/test_markitdown.py @@ -87,6 +87,13 @@ SERP_TEST_EXCLUDES = [ "data:image/svg+xml,%3Csvg%20width%3D", ] +CSV_CP932_TEST_STRINGS = [ + "蜷榊燕,蟷エ鮨「,菴乗園", + "菴占陸螟ェ驛,30,譚ア莠ャ", + "荳画惠闍ア蟄,25,螟ァ髦ェ", + "鬮呎ゥ区キウ,35,蜷榊商螻", +] + @pytest.mark.skipif( skip_remote, @@ -170,6 +177,12 @@ def test_markitdown_local() -> None: for test_string in SERP_TEST_STRINGS: assert test_string in text_content + ## Test non-UTF-8 encoding + result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_mskanji.csv")) + text_content = result.text_content.replace("\\", "") + for test_string in CSV_CP932_TEST_STRINGS: + assert test_string in text_content + @pytest.mark.skipif( skip_exiftool,