Merge branch 'main' into main

2024-12-16 13:51:39 -08:00
parent a55c3d525c ed91e8b534
commit 9e6a19987b
5 changed files with 41 additions and 4 deletions
--- a/README.md
+++ b/README.md
@@ -1,5 +1,7 @@
 # MarkItDown

+[![PyPI](https://img.shields.io/pypi/v/markitdown.svg)](https://pypi.org/project/markitdown/)
+
 The MarkItDown library is a utility tool for converting various files to Markdown (e.g., for indexing, text analysis, etc.)

 It presently supports:
@@ -28,7 +30,6 @@ or from the source
 pip install -e .
 ```

-
 # Usage
 The API is simple:

@@ -40,6 +41,25 @@ result = markitdown.convert("test.xlsx")
 print(result.text_content)
 ```

+To use this as a command-line utility, install it and then run it like this:
+
+```bash
+markitdown path-to-file.pdf
+```
+
+This will output Markdown to standard output. You can save it like this:
+
+```bash
+markitdown path-to-file.pdf > document.md
+```
+
+You can pipe content to standard input by omitting the argument:
+
+```bash
+cat path-to-file.pdf | markitdown
+```
+
+
 You can also configure markitdown to use Large Language Models to describe images. To do so you must provide mlm_client and mlm_model parameters to MarkItDown object, according to your specific client.

 ```python
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -38,6 +38,7 @@ dependencies = [
  "youtube-transcript-api",
  "SpeechRecognition",
  "pathvalidate",
+  "charset-normalizer",
 ]

 [project.urls]
--- a/src/markitdown/_markitdown.py
+++ b/src/markitdown/_markitdown.py
@@ -27,6 +27,7 @@ import pptx
 import puremagic
 import requests
 from bs4 import BeautifulSoup
+from charset_normalizer import from_path

 # Optional Transcription support
 try:
@@ -162,9 +163,7 @@ class PlainTextConverter(DocumentConverter):
        elif "text/" not in content_type.lower():
            return None

-        text_content = ""
-        with open(local_path, "rt", encoding="utf-8") as fh:
-            text_content = fh.read()
+        text_content = str(from_path(local_path).best())
        return DocumentConverterResult(
            title=None,
            text_content=text_content,
--- a/tests/test_files/test_mskanji.csv
+++ b/tests/test_files/test_mskanji.csv
@@ -0,0 +1,4 @@
+<EFBFBD><EFBFBD><EFBFBD>O,<EFBFBD>N<EFBFBD><EFBFBD>,<EFBFBD>Z<EFBFBD><EFBFBD>
+<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Y,30,<EFBFBD><EFBFBD><EFBFBD><EFBFBD>
+<EFBFBD>O<EFBFBD>؉p<EFBFBD>q,25,<EFBFBD><EFBFBD><EFBFBD><EFBFBD>
+<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>~,35,<EFBFBD><EFBFBD><EFBFBD>É<EFBFBD>
--- a/tests/test_markitdown.py
+++ b/tests/test_markitdown.py
@@ -87,6 +87,13 @@ SERP_TEST_EXCLUDES = [
    "data:image/svg+xml,%3Csvg%20width%3D",
 ]

+CSV_CP932_TEST_STRINGS = [
+    "名前,年齢,住所",
+    "佐藤太郎,30,東京",
+    "三木英子,25,大阪",
+    "髙橋淳,35,名古屋",
+]
+

@pytest.mark.skipif(
    skip_remote,
@@ -170,6 +177,12 @@ def test_markitdown_local() -> None:
    for test_string in SERP_TEST_STRINGS:
        assert test_string in text_content

+    ## Test non-UTF-8 encoding
+    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_mskanji.csv"))
+    text_content = result.text_content.replace("\\", "")
+    for test_string in CSV_CP932_TEST_STRINGS:
+        assert test_string in text_content
+

@pytest.mark.skipif(
    skip_exiftool,