Merge branch 'main' into main
This commit is contained in:
22
README.md
22
README.md
@@ -1,5 +1,7 @@
|
|||||||
# MarkItDown
|
# MarkItDown
|
||||||
|
|
||||||
|
[](https://pypi.org/project/markitdown/)
|
||||||
|
|
||||||
The MarkItDown library is a utility tool for converting various files to Markdown (e.g., for indexing, text analysis, etc.)
|
The MarkItDown library is a utility tool for converting various files to Markdown (e.g., for indexing, text analysis, etc.)
|
||||||
|
|
||||||
It presently supports:
|
It presently supports:
|
||||||
@@ -28,7 +30,6 @@ or from the source
|
|||||||
pip install -e .
|
pip install -e .
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
# Usage
|
# Usage
|
||||||
The API is simple:
|
The API is simple:
|
||||||
|
|
||||||
@@ -40,6 +41,25 @@ result = markitdown.convert("test.xlsx")
|
|||||||
print(result.text_content)
|
print(result.text_content)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
To use this as a command-line utility, install it and then run it like this:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
markitdown path-to-file.pdf
|
||||||
|
```
|
||||||
|
|
||||||
|
This will output Markdown to standard output. You can save it like this:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
markitdown path-to-file.pdf > document.md
|
||||||
|
```
|
||||||
|
|
||||||
|
You can pipe content to standard input by omitting the argument:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cat path-to-file.pdf | markitdown
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
You can also configure markitdown to use Large Language Models to describe images. To do so you must provide mlm_client and mlm_model parameters to MarkItDown object, according to your specific client.
|
You can also configure markitdown to use Large Language Models to describe images. To do so you must provide mlm_client and mlm_model parameters to MarkItDown object, according to your specific client.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
|||||||
@@ -38,6 +38,7 @@ dependencies = [
|
|||||||
"youtube-transcript-api",
|
"youtube-transcript-api",
|
||||||
"SpeechRecognition",
|
"SpeechRecognition",
|
||||||
"pathvalidate",
|
"pathvalidate",
|
||||||
|
"charset-normalizer",
|
||||||
]
|
]
|
||||||
|
|
||||||
[project.urls]
|
[project.urls]
|
||||||
|
|||||||
@@ -27,6 +27,7 @@ import pptx
|
|||||||
import puremagic
|
import puremagic
|
||||||
import requests
|
import requests
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
from charset_normalizer import from_path
|
||||||
|
|
||||||
# Optional Transcription support
|
# Optional Transcription support
|
||||||
try:
|
try:
|
||||||
@@ -162,9 +163,7 @@ class PlainTextConverter(DocumentConverter):
|
|||||||
elif "text/" not in content_type.lower():
|
elif "text/" not in content_type.lower():
|
||||||
return None
|
return None
|
||||||
|
|
||||||
text_content = ""
|
text_content = str(from_path(local_path).best())
|
||||||
with open(local_path, "rt", encoding="utf-8") as fh:
|
|
||||||
text_content = fh.read()
|
|
||||||
return DocumentConverterResult(
|
return DocumentConverterResult(
|
||||||
title=None,
|
title=None,
|
||||||
text_content=text_content,
|
text_content=text_content,
|
||||||
|
|||||||
4
tests/test_files/test_mskanji.csv
Normal file
4
tests/test_files/test_mskanji.csv
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
<EFBFBD><EFBFBD><EFBFBD>O,<EFBFBD>N<EFBFBD><EFBFBD>,<EFBFBD>Z<EFBFBD><EFBFBD>
|
||||||
|
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Y,30,<EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||||
|
<EFBFBD>O<EFBFBD>؉p<EFBFBD>q,25,<EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||||
|
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>~,35,<EFBFBD><EFBFBD><EFBFBD>É<EFBFBD>
|
||||||
|
@@ -87,6 +87,13 @@ SERP_TEST_EXCLUDES = [
|
|||||||
"data:image/svg+xml,%3Csvg%20width%3D",
|
"data:image/svg+xml,%3Csvg%20width%3D",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
CSV_CP932_TEST_STRINGS = [
|
||||||
|
"名前,年齢,住所",
|
||||||
|
"佐藤太郎,30,東京",
|
||||||
|
"三木英子,25,大阪",
|
||||||
|
"髙橋淳,35,名古屋",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(
|
@pytest.mark.skipif(
|
||||||
skip_remote,
|
skip_remote,
|
||||||
@@ -170,6 +177,12 @@ def test_markitdown_local() -> None:
|
|||||||
for test_string in SERP_TEST_STRINGS:
|
for test_string in SERP_TEST_STRINGS:
|
||||||
assert test_string in text_content
|
assert test_string in text_content
|
||||||
|
|
||||||
|
## Test non-UTF-8 encoding
|
||||||
|
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_mskanji.csv"))
|
||||||
|
text_content = result.text_content.replace("\\", "")
|
||||||
|
for test_string in CSV_CP932_TEST_STRINGS:
|
||||||
|
assert test_string in text_content
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(
|
@pytest.mark.skipif(
|
||||||
skip_exiftool,
|
skip_exiftool,
|
||||||
|
|||||||
Reference in New Issue
Block a user