5 Commits

Author SHA1 Message Date
Adam Fourney
8eaf5a1da9 Clean up README.md 2025-03-05 21:35:08 -08:00
afourney
38c924793c Bump version (#1095) 2025-03-05 21:30:56 -08:00
afourney
b9526d5e47 Bump version. (#1075) 2025-02-28 07:30:46 -08:00
Hieu Lam
519fe172aa Unable to convert HTML to Markdown (#1072)
* feat: issue where inherited function from `markdownify.MarkdownConverter` doesn't have `current_tags` leading to error using `kwargs`, also set default value for `convert_as_inline`
2025-02-28 00:57:41 -08:00
Adam Fourney
abe9752438 Bumped version 2025-02-10 16:01:17 -08:00
4 changed files with 24 additions and 42 deletions

View File

@@ -87,42 +87,6 @@ print(result.text_content)
docker build -t markitdown:latest .
docker run --rm -i markitdown:latest < ~/your-file.pdf > output.md
```
<details>
<summary>Batch Processing Multiple Files</summary>
This example shows how to convert multiple files to markdown format in a single run. The script processes all supported files in a directory and creates corresponding markdown files.
```python convert.py
from markitdown import MarkItDown
from openai import OpenAI
import os
client = OpenAI(api_key="your-api-key-here")
md = MarkItDown(llm_client=client, llm_model="gpt-4o-2024-11-20")
supported_extensions = ('.pptx', '.docx', '.pdf', '.jpg', '.jpeg', '.png')
files_to_convert = [f for f in os.listdir('.') if f.lower().endswith(supported_extensions)]
for file in files_to_convert:
print(f"\nConverting {file}...")
try:
md_file = os.path.splitext(file)[0] + '.md'
result = md.convert(file)
with open(md_file, 'w') as f:
f.write(result.text_content)
print(f"Successfully converted {file} to {md_file}")
except Exception as e:
print(f"Error converting {file}: {str(e)}")
print("\nAll conversions completed!")
```
2. Place the script in the same directory as your files
3. Install required packages: like openai
4. Run script ```bash python convert.py ```
Note that original files will remain unchanged and new markdown files are created with the same base name.
</details>
## Contributing

View File

@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
#
# SPDX-License-Identifier: MIT
__version__ = "0.0.1a3"
__version__ = "0.0.1"

View File

@@ -3,6 +3,7 @@
# SPDX-License-Identifier: MIT
import argparse
import sys
import shutil
from textwrap import dedent
from .__about__ import __version__
from ._markitdown import MarkItDown, DocumentConverterResult
@@ -74,6 +75,8 @@ def main():
parser.add_argument("filename", nargs="?")
args = parser.parse_args()
which_exiftool = shutil.which("exiftool")
if args.use_docintel:
if args.endpoint is None:
raise ValueError(
@@ -81,9 +84,11 @@ def main():
)
elif args.filename is None:
raise ValueError("Filename is required when using Document Intelligence.")
markitdown = MarkItDown(docintel_endpoint=args.endpoint)
markitdown = MarkItDown(
exiftool_path=which_exiftool, docintel_endpoint=args.endpoint
)
else:
markitdown = MarkItDown()
markitdown = MarkItDown(exiftool_path=which_exiftool)
if args.filename is None:
result = markitdown.convert_stream(sys.stdin.buffer)

View File

@@ -91,7 +91,14 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
# Explicitly cast options to the expected type if necessary
super().__init__(**options)
def convert_hn(self, n: int, el: Any, text: str, convert_as_inline: bool) -> str:
def convert_hn(
self,
n: int,
el: Any,
text: str,
convert_as_inline: Optional[bool] = False,
**kwargs,
) -> str:
"""Same as usual, but be sure to start with a new line"""
if not convert_as_inline:
if not re.search(r"^\n", text):
@@ -99,7 +106,9 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
return super().convert_hn(n, el, text, convert_as_inline) # type: ignore
def convert_a(self, el: Any, text: str, convert_as_inline: bool):
def convert_a(
self, el: Any, text: str, convert_as_inline: Optional[bool] = False, **kwargs
):
"""Same as usual converter, but removes Javascript links and escapes URIs."""
prefix, suffix, text = markdownify.chomp(text) # type: ignore
if not text:
@@ -135,7 +144,9 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
else text
)
def convert_img(self, el: Any, text: str, convert_as_inline: bool) -> str:
def convert_img(
self, el: Any, text: str, convert_as_inline: Optional[bool] = False, **kwargs
) -> str:
"""Same as usual converter, but removes data URIs"""
alt = el.attrs.get("alt", None) or ""
@@ -1752,6 +1763,8 @@ class MarkItDown:
ext = ext.strip()
if ext == "":
return
if ext in extensions:
return
# if ext not in extensions:
extensions.append(ext)