Cleanup and refactor, in preparation for plugin support. (#318)

* Work started moving converters to individual files.
* Significant cleanup and refactor.
* Moved everything to a packages subfolder.
* Added sample plugin.
* Added instructions to the README.md
* Bumped version, and added a note about compatibility.
This commit is contained in:
afourney
2025-02-10 15:21:44 -08:00
committed by GitHub
parent 73ba69d8cd
commit c73afcffea
60 changed files with 2755 additions and 1901 deletions

View File

@@ -0,0 +1,4 @@
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
#
# SPDX-License-Identifier: MIT
__version__ = "0.0.1a2"

View File

@@ -0,0 +1,13 @@
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
#
# SPDX-License-Identifier: MIT
from ._plugin import __plugin_interface_version__, register_converters, RtfConverter
from .__about__ import __version__
__all__ = [
"__version__",
"__plugin_interface_version__",
"register_converters",
"RtfConverter",
]

View File

@@ -0,0 +1,39 @@
from typing import Union
from striprtf.striprtf import rtf_to_text
from markitdown import MarkItDown, DocumentConverter, DocumentConverterResult
__plugin_interface_version__ = (
1 # The version of the plugin interface that this plugin uses
)
def register_converters(markitdown: MarkItDown, **kwargs):
"""
Called during construction of MarkItDown instances to register converters provided by plugins.
"""
# Simply create and attach an RtfConverter instance
markitdown.register_converter(RtfConverter())
class RtfConverter(DocumentConverter):
"""
Converts an RTF file to in the simplest possible way.
"""
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not a DOCX
extension = kwargs.get("file_extension", "")
if extension.lower() != ".rtf":
return None
# Read the RTF file
with open(local_path, "r") as f:
rtf = f.read()
# Return the result
return DocumentConverterResult(
title=None,
text_content=rtf_to_text(rtf),
)