* Updated DocumentConverter interface * Updated all DocumentConverter classes * Added support for various new audio files. * Updated sample plugin to new DocumentConverter interface. * Updated project README with notes about changes, and use-cases. * Updated DocumentConverter documentation. * Move priority to outside DocumentConverter, allowing them to be reprioritized, and keeping the DocumentConverter interface simple. --------- Co-authored-by: Kenny Zhang <kzhang678@gmail.com>
79 lines
2.2 KiB
Python
79 lines
2.2 KiB
Python
import sys
|
|
import io
|
|
|
|
from typing import BinaryIO, Any
|
|
|
|
|
|
from ._html_converter import HtmlConverter
|
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
|
from .._stream_info import StreamInfo
|
|
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
|
|
|
|
|
# Try loading optional (but in this case, required) dependencies
|
|
# Save reporting of any exceptions for later
|
|
_dependency_exc_info = None
|
|
try:
|
|
import pdfminer
|
|
import pdfminer.high_level
|
|
except ImportError:
|
|
# Preserve the error and stack trace for later
|
|
_dependency_exc_info = sys.exc_info()
|
|
|
|
|
|
ACCEPTED_MIME_TYPE_PREFIXES = [
|
|
"application/pdf",
|
|
"application/x-pdf",
|
|
]
|
|
|
|
ACCEPTED_FILE_EXTENSIONS = [".pdf"]
|
|
|
|
|
|
class PdfConverter(DocumentConverter):
|
|
"""
|
|
Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
|
|
"""
|
|
|
|
def accepts(
|
|
self,
|
|
file_stream: BinaryIO,
|
|
stream_info: StreamInfo,
|
|
**kwargs: Any, # Options to pass to the converter
|
|
) -> bool:
|
|
mimetype = (stream_info.mimetype or "").lower()
|
|
extension = (stream_info.extension or "").lower()
|
|
|
|
if extension in ACCEPTED_FILE_EXTENSIONS:
|
|
return True
|
|
|
|
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
|
if mimetype.startswith(prefix):
|
|
return True
|
|
|
|
return False
|
|
|
|
def convert(
|
|
self,
|
|
file_stream: BinaryIO,
|
|
stream_info: StreamInfo,
|
|
**kwargs: Any, # Options to pass to the converter
|
|
) -> DocumentConverterResult:
|
|
# Check the dependencies
|
|
if _dependency_exc_info is not None:
|
|
raise MissingDependencyException(
|
|
MISSING_DEPENDENCY_MESSAGE.format(
|
|
converter=type(self).__name__,
|
|
extension=".pdf",
|
|
feature="pdf",
|
|
)
|
|
) from _dependency_exc_info[
|
|
1
|
|
].with_traceback( # type: ignore[union-attr]
|
|
_dependency_exc_info[2]
|
|
)
|
|
|
|
assert isinstance(file_stream, io.IOBase) # for mypy
|
|
return DocumentConverterResult(
|
|
markdown=pdfminer.high_level.extract_text(file_stream),
|
|
)
|