2 Commits

Author SHA1 Message Date
afourney
cd6aa41361 Adjust warning filters and update dependencies (#1143)
Adjusts warning filters to be more contextual
Updates dependencies for magika and youtube-transcript-api
Updates the version to 0.1.0a5 in __about__.py
2025-03-19 22:09:14 -07:00
afourney
716f74dcb9 Consider anything with a charset as plain text-convertible. (#1142) 2025-03-19 20:46:35 -07:00
5 changed files with 35 additions and 35 deletions

View File

@@ -27,7 +27,7 @@ dependencies = [
"beautifulsoup4",
"requests",
"markdownify",
"magika>=0.6.1rc3",
"magika~=0.6.1",
"charset-normalizer",
]
@@ -42,7 +42,7 @@ all = [
"olefile",
"pydub",
"SpeechRecognition",
"youtube-transcript-api",
"youtube-transcript-api~=1.0.0",
"azure-ai-documentintelligence",
"azure-identity"
]

View File

@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
#
# SPDX-License-Identifier: MIT
__version__ = "0.1.0a4"
__version__ = "0.1.0a5"

View File

@@ -17,12 +17,16 @@ except ImportError:
ACCEPTED_MIME_TYPE_PREFIXES = [
"text/",
"application/json",
"application/markdown",
]
# Mimetypes to ignore (commonly confused extensions)
IGNORE_MIME_TYPE_PREFIXES = [
"text/vnd.in3d.spot", # .spo wich is confused with xls, doc, etc.
"text/vnd.graphviz", # .dot which is confused with xls, doc, etc.
ACCEPTED_FILE_EXTENSIONS = [
".txt",
".text",
".md",
".markdown",
".json",
".jsonl",
]
@@ -38,9 +42,14 @@ class PlainTextConverter(DocumentConverter):
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()
for prefix in IGNORE_MIME_TYPE_PREFIXES:
if mimetype.startswith(prefix):
return False
# If we have a charset, we can safely assume it's text
# With Magika in the earlier stages, this handles most cases
if stream_info.charset is not None:
return True
# Otherwise, check the mimetype and extension
if extension in ACCEPTED_FILE_EXTENSIONS:
return True
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
if mimetype.startswith(prefix):

View File

@@ -7,20 +7,14 @@ from .._exceptions import MissingDependencyException
# Save reporting of any exceptions for later
_dependency_exc_info = None
try:
# Suppress some deprecation warnings from the speech_recognition library
# Suppress some warnings on library import
import warnings
warnings.filterwarnings(
"ignore", category=DeprecationWarning, module="speech_recognition"
)
warnings.filterwarnings(
"ignore",
category=SyntaxWarning,
module="pydub", # TODO: Migrate away from pydub
)
import speech_recognition as sr
import pydub
with warnings.catch_warnings():
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=SyntaxWarning)
import speech_recognition as sr
import pydub
except ImportError:
# Preserve the error and stack trace for later
_dependency_exc_info = sys.exc_info()

View File

@@ -4,22 +4,21 @@ import time
import io
import re
import bs4
import warnings
from typing import Any, BinaryIO, Optional, Dict, List, Union
from urllib.parse import parse_qs, urlparse, unquote
from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo
from ._markdownify import _CustomMarkdownify
# Optional YouTube transcription support
try:
warnings.filterwarnings(
"ignore",
category=SyntaxWarning,
module="youtube_transcript_api", # Patch submitted to youtube-transcript-api
)
from youtube_transcript_api import YouTubeTranscriptApi
# Suppress some warnings on library import
import warnings
with warnings.catch_warnings():
warnings.filterwarnings("ignore", category=SyntaxWarning)
# Patch submitted upstream to fix the SyntaxWarning
from youtube_transcript_api import YouTubeTranscriptApi
IS_YOUTUBE_TRANSCRIPT_CAPABLE = True
except ModuleNotFoundError:
@@ -148,6 +147,7 @@ class YouTubeConverter(DocumentConverter):
webpage_text += f"\n### Description\n{description}\n"
if IS_YOUTUBE_TRANSCRIPT_CAPABLE:
ytt_api = YouTubeTranscriptApi()
transcript_text = ""
parsed_url = urlparse(stream_info.url) # type: ignore
params = parse_qs(parsed_url.query) # type: ignore
@@ -159,7 +159,7 @@ class YouTubeConverter(DocumentConverter):
)
# Retry the transcript fetching operation
transcript = self._retry_operation(
lambda: YouTubeTranscriptApi.get_transcript(
lambda: ytt_api.fetch(
video_id, languages=youtube_transcript_languages
),
retries=3, # Retry 3 times
@@ -167,11 +167,8 @@ class YouTubeConverter(DocumentConverter):
)
if transcript:
transcript_text = " ".join(
[part["text"] for part in transcript]
[part.text for part in transcript]
) # type: ignore
# Alternative formatting:
# formatter = TextFormatter()
# formatter.format_transcript(transcript)
except Exception as e:
print(f"Error fetching transcript: {e}")
if transcript_text: