Adjust warning filters and update dependencies (#1143)

Adjusts warning filters to be more contextual
Updates dependencies for magika and youtube-transcript-api
Updates the version to 0.1.0a5 in __about__.py
This commit is contained in:
afourney
2025-03-19 22:09:14 -07:00
committed by GitHub
parent 716f74dcb9
commit cd6aa41361
4 changed files with 19 additions and 28 deletions

View File

@@ -27,7 +27,7 @@ dependencies = [
"beautifulsoup4", "beautifulsoup4",
"requests", "requests",
"markdownify", "markdownify",
"magika>=0.6.1rc3", "magika~=0.6.1",
"charset-normalizer", "charset-normalizer",
] ]
@@ -42,7 +42,7 @@ all = [
"olefile", "olefile",
"pydub", "pydub",
"SpeechRecognition", "SpeechRecognition",
"youtube-transcript-api", "youtube-transcript-api~=1.0.0",
"azure-ai-documentintelligence", "azure-ai-documentintelligence",
"azure-identity" "azure-identity"
] ]

View File

@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com> # SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
# #
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
__version__ = "0.1.0a4" __version__ = "0.1.0a5"

View File

@@ -7,19 +7,13 @@ from .._exceptions import MissingDependencyException
# Save reporting of any exceptions for later # Save reporting of any exceptions for later
_dependency_exc_info = None _dependency_exc_info = None
try: try:
# Suppress some deprecation warnings from the speech_recognition library # Suppress some warnings on library import
import warnings import warnings
warnings.filterwarnings( with warnings.catch_warnings():
"ignore", category=DeprecationWarning, module="speech_recognition" warnings.filterwarnings("ignore", category=DeprecationWarning)
) warnings.filterwarnings("ignore", category=SyntaxWarning)
warnings.filterwarnings(
"ignore",
category=SyntaxWarning,
module="pydub", # TODO: Migrate away from pydub
)
import speech_recognition as sr import speech_recognition as sr
import pydub import pydub
except ImportError: except ImportError:
# Preserve the error and stack trace for later # Preserve the error and stack trace for later

View File

@@ -4,21 +4,20 @@ import time
import io import io
import re import re
import bs4 import bs4
import warnings
from typing import Any, BinaryIO, Optional, Dict, List, Union from typing import Any, BinaryIO, Optional, Dict, List, Union
from urllib.parse import parse_qs, urlparse, unquote from urllib.parse import parse_qs, urlparse, unquote
from .._base_converter import DocumentConverter, DocumentConverterResult from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo from .._stream_info import StreamInfo
from ._markdownify import _CustomMarkdownify
# Optional YouTube transcription support # Optional YouTube transcription support
try: try:
warnings.filterwarnings( # Suppress some warnings on library import
"ignore", import warnings
category=SyntaxWarning,
module="youtube_transcript_api", # Patch submitted to youtube-transcript-api with warnings.catch_warnings():
) warnings.filterwarnings("ignore", category=SyntaxWarning)
# Patch submitted upstream to fix the SyntaxWarning
from youtube_transcript_api import YouTubeTranscriptApi from youtube_transcript_api import YouTubeTranscriptApi
IS_YOUTUBE_TRANSCRIPT_CAPABLE = True IS_YOUTUBE_TRANSCRIPT_CAPABLE = True
@@ -148,6 +147,7 @@ class YouTubeConverter(DocumentConverter):
webpage_text += f"\n### Description\n{description}\n" webpage_text += f"\n### Description\n{description}\n"
if IS_YOUTUBE_TRANSCRIPT_CAPABLE: if IS_YOUTUBE_TRANSCRIPT_CAPABLE:
ytt_api = YouTubeTranscriptApi()
transcript_text = "" transcript_text = ""
parsed_url = urlparse(stream_info.url) # type: ignore parsed_url = urlparse(stream_info.url) # type: ignore
params = parse_qs(parsed_url.query) # type: ignore params = parse_qs(parsed_url.query) # type: ignore
@@ -159,7 +159,7 @@ class YouTubeConverter(DocumentConverter):
) )
# Retry the transcript fetching operation # Retry the transcript fetching operation
transcript = self._retry_operation( transcript = self._retry_operation(
lambda: YouTubeTranscriptApi.get_transcript( lambda: ytt_api.fetch(
video_id, languages=youtube_transcript_languages video_id, languages=youtube_transcript_languages
), ),
retries=3, # Retry 3 times retries=3, # Retry 3 times
@@ -167,11 +167,8 @@ class YouTubeConverter(DocumentConverter):
) )
if transcript: if transcript:
transcript_text = " ".join( transcript_text = " ".join(
[part["text"] for part in transcript] [part.text for part in transcript]
) # type: ignore ) # type: ignore
# Alternative formatting:
# formatter = TextFormatter()
# formatter.format_transcript(transcript)
except Exception as e: except Exception as e:
print(f"Error fetching transcript: {e}") print(f"Error fetching transcript: {e}")
if transcript_text: if transcript_text: