Adjust warning filters and update dependencies (#1143)

Adjusts warning filters to be more contextual
Updates dependencies for magika and youtube-transcript-api
Updates the version to 0.1.0a5 in __about__.py
This commit is contained in:
afourney
2025-03-19 22:09:14 -07:00
committed by GitHub
parent 716f74dcb9
commit cd6aa41361
4 changed files with 19 additions and 28 deletions

View File

@@ -27,7 +27,7 @@ dependencies = [
"beautifulsoup4",
"requests",
"markdownify",
"magika>=0.6.1rc3",
"magika~=0.6.1",
"charset-normalizer",
]
@@ -42,7 +42,7 @@ all = [
"olefile",
"pydub",
"SpeechRecognition",
"youtube-transcript-api",
"youtube-transcript-api~=1.0.0",
"azure-ai-documentintelligence",
"azure-identity"
]

View File

@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
#
# SPDX-License-Identifier: MIT
__version__ = "0.1.0a4"
__version__ = "0.1.0a5"

View File

@@ -7,20 +7,14 @@ from .._exceptions import MissingDependencyException
# Save reporting of any exceptions for later
_dependency_exc_info = None
try:
# Suppress some deprecation warnings from the speech_recognition library
# Suppress some warnings on library import
import warnings
warnings.filterwarnings(
"ignore", category=DeprecationWarning, module="speech_recognition"
)
warnings.filterwarnings(
"ignore",
category=SyntaxWarning,
module="pydub", # TODO: Migrate away from pydub
)
import speech_recognition as sr
import pydub
with warnings.catch_warnings():
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=SyntaxWarning)
import speech_recognition as sr
import pydub
except ImportError:
# Preserve the error and stack trace for later
_dependency_exc_info = sys.exc_info()

View File

@@ -4,22 +4,21 @@ import time
import io
import re
import bs4
import warnings
from typing import Any, BinaryIO, Optional, Dict, List, Union
from urllib.parse import parse_qs, urlparse, unquote
from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo
from ._markdownify import _CustomMarkdownify
# Optional YouTube transcription support
try:
warnings.filterwarnings(
"ignore",
category=SyntaxWarning,
module="youtube_transcript_api", # Patch submitted to youtube-transcript-api
)
from youtube_transcript_api import YouTubeTranscriptApi
# Suppress some warnings on library import
import warnings
with warnings.catch_warnings():
warnings.filterwarnings("ignore", category=SyntaxWarning)
# Patch submitted upstream to fix the SyntaxWarning
from youtube_transcript_api import YouTubeTranscriptApi
IS_YOUTUBE_TRANSCRIPT_CAPABLE = True
except ModuleNotFoundError:
@@ -148,6 +147,7 @@ class YouTubeConverter(DocumentConverter):
webpage_text += f"\n### Description\n{description}\n"
if IS_YOUTUBE_TRANSCRIPT_CAPABLE:
ytt_api = YouTubeTranscriptApi()
transcript_text = ""
parsed_url = urlparse(stream_info.url) # type: ignore
params = parse_qs(parsed_url.query) # type: ignore
@@ -159,7 +159,7 @@ class YouTubeConverter(DocumentConverter):
)
# Retry the transcript fetching operation
transcript = self._retry_operation(
lambda: YouTubeTranscriptApi.get_transcript(
lambda: ytt_api.fetch(
video_id, languages=youtube_transcript_languages
),
retries=3, # Retry 3 times
@@ -167,11 +167,8 @@ class YouTubeConverter(DocumentConverter):
)
if transcript:
transcript_text = " ".join(
[part["text"] for part in transcript]
[part.text for part in transcript]
) # type: ignore
# Alternative formatting:
# formatter = TextFormatter()
# formatter.format_transcript(transcript)
except Exception as e:
print(f"Error fetching transcript: {e}")
if transcript_text: