Adjust warning filters and update dependencies (#1143)
Adjusts warning filters to be more contextual Updates dependencies for magika and youtube-transcript-api Updates the version to 0.1.0a5 in __about__.py
This commit is contained in:
@@ -27,7 +27,7 @@ dependencies = [
|
||||
"beautifulsoup4",
|
||||
"requests",
|
||||
"markdownify",
|
||||
"magika>=0.6.1rc3",
|
||||
"magika~=0.6.1",
|
||||
"charset-normalizer",
|
||||
]
|
||||
|
||||
@@ -42,7 +42,7 @@ all = [
|
||||
"olefile",
|
||||
"pydub",
|
||||
"SpeechRecognition",
|
||||
"youtube-transcript-api",
|
||||
"youtube-transcript-api~=1.0.0",
|
||||
"azure-ai-documentintelligence",
|
||||
"azure-identity"
|
||||
]
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
|
||||
#
|
||||
# SPDX-License-Identifier: MIT
|
||||
__version__ = "0.1.0a4"
|
||||
__version__ = "0.1.0a5"
|
||||
|
||||
@@ -7,20 +7,14 @@ from .._exceptions import MissingDependencyException
|
||||
# Save reporting of any exceptions for later
|
||||
_dependency_exc_info = None
|
||||
try:
|
||||
# Suppress some deprecation warnings from the speech_recognition library
|
||||
# Suppress some warnings on library import
|
||||
import warnings
|
||||
|
||||
warnings.filterwarnings(
|
||||
"ignore", category=DeprecationWarning, module="speech_recognition"
|
||||
)
|
||||
warnings.filterwarnings(
|
||||
"ignore",
|
||||
category=SyntaxWarning,
|
||||
module="pydub", # TODO: Migrate away from pydub
|
||||
)
|
||||
import speech_recognition as sr
|
||||
|
||||
import pydub
|
||||
with warnings.catch_warnings():
|
||||
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
||||
warnings.filterwarnings("ignore", category=SyntaxWarning)
|
||||
import speech_recognition as sr
|
||||
import pydub
|
||||
except ImportError:
|
||||
# Preserve the error and stack trace for later
|
||||
_dependency_exc_info = sys.exc_info()
|
||||
|
||||
@@ -4,22 +4,21 @@ import time
|
||||
import io
|
||||
import re
|
||||
import bs4
|
||||
import warnings
|
||||
from typing import Any, BinaryIO, Optional, Dict, List, Union
|
||||
from urllib.parse import parse_qs, urlparse, unquote
|
||||
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._stream_info import StreamInfo
|
||||
from ._markdownify import _CustomMarkdownify
|
||||
|
||||
# Optional YouTube transcription support
|
||||
try:
|
||||
warnings.filterwarnings(
|
||||
"ignore",
|
||||
category=SyntaxWarning,
|
||||
module="youtube_transcript_api", # Patch submitted to youtube-transcript-api
|
||||
)
|
||||
from youtube_transcript_api import YouTubeTranscriptApi
|
||||
# Suppress some warnings on library import
|
||||
import warnings
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.filterwarnings("ignore", category=SyntaxWarning)
|
||||
# Patch submitted upstream to fix the SyntaxWarning
|
||||
from youtube_transcript_api import YouTubeTranscriptApi
|
||||
|
||||
IS_YOUTUBE_TRANSCRIPT_CAPABLE = True
|
||||
except ModuleNotFoundError:
|
||||
@@ -148,6 +147,7 @@ class YouTubeConverter(DocumentConverter):
|
||||
webpage_text += f"\n### Description\n{description}\n"
|
||||
|
||||
if IS_YOUTUBE_TRANSCRIPT_CAPABLE:
|
||||
ytt_api = YouTubeTranscriptApi()
|
||||
transcript_text = ""
|
||||
parsed_url = urlparse(stream_info.url) # type: ignore
|
||||
params = parse_qs(parsed_url.query) # type: ignore
|
||||
@@ -159,7 +159,7 @@ class YouTubeConverter(DocumentConverter):
|
||||
)
|
||||
# Retry the transcript fetching operation
|
||||
transcript = self._retry_operation(
|
||||
lambda: YouTubeTranscriptApi.get_transcript(
|
||||
lambda: ytt_api.fetch(
|
||||
video_id, languages=youtube_transcript_languages
|
||||
),
|
||||
retries=3, # Retry 3 times
|
||||
@@ -167,11 +167,8 @@ class YouTubeConverter(DocumentConverter):
|
||||
)
|
||||
if transcript:
|
||||
transcript_text = " ".join(
|
||||
[part["text"] for part in transcript]
|
||||
[part.text for part in transcript]
|
||||
) # type: ignore
|
||||
# Alternative formatting:
|
||||
# formatter = TextFormatter()
|
||||
# formatter.format_transcript(transcript)
|
||||
except Exception as e:
|
||||
print(f"Error fetching transcript: {e}")
|
||||
if transcript_text:
|
||||
|
||||
Reference in New Issue
Block a user