Adjust warning filters and update dependencies (#1143)
Adjusts warning filters to be more contextual Updates dependencies for magika and youtube-transcript-api Updates the version to 0.1.0a5 in __about__.py
This commit is contained in:
@@ -27,7 +27,7 @@ dependencies = [
|
|||||||
"beautifulsoup4",
|
"beautifulsoup4",
|
||||||
"requests",
|
"requests",
|
||||||
"markdownify",
|
"markdownify",
|
||||||
"magika>=0.6.1rc3",
|
"magika~=0.6.1",
|
||||||
"charset-normalizer",
|
"charset-normalizer",
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -42,7 +42,7 @@ all = [
|
|||||||
"olefile",
|
"olefile",
|
||||||
"pydub",
|
"pydub",
|
||||||
"SpeechRecognition",
|
"SpeechRecognition",
|
||||||
"youtube-transcript-api",
|
"youtube-transcript-api~=1.0.0",
|
||||||
"azure-ai-documentintelligence",
|
"azure-ai-documentintelligence",
|
||||||
"azure-identity"
|
"azure-identity"
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
|
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
|
||||||
#
|
#
|
||||||
# SPDX-License-Identifier: MIT
|
# SPDX-License-Identifier: MIT
|
||||||
__version__ = "0.1.0a4"
|
__version__ = "0.1.0a5"
|
||||||
|
|||||||
@@ -7,20 +7,14 @@ from .._exceptions import MissingDependencyException
|
|||||||
# Save reporting of any exceptions for later
|
# Save reporting of any exceptions for later
|
||||||
_dependency_exc_info = None
|
_dependency_exc_info = None
|
||||||
try:
|
try:
|
||||||
# Suppress some deprecation warnings from the speech_recognition library
|
# Suppress some warnings on library import
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
warnings.filterwarnings(
|
with warnings.catch_warnings():
|
||||||
"ignore", category=DeprecationWarning, module="speech_recognition"
|
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
||||||
)
|
warnings.filterwarnings("ignore", category=SyntaxWarning)
|
||||||
warnings.filterwarnings(
|
import speech_recognition as sr
|
||||||
"ignore",
|
import pydub
|
||||||
category=SyntaxWarning,
|
|
||||||
module="pydub", # TODO: Migrate away from pydub
|
|
||||||
)
|
|
||||||
import speech_recognition as sr
|
|
||||||
|
|
||||||
import pydub
|
|
||||||
except ImportError:
|
except ImportError:
|
||||||
# Preserve the error and stack trace for later
|
# Preserve the error and stack trace for later
|
||||||
_dependency_exc_info = sys.exc_info()
|
_dependency_exc_info = sys.exc_info()
|
||||||
|
|||||||
@@ -4,22 +4,21 @@ import time
|
|||||||
import io
|
import io
|
||||||
import re
|
import re
|
||||||
import bs4
|
import bs4
|
||||||
import warnings
|
|
||||||
from typing import Any, BinaryIO, Optional, Dict, List, Union
|
from typing import Any, BinaryIO, Optional, Dict, List, Union
|
||||||
from urllib.parse import parse_qs, urlparse, unquote
|
from urllib.parse import parse_qs, urlparse, unquote
|
||||||
|
|
||||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
from .._stream_info import StreamInfo
|
from .._stream_info import StreamInfo
|
||||||
from ._markdownify import _CustomMarkdownify
|
|
||||||
|
|
||||||
# Optional YouTube transcription support
|
# Optional YouTube transcription support
|
||||||
try:
|
try:
|
||||||
warnings.filterwarnings(
|
# Suppress some warnings on library import
|
||||||
"ignore",
|
import warnings
|
||||||
category=SyntaxWarning,
|
|
||||||
module="youtube_transcript_api", # Patch submitted to youtube-transcript-api
|
with warnings.catch_warnings():
|
||||||
)
|
warnings.filterwarnings("ignore", category=SyntaxWarning)
|
||||||
from youtube_transcript_api import YouTubeTranscriptApi
|
# Patch submitted upstream to fix the SyntaxWarning
|
||||||
|
from youtube_transcript_api import YouTubeTranscriptApi
|
||||||
|
|
||||||
IS_YOUTUBE_TRANSCRIPT_CAPABLE = True
|
IS_YOUTUBE_TRANSCRIPT_CAPABLE = True
|
||||||
except ModuleNotFoundError:
|
except ModuleNotFoundError:
|
||||||
@@ -148,6 +147,7 @@ class YouTubeConverter(DocumentConverter):
|
|||||||
webpage_text += f"\n### Description\n{description}\n"
|
webpage_text += f"\n### Description\n{description}\n"
|
||||||
|
|
||||||
if IS_YOUTUBE_TRANSCRIPT_CAPABLE:
|
if IS_YOUTUBE_TRANSCRIPT_CAPABLE:
|
||||||
|
ytt_api = YouTubeTranscriptApi()
|
||||||
transcript_text = ""
|
transcript_text = ""
|
||||||
parsed_url = urlparse(stream_info.url) # type: ignore
|
parsed_url = urlparse(stream_info.url) # type: ignore
|
||||||
params = parse_qs(parsed_url.query) # type: ignore
|
params = parse_qs(parsed_url.query) # type: ignore
|
||||||
@@ -159,7 +159,7 @@ class YouTubeConverter(DocumentConverter):
|
|||||||
)
|
)
|
||||||
# Retry the transcript fetching operation
|
# Retry the transcript fetching operation
|
||||||
transcript = self._retry_operation(
|
transcript = self._retry_operation(
|
||||||
lambda: YouTubeTranscriptApi.get_transcript(
|
lambda: ytt_api.fetch(
|
||||||
video_id, languages=youtube_transcript_languages
|
video_id, languages=youtube_transcript_languages
|
||||||
),
|
),
|
||||||
retries=3, # Retry 3 times
|
retries=3, # Retry 3 times
|
||||||
@@ -167,11 +167,8 @@ class YouTubeConverter(DocumentConverter):
|
|||||||
)
|
)
|
||||||
if transcript:
|
if transcript:
|
||||||
transcript_text = " ".join(
|
transcript_text = " ".join(
|
||||||
[part["text"] for part in transcript]
|
[part.text for part in transcript]
|
||||||
) # type: ignore
|
) # type: ignore
|
||||||
# Alternative formatting:
|
|
||||||
# formatter = TextFormatter()
|
|
||||||
# formatter.format_transcript(transcript)
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error fetching transcript: {e}")
|
print(f"Error fetching transcript: {e}")
|
||||||
if transcript_text:
|
if transcript_text:
|
||||||
|
|||||||
Reference in New Issue
Block a user