From cd6aa41361d47e3a1eaefc3a176e5d3ca7ab9994 Mon Sep 17 00:00:00 2001 From: afourney Date: Wed, 19 Mar 2025 22:09:14 -0700 Subject: [PATCH] Adjust warning filters and update dependencies (#1143) Adjusts warning filters to be more contextual Updates dependencies for magika and youtube-transcript-api Updates the version to 0.1.0a5 in __about__.py --- packages/markitdown/pyproject.toml | 4 ++-- .../markitdown/src/markitdown/__about__.py | 2 +- .../converters/_transcribe_audio.py | 18 +++++---------- .../converters/_youtube_converter.py | 23 ++++++++----------- 4 files changed, 19 insertions(+), 28 deletions(-) diff --git a/packages/markitdown/pyproject.toml b/packages/markitdown/pyproject.toml index bd38193..9136108 100644 --- a/packages/markitdown/pyproject.toml +++ b/packages/markitdown/pyproject.toml @@ -27,7 +27,7 @@ dependencies = [ "beautifulsoup4", "requests", "markdownify", - "magika>=0.6.1rc3", + "magika~=0.6.1", "charset-normalizer", ] @@ -42,7 +42,7 @@ all = [ "olefile", "pydub", "SpeechRecognition", - "youtube-transcript-api", + "youtube-transcript-api~=1.0.0", "azure-ai-documentintelligence", "azure-identity" ] diff --git a/packages/markitdown/src/markitdown/__about__.py b/packages/markitdown/src/markitdown/__about__.py index e54f3bc..21790a2 100644 --- a/packages/markitdown/src/markitdown/__about__.py +++ b/packages/markitdown/src/markitdown/__about__.py @@ -1,4 +1,4 @@ # SPDX-FileCopyrightText: 2024-present Adam Fourney # # SPDX-License-Identifier: MIT -__version__ = "0.1.0a4" +__version__ = "0.1.0a5" diff --git a/packages/markitdown/src/markitdown/converters/_transcribe_audio.py b/packages/markitdown/src/markitdown/converters/_transcribe_audio.py index 4a9a521..d558e46 100644 --- a/packages/markitdown/src/markitdown/converters/_transcribe_audio.py +++ b/packages/markitdown/src/markitdown/converters/_transcribe_audio.py @@ -7,20 +7,14 @@ from .._exceptions import MissingDependencyException # Save reporting of any exceptions for later _dependency_exc_info = None try: - # Suppress some deprecation warnings from the speech_recognition library + # Suppress some warnings on library import import warnings - warnings.filterwarnings( - "ignore", category=DeprecationWarning, module="speech_recognition" - ) - warnings.filterwarnings( - "ignore", - category=SyntaxWarning, - module="pydub", # TODO: Migrate away from pydub - ) - import speech_recognition as sr - - import pydub + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=DeprecationWarning) + warnings.filterwarnings("ignore", category=SyntaxWarning) + import speech_recognition as sr + import pydub except ImportError: # Preserve the error and stack trace for later _dependency_exc_info = sys.exc_info() diff --git a/packages/markitdown/src/markitdown/converters/_youtube_converter.py b/packages/markitdown/src/markitdown/converters/_youtube_converter.py index 83ad57b..b5a014c 100644 --- a/packages/markitdown/src/markitdown/converters/_youtube_converter.py +++ b/packages/markitdown/src/markitdown/converters/_youtube_converter.py @@ -4,22 +4,21 @@ import time import io import re import bs4 -import warnings from typing import Any, BinaryIO, Optional, Dict, List, Union from urllib.parse import parse_qs, urlparse, unquote from .._base_converter import DocumentConverter, DocumentConverterResult from .._stream_info import StreamInfo -from ._markdownify import _CustomMarkdownify # Optional YouTube transcription support try: - warnings.filterwarnings( - "ignore", - category=SyntaxWarning, - module="youtube_transcript_api", # Patch submitted to youtube-transcript-api - ) - from youtube_transcript_api import YouTubeTranscriptApi + # Suppress some warnings on library import + import warnings + + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=SyntaxWarning) + # Patch submitted upstream to fix the SyntaxWarning + from youtube_transcript_api import YouTubeTranscriptApi IS_YOUTUBE_TRANSCRIPT_CAPABLE = True except ModuleNotFoundError: @@ -148,6 +147,7 @@ class YouTubeConverter(DocumentConverter): webpage_text += f"\n### Description\n{description}\n" if IS_YOUTUBE_TRANSCRIPT_CAPABLE: + ytt_api = YouTubeTranscriptApi() transcript_text = "" parsed_url = urlparse(stream_info.url) # type: ignore params = parse_qs(parsed_url.query) # type: ignore @@ -159,7 +159,7 @@ class YouTubeConverter(DocumentConverter): ) # Retry the transcript fetching operation transcript = self._retry_operation( - lambda: YouTubeTranscriptApi.get_transcript( + lambda: ytt_api.fetch( video_id, languages=youtube_transcript_languages ), retries=3, # Retry 3 times @@ -167,11 +167,8 @@ class YouTubeConverter(DocumentConverter): ) if transcript: transcript_text = " ".join( - [part["text"] for part in transcript] + [part.text for part in transcript] ) # type: ignore - # Alternative formatting: - # formatter = TextFormatter() - # formatter.format_transcript(transcript) except Exception as e: print(f"Error fetching transcript: {e}") if transcript_text: