From cd6aa41361d47e3a1eaefc3a176e5d3ca7ab9994 Mon Sep 17 00:00:00 2001
From: afourney <adamfo@microsoft.com>
Date: Wed, 19 Mar 2025 22:09:14 -0700
Subject: [PATCH] Adjust warning filters and update dependencies (#1143)

Adjusts warning filters to be more contextual
Updates dependencies for magika and youtube-transcript-api
Updates the version to 0.1.0a5 in __about__.py
---
 packages/markitdown/pyproject.toml            |  4 ++--
 .../markitdown/src/markitdown/__about__.py    |  2 +-
 .../converters/_transcribe_audio.py           | 18 +++++----------
 .../converters/_youtube_converter.py          | 23 ++++++++-----------
 4 files changed, 19 insertions(+), 28 deletions(-)

diff --git a/packages/markitdown/pyproject.toml b/packages/markitdown/pyproject.toml
index bd38193..9136108 100644
--- a/packages/markitdown/pyproject.toml
+++ b/packages/markitdown/pyproject.toml
@@ -27,7 +27,7 @@ dependencies = [
   "beautifulsoup4",
   "requests",
   "markdownify",
-  "magika>=0.6.1rc3",
+  "magika~=0.6.1",
   "charset-normalizer",
 ]
 
@@ -42,7 +42,7 @@ all = [
   "olefile",
   "pydub",
   "SpeechRecognition",
-  "youtube-transcript-api",
+  "youtube-transcript-api~=1.0.0",
   "azure-ai-documentintelligence",
   "azure-identity"
 ]
diff --git a/packages/markitdown/src/markitdown/__about__.py b/packages/markitdown/src/markitdown/__about__.py
index e54f3bc..21790a2 100644
--- a/packages/markitdown/src/markitdown/__about__.py
+++ b/packages/markitdown/src/markitdown/__about__.py
@@ -1,4 +1,4 @@
 # SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
 #
 # SPDX-License-Identifier: MIT
-__version__ = "0.1.0a4"
+__version__ = "0.1.0a5"
diff --git a/packages/markitdown/src/markitdown/converters/_transcribe_audio.py b/packages/markitdown/src/markitdown/converters/_transcribe_audio.py
index 4a9a521..d558e46 100644
--- a/packages/markitdown/src/markitdown/converters/_transcribe_audio.py
+++ b/packages/markitdown/src/markitdown/converters/_transcribe_audio.py
@@ -7,20 +7,14 @@ from .._exceptions import MissingDependencyException
 # Save reporting of any exceptions for later
 _dependency_exc_info = None
 try:
-    # Suppress some deprecation warnings from the speech_recognition library
+    # Suppress some warnings on library import
     import warnings
 
-    warnings.filterwarnings(
-        "ignore", category=DeprecationWarning, module="speech_recognition"
-    )
-    warnings.filterwarnings(
-        "ignore",
-        category=SyntaxWarning,
-        module="pydub",  # TODO: Migrate away from pydub
-    )
-    import speech_recognition as sr
-
-    import pydub
+    with warnings.catch_warnings():
+        warnings.filterwarnings("ignore", category=DeprecationWarning)
+        warnings.filterwarnings("ignore", category=SyntaxWarning)
+        import speech_recognition as sr
+        import pydub
 except ImportError:
     # Preserve the error and stack trace for later
     _dependency_exc_info = sys.exc_info()
diff --git a/packages/markitdown/src/markitdown/converters/_youtube_converter.py b/packages/markitdown/src/markitdown/converters/_youtube_converter.py
index 83ad57b..b5a014c 100644
--- a/packages/markitdown/src/markitdown/converters/_youtube_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_youtube_converter.py
@@ -4,22 +4,21 @@ import time
 import io
 import re
 import bs4
-import warnings
 from typing import Any, BinaryIO, Optional, Dict, List, Union
 from urllib.parse import parse_qs, urlparse, unquote
 
 from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._stream_info import StreamInfo
-from ._markdownify import _CustomMarkdownify
 
 # Optional YouTube transcription support
 try:
-    warnings.filterwarnings(
-        "ignore",
-        category=SyntaxWarning,
-        module="youtube_transcript_api",  # Patch submitted to youtube-transcript-api
-    )
-    from youtube_transcript_api import YouTubeTranscriptApi
+    # Suppress some warnings on library import
+    import warnings
+
+    with warnings.catch_warnings():
+        warnings.filterwarnings("ignore", category=SyntaxWarning)
+        # Patch submitted upstream to fix the SyntaxWarning
+        from youtube_transcript_api import YouTubeTranscriptApi
 
     IS_YOUTUBE_TRANSCRIPT_CAPABLE = True
 except ModuleNotFoundError:
@@ -148,6 +147,7 @@ class YouTubeConverter(DocumentConverter):
             webpage_text += f"\n### Description\n{description}\n"
 
         if IS_YOUTUBE_TRANSCRIPT_CAPABLE:
+            ytt_api = YouTubeTranscriptApi()
             transcript_text = ""
             parsed_url = urlparse(stream_info.url)  # type: ignore
             params = parse_qs(parsed_url.query)  # type: ignore
@@ -159,7 +159,7 @@ class YouTubeConverter(DocumentConverter):
                     )
                     # Retry the transcript fetching operation
                     transcript = self._retry_operation(
-                        lambda: YouTubeTranscriptApi.get_transcript(
+                        lambda: ytt_api.fetch(
                             video_id, languages=youtube_transcript_languages
                         ),
                         retries=3,  # Retry 3 times
@@ -167,11 +167,8 @@ class YouTubeConverter(DocumentConverter):
                     )
                     if transcript:
                         transcript_text = " ".join(
-                            [part["text"] for part in transcript]
+                            [part.text for part in transcript]
                         )  # type: ignore
-                    # Alternative formatting:
-                    # formatter = TextFormatter()
-                    # formatter.format_transcript(transcript)
                 except Exception as e:
                     print(f"Error fetching transcript: {e}")
             if transcript_text: