Adjust warning filters and update dependencies (#1143 )

Adjusts warning filters to be more contextual Updates dependencies for magika and youtube-transcript-api Updates the version to 0.1.0a5 in __about__.py
Consider anything with a charset as plain text-convertible. (#1142 )
2025-03-19 22:09:14 -07:00 · 2025-03-19 20:46:35 -07:00
5 changed files with 35 additions and 35 deletions
--- a/packages/markitdown/pyproject.toml
+++ b/packages/markitdown/pyproject.toml
@@ -27,7 +27,7 @@ dependencies = [
  "beautifulsoup4",
  "requests",
  "markdownify",
-  "magika>=0.6.1rc3",
+  "magika~=0.6.1",
  "charset-normalizer",
 ]

@@ -42,7 +42,7 @@ all = [
  "olefile",
  "pydub",
  "SpeechRecognition",
-  "youtube-transcript-api",
+  "youtube-transcript-api~=1.0.0",
  "azure-ai-documentintelligence",
  "azure-identity"
 ]
--- a/packages/markitdown/src/markitdown/about.py
+++ b/packages/markitdown/src/markitdown/about.py
@@ -1,4 +1,4 @@
 # SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
 #
 # SPDX-License-Identifier: MIT
-__version__ = "0.1.0a4"
+__version__ = "0.1.0a5"
--- a/packages/markitdown/src/markitdown/converters/_plain_text_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_plain_text_converter.py
@@ -17,12 +17,16 @@ except ImportError:
 ACCEPTED_MIME_TYPE_PREFIXES = [
    "text/",
    "application/json",
+    "application/markdown",
 ]

-# Mimetypes to ignore (commonly confused extensions)
-IGNORE_MIME_TYPE_PREFIXES = [
-    "text/vnd.in3d.spot",  # .spo wich is confused with xls, doc, etc.
-    "text/vnd.graphviz",  # .dot which is confused with xls, doc, etc.
+ACCEPTED_FILE_EXTENSIONS = [
+    ".txt",
+    ".text",
+    ".md",
+    ".markdown",
+    ".json",
+    ".jsonl",
 ]


@@ -38,9 +42,14 @@ class PlainTextConverter(DocumentConverter):
        mimetype = (stream_info.mimetype or "").lower()
        extension = (stream_info.extension or "").lower()

-        for prefix in IGNORE_MIME_TYPE_PREFIXES:
-            if mimetype.startswith(prefix):
-                return False
+        # If we have a charset, we can safely assume it's text
+        # With Magika in the earlier stages, this handles most cases
+        if stream_info.charset is not None:
+            return True
+
+        # Otherwise, check the mimetype and extension
+        if extension in ACCEPTED_FILE_EXTENSIONS:
+            return True

        for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
            if mimetype.startswith(prefix):
--- a/packages/markitdown/src/markitdown/converters/_transcribe_audio.py
+++ b/packages/markitdown/src/markitdown/converters/_transcribe_audio.py
@@ -7,20 +7,14 @@ from .._exceptions import MissingDependencyException
 # Save reporting of any exceptions for later
 _dependency_exc_info = None
 try:
-    # Suppress some deprecation warnings from the speech_recognition library
+    # Suppress some warnings on library import
    import warnings

-    warnings.filterwarnings(
-        "ignore", category=DeprecationWarning, module="speech_recognition"
-    )
-    warnings.filterwarnings(
-        "ignore",
-        category=SyntaxWarning,
-        module="pydub",  # TODO: Migrate away from pydub
-    )
-    import speech_recognition as sr
-
-    import pydub
+    with warnings.catch_warnings():
+        warnings.filterwarnings("ignore", category=DeprecationWarning)
+        warnings.filterwarnings("ignore", category=SyntaxWarning)
+        import speech_recognition as sr
+        import pydub
 except ImportError:
    # Preserve the error and stack trace for later
    _dependency_exc_info = sys.exc_info()
--- a/packages/markitdown/src/markitdown/converters/_youtube_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_youtube_converter.py
@@ -4,22 +4,21 @@ import time
 import io
 import re
 import bs4
-import warnings
 from typing import Any, BinaryIO, Optional, Dict, List, Union
 from urllib.parse import parse_qs, urlparse, unquote

 from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._stream_info import StreamInfo
-from ._markdownify import _CustomMarkdownify

 # Optional YouTube transcription support
 try:
-    warnings.filterwarnings(
-        "ignore",
-        category=SyntaxWarning,
-        module="youtube_transcript_api",  # Patch submitted to youtube-transcript-api
-    )
-    from youtube_transcript_api import YouTubeTranscriptApi
+    # Suppress some warnings on library import
+    import warnings
+
+    with warnings.catch_warnings():
+        warnings.filterwarnings("ignore", category=SyntaxWarning)
+        # Patch submitted upstream to fix the SyntaxWarning
+        from youtube_transcript_api import YouTubeTranscriptApi

    IS_YOUTUBE_TRANSCRIPT_CAPABLE = True
 except ModuleNotFoundError:
@@ -148,6 +147,7 @@ class YouTubeConverter(DocumentConverter):
            webpage_text += f"\n### Description\n{description}\n"

        if IS_YOUTUBE_TRANSCRIPT_CAPABLE:
+            ytt_api = YouTubeTranscriptApi()
            transcript_text = ""
            parsed_url = urlparse(stream_info.url)  # type: ignore
            params = parse_qs(parsed_url.query)  # type: ignore
@@ -159,7 +159,7 @@ class YouTubeConverter(DocumentConverter):
                    )
                    # Retry the transcript fetching operation
                    transcript = self._retry_operation(
-                        lambda: YouTubeTranscriptApi.get_transcript(
+                        lambda: ytt_api.fetch(
                            video_id, languages=youtube_transcript_languages
                        ),
                        retries=3,  # Retry 3 times
@@ -167,11 +167,8 @@ class YouTubeConverter(DocumentConverter):
                    )
                    if transcript:
                        transcript_text = " ".join(
-                            [part["text"] for part in transcript]
+                            [part.text for part in transcript]
                        )  # type: ignore
-                    # Alternative formatting:
-                    # formatter = TextFormatter()
-                    # formatter.format_transcript(transcript)
                except Exception as e:
                    print(f"Error fetching transcript: {e}")
            if transcript_text:
Author	SHA1	Message	Date
afourney	cd6aa41361	Adjust warning filters and update dependencies (#1143 ) Adjusts warning filters to be more contextual Updates dependencies for magika and youtube-transcript-api Updates the version to 0.1.0a5 in __about__.py	2025-03-19 22:09:14 -07:00
afourney	716f74dcb9	Consider anything with a charset as plain text-convertible. (#1142 )	2025-03-19 20:46:35 -07:00