From 56f7579ce2f74eff2b75c5337b7ad85d64baa480 Mon Sep 17 00:00:00 2001 From: JoshClark-git <73681126+JoshClark-git@users.noreply.github.com> Date: Wed, 21 May 2025 10:17:57 -0700 Subject: [PATCH] FIX YouTube transcript errors (#1241) * FIX YouTube transcript errors * Fixed formatting. --------- Co-authored-by: Josh Co-authored-by: afourney --- .../converters/_youtube_converter.py | 20 +++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/packages/markitdown/src/markitdown/converters/_youtube_converter.py b/packages/markitdown/src/markitdown/converters/_youtube_converter.py index e08c4cb..c96e8f4 100644 --- a/packages/markitdown/src/markitdown/converters/_youtube_converter.py +++ b/packages/markitdown/src/markitdown/converters/_youtube_converter.py @@ -151,9 +151,14 @@ class YouTubeConverter(DocumentConverter): params = parse_qs(parsed_url.query) # type: ignore if "v" in params and params["v"][0]: video_id = str(params["v"][0]) + transcript_list = ytt_api.list(video_id) + languages = ["en"] + for transcript in transcript_list: + languages.append(transcript.language_code) + break try: youtube_transcript_languages = kwargs.get( - "youtube_transcript_languages", ("en",) + "youtube_transcript_languages", languages ) # Retry the transcript fetching operation transcript = self._retry_operation( @@ -163,12 +168,23 @@ class YouTubeConverter(DocumentConverter): retries=3, # Retry 3 times delay=2, # 2 seconds delay between retries ) + if transcript: transcript_text = " ".join( [part.text for part in transcript] ) # type: ignore except Exception as e: - print(f"Error fetching transcript: {e}") + # No transcript available + if len(languages) == 1: + print(f"Error fetching transcript: {e}") + else: + # Translate transcript into first kwarg + transcript = ( + transcript_list.find_transcript(languages) + .translate(youtube_transcript_languages[0]) + .fetch() + ) + transcript_text = " ".join([part.text for part in transcript]) if transcript_text: webpage_text += f"\n### Transcript\n{transcript_text}\n"