Support specifying YouTube transcript language

This commit is contained in:
narumi
2024-12-16 13:16:00 +08:00
parent 81e3f24acd
commit 695100d5d8

View File

@@ -344,8 +344,11 @@ class YouTubeConverter(DocumentConverter):
assert isinstance(params["v"][0], str)
video_id = str(params["v"][0])
try:
youtube_transcript_languages = kwargs.get(
"youtube_transcript_languages", ("en",)
)
# Must be a single transcript.
transcript = YouTubeTranscriptApi.get_transcript(video_id) # type: ignore
transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=youtube_transcript_languages) # type: ignore
transcript_text = " ".join([part["text"] for part in transcript]) # type: ignore
# Alternative formatting:
# formatter = TextFormatter()
@@ -1003,7 +1006,7 @@ class MarkItDown:
self._append_ext(extensions, g)
# Convert
result = self._convert(temp_path, extensions, url=response.url)
result = self._convert(temp_path, extensions, url=response.url, **kwargs)
# Clean up
finally:
try: