Fetch YouTube Transcript
Extract transcript from a YouTube video URL using youtube-transcript-api.
Source Code
metadata = {
"id": "code:media.youtube.fetch",
"name": "Fetch YouTube Transcript",
"description": "Extract transcript from a YouTube video URL using youtube-transcript-api.",
"packages": ["youtube-transcript-api"],
"args": [
{
"name": "url",
"type": "string",
"description": "YouTube video URL (youtube.com/watch?v=, youtu.be/, etc.)",
"position": 0,
},
{
"name": "output_path",
"type": "string",
"description": "Path to write transcript JSON output",
"position": 1,
"default": "session/transcript.json",
},
],
}
import sys
import re
import json
import os
from youtube_transcript_api import YouTubeTranscriptApi
def extract_video_id(url: str) -> str | None:
"""Extract video ID from various YouTube URL formats."""
patterns = [
r"(?:youtube\.com\/watch\?v=|youtu\.be\/|youtube\.com\/embed\/|youtube\.com\/v\/)([a-zA-Z0-9_-]{11})",
r"^([a-zA-Z0-9_-]{11})$", # Direct video ID
]
for pattern in patterns:
match = re.search(pattern, url)
if match:
return match.group(1)
return None
def format_timestamp(seconds: float) -> str:
"""Format seconds as HH:MM:SS."""
hours = int(seconds) // 3600
minutes = (int(seconds) % 3600) // 60
secs = int(seconds) % 60
return f"{hours:02d}:{minutes:02d}:{secs:02d}"
def main():
if len(sys.argv) < 2:
print("Error: YouTube URL is required")
sys.exit(1)
url = sys.argv[1]
output_path = sys.argv[2] if len(sys.argv) > 2 else "session/transcript.json"
# Extract video ID
video_id = extract_video_id(url)
if not video_id:
print(f"Error: Could not extract video ID from URL: {url}")
print("Supported formats: youtube.com/watch?v=, youtu.be/, youtube.com/embed/")
sys.exit(1)
print(f"Fetching transcript for video: {video_id}")
try:
# Use the new API style (fetch instead of get_transcript)
api = YouTubeTranscriptApi()
fetched = api.fetch(video_id)
transcript = fetched.to_raw_data()
if not transcript:
print("Error: No transcript available for this video")
sys.exit(1)
# Get language info if available
language = getattr(fetched, 'language', 'unknown')
is_generated = getattr(fetched, 'is_generated', None)
# Format entries
entries = [
{
"start": entry["start"],
"duration": entry["duration"],
"text": entry["text"],
"timestamp": format_timestamp(entry["start"]),
}
for entry in transcript
]
result = {
"videoId": video_id,
"url": url,
"language": language,
"autoGenerated": is_generated,
"entryCount": len(entries),
"entries": entries,
}
# Ensure output directory exists
output_dir = os.path.dirname(output_path)
if output_dir:
os.makedirs(output_dir, exist_ok=True)
# Write transcript to file
with open(output_path, "w") as f:
json.dump(result, f, indent=2)
print(f"✓ Fetched {len(entries)} transcript entries")
print(f"✓ Written to: {output_path}")
print(json.dumps({
"videoId": video_id,
"entryCount": len(entries),
"language": language,
"outputPath": output_path
}))
except Exception as e:
error_msg = str(e)
if "TranscriptsDisabled" in error_msg or "disabled" in error_msg.lower():
print("Error: Transcripts are disabled for this video")
elif "NoTranscriptFound" in error_msg:
print("Error: No transcript found for this video")
elif "VideoUnavailable" in error_msg:
print("Error: Video is unavailable or private")
else:
print(f"Error fetching transcript: {error_msg}")
sys.exit(1)
if __name__ == "__main__":
main()