code icon Code

Fetch YouTube Transcript

Extract transcript from a YouTube video URL using youtube-transcript-api.

Source Code

metadata = {
    "id": "code:media.youtube.fetch",
    "name": "Fetch YouTube Transcript",
    "description": "Extract transcript from a YouTube video URL using youtube-transcript-api.",
    "packages": ["youtube-transcript-api"],
    "args": [
        {
            "name": "url",
            "type": "string",
            "description": "YouTube video URL (youtube.com/watch?v=, youtu.be/, etc.)",
            "position": 0,
        },
        {
            "name": "output_path",
            "type": "string",
            "description": "Path to write transcript JSON output",
            "position": 1,
            "default": "session/transcript.json",
        },
    ],
}

import sys
import re
import json
import os
from youtube_transcript_api import YouTubeTranscriptApi

def extract_video_id(url: str) -> str | None:
    """Extract video ID from various YouTube URL formats."""
    patterns = [
        r"(?:youtube\.com\/watch\?v=|youtu\.be\/|youtube\.com\/embed\/|youtube\.com\/v\/)([a-zA-Z0-9_-]{11})",
        r"^([a-zA-Z0-9_-]{11})$",  # Direct video ID
    ]

    for pattern in patterns:
        match = re.search(pattern, url)
        if match:
            return match.group(1)
    return None

def format_timestamp(seconds: float) -> str:
    """Format seconds as HH:MM:SS."""
    hours = int(seconds) // 3600
    minutes = (int(seconds) % 3600) // 60
    secs = int(seconds) % 60
    return f"{hours:02d}:{minutes:02d}:{secs:02d}"

def main():
    if len(sys.argv) < 2:
        print("Error: YouTube URL is required")
        sys.exit(1)

    url = sys.argv[1]
    output_path = sys.argv[2] if len(sys.argv) > 2 else "session/transcript.json"

    # Extract video ID
    video_id = extract_video_id(url)
    if not video_id:
        print(f"Error: Could not extract video ID from URL: {url}")
        print("Supported formats: youtube.com/watch?v=, youtu.be/, youtube.com/embed/")
        sys.exit(1)

    print(f"Fetching transcript for video: {video_id}")

    try:
        # Use the new API style (fetch instead of get_transcript)
        api = YouTubeTranscriptApi()
        fetched = api.fetch(video_id)
        transcript = fetched.to_raw_data()

        if not transcript:
            print("Error: No transcript available for this video")
            sys.exit(1)

        # Get language info if available
        language = getattr(fetched, 'language', 'unknown')
        is_generated = getattr(fetched, 'is_generated', None)

        # Format entries
        entries = [
            {
                "start": entry["start"],
                "duration": entry["duration"],
                "text": entry["text"],
                "timestamp": format_timestamp(entry["start"]),
            }
            for entry in transcript
        ]

        result = {
            "videoId": video_id,
            "url": url,
            "language": language,
            "autoGenerated": is_generated,
            "entryCount": len(entries),
            "entries": entries,
        }

        # Ensure output directory exists
        output_dir = os.path.dirname(output_path)
        if output_dir:
            os.makedirs(output_dir, exist_ok=True)

        # Write transcript to file
        with open(output_path, "w") as f:
            json.dump(result, f, indent=2)

        print(f"✓ Fetched {len(entries)} transcript entries")
        print(f"✓ Written to: {output_path}")
        print(json.dumps({
            "videoId": video_id,
            "entryCount": len(entries),
            "language": language,
            "outputPath": output_path
        }))

    except Exception as e:
        error_msg = str(e)
        if "TranscriptsDisabled" in error_msg or "disabled" in error_msg.lower():
            print("Error: Transcripts are disabled for this video")
        elif "NoTranscriptFound" in error_msg:
            print("Error: No transcript found for this video")
        elif "VideoUnavailable" in error_msg:
            print("Error: Video is unavailable or private")
        else:
            print(f"Error fetching transcript: {error_msg}")
        sys.exit(1)

if __name__ == "__main__":
    main()