Transcribe YouTube Audio via ElevenLabs
Downloads audio from YouTube and transcribes it using ElevenLabs Speech-to-Text. Fallback for videos without captions.
Source Code
import ytdl from "@distube/ytdl-core";
const ELEVENLABS_API_KEY = "PLACEHOLDER_TOKEN";
const MAX_DURATION_SECONDS = 3600; // 1 hour
const [url] = process.argv.slice(2);
if (!url) {
console.error("Error: YouTube URL is required.");
process.exit(1);
}
// Extract video ID
const videoIdMatch = url.match(
/(?:https?:\/\/)?(?:www\.)?(?:m\.)?(?:youtube\.com|youtu\.be)\/(?:watch\?v=|embed\/|v\/|)([\w-]{11})(?:\S+)?/
);
if (!videoIdMatch || !videoIdMatch[1]) {
console.error("Error: Invalid YouTube URL provided.");
process.exit(1);
}
const videoId = videoIdMatch[1];
async function main() {
console.log(`Fetching video info for: ${videoId}`);
// Get video info to check duration
const info = await ytdl.getInfo(videoId);
const durationSeconds = parseInt(info.videoDetails.lengthSeconds, 10);
const title = info.videoDetails.title;
console.log(`Video: "${title}"`);
console.log(`Duration: ${Math.floor(durationSeconds / 60)}m ${durationSeconds % 60}s`);
if (durationSeconds > MAX_DURATION_SECONDS) {
console.error(
`Error: Video is ${Math.floor(durationSeconds / 60)} minutes. Maximum supported duration is ${MAX_DURATION_SECONDS / 60} minutes.`
);
process.exit(1);
}
console.log("Downloading audio stream...");
// Get audio-only stream
const audioStream = ytdl(videoId, {
quality: "lowestaudio", // Smallest audio file for faster upload
filter: "audioonly",
});
// Collect audio data into buffer
const chunks = [];
for await (const chunk of audioStream) {
chunks.push(chunk);
}
const audioBuffer = Buffer.concat(chunks);
console.log(`✓ Audio downloaded: ${(audioBuffer.length / 1024 / 1024).toFixed(2)} MB`);
console.log("Sending to ElevenLabs for transcription...");
// Prepare multipart form data
const boundary = "----ElevenLabsBoundary" + Date.now();
const formData = [];
// Add audio file
formData.push(`--${boundary}\r\n`);
formData.push(`Content-Disposition: form-data; name="file"; filename="audio.webm"\r\n`);
formData.push(`Content-Type: audio/webm\r\n\r\n`);
const header = Buffer.from(formData.join(""));
const footer = Buffer.from(`\r\n--${boundary}--\r\n`);
const body = Buffer.concat([header, audioBuffer, footer]);
// Call ElevenLabs Speech-to-Text API
const response = await fetch("https://api.elevenlabs.io/v1/speech-to-text", {
method: "POST",
headers: {
"xi-api-key": ELEVENLABS_API_KEY,
"Content-Type": `multipart/form-data; boundary=${boundary}`,
},
body: body,
});
if (!response.ok) {
const errorText = await response.text();
console.error(`ElevenLabs API error (${response.status}): ${errorText}`);
process.exit(1);
}
const result = await response.json();
// Transform to match our standard transcript format
const output = {
videoId,
title,
source: "elevenlabs",
entries: [],
};
// ElevenLabs returns { text, words: [{ word, start, end }] }
if (result.words && result.words.length > 0) {
// Group words into sentence-like chunks (roughly 10 words each)
const wordsPerChunk = 10;
for (let i = 0; i < result.words.length; i += wordsPerChunk) {
const chunkWords = result.words.slice(i, i + wordsPerChunk);
const text = chunkWords.map((w) => w.word).join(" ");
const start = chunkWords[0].start;
const end = chunkWords[chunkWords.length - 1].end;
output.entries.push({
start: start,
duration: end - start,
text: text,
});
}
} else if (result.text) {
// Fallback if no word-level timestamps
output.entries.push({
start: 0,
duration: durationSeconds,
text: result.text,
});
}
console.log(`✓ Transcription complete: ${output.entries.length} segments`);
console.log(JSON.stringify(output, null, 2));
}
main().catch((err) => {
console.error("Error:", err.message);
process.exit(1);
});