All files / src transcript.ts

90.27% Statements 65/72
68.18% Branches 15/22
92.85% Functions 13/14
90.14% Lines 64/71

Press n or j to go to the next uncovered block, b, p or k for the previous block.

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210                                                        14x 13x 13x 6x 5x             18x         18x   18x   2x 16x   4x 4x 2x 2x   4x     18x             13x 13x 1x   13x   13x 13x             13x   13x 7x     6x   6x       6x       6x             6x 6x 6x                   7x 1x   6x 5x   1x             6x 1x     5x 5x       5x       5x             1x 1x 1x     1x 1x 1x   1x                         5x 5x     5x   5x   5x       5x 5x   5x                       14x 10x     4x 4x 3x     1x      
import { CONSTANTS } from './constants.js';
import {
  TranscriptError,
  RateLimitError,
  VideoUnavailableError,
  TranscriptDisabledError,
  NoTranscriptError,
  LanguageNotFoundError,
} from './errors.js';
import { TranscriptConfig, TranscriptSegment, CaptionsData } from './types.js';
import fetch, { RequestInit } from 'node-fetch';
import { HttpsProxyAgent } from 'https-proxy-agent';
import { URL } from 'url';
 
/**
 * Service class for fetching YouTube video transcripts
 */
export class YoutubeTranscript {
  /**
   * Fetches the transcript for a YouTube video
   * @param videoId Video URL or ID
   * @param config Configuration options
   * @returns Array of transcript segments
   */
  public static async fetchTranscript(
    videoId: string,
    config?: TranscriptConfig
  ): Promise<TranscriptSegment[]> {
    const identifier = this.retrieveVideoId(videoId);
    const pageContent = await this.fetchVideoPage(identifier, config);
    const captionsData = this.parseCaptionsData(pageContent, videoId);
    const transcriptUrl = this.getTranscriptUrl(captionsData, videoId, config?.lang);
    return this.fetchAndParseTranscript(transcriptUrl, config?.lang, captionsData.playerCaptionsTracklistRenderer.captionTracks[0].languageCode, config);
  }
 
  /**
   * Creates fetch options with proxy configuration if provided
   */
  private static getFetchOptions(config?: TranscriptConfig, extraHeaders: Record<string, string> = {}): RequestInit {
    const headers = {
      'User-Agent': CONSTANTS.USER_AGENT,
      ...extraHeaders,
    };
 
    const options: RequestInit & { agent?: any } = { headers };
 
    if (config?.proxyAgent) {
      // Use pre-configured proxy agent if provided
      options.agent = config.proxyAgent;
    } else if (config?.proxy) {
      // Otherwise, create a proxy agent from the proxy configuration
      const proxyUrl = new URL(config.proxy.host);
      if (config.proxy.auth) {
        proxyUrl.username = config.proxy.auth.username;
        proxyUrl.password = config.proxy.auth.password;
      }
      options.agent = new HttpsProxyAgent(proxyUrl.toString());
    }
 
    return options;
  }
 
  /**
   * Fetches the video page content
   */
  private static async fetchVideoPage(videoId: string, config?: TranscriptConfig): Promise<string> {
    const extraHeaders: Record<string, string> = {};
    if (config?.lang) {
      extraHeaders['Accept-Language'] = config.lang;
    }
    const options = this.getFetchOptions(config, extraHeaders);
    
    const response = await fetch(`https://www.youtube.com/watch?v=${videoId}`, options);
    return response.text();
  }
 
  /**
   * Extracts and validates captions data from the video page
   */
  private static parseCaptionsData(pageContent: string, videoId: string): CaptionsData {
    const htmlParts = pageContent.split('"captions":');
 
    if (htmlParts.length <= 1) {
      this.handlePageErrors(pageContent, videoId);
    }
 
    const captionsData = this.extractCaptionsJson(htmlParts[1]);
    
    Iif (!captionsData) {
      throw new TranscriptDisabledError(videoId);
    }
 
    Iif (!('captionTracks' in captionsData.playerCaptionsTracklistRenderer)) {
      throw new NoTranscriptError(videoId);
    }
 
    return captionsData;
  }
 
  /**
   * Extracts captions JSON data from the page content
   */
  private static extractCaptionsJson(captionsSection: string): CaptionsData | undefined {
    try {
      const jsonStr = captionsSection.split(',"videoDetails')[0].replace('\n', '');
      return JSON.parse(jsonStr);
    } catch {
      return undefined;
    }
  }
 
  /**
   * Handles various error cases from the video page
   */
  private static handlePageErrors(pageContent: string, videoId: string): never {
    if (pageContent.includes('class="g-recaptcha"')) {
      throw new RateLimitError();
    }
    if (!pageContent.includes('"playabilityStatus":')) {
      throw new VideoUnavailableError(videoId);
    }
    throw new TranscriptDisabledError(videoId);
  }
 
  /**
   * Gets the URL for the transcript in the requested language
   */
  private static getTranscriptUrl(captionsData: CaptionsData, videoId: string, requestedLang?: string): string {
    if (requestedLang) {
      this.validateLanguageAvailability(captionsData, requestedLang, videoId);
    }
 
    const tracks = captionsData.playerCaptionsTracklistRenderer.captionTracks;
    const track = requestedLang
      ? tracks.find((track) => track.languageCode === requestedLang)
      : tracks[0];
 
    Iif (!track) {
      throw new NoTranscriptError(videoId);
    }
 
    return track.baseUrl;
  }
 
  /**
   * Validates that the requested language is available
   */
  private static validateLanguageAvailability(captionsData: CaptionsData, lang: string, videoId: string): void {
    const tracks = captionsData.playerCaptionsTracklistRenderer.captionTracks;
    const isLanguageAvailable = tracks.some(
      (track) => track.languageCode === lang
    );
 
    if (!isLanguageAvailable) {
      const availableLanguages = tracks.map(
        (track) => track.languageCode
      );
      throw new LanguageNotFoundError(lang, availableLanguages, videoId);
    }
  }
 
  /**
   * Fetches and parses the transcript XML
   */
  private static async fetchAndParseTranscript(
    transcriptUrl: string,
    requestedLang?: string,
    defaultLang?: string,
    config?: TranscriptConfig
  ): Promise<TranscriptSegment[]> {
    const extraHeaders: Record<string, string> = {};
    Iif (requestedLang) {
      extraHeaders['Accept-Language'] = requestedLang;
    }
    const options = this.getFetchOptions(config, extraHeaders);
    
    const response = await fetch(transcriptUrl, options);
 
    Iif (!response.ok) {
      throw new NoTranscriptError(transcriptUrl);
    }
 
    const transcriptText = await response.text();
    const matches = [...transcriptText.matchAll(CONSTANTS.TRANSCRIPT_XML_REGEX)];
 
    return matches.map(match => ({
      text: match[3],
      duration: parseFloat(match[2]),
      offset: parseFloat(match[1]),
      lang: requestedLang ?? defaultLang,
    }));
  }
 
  /**
   * Extracts the video ID from either a full URL or direct ID
   */
  private static retrieveVideoId(videoId: string): string {
    if (videoId.length === 11) {
      return videoId;
    }
 
    const match = videoId.match(CONSTANTS.VIDEO_ID_REGEX);
    if (match?.[1]) {
      return match[1];
    }
 
    throw new TranscriptError('Could not extract YouTube video ID from the provided string');
  }
}