All files / src transcript.ts

90.27% Statements 65/72
68.18% Branches 15/22
92.85% Functions 13/14
90.14% Lines 64/71
Press n or j to go to the next uncovered block, b, p or k for the previous block.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210  
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14x
13x
13x
6x
5x
 
 
 
 
 
 
18x
 
 
 
 
18x
 
18x
 
2x
16x
 
4x
4x
2x
2x
 
4x
 
 
18x
 
 
 
 
 
 
13x
13x
1x
 
13x
 
13x
13x
 
 
 
 
 
 
13x
 
13x
7x
 
 
6x
 
6x
 
 
 
6x
 
 
 
6x
 
 
 
 
 
 
6x
6x
6x
 
 
 
 
 
 
 
 
 
7x
1x
 
6x
5x
 
1x
 
 
 
 
 
 
6x
1x
 
 
5x
5x
 
 
 
5x
 
 
 
5x
 
 
 
 
 
 
1x
1x
1x
 
 
1x
1x
1x
 
1x
 
 
 
 
 
 
 
 
 
 
 
 
5x
5x
 
 
5x
 
5x
 
5x
 
 
 
5x
5x
 
5x
 
 
 
 
 
 
 
 
 
 
 
14x
10x
 
 
4x
4x
3x
 
 
1x
 
 
  import { CONSTANTS } from './constants.js';
import {
  TranscriptError,
  RateLimitError,
  VideoUnavailableError,
  TranscriptDisabledError,
  NoTranscriptError,
  LanguageNotFoundError,
} from './errors.js';
import { TranscriptConfig, TranscriptSegment, CaptionsData } from './types.js';
import fetch, { RequestInit } from 'node-fetch';
import { HttpsProxyAgent } from 'https-proxy-agent';
import { URL } from 'url';
 
/**
 * Service class for fetching YouTube video transcripts
 */
export class YoutubeTranscript {
  /**
   * Fetches the transcript for a YouTube video
   * @param videoId Video URL or ID
   * @param config Configuration options
   * @returns Array of transcript segments
   */
  public static async fetchTranscript(
    videoId: string,
    config?: TranscriptConfig
  ): Promise<TranscriptSegment[]> {
    const identifier = this.retrieveVideoId(videoId);
    const pageContent = await this.fetchVideoPage(identifier, config);
    const captionsData = this.parseCaptionsData(pageContent, videoId);
    const transcriptUrl = this.getTranscriptUrl(captionsData, videoId, config?.lang);
    return this.fetchAndParseTranscript(transcriptUrl, config?.lang, captionsData.playerCaptionsTracklistRenderer.captionTracks[0].languageCode, config);
  }
 
  /**
   * Creates fetch options with proxy configuration if provided
   */
  private static getFetchOptions(config?: TranscriptConfig, extraHeaders: Record<string, string> = {}): RequestInit {
    const headers = {
      'User-Agent': CONSTANTS.USER_AGENT,
      ...extraHeaders,
    };
 
    const options: RequestInit & { agent?: any } = { headers };
 
    if (config?.proxyAgent) {
      // Use pre-configured proxy agent if provided
      options.agent = config.proxyAgent;
    } else if (config?.proxy) {
      // Otherwise, create a proxy agent from the proxy configuration
      const proxyUrl = new URL(config.proxy.host);
      if (config.proxy.auth) {
        proxyUrl.username = config.proxy.auth.username;
        proxyUrl.password = config.proxy.auth.password;
      }
      options.agent = new HttpsProxyAgent(proxyUrl.toString());
    }
 
    return options;
  }
 
  /**
   * Fetches the video page content
   */
  private static async fetchVideoPage(videoId: string, config?: TranscriptConfig): Promise<string> {
    const extraHeaders: Record<string, string> = {};
    if (config?.lang) {
      extraHeaders['Accept-Language'] = config.lang;
    }
    const options = this.getFetchOptions(config, extraHeaders);
    
    const response = await fetch(`https://www.youtube.com/watch?v=${videoId}`, options);
    return response.text();
  }
 
  /**
   * Extracts and validates captions data from the video page
   */
  private static parseCaptionsData(pageContent: string, videoId: string): CaptionsData {
    const htmlParts = pageContent.split('"captions":');
 
    if (htmlParts.length <= 1) {
      this.handlePageErrors(pageContent, videoId);
    }
 
    const captionsData = this.extractCaptionsJson(htmlParts[1]);
    
    Iif (!captionsData) {
      throw new TranscriptDisabledError(videoId);
    }
 
    Iif (!('captionTracks' in captionsData.playerCaptionsTracklistRenderer)) {
      throw new NoTranscriptError(videoId);
    }
 
    return captionsData;
  }
 
  /**
   * Extracts captions JSON data from the page content
   */
  private static extractCaptionsJson(captionsSection: string): CaptionsData | undefined {
    try {
      const jsonStr = captionsSection.split(',"videoDetails')[0].replace('\n', '');
      return JSON.parse(jsonStr);
    } catch {
      return undefined;
    }
  }
 
  /**
   * Handles various error cases from the video page
   */
  private static handlePageErrors(pageContent: string, videoId: string): never {
    if (pageContent.includes('class="g-recaptcha"')) {
      throw new RateLimitError();
    }
    if (!pageContent.includes('"playabilityStatus":')) {
      throw new VideoUnavailableError(videoId);
    }
    throw new TranscriptDisabledError(videoId);
  }
 
  /**
   * Gets the URL for the transcript in the requested language
   */
  private static getTranscriptUrl(captionsData: CaptionsData, videoId: string, requestedLang?: string): string {
    if (requestedLang) {
      this.validateLanguageAvailability(captionsData, requestedLang, videoId);
    }
 
    const tracks = captionsData.playerCaptionsTracklistRenderer.captionTracks;
    const track = requestedLang
      ? tracks.find((track) => track.languageCode === requestedLang)
      : tracks[0];
 
    Iif (!track) {
      throw new NoTranscriptError(videoId);
    }
 
    return track.baseUrl;
  }
 
  /**
   * Validates that the requested language is available
   */
  private static validateLanguageAvailability(captionsData: CaptionsData, lang: string, videoId: string): void {
    const tracks = captionsData.playerCaptionsTracklistRenderer.captionTracks;
    const isLanguageAvailable = tracks.some(
      (track) => track.languageCode === lang
    );
 
    if (!isLanguageAvailable) {
      const availableLanguages = tracks.map(
        (track) => track.languageCode
      );
      throw new LanguageNotFoundError(lang, availableLanguages, videoId);
    }
  }
 
  /**
   * Fetches and parses the transcript XML
   */
  private static async fetchAndParseTranscript(
    transcriptUrl: string,
    requestedLang?: string,
    defaultLang?: string,
    config?: TranscriptConfig
  ): Promise<TranscriptSegment[]> {
    const extraHeaders: Record<string, string> = {};
    Iif (requestedLang) {
      extraHeaders['Accept-Language'] = requestedLang;
    }
    const options = this.getFetchOptions(config, extraHeaders);
    
    const response = await fetch(transcriptUrl, options);
 
    Iif (!response.ok) {
      throw new NoTranscriptError(transcriptUrl);
    }
 
    const transcriptText = await response.text();
    const matches = [...transcriptText.matchAll(CONSTANTS.TRANSCRIPT_XML_REGEX)];
 
    return matches.map(match => ({
      text: match[3],
      duration: parseFloat(match[2]),
      offset: parseFloat(match[1]),
      lang: requestedLang ?? defaultLang,
    }));
  }
 
  /**
   * Extracts the video ID from either a full URL or direct ID
   */
  private static retrieveVideoId(videoId: string): string {
    if (videoId.length === 11) {
      return videoId;
    }
 
    const match = videoId.match(CONSTANTS.VIDEO_ID_REGEX);
    if (match?.[1]) {
      return match[1];
    }
 
    throw new TranscriptError('Could not extract YouTube video ID from the provided string');
  }
}