1
- import requests
1
+ import warnings
2
+ from pathlib import Path
3
+ from typing import Optional , Iterable , Union
2
4
3
- try : # pragma: no cover
4
- import http .cookiejar as cookiejar
5
+ from http .cookiejar import MozillaCookieJar , LoadError
5
6
6
- CookieLoadError = (FileNotFoundError , cookiejar .LoadError )
7
- except ImportError : # pragma: no cover
8
- import cookielib as cookiejar
7
+ from requests import Session
9
8
10
- CookieLoadError = IOError
9
+ from . proxies import ProxyConfig , GenericProxyConfig
11
10
12
- from ._transcripts import TranscriptListFetcher
11
+ from ._transcripts import TranscriptListFetcher , FetchedTranscript , TranscriptList
13
12
14
- from ._errors import CookiePathInvalid , CookiesInvalid
13
+ from ._errors import CookiePathInvalid , CookieInvalid
15
14
16
15
17
- class YouTubeTranscriptApi (object ):
16
+ def _load_cookie_jar (cookies : Union [Path , str ]) -> MozillaCookieJar :
17
+ try :
18
+ cookie_jar = MozillaCookieJar ()
19
+ cookie_jar .load (str (cookies ))
20
+ if not cookie_jar :
21
+ raise CookieInvalid (cookies )
22
+ return cookie_jar
23
+ except (FileNotFoundError , LoadError ):
24
+ raise CookiePathInvalid (cookies )
25
+
26
+
27
+ class YouTubeTranscriptApi :
28
+ def __init__ (
29
+ self ,
30
+ cookie_path : Optional [Union [Path , str ]] = None ,
31
+ proxy_config : Optional [ProxyConfig ] = None ,
32
+ http_client : Optional [Session ] = None ,
33
+ ):
34
+ """
35
+ :param cookie_path: Path to a text file containing YouTube authorization cookies
36
+ :param proxy_config: an optional ProxyConfig object, defining proxies used for
37
+ all network requests. This can be used to work around your IP being blocked
38
+ by YouTube, as described in the "Working around IP bans" section of the
39
+ README
40
+ (https://github.com/jdepoix/youtube-transcript-api?tab=readme-ov-file#working-around-ip-bans-requestblocked-or-ipblocked-exception)
41
+ :param http_client: You can optionally pass in a requests.Session object, if you
42
+ manually want to share cookies between different instances of
43
+ `YouTubeTranscriptApi`, overwrite defaults, specify SSL certificates, etc.
44
+ """
45
+ http_client = Session () if http_client is None else http_client
46
+ http_client .headers .update ({"Accept-Language" : "en-US" })
47
+ if cookie_path is not None :
48
+ http_client .cookies = _load_cookie_jar (cookie_path )
49
+ if proxy_config is not None :
50
+ http_client .proxies = proxy_config .to_requests_dict ()
51
+ self ._fetcher = TranscriptListFetcher (http_client )
52
+
53
+ def fetch (
54
+ self ,
55
+ video_id : str ,
56
+ languages : Iterable [str ] = ("en" ,),
57
+ preserve_formatting : bool = False ,
58
+ ) -> FetchedTranscript :
59
+ """
60
+ Retrieves the transcript for a single video. This is just a shortcut for
61
+ calling:
62
+ `YouTubeTranscriptApi.list_transcripts(video_id, proxies).find_transcript(languages).fetch()`
63
+
64
+ :param video_id: the ID of the video you want to retrieve the transcript for.
65
+ Make sure that this is the actual ID, NOT the full URL to the video!
66
+ :param languages: A list of language codes in a descending priority. For
67
+ example, if this is set to ["de", "en"] it will first try to fetch the
68
+ german transcript (de) and then fetch the english transcript (en) if
69
+ it fails to do so. This defaults to ["en"].
70
+ :param preserve_formatting: whether to keep select HTML text formatting
71
+ """
72
+ return (
73
+ self .list (video_id )
74
+ .find_transcript (languages )
75
+ .fetch (preserve_formatting = preserve_formatting )
76
+ )
77
+
78
+ def list (
79
+ self ,
80
+ video_id : str ,
81
+ ) -> TranscriptList :
82
+ """
83
+ Retrieves the list of transcripts which are available for a given video. It
84
+ returns a `TranscriptList` object which is iterable and provides methods to
85
+ filter the list of transcripts for specific languages. While iterating over
86
+ the `TranscriptList` the individual transcripts are represented by
87
+ `Transcript` objects, which provide metadata and can either be fetched by
88
+ calling `transcript.fetch()` or translated by calling `transcript.translate(
89
+ 'en')`. Example:
90
+
91
+ ```
92
+ ytt_api = YouTubeTranscriptApi()
93
+
94
+ # retrieve the available transcripts
95
+ transcript_list = ytt_api.list('video_id')
96
+
97
+ # iterate over all available transcripts
98
+ for transcript in transcript_list:
99
+ # the Transcript object provides metadata properties
100
+ print(
101
+ transcript.video_id,
102
+ transcript.language,
103
+ transcript.language_code,
104
+ # whether it has been manually created or generated by YouTube
105
+ transcript.is_generated,
106
+ # a list of languages the transcript can be translated to
107
+ transcript.translation_languages,
108
+ )
109
+
110
+ # fetch the actual transcript data
111
+ print(transcript.fetch())
112
+
113
+ # translating the transcript will return another transcript object
114
+ print(transcript.translate('en').fetch())
115
+
116
+ # you can also directly filter for the language you are looking for, using the transcript list
117
+ transcript = transcript_list.find_transcript(['de', 'en'])
118
+
119
+ # or just filter for manually created transcripts
120
+ transcript = transcript_list.find_manually_created_transcript(['de', 'en'])
121
+
122
+ # or automatically generated ones
123
+ transcript = transcript_list.find_generated_transcript(['de', 'en'])
124
+ ```
125
+
126
+ :param video_id: the ID of the video you want to retrieve the transcript for.
127
+ Make sure that this is the actual ID, NOT the full URL to the video!
128
+ """
129
+ return self ._fetcher .fetch (video_id )
130
+
18
131
@classmethod
19
132
def list_transcripts (cls , video_id , proxies = None , cookies = None ):
20
133
"""
134
+ DEPRECATED: use the `list` method instead!
135
+
21
136
Retrieves the list of transcripts which are available for a given video. It returns a `TranscriptList` object
22
137
which is iterable and provides methods to filter the list of transcripts for specific languages. While iterating
23
138
over the `TranscriptList` the individual transcripts are represented by `Transcript` objects, which provide
24
139
metadata and can either be fetched by calling `transcript.fetch()` or translated by calling
25
- `transcript.translate('en')`. Example::
140
+ `transcript.translate('en')`. Example:
26
141
27
142
# retrieve the available transcripts
28
- transcript_list = YouTubeTranscriptApi.get ('video_id')
143
+ transcript_list = YouTubeTranscriptApi.list_transcripts ('video_id')
29
144
30
145
# iterate over all available transcripts
31
146
for transcript in transcript_list:
@@ -64,11 +179,26 @@ def list_transcripts(cls, video_id, proxies=None, cookies=None):
64
179
:return: the list of available transcripts
65
180
:rtype TranscriptList:
66
181
"""
67
- with requests .Session () as http_client :
68
- if cookies :
69
- http_client .cookies = cls ._load_cookies (cookies , video_id )
70
- http_client .proxies = proxies if proxies else {}
71
- return TranscriptListFetcher (http_client ).fetch (video_id )
182
+ warnings .warn (
183
+ "`list_transcripts` is deprecated and will be removed in a future version. "
184
+ "Use the `list` method instead!" ,
185
+ DeprecationWarning ,
186
+ )
187
+
188
+ proxy_config = None
189
+ if proxies :
190
+ if isinstance (proxies , ProxyConfig ):
191
+ proxy_config = proxies
192
+ else :
193
+ proxy_config = GenericProxyConfig (
194
+ http_url = proxies .get ("http" ), https_url = proxies .get ("https" )
195
+ )
196
+
197
+ ytt_api = YouTubeTranscriptApi (
198
+ proxy_config = proxy_config ,
199
+ cookie_path = Path (cookies ) if cookies else None ,
200
+ )
201
+ return ytt_api .list (video_id )
72
202
73
203
@classmethod
74
204
def get_transcripts (
@@ -81,6 +211,8 @@ def get_transcripts(
81
211
preserve_formatting = False ,
82
212
):
83
213
"""
214
+ DEPRECATED: use the `fetch` method instead!
215
+
84
216
Retrieves the transcripts for a list of videos.
85
217
86
218
:param video_ids: a list of youtube video ids
@@ -102,6 +234,12 @@ def get_transcripts(
102
234
video ids, which could not be retrieved
103
235
:rtype ({str: [{'text': str, 'start': float, 'end': float}]}, [str]}):
104
236
"""
237
+ warnings .warn (
238
+ "`get_transcripts` is deprecated and will be removed in a future version. "
239
+ "Use the `fetch` method instead!" ,
240
+ DeprecationWarning ,
241
+ )
242
+
105
243
assert isinstance (video_ids , list ), "`video_ids` must be a list of strings"
106
244
107
245
data = {}
@@ -130,6 +268,8 @@ def get_transcript(
130
268
preserve_formatting = False ,
131
269
):
132
270
"""
271
+ DEPRECATED: use the `fetch` method instead!
272
+
133
273
Retrieves the transcript for a single video. This is just a shortcut for calling::
134
274
135
275
YouTubeTranscriptApi.list_transcripts(video_id, proxies).find_transcript(languages).fetch()
@@ -149,20 +289,16 @@ def get_transcript(
149
289
:return: a list of dictionaries containing the 'text', 'start' and 'duration' keys
150
290
:rtype [{'text': str, 'start': float, 'end': float}]:
151
291
"""
292
+ warnings .warn (
293
+ "`get_transcript` is deprecated and will be removed in a future version. "
294
+ "Use the `fetch` method instead!" ,
295
+ DeprecationWarning ,
296
+ )
297
+
152
298
assert isinstance (video_id , str ), "`video_id` must be a string"
153
299
return (
154
300
cls .list_transcripts (video_id , proxies , cookies )
155
301
.find_transcript (languages )
156
302
.fetch (preserve_formatting = preserve_formatting )
303
+ .to_raw_data ()
157
304
)
158
-
159
- @classmethod
160
- def _load_cookies (cls , cookies , video_id ):
161
- try :
162
- cookie_jar = cookiejar .MozillaCookieJar ()
163
- cookie_jar .load (cookies )
164
- if not cookie_jar :
165
- raise CookiesInvalid (video_id )
166
- return cookie_jar
167
- except CookieLoadError :
168
- raise CookiePathInvalid (video_id )
0 commit comments