Skip to content

Commit bf45008

Browse files
authored
Merge pull request #388 from jdepoix/release/v1.0.0
release/v1.0.0
2 parents 4d5668d + ce3fa1f commit bf45008

26 files changed

+2421
-8888
lines changed

README.md

Lines changed: 235 additions & 75 deletions
Large diffs are not rendered by default.

poetry.lock

Lines changed: 134 additions & 133 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
44

55
[tool.poetry]
66
name = "youtube-transcript-api"
7-
version = "0.6.3"
7+
version = "1.0.0"
88
description = "This is an python API which allows you to get the transcripts/subtitles for a given YouTube video. It also works for automatically generated subtitles, supports translating subtitles and it does not require a headless browser, like other selenium based solutions do!"
99
readme = "README.md"
1010
license = "MIT"
@@ -39,8 +39,8 @@ youtube_transcript_api = "youtube_transcript_api.__main__:main"
3939

4040
[tool.poe.tasks]
4141
test = "pytest youtube_transcript_api"
42-
ci-test.shell = "coverage run -m unittest discover && coverage xml"
43-
coverage.shell = "coverage run -m unittest discover && coverage report -m --fail-under=100"
42+
ci-test.shell = "coverage run -m pytest youtube_transcript_api && coverage xml"
43+
coverage.shell = "coverage run -m pytest youtube_transcript_api && coverage report -m --fail-under=100"
4444
format = "ruff format youtube_transcript_api"
4545
ci-format = "ruff format youtube_transcript_api --check"
4646
lint = "ruff check youtube_transcript_api"
@@ -57,7 +57,6 @@ optional = true
5757
[tool.poetry.group.test.dependencies]
5858
pytest = "^8.3.3"
5959
coverage = "^7.6.1"
60-
mock = "^5.1.0"
6160
httpretty = "^1.1.4"
6261

6362
[tool.poetry.group.dev]

youtube_transcript_api/__init__.py

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,27 @@
11
# ruff: noqa: F401
22
from ._api import YouTubeTranscriptApi
3-
from ._transcripts import TranscriptList, Transcript
3+
from ._transcripts import (
4+
TranscriptList,
5+
Transcript,
6+
FetchedTranscript,
7+
FetchedTranscriptSnippet,
8+
)
49
from ._errors import (
10+
YouTubeTranscriptApiException,
11+
CookieError,
12+
CookiePathInvalid,
13+
CookieInvalid,
514
TranscriptsDisabled,
615
NoTranscriptFound,
716
CouldNotRetrieveTranscript,
817
VideoUnavailable,
9-
TooManyRequests,
18+
VideoUnplayable,
19+
IpBlocked,
20+
RequestBlocked,
1021
NotTranslatable,
1122
TranslationLanguageNotAvailable,
12-
NoTranscriptAvailable,
13-
CookiePathInvalid,
14-
CookiesInvalid,
1523
FailedToCreateConsentCookie,
1624
YouTubeRequestFailed,
1725
InvalidVideoId,
26+
AgeRestricted,
1827
)

youtube_transcript_api/_api.py

Lines changed: 164 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,31 +1,146 @@
1-
import requests
1+
import warnings
2+
from pathlib import Path
3+
from typing import Optional, Iterable, Union
24

3-
try: # pragma: no cover
4-
import http.cookiejar as cookiejar
5+
from http.cookiejar import MozillaCookieJar, LoadError
56

6-
CookieLoadError = (FileNotFoundError, cookiejar.LoadError)
7-
except ImportError: # pragma: no cover
8-
import cookielib as cookiejar
7+
from requests import Session
98

10-
CookieLoadError = IOError
9+
from .proxies import ProxyConfig, GenericProxyConfig
1110

12-
from ._transcripts import TranscriptListFetcher
11+
from ._transcripts import TranscriptListFetcher, FetchedTranscript, TranscriptList
1312

14-
from ._errors import CookiePathInvalid, CookiesInvalid
13+
from ._errors import CookiePathInvalid, CookieInvalid
1514

1615

17-
class YouTubeTranscriptApi(object):
16+
def _load_cookie_jar(cookies: Union[Path, str]) -> MozillaCookieJar:
17+
try:
18+
cookie_jar = MozillaCookieJar()
19+
cookie_jar.load(str(cookies))
20+
if not cookie_jar:
21+
raise CookieInvalid(cookies)
22+
return cookie_jar
23+
except (FileNotFoundError, LoadError):
24+
raise CookiePathInvalid(cookies)
25+
26+
27+
class YouTubeTranscriptApi:
28+
def __init__(
29+
self,
30+
cookie_path: Optional[Union[Path, str]] = None,
31+
proxy_config: Optional[ProxyConfig] = None,
32+
http_client: Optional[Session] = None,
33+
):
34+
"""
35+
:param cookie_path: Path to a text file containing YouTube authorization cookies
36+
:param proxy_config: an optional ProxyConfig object, defining proxies used for
37+
all network requests. This can be used to work around your IP being blocked
38+
by YouTube, as described in the "Working around IP bans" section of the
39+
README
40+
(https://github.com/jdepoix/youtube-transcript-api?tab=readme-ov-file#working-around-ip-bans-requestblocked-or-ipblocked-exception)
41+
:param http_client: You can optionally pass in a requests.Session object, if you
42+
manually want to share cookies between different instances of
43+
`YouTubeTranscriptApi`, overwrite defaults, specify SSL certificates, etc.
44+
"""
45+
http_client = Session() if http_client is None else http_client
46+
http_client.headers.update({"Accept-Language": "en-US"})
47+
if cookie_path is not None:
48+
http_client.cookies = _load_cookie_jar(cookie_path)
49+
if proxy_config is not None:
50+
http_client.proxies = proxy_config.to_requests_dict()
51+
self._fetcher = TranscriptListFetcher(http_client)
52+
53+
def fetch(
54+
self,
55+
video_id: str,
56+
languages: Iterable[str] = ("en",),
57+
preserve_formatting: bool = False,
58+
) -> FetchedTranscript:
59+
"""
60+
Retrieves the transcript for a single video. This is just a shortcut for
61+
calling:
62+
`YouTubeTranscriptApi.list_transcripts(video_id, proxies).find_transcript(languages).fetch()`
63+
64+
:param video_id: the ID of the video you want to retrieve the transcript for.
65+
Make sure that this is the actual ID, NOT the full URL to the video!
66+
:param languages: A list of language codes in a descending priority. For
67+
example, if this is set to ["de", "en"] it will first try to fetch the
68+
german transcript (de) and then fetch the english transcript (en) if
69+
it fails to do so. This defaults to ["en"].
70+
:param preserve_formatting: whether to keep select HTML text formatting
71+
"""
72+
return (
73+
self.list(video_id)
74+
.find_transcript(languages)
75+
.fetch(preserve_formatting=preserve_formatting)
76+
)
77+
78+
def list(
79+
self,
80+
video_id: str,
81+
) -> TranscriptList:
82+
"""
83+
Retrieves the list of transcripts which are available for a given video. It
84+
returns a `TranscriptList` object which is iterable and provides methods to
85+
filter the list of transcripts for specific languages. While iterating over
86+
the `TranscriptList` the individual transcripts are represented by
87+
`Transcript` objects, which provide metadata and can either be fetched by
88+
calling `transcript.fetch()` or translated by calling `transcript.translate(
89+
'en')`. Example:
90+
91+
```
92+
ytt_api = YouTubeTranscriptApi()
93+
94+
# retrieve the available transcripts
95+
transcript_list = ytt_api.list('video_id')
96+
97+
# iterate over all available transcripts
98+
for transcript in transcript_list:
99+
# the Transcript object provides metadata properties
100+
print(
101+
transcript.video_id,
102+
transcript.language,
103+
transcript.language_code,
104+
# whether it has been manually created or generated by YouTube
105+
transcript.is_generated,
106+
# a list of languages the transcript can be translated to
107+
transcript.translation_languages,
108+
)
109+
110+
# fetch the actual transcript data
111+
print(transcript.fetch())
112+
113+
# translating the transcript will return another transcript object
114+
print(transcript.translate('en').fetch())
115+
116+
# you can also directly filter for the language you are looking for, using the transcript list
117+
transcript = transcript_list.find_transcript(['de', 'en'])
118+
119+
# or just filter for manually created transcripts
120+
transcript = transcript_list.find_manually_created_transcript(['de', 'en'])
121+
122+
# or automatically generated ones
123+
transcript = transcript_list.find_generated_transcript(['de', 'en'])
124+
```
125+
126+
:param video_id: the ID of the video you want to retrieve the transcript for.
127+
Make sure that this is the actual ID, NOT the full URL to the video!
128+
"""
129+
return self._fetcher.fetch(video_id)
130+
18131
@classmethod
19132
def list_transcripts(cls, video_id, proxies=None, cookies=None):
20133
"""
134+
DEPRECATED: use the `list` method instead!
135+
21136
Retrieves the list of transcripts which are available for a given video. It returns a `TranscriptList` object
22137
which is iterable and provides methods to filter the list of transcripts for specific languages. While iterating
23138
over the `TranscriptList` the individual transcripts are represented by `Transcript` objects, which provide
24139
metadata and can either be fetched by calling `transcript.fetch()` or translated by calling
25-
`transcript.translate('en')`. Example::
140+
`transcript.translate('en')`. Example:
26141
27142
# retrieve the available transcripts
28-
transcript_list = YouTubeTranscriptApi.get('video_id')
143+
transcript_list = YouTubeTranscriptApi.list_transcripts('video_id')
29144
30145
# iterate over all available transcripts
31146
for transcript in transcript_list:
@@ -64,11 +179,26 @@ def list_transcripts(cls, video_id, proxies=None, cookies=None):
64179
:return: the list of available transcripts
65180
:rtype TranscriptList:
66181
"""
67-
with requests.Session() as http_client:
68-
if cookies:
69-
http_client.cookies = cls._load_cookies(cookies, video_id)
70-
http_client.proxies = proxies if proxies else {}
71-
return TranscriptListFetcher(http_client).fetch(video_id)
182+
warnings.warn(
183+
"`list_transcripts` is deprecated and will be removed in a future version. "
184+
"Use the `list` method instead!",
185+
DeprecationWarning,
186+
)
187+
188+
proxy_config = None
189+
if proxies:
190+
if isinstance(proxies, ProxyConfig):
191+
proxy_config = proxies
192+
else:
193+
proxy_config = GenericProxyConfig(
194+
http_url=proxies.get("http"), https_url=proxies.get("https")
195+
)
196+
197+
ytt_api = YouTubeTranscriptApi(
198+
proxy_config=proxy_config,
199+
cookie_path=Path(cookies) if cookies else None,
200+
)
201+
return ytt_api.list(video_id)
72202

73203
@classmethod
74204
def get_transcripts(
@@ -81,6 +211,8 @@ def get_transcripts(
81211
preserve_formatting=False,
82212
):
83213
"""
214+
DEPRECATED: use the `fetch` method instead!
215+
84216
Retrieves the transcripts for a list of videos.
85217
86218
:param video_ids: a list of youtube video ids
@@ -102,6 +234,12 @@ def get_transcripts(
102234
video ids, which could not be retrieved
103235
:rtype ({str: [{'text': str, 'start': float, 'end': float}]}, [str]}):
104236
"""
237+
warnings.warn(
238+
"`get_transcripts` is deprecated and will be removed in a future version. "
239+
"Use the `fetch` method instead!",
240+
DeprecationWarning,
241+
)
242+
105243
assert isinstance(video_ids, list), "`video_ids` must be a list of strings"
106244

107245
data = {}
@@ -130,6 +268,8 @@ def get_transcript(
130268
preserve_formatting=False,
131269
):
132270
"""
271+
DEPRECATED: use the `fetch` method instead!
272+
133273
Retrieves the transcript for a single video. This is just a shortcut for calling::
134274
135275
YouTubeTranscriptApi.list_transcripts(video_id, proxies).find_transcript(languages).fetch()
@@ -149,20 +289,16 @@ def get_transcript(
149289
:return: a list of dictionaries containing the 'text', 'start' and 'duration' keys
150290
:rtype [{'text': str, 'start': float, 'end': float}]:
151291
"""
292+
warnings.warn(
293+
"`get_transcript` is deprecated and will be removed in a future version. "
294+
"Use the `fetch` method instead!",
295+
DeprecationWarning,
296+
)
297+
152298
assert isinstance(video_id, str), "`video_id` must be a string"
153299
return (
154300
cls.list_transcripts(video_id, proxies, cookies)
155301
.find_transcript(languages)
156302
.fetch(preserve_formatting=preserve_formatting)
303+
.to_raw_data()
157304
)
158-
159-
@classmethod
160-
def _load_cookies(cls, cookies, video_id):
161-
try:
162-
cookie_jar = cookiejar.MozillaCookieJar()
163-
cookie_jar.load(cookies)
164-
if not cookie_jar:
165-
raise CookiesInvalid(video_id)
166-
return cookie_jar
167-
except CookieLoadError:
168-
raise CookiePathInvalid(video_id)

0 commit comments

Comments
 (0)