jdepoix
diff --git a/‎README.md‎
Lines changed: 235 additions & 75 deletions b/‎README.md‎
Lines changed: 235 additions & 75 deletions
diff --git a/‎poetry.lock‎
Lines changed: 134 additions & 133 deletions b/‎poetry.lock‎
Lines changed: 134 additions & 133 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 3 additions & 4 deletions b/‎pyproject.toml‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎youtube_transcript_api/__init__.py‎
Lines changed: 14 additions & 5 deletions b/‎youtube_transcript_api/__init__.py‎
Lines changed: 14 additions & 5 deletions
diff --git a/‎youtube_transcript_api/_api.py‎
Lines changed: 164 additions & 28 deletions b/‎youtube_transcript_api/_api.py‎
Lines changed: 164 additions & 28 deletions
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 
 [tool.poetry]
 name = "youtube-transcript-api"
-version = "0.6.3"
+version = "1.0.0"
 description = "This is an python API which allows you to get the transcripts/subtitles for a given YouTube video. It also works for automatically generated subtitles, supports translating subtitles and it does not require a headless browser, like other selenium based solutions do!"
 readme = "README.md"
 license = "MIT"
@@ -39,8 +39,8 @@ youtube_transcript_api = "youtube_transcript_api.__main__:main"
 
 [tool.poe.tasks]
 test = "pytest youtube_transcript_api"
-ci-test.shell = "coverage run -m unittest discover && coverage xml"
-coverage.shell = "coverage run -m unittest discover && coverage report -m --fail-under=100"
+ci-test.shell = "coverage run -m pytest youtube_transcript_api && coverage xml"
+coverage.shell = "coverage run -m pytest youtube_transcript_api && coverage report -m --fail-under=100"
 format = "ruff format youtube_transcript_api"
 ci-format = "ruff format youtube_transcript_api --check"
 lint = "ruff check youtube_transcript_api"
@@ -57,7 +57,6 @@ optional = true
 [tool.poetry.group.test.dependencies]
 pytest = "^8.3.3"
 coverage = "^7.6.1"
-mock = "^5.1.0"
 httpretty = "^1.1.4"
 
 [tool.poetry.group.dev]
 
@@ -1,18 +1,27 @@
 # ruff: noqa: F401
 from ._api import YouTubeTranscriptApi
-from ._transcripts import TranscriptList, Transcript
+from ._transcripts import (
+    TranscriptList,
+    Transcript,
+    FetchedTranscript,
+    FetchedTranscriptSnippet,
+)
 from ._errors import (
+    YouTubeTranscriptApiException,
+    CookieError,
+    CookiePathInvalid,
+    CookieInvalid,
     TranscriptsDisabled,
     NoTranscriptFound,
     CouldNotRetrieveTranscript,
     VideoUnavailable,
-    TooManyRequests,
+    VideoUnplayable,
+    IpBlocked,
+    RequestBlocked,
     NotTranslatable,
     TranslationLanguageNotAvailable,
-    NoTranscriptAvailable,
-    CookiePathInvalid,
-    CookiesInvalid,
     FailedToCreateConsentCookie,
     YouTubeRequestFailed,
     InvalidVideoId,
+    AgeRestricted,
 )
@@ -1,31 +1,146 @@
-import requests
+import warnings
+from pathlib import Path
+from typing import Optional, Iterable, Union
 
-try:  # pragma: no cover
-    import http.cookiejar as cookiejar
+from http.cookiejar import MozillaCookieJar, LoadError
 
-    CookieLoadError = (FileNotFoundError, cookiejar.LoadError)
-except ImportError:  # pragma: no cover
-    import cookielib as cookiejar
+from requests import Session
 
-    CookieLoadError = IOError
+from .proxies import ProxyConfig, GenericProxyConfig
 
-from ._transcripts import TranscriptListFetcher
+from ._transcripts import TranscriptListFetcher, FetchedTranscript, TranscriptList
 
-from ._errors import CookiePathInvalid, CookiesInvalid
+from ._errors import CookiePathInvalid, CookieInvalid
 
 
-class YouTubeTranscriptApi(object):
+def _load_cookie_jar(cookies: Union[Path, str]) -> MozillaCookieJar:
+    try:
+        cookie_jar = MozillaCookieJar()
+        cookie_jar.load(str(cookies))
+        if not cookie_jar:
+            raise CookieInvalid(cookies)
+        return cookie_jar
+    except (FileNotFoundError, LoadError):
+        raise CookiePathInvalid(cookies)
+
+
+class YouTubeTranscriptApi:
+    def __init__(
+        self,
+        cookie_path: Optional[Union[Path, str]] = None,
+        proxy_config: Optional[ProxyConfig] = None,
+        http_client: Optional[Session] = None,
+    ):
+        """
+        :param cookie_path: Path to a text file containing YouTube authorization cookies
+        :param proxy_config: an optional ProxyConfig object, defining proxies used for
+            all network requests. This can be used to work around your IP being blocked
+            by YouTube, as described in the "Working around IP bans" section of the
+            README
+            (https://github.com/jdepoix/youtube-transcript-api?tab=readme-ov-file#working-around-ip-bans-requestblocked-or-ipblocked-exception)
+        :param http_client: You can optionally pass in a requests.Session object, if you
+            manually want to share cookies between different instances of
+            `YouTubeTranscriptApi`, overwrite defaults, specify SSL certificates, etc.
+        """
+        http_client = Session() if http_client is None else http_client
+        http_client.headers.update({"Accept-Language": "en-US"})
+        if cookie_path is not None:
+            http_client.cookies = _load_cookie_jar(cookie_path)
+        if proxy_config is not None:
+            http_client.proxies = proxy_config.to_requests_dict()
+        self._fetcher = TranscriptListFetcher(http_client)
+
+    def fetch(
+        self,
+        video_id: str,
+        languages: Iterable[str] = ("en",),
+        preserve_formatting: bool = False,
+    ) -> FetchedTranscript:
+        """
+        Retrieves the transcript for a single video. This is just a shortcut for
+        calling:
+        `YouTubeTranscriptApi.list_transcripts(video_id, proxies).find_transcript(languages).fetch()`
+
+        :param video_id: the ID of the video you want to retrieve the transcript for.
+            Make sure that this is the actual ID, NOT the full URL to the video!
+        :param languages: A list of language codes in a descending priority. For
+            example, if this is set to ["de", "en"] it will first try to fetch the
+            german transcript (de) and then fetch the english transcript (en) if
+            it fails to do so. This defaults to ["en"].
+        :param preserve_formatting: whether to keep select HTML text formatting
+        """
+        return (
+            self.list(video_id)
+            .find_transcript(languages)
+            .fetch(preserve_formatting=preserve_formatting)
+        )
+
+    def list(
+        self,
+        video_id: str,
+    ) -> TranscriptList:
+        """
+        Retrieves the list of transcripts which are available for a given video. It
+        returns a `TranscriptList` object which is iterable and provides methods to
+        filter the list of transcripts for specific languages. While iterating over
+        the `TranscriptList` the individual transcripts are represented by
+        `Transcript` objects, which provide metadata and can either be fetched by
+        calling `transcript.fetch()` or translated by calling `transcript.translate(
+        'en')`. Example:
+
+        ```
+        ytt_api = YouTubeTranscriptApi()
+
+        # retrieve the available transcripts
+        transcript_list = ytt_api.list('video_id')
+
+        # iterate over all available transcripts
+        for transcript in transcript_list:
+            # the Transcript object provides metadata properties
+            print(
+                transcript.video_id,
+                transcript.language,
+                transcript.language_code,
+                # whether it has been manually created or generated by YouTube
+                transcript.is_generated,
+                # a list of languages the transcript can be translated to
+                transcript.translation_languages,
+            )
+
+            # fetch the actual transcript data
+            print(transcript.fetch())
+
+            # translating the transcript will return another transcript object
+            print(transcript.translate('en').fetch())
+
+        # you can also directly filter for the language you are looking for, using the transcript list
+        transcript = transcript_list.find_transcript(['de', 'en'])
+
+        # or just filter for manually created transcripts
+        transcript = transcript_list.find_manually_created_transcript(['de', 'en'])
+
+        # or automatically generated ones
+        transcript = transcript_list.find_generated_transcript(['de', 'en'])
+        ```
+
+        :param video_id: the ID of the video you want to retrieve the transcript for.
+            Make sure that this is the actual ID, NOT the full URL to the video!
+        """
+        return self._fetcher.fetch(video_id)
+
     @classmethod
     def list_transcripts(cls, video_id, proxies=None, cookies=None):
         """
+        DEPRECATED: use the `list` method instead!
+
         Retrieves the list of transcripts which are available for a given video. It returns a `TranscriptList` object
         which is iterable and provides methods to filter the list of transcripts for specific languages. While iterating
         over the `TranscriptList` the individual transcripts are represented by `Transcript` objects, which provide
         metadata and can either be fetched by calling `transcript.fetch()` or translated by calling
-        `transcript.translate('en')`. Example::
+        `transcript.translate('en')`. Example:
 
             # retrieve the available transcripts
-            transcript_list = YouTubeTranscriptApi.get('video_id')
+            transcript_list = YouTubeTranscriptApi.list_transcripts('video_id')
 
             # iterate over all available transcripts
             for transcript in transcript_list:
@@ -64,11 +179,26 @@ def list_transcripts(cls, video_id, proxies=None, cookies=None):
         :return: the list of available transcripts
         :rtype TranscriptList:
         """
-        with requests.Session() as http_client:
-            if cookies:
-                http_client.cookies = cls._load_cookies(cookies, video_id)
-            http_client.proxies = proxies if proxies else {}
-            return TranscriptListFetcher(http_client).fetch(video_id)
+        warnings.warn(
+            "`list_transcripts` is deprecated and will be removed in a future version. "
+            "Use the `list` method instead!",
+            DeprecationWarning,
+        )
+
+        proxy_config = None
+        if proxies:
+            if isinstance(proxies, ProxyConfig):
+                proxy_config = proxies
+            else:
+                proxy_config = GenericProxyConfig(
+                    http_url=proxies.get("http"), https_url=proxies.get("https")
+                )
+
+        ytt_api = YouTubeTranscriptApi(
+            proxy_config=proxy_config,
+            cookie_path=Path(cookies) if cookies else None,
+        )
+        return ytt_api.list(video_id)
 
     @classmethod
     def get_transcripts(
@@ -81,6 +211,8 @@ def get_transcripts(
         preserve_formatting=False,
     ):
         """
+        DEPRECATED: use the `fetch` method instead!
+
         Retrieves the transcripts for a list of videos.
 
         :param video_ids: a list of youtube video ids
@@ -102,6 +234,12 @@ def get_transcripts(
         video ids, which could not be retrieved
         :rtype ({str: [{'text': str, 'start': float, 'end': float}]}, [str]}):
         """
+        warnings.warn(
+            "`get_transcripts` is deprecated and will be removed in a future version. "
+            "Use the `fetch` method instead!",
+            DeprecationWarning,
+        )
+
         assert isinstance(video_ids, list), "`video_ids` must be a list of strings"
 
         data = {}
@@ -130,6 +268,8 @@ def get_transcript(
         preserve_formatting=False,
     ):
         """
+        DEPRECATED: use the `fetch` method instead!
+
         Retrieves the transcript for a single video. This is just a shortcut for calling::
 
             YouTubeTranscriptApi.list_transcripts(video_id, proxies).find_transcript(languages).fetch()
@@ -149,20 +289,16 @@ def get_transcript(
         :return: a list of dictionaries containing the 'text', 'start' and 'duration' keys
         :rtype [{'text': str, 'start': float, 'end': float}]:
         """
+        warnings.warn(
+            "`get_transcript` is deprecated and will be removed in a future version. "
+            "Use the `fetch` method instead!",
+            DeprecationWarning,
+        )
+
         assert isinstance(video_id, str), "`video_id` must be a string"
         return (
             cls.list_transcripts(video_id, proxies, cookies)
             .find_transcript(languages)
             .fetch(preserve_formatting=preserve_formatting)
+            .to_raw_data()
         )
-
-    @classmethod
-    def _load_cookies(cls, cookies, video_id):
-        try:
-            cookie_jar = cookiejar.MozillaCookieJar()
-            cookie_jar.load(cookies)
-            if not cookie_jar:
-                raise CookiesInvalid(video_id)
-            return cookie_jar
-        except CookieLoadError:
-            raise CookiePathInvalid(video_id)