[nebula] Add NebulaCollectionIE and rewrite extractor (#1694)

Closes #1690 Authored by: hheimbuerger
2024-11-30 04:10:23 +00:00 · 2021-11-27 07:51:32 +01:00 · 2021-11-27 07:51:32 +01:00 · 359df0fc42
commit 359df0fc42
parent 3938a9212c
2 changed files with 217 additions and 162 deletions
--- a/yt_dlp/extractor/extractors.py
+++ b/yt_dlp/extractor/extractors.py
@ -889,7 +889,10 @@
    NJoyEmbedIE,
 )
 from .ndtv import NDTVIE
-from .nebula import NebulaIE
+from .nebula import (
    NebulaIE,
    NebulaCollectionIE,
 )
 from .nerdcubed import NerdCubedFeedIE
 from .netzkino import NetzkinoIE
 from .neteasemusic import (
--- a/yt_dlp/extractor/nebula.py
+++ b/yt_dlp/extractor/nebula.py
@ -1,22 +1,163 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import itertools
 import json
 import time
 import urllib
 from urllib.error import HTTPError
 from .common import InfoExtractor
 from ..compat import compat_str, compat_urllib_parse_unquote, compat_urllib_parse_quote
 from ..utils import (
    ExtractorError,
    parse_iso8601,
    try_get,
-    urljoin,
+)
 from .common import InfoExtractor
 class NebulaBaseIE(InfoExtractor):
    _NETRC_MACHINE = 'watchnebula'
    _nebula_api_token = None
    _nebula_bearer_token = None
    _zype_access_token = None
    def _perform_nebula_auth(self):
        username, password = self._get_login_info()
        if not (username and password):
            self.raise_login_required()
        data = json.dumps({'email': username, 'password': password}).encode('utf8')
        response = self._download_json(
            'https://api.watchnebula.com/api/v1/auth/login/',
            data=data, fatal=False, video_id=None,
            headers={
                'content-type': 'application/json',
                # Submitting the 'sessionid' cookie always causes a 403 on auth endpoint
                'cookie': ''
            },
            note='Logging in to Nebula with supplied credentials',
            errnote='Authentication failed or rejected')
        if not response or not response.get('key'):
            self.raise_login_required()
        # save nebula token as cookie
        self._set_cookie(
            'nebula.app', 'nebula-auth',
            urllib.parse.quote(
                json.dumps({
                    "apiToken": response["key"],
                    "isLoggingIn": False,
                    "isLoggingOut": False,
                }, separators=(",", ":"))),
            expire_time=int(time.time()) + 86400 * 365,
        )
        return response['key']
-class NebulaIE(InfoExtractor):
+    def _retrieve_nebula_api_token(self):
        """
        Check cookie jar for valid token. Try to authenticate using credentials if no valid token
        can be found in the cookie jar.
        """
        nebula_cookies = self._get_cookies('https://nebula.app')
        nebula_cookie = nebula_cookies.get('nebula-auth')
        if nebula_cookie:
            self.to_screen('Authenticating to Nebula with token from cookie jar')
            nebula_cookie_value = urllib.parse.unquote(nebula_cookie.value)
            nebula_api_token = self._parse_json(nebula_cookie_value, None).get('apiToken')
            if nebula_api_token:
                return nebula_api_token
        return self._perform_nebula_auth()
    def _call_nebula_api(self, url, video_id=None, method='GET', auth_type='api', note=''):
        assert method in ('GET', 'POST',)
        assert auth_type in ('api', 'bearer',)
        def inner_call():
            authorization = f'Token {self._nebula_api_token}' if auth_type == 'api' else f'Bearer {self._nebula_bearer_token}'
            return self._download_json(
                url, video_id, note=note, headers={'Authorization': authorization},
                data=b'' if method == 'POST' else None)
        try:
            return inner_call()
        except ExtractorError as exc:
            # if 401 or 403, attempt credential re-auth and retry
            if exc.cause and isinstance(exc.cause, urllib.error.HTTPError) and exc.cause.code in (401, 403):
                self.to_screen(f'Reauthenticating to Nebula and retrying, because last {auth_type} call resulted in error {exc.cause.code}')
                self._login()
                return inner_call()
            else:
                raise
    def _fetch_nebula_bearer_token(self):
        """
        Get a Bearer token for the Nebula API. This will be required to fetch video meta data.
        """
        response = self._call_nebula_api('https://api.watchnebula.com/api/v1/authorization/',
                                         method='POST',
                                         note='Authorizing to Nebula')
        return response['token']
    def _fetch_zype_access_token(self):
        """
        Get a Zype access token, which is required to access video streams -- in our case: to
        generate video URLs.
        """
        user_object = self._call_nebula_api('https://api.watchnebula.com/api/v1/auth/user/', note='Retrieving Zype access token')
        access_token = try_get(user_object, lambda x: x['zype_auth_info']['access_token'], str)
        if not access_token:
            if try_get(user_object, lambda x: x['is_subscribed'], bool):
                # TODO: Reimplement the same Zype token polling the Nebula frontend implements
                # see https://github.com/ytdl-org/youtube-dl/pull/24805#issuecomment-749231532
                raise ExtractorError(
                    'Unable to extract Zype access token from Nebula API authentication endpoint. '
                    'Open an arbitrary video in a browser with this account to generate a token',
                    expected=True)
            raise ExtractorError('Unable to extract Zype access token from Nebula API authentication endpoint')
        return access_token
    def _build_video_info(self, episode):
        zype_id = episode['zype_id']
        zype_video_url = f'https://player.zype.com/embed/{zype_id}.html?access_token={self._zype_access_token}'
        channel_slug = episode['channel_slug']
        return {
            'id': episode['zype_id'],
            'display_id': episode['slug'],
            '_type': 'url_transparent',
            'ie_key': 'Zype',
            'url': zype_video_url,
            'title': episode['title'],
            'description': episode['description'],
            'timestamp': parse_iso8601(episode['published_at']),
            'thumbnails': [{
                # 'id': tn.get('name'),  # this appears to be null
                'url': tn['original'],
                'height': key,
            } for key, tn in episode['assets']['thumbnail'].items()],
            'duration': episode['duration'],
            'channel': episode['channel_title'],
            'channel_id': channel_slug,
            'channel_url': f'https://nebula.app/{channel_slug}',
            'uploader': episode['channel_title'],
            'uploader_id': channel_slug,
            'uploader_url': f'https://nebula.app/{channel_slug}',
            'series': episode['channel_title'],
            'creator': episode['channel_title'],
        }
    def _login(self):
        self._nebula_api_token = self._retrieve_nebula_api_token()
        self._nebula_bearer_token = self._fetch_nebula_bearer_token()
        self._zype_access_token = self._fetch_zype_access_token()
    def _real_initialize(self):
        self._login()
 class NebulaIE(NebulaBaseIE):
    _VALID_URL = r'https?://(?:www\.)?(?:watchnebula\.com|nebula\.app)/videos/(?P<id>[-\w]+)'
    _TESTS = [
        {
@ -30,12 +171,13 @@ class NebulaIE(InfoExtractor):
                'upload_date': '20180731',
                'timestamp': 1533009600,
                'channel': 'Lindsay Ellis',
                'channel_id': 'lindsayellis',
                'uploader': 'Lindsay Ellis',
                'uploader_id': 'lindsayellis',
            },
            'params': {
                'usenetrc': True,
            },
            'skip': 'All Nebula content requires authentication',
        },
        {
            'url': 'https://nebula.app/videos/the-logistics-of-d-day-landing-craft-how-the-allies-got-ashore',
@ -47,13 +189,14 @@ class NebulaIE(InfoExtractor):
                'description': r're:^In this episode we explore the unsung heroes of D-Day, the landing craft.',
                'upload_date': '20200327',
                'timestamp': 1585348140,
-                'channel': 'The Logistics of D-Day',
+                'channel': 'Real Engineering',
-                'uploader': 'The Logistics of D-Day',
+                'channel_id': 'realengineering',
                'uploader': 'Real Engineering',
                'uploader_id': 'realengineering',
            },
            'params': {
                'usenetrc': True,
            },
            'skip': 'All Nebula content requires authentication',
        },
        {
            'url': 'https://nebula.app/videos/money-episode-1-the-draw',
@ -66,173 +209,82 @@ class NebulaIE(InfoExtractor):
                'upload_date': '20200323',
                'timestamp': 1584980400,
                'channel': 'Tom Scott Presents: Money',
                'channel_id': 'tom-scott-presents-money',
                'uploader': 'Tom Scott Presents: Money',
                'uploader_id': 'tom-scott-presents-money',
            },
            'params': {
                'usenetrc': True,
            },
            'skip': 'All Nebula content requires authentication',
        },
        {
            'url': 'https://watchnebula.com/videos/money-episode-1-the-draw',
            'only_matching': True,
        },
    ]
    _NETRC_MACHINE = 'watchnebula'
-    _nebula_token = None
+    def _fetch_video_metadata(self, slug):
-
+        return self._call_nebula_api(f'https://content.watchnebula.com/video/{slug}/',
-    def _retrieve_nebula_auth(self):
+                                     video_id=slug,
-        """
+                                     auth_type='bearer',
-        Log in to Nebula, and returns a Nebula API token
+                                     note='Fetching video meta data')
        """
        username, password = self._get_login_info()
        if not (username and password):
            self.raise_login_required()
        self.report_login()
        data = json.dumps({'email': username, 'password': password}).encode('utf8')
        response = self._download_json(
            'https://api.watchnebula.com/api/v1/auth/login/',
            data=data, fatal=False, video_id=None,
            headers={
                'content-type': 'application/json',
                # Submitting the 'sessionid' cookie always causes a 403 on auth endpoint
                'cookie': ''
            },
            note='Authenticating to Nebula with supplied credentials',
            errnote='Authentication failed or rejected')
        if not response or not response.get('key'):
            self.raise_login_required()
        # save nebula token as cookie
        self._set_cookie(
            'nebula.app', 'nebula-auth',
            compat_urllib_parse_quote(
                json.dumps({
                    "apiToken": response["key"],
                    "isLoggingIn": False,
                    "isLoggingOut": False,
                }, separators=(",", ":"))),
            expire_time=int(time.time()) + 86400 * 365,
        )
        return response['key']
    def _retrieve_zype_api_key(self, page_url, display_id):
        """
        Retrieves the Zype API key
        """
        # Find the js that has the API key from the webpage and download it
        webpage = self._download_webpage(page_url, video_id=display_id)
        main_script_relpath = self._search_regex(
            r'<script[^>]*src="(?P<script_relpath>[^"]*main.[0-9a-f]*.chunk.js)"[^>]*>', webpage,
            group='script_relpath', name='script relative path', fatal=True)
        main_script_abspath = urljoin(page_url, main_script_relpath)
        main_script = self._download_webpage(main_script_abspath, video_id=display_id,
                                             note='Retrieving Zype API key')
        api_key = self._search_regex(
            r'REACT_APP_ZYPE_API_KEY\s*:\s*"(?P<api_key>[\w-]*)"', main_script,
            group='api_key', name='API key', fatal=True)
        return api_key
    def _call_zype_api(self, path, params, video_id, api_key, note):
        """
        A helper for making calls to the Zype API.
        """
        query = {'api_key': api_key, 'per_page': 1}
        query.update(params)
        return self._download_json('https://api.zype.com' + path, video_id, query=query, note=note)
    def _call_nebula_api(self, path, video_id, access_token, note):
        """
        A helper for making calls to the Nebula API.
        """
        return self._download_json('https://api.watchnebula.com/api/v1' + path, video_id, headers={
            'Authorization': 'Token {access_token}'.format(access_token=access_token)
        }, note=note)
    def _fetch_zype_access_token(self, video_id):
        try:
            user_object = self._call_nebula_api('/auth/user/', video_id, self._nebula_token, note='Retrieving Zype access token')
        except ExtractorError as exc:
            # if 401, attempt credential auth and retry
            if exc.cause and isinstance(exc.cause, HTTPError) and exc.cause.code == 401:
                self._nebula_token = self._retrieve_nebula_auth()
                user_object = self._call_nebula_api('/auth/user/', video_id, self._nebula_token, note='Retrieving Zype access token')
            else:
                raise
        access_token = try_get(user_object, lambda x: x['zype_auth_info']['access_token'], compat_str)
        if not access_token:
            if try_get(user_object, lambda x: x['is_subscribed'], bool):
                # TODO: Reimplement the same Zype token polling the Nebula frontend implements
                # see https://github.com/ytdl-org/youtube-dl/pull/24805#issuecomment-749231532
                raise ExtractorError(
                    'Unable to extract Zype access token from Nebula API authentication endpoint. '
                    'Open an arbitrary video in a browser with this account to generate a token',
                    expected=True)
            raise ExtractorError('Unable to extract Zype access token from Nebula API authentication endpoint')
        return access_token
    def _extract_channel_title(self, video_meta):
        # TODO: Implement the API calls giving us the channel list,
        # so that we can do the title lookup and then figure out the channel URL
        categories = video_meta.get('categories', []) if video_meta else []
        # the channel name is the value of the first category
        for category in categories:
            if category.get('value'):
                return category['value'][0]
    def _real_initialize(self):
        # check cookie jar for valid token
        nebula_cookies = self._get_cookies('https://nebula.app')
        nebula_cookie = nebula_cookies.get('nebula-auth')
        if nebula_cookie:
            self.to_screen('Authenticating to Nebula with token from cookie jar')
            nebula_cookie_value = compat_urllib_parse_unquote(nebula_cookie.value)
            self._nebula_token = self._parse_json(nebula_cookie_value, None).get('apiToken')
        # try to authenticate using credentials if no valid token has been found
        if not self._nebula_token:
            self._nebula_token = self._retrieve_nebula_auth()
    def _real_extract(self, url):
-        display_id = self._match_id(url)
+        slug = self._match_id(url)
-        api_key = self._retrieve_zype_api_key(url, display_id)
+        video = self._fetch_video_metadata(slug)
        return self._build_video_info(video)
        response = self._call_zype_api('/videos', {'friendly_title': display_id},
                                       display_id, api_key, note='Retrieving metadata from Zype')
        if len(response.get('response') or []) != 1:
            raise ExtractorError('Unable to find video on Zype API')
        video_meta = response['response'][0]
-        video_id = video_meta['_id']
+class NebulaCollectionIE(NebulaBaseIE):
-        zype_access_token = self._fetch_zype_access_token(display_id)
+    IE_NAME = 'nebula:collection'
    _VALID_URL = r'https?://(?:www\.)?(?:watchnebula\.com|nebula\.app)/(?!videos/)(?P<id>[-\w]+)'
    _TESTS = [
        {
            'url': 'https://nebula.app/tom-scott-presents-money',
            'info_dict': {
                'id': 'tom-scott-presents-money',
                'title': 'Tom Scott Presents: Money',
                'description': 'Tom Scott hosts a series all about trust, negotiation and money.',
            },
            'playlist_count': 5,
            'params': {
                'usenetrc': True,
            },
        }, {
            'url': 'https://nebula.app/lindsayellis',
            'info_dict': {
                'id': 'lindsayellis',
                'title': 'Lindsay Ellis',
                'description': 'Enjoy these hottest of takes on Disney, Transformers, and Musicals.',
            },
            'playlist_mincount': 100,
            'params': {
                'usenetrc': True,
            },
        },
    ]
-        channel_title = self._extract_channel_title(video_meta)
+    def _generate_playlist_entries(self, collection_id, channel):
        episodes = channel['episodes']['results']
        for page_num in itertools.count(2):
            for episode in episodes:
                yield self._build_video_info(episode)
            next_url = channel['episodes']['next']
            if not next_url:
                break
            channel = self._call_nebula_api(next_url, collection_id, auth_type='bearer',
                                            note=f'Retrieving channel page {page_num}')
            episodes = channel['episodes']['results']
-        return {
+    def _real_extract(self, url):
-            'id': video_id,
+        collection_id = self._match_id(url)
-            'display_id': display_id,
+        channel_url = f'https://content.watchnebula.com/video/channels/{collection_id}/'
-            '_type': 'url_transparent',
+        channel = self._call_nebula_api(channel_url, collection_id, auth_type='bearer', note='Retrieving channel')
-            'ie_key': 'Zype',
+        channel_details = channel['details']
-            'url': 'https://player.zype.com/embed/%s.html?access_token=%s' % (video_id, zype_access_token),
+
-            'title': video_meta.get('title'),
+        return self.playlist_result(
-            'description': video_meta.get('description'),
+            entries=self._generate_playlist_entries(collection_id, channel),
-            'timestamp': parse_iso8601(video_meta.get('published_at')),
+            playlist_id=collection_id,
-            'thumbnails': [{
+            playlist_title=channel_details['title'],
-                'id': tn.get('name'),  # this appears to be null
+            playlist_description=channel_details['description']
-                'url': tn['url'],
+        )
                'width': tn.get('width'),
                'height': tn.get('height'),
            } for tn in video_meta.get('thumbnails', [])],
            'duration': video_meta.get('duration'),
            'channel': channel_title,
            'uploader': channel_title,  # we chose uploader = channel name
            # TODO: uploader_url, channel_id, channel_url
        }