From 359df0fc423b4a5d5af8113d42648fdea22e81ea Mon Sep 17 00:00:00 2001 From: Henrik Heimbuerger Date: Sat, 27 Nov 2021 07:51:32 +0100 Subject: [PATCH] [nebula] Add NebulaCollectionIE and rewrite extractor (#1694) Closes #1690 Authored by: hheimbuerger --- yt_dlp/extractor/extractors.py | 5 +- yt_dlp/extractor/nebula.py | 374 +++++++++++++++++++-------------- 2 files changed, 217 insertions(+), 162 deletions(-) diff --git a/yt_dlp/extractor/extractors.py b/yt_dlp/extractor/extractors.py index a277bf722..2fb9515c0 100644 --- a/yt_dlp/extractor/extractors.py +++ b/yt_dlp/extractor/extractors.py @@ -889,7 +889,10 @@ NJoyEmbedIE, ) from .ndtv import NDTVIE -from .nebula import NebulaIE +from .nebula import ( + NebulaIE, + NebulaCollectionIE, +) from .nerdcubed import NerdCubedFeedIE from .netzkino import NetzkinoIE from .neteasemusic import ( diff --git a/yt_dlp/extractor/nebula.py b/yt_dlp/extractor/nebula.py index 9698a358e..d235805c3 100644 --- a/yt_dlp/extractor/nebula.py +++ b/yt_dlp/extractor/nebula.py @@ -1,22 +1,163 @@ # coding: utf-8 from __future__ import unicode_literals +import itertools import json import time +import urllib -from urllib.error import HTTPError -from .common import InfoExtractor -from ..compat import compat_str, compat_urllib_parse_unquote, compat_urllib_parse_quote from ..utils import ( ExtractorError, parse_iso8601, try_get, - urljoin, ) +from .common import InfoExtractor -class NebulaIE(InfoExtractor): +class NebulaBaseIE(InfoExtractor): + _NETRC_MACHINE = 'watchnebula' + _nebula_api_token = None + _nebula_bearer_token = None + _zype_access_token = None + + def _perform_nebula_auth(self): + username, password = self._get_login_info() + if not (username and password): + self.raise_login_required() + + data = json.dumps({'email': username, 'password': password}).encode('utf8') + response = self._download_json( + 'https://api.watchnebula.com/api/v1/auth/login/', + data=data, fatal=False, video_id=None, + headers={ + 'content-type': 'application/json', + # Submitting the 'sessionid' cookie always causes a 403 on auth endpoint + 'cookie': '' + }, + note='Logging in to Nebula with supplied credentials', + errnote='Authentication failed or rejected') + if not response or not response.get('key'): + self.raise_login_required() + + # save nebula token as cookie + self._set_cookie( + 'nebula.app', 'nebula-auth', + urllib.parse.quote( + json.dumps({ + "apiToken": response["key"], + "isLoggingIn": False, + "isLoggingOut": False, + }, separators=(",", ":"))), + expire_time=int(time.time()) + 86400 * 365, + ) + + return response['key'] + + def _retrieve_nebula_api_token(self): + """ + Check cookie jar for valid token. Try to authenticate using credentials if no valid token + can be found in the cookie jar. + """ + nebula_cookies = self._get_cookies('https://nebula.app') + nebula_cookie = nebula_cookies.get('nebula-auth') + if nebula_cookie: + self.to_screen('Authenticating to Nebula with token from cookie jar') + nebula_cookie_value = urllib.parse.unquote(nebula_cookie.value) + nebula_api_token = self._parse_json(nebula_cookie_value, None).get('apiToken') + if nebula_api_token: + return nebula_api_token + + return self._perform_nebula_auth() + + def _call_nebula_api(self, url, video_id=None, method='GET', auth_type='api', note=''): + assert method in ('GET', 'POST',) + assert auth_type in ('api', 'bearer',) + + def inner_call(): + authorization = f'Token {self._nebula_api_token}' if auth_type == 'api' else f'Bearer {self._nebula_bearer_token}' + return self._download_json( + url, video_id, note=note, headers={'Authorization': authorization}, + data=b'' if method == 'POST' else None) + + try: + return inner_call() + except ExtractorError as exc: + # if 401 or 403, attempt credential re-auth and retry + if exc.cause and isinstance(exc.cause, urllib.error.HTTPError) and exc.cause.code in (401, 403): + self.to_screen(f'Reauthenticating to Nebula and retrying, because last {auth_type} call resulted in error {exc.cause.code}') + self._login() + return inner_call() + else: + raise + + def _fetch_nebula_bearer_token(self): + """ + Get a Bearer token for the Nebula API. This will be required to fetch video meta data. + """ + response = self._call_nebula_api('https://api.watchnebula.com/api/v1/authorization/', + method='POST', + note='Authorizing to Nebula') + return response['token'] + + def _fetch_zype_access_token(self): + """ + Get a Zype access token, which is required to access video streams -- in our case: to + generate video URLs. + """ + user_object = self._call_nebula_api('https://api.watchnebula.com/api/v1/auth/user/', note='Retrieving Zype access token') + + access_token = try_get(user_object, lambda x: x['zype_auth_info']['access_token'], str) + if not access_token: + if try_get(user_object, lambda x: x['is_subscribed'], bool): + # TODO: Reimplement the same Zype token polling the Nebula frontend implements + # see https://github.com/ytdl-org/youtube-dl/pull/24805#issuecomment-749231532 + raise ExtractorError( + 'Unable to extract Zype access token from Nebula API authentication endpoint. ' + 'Open an arbitrary video in a browser with this account to generate a token', + expected=True) + raise ExtractorError('Unable to extract Zype access token from Nebula API authentication endpoint') + return access_token + + def _build_video_info(self, episode): + zype_id = episode['zype_id'] + zype_video_url = f'https://player.zype.com/embed/{zype_id}.html?access_token={self._zype_access_token}' + channel_slug = episode['channel_slug'] + return { + 'id': episode['zype_id'], + 'display_id': episode['slug'], + '_type': 'url_transparent', + 'ie_key': 'Zype', + 'url': zype_video_url, + 'title': episode['title'], + 'description': episode['description'], + 'timestamp': parse_iso8601(episode['published_at']), + 'thumbnails': [{ + # 'id': tn.get('name'), # this appears to be null + 'url': tn['original'], + 'height': key, + } for key, tn in episode['assets']['thumbnail'].items()], + 'duration': episode['duration'], + 'channel': episode['channel_title'], + 'channel_id': channel_slug, + 'channel_url': f'https://nebula.app/{channel_slug}', + 'uploader': episode['channel_title'], + 'uploader_id': channel_slug, + 'uploader_url': f'https://nebula.app/{channel_slug}', + 'series': episode['channel_title'], + 'creator': episode['channel_title'], + } + + def _login(self): + self._nebula_api_token = self._retrieve_nebula_api_token() + self._nebula_bearer_token = self._fetch_nebula_bearer_token() + self._zype_access_token = self._fetch_zype_access_token() + + def _real_initialize(self): + self._login() + + +class NebulaIE(NebulaBaseIE): _VALID_URL = r'https?://(?:www\.)?(?:watchnebula\.com|nebula\.app)/videos/(?P[-\w]+)' _TESTS = [ { @@ -30,12 +171,13 @@ class NebulaIE(InfoExtractor): 'upload_date': '20180731', 'timestamp': 1533009600, 'channel': 'Lindsay Ellis', + 'channel_id': 'lindsayellis', 'uploader': 'Lindsay Ellis', + 'uploader_id': 'lindsayellis', }, 'params': { 'usenetrc': True, }, - 'skip': 'All Nebula content requires authentication', }, { 'url': 'https://nebula.app/videos/the-logistics-of-d-day-landing-craft-how-the-allies-got-ashore', @@ -47,13 +189,14 @@ class NebulaIE(InfoExtractor): 'description': r're:^In this episode we explore the unsung heroes of D-Day, the landing craft.', 'upload_date': '20200327', 'timestamp': 1585348140, - 'channel': 'The Logistics of D-Day', - 'uploader': 'The Logistics of D-Day', + 'channel': 'Real Engineering', + 'channel_id': 'realengineering', + 'uploader': 'Real Engineering', + 'uploader_id': 'realengineering', }, 'params': { 'usenetrc': True, }, - 'skip': 'All Nebula content requires authentication', }, { 'url': 'https://nebula.app/videos/money-episode-1-the-draw', @@ -66,173 +209,82 @@ class NebulaIE(InfoExtractor): 'upload_date': '20200323', 'timestamp': 1584980400, 'channel': 'Tom Scott Presents: Money', + 'channel_id': 'tom-scott-presents-money', 'uploader': 'Tom Scott Presents: Money', + 'uploader_id': 'tom-scott-presents-money', }, 'params': { 'usenetrc': True, }, - 'skip': 'All Nebula content requires authentication', }, { 'url': 'https://watchnebula.com/videos/money-episode-1-the-draw', 'only_matching': True, }, ] - _NETRC_MACHINE = 'watchnebula' - _nebula_token = None - - def _retrieve_nebula_auth(self): - """ - Log in to Nebula, and returns a Nebula API token - """ - - username, password = self._get_login_info() - if not (username and password): - self.raise_login_required() - - self.report_login() - data = json.dumps({'email': username, 'password': password}).encode('utf8') - response = self._download_json( - 'https://api.watchnebula.com/api/v1/auth/login/', - data=data, fatal=False, video_id=None, - headers={ - 'content-type': 'application/json', - # Submitting the 'sessionid' cookie always causes a 403 on auth endpoint - 'cookie': '' - }, - note='Authenticating to Nebula with supplied credentials', - errnote='Authentication failed or rejected') - if not response or not response.get('key'): - self.raise_login_required() - - # save nebula token as cookie - self._set_cookie( - 'nebula.app', 'nebula-auth', - compat_urllib_parse_quote( - json.dumps({ - "apiToken": response["key"], - "isLoggingIn": False, - "isLoggingOut": False, - }, separators=(",", ":"))), - expire_time=int(time.time()) + 86400 * 365, - ) - - return response['key'] - - def _retrieve_zype_api_key(self, page_url, display_id): - """ - Retrieves the Zype API key - """ - - # Find the js that has the API key from the webpage and download it - webpage = self._download_webpage(page_url, video_id=display_id) - main_script_relpath = self._search_regex( - r']*src="(?P[^"]*main.[0-9a-f]*.chunk.js)"[^>]*>', webpage, - group='script_relpath', name='script relative path', fatal=True) - main_script_abspath = urljoin(page_url, main_script_relpath) - main_script = self._download_webpage(main_script_abspath, video_id=display_id, - note='Retrieving Zype API key') - - api_key = self._search_regex( - r'REACT_APP_ZYPE_API_KEY\s*:\s*"(?P[\w-]*)"', main_script, - group='api_key', name='API key', fatal=True) - - return api_key - - def _call_zype_api(self, path, params, video_id, api_key, note): - """ - A helper for making calls to the Zype API. - """ - query = {'api_key': api_key, 'per_page': 1} - query.update(params) - return self._download_json('https://api.zype.com' + path, video_id, query=query, note=note) - - def _call_nebula_api(self, path, video_id, access_token, note): - """ - A helper for making calls to the Nebula API. - """ - return self._download_json('https://api.watchnebula.com/api/v1' + path, video_id, headers={ - 'Authorization': 'Token {access_token}'.format(access_token=access_token) - }, note=note) - - def _fetch_zype_access_token(self, video_id): - try: - user_object = self._call_nebula_api('/auth/user/', video_id, self._nebula_token, note='Retrieving Zype access token') - except ExtractorError as exc: - # if 401, attempt credential auth and retry - if exc.cause and isinstance(exc.cause, HTTPError) and exc.cause.code == 401: - self._nebula_token = self._retrieve_nebula_auth() - user_object = self._call_nebula_api('/auth/user/', video_id, self._nebula_token, note='Retrieving Zype access token') - else: - raise - - access_token = try_get(user_object, lambda x: x['zype_auth_info']['access_token'], compat_str) - if not access_token: - if try_get(user_object, lambda x: x['is_subscribed'], bool): - # TODO: Reimplement the same Zype token polling the Nebula frontend implements - # see https://github.com/ytdl-org/youtube-dl/pull/24805#issuecomment-749231532 - raise ExtractorError( - 'Unable to extract Zype access token from Nebula API authentication endpoint. ' - 'Open an arbitrary video in a browser with this account to generate a token', - expected=True) - raise ExtractorError('Unable to extract Zype access token from Nebula API authentication endpoint') - return access_token - - def _extract_channel_title(self, video_meta): - # TODO: Implement the API calls giving us the channel list, - # so that we can do the title lookup and then figure out the channel URL - categories = video_meta.get('categories', []) if video_meta else [] - # the channel name is the value of the first category - for category in categories: - if category.get('value'): - return category['value'][0] - - def _real_initialize(self): - # check cookie jar for valid token - nebula_cookies = self._get_cookies('https://nebula.app') - nebula_cookie = nebula_cookies.get('nebula-auth') - if nebula_cookie: - self.to_screen('Authenticating to Nebula with token from cookie jar') - nebula_cookie_value = compat_urllib_parse_unquote(nebula_cookie.value) - self._nebula_token = self._parse_json(nebula_cookie_value, None).get('apiToken') - - # try to authenticate using credentials if no valid token has been found - if not self._nebula_token: - self._nebula_token = self._retrieve_nebula_auth() + def _fetch_video_metadata(self, slug): + return self._call_nebula_api(f'https://content.watchnebula.com/video/{slug}/', + video_id=slug, + auth_type='bearer', + note='Fetching video meta data') def _real_extract(self, url): - display_id = self._match_id(url) - api_key = self._retrieve_zype_api_key(url, display_id) + slug = self._match_id(url) + video = self._fetch_video_metadata(slug) + return self._build_video_info(video) - response = self._call_zype_api('/videos', {'friendly_title': display_id}, - display_id, api_key, note='Retrieving metadata from Zype') - if len(response.get('response') or []) != 1: - raise ExtractorError('Unable to find video on Zype API') - video_meta = response['response'][0] - video_id = video_meta['_id'] - zype_access_token = self._fetch_zype_access_token(display_id) +class NebulaCollectionIE(NebulaBaseIE): + IE_NAME = 'nebula:collection' + _VALID_URL = r'https?://(?:www\.)?(?:watchnebula\.com|nebula\.app)/(?!videos/)(?P[-\w]+)' + _TESTS = [ + { + 'url': 'https://nebula.app/tom-scott-presents-money', + 'info_dict': { + 'id': 'tom-scott-presents-money', + 'title': 'Tom Scott Presents: Money', + 'description': 'Tom Scott hosts a series all about trust, negotiation and money.', + }, + 'playlist_count': 5, + 'params': { + 'usenetrc': True, + }, + }, { + 'url': 'https://nebula.app/lindsayellis', + 'info_dict': { + 'id': 'lindsayellis', + 'title': 'Lindsay Ellis', + 'description': 'Enjoy these hottest of takes on Disney, Transformers, and Musicals.', + }, + 'playlist_mincount': 100, + 'params': { + 'usenetrc': True, + }, + }, + ] - channel_title = self._extract_channel_title(video_meta) + def _generate_playlist_entries(self, collection_id, channel): + episodes = channel['episodes']['results'] + for page_num in itertools.count(2): + for episode in episodes: + yield self._build_video_info(episode) + next_url = channel['episodes']['next'] + if not next_url: + break + channel = self._call_nebula_api(next_url, collection_id, auth_type='bearer', + note=f'Retrieving channel page {page_num}') + episodes = channel['episodes']['results'] - return { - 'id': video_id, - 'display_id': display_id, - '_type': 'url_transparent', - 'ie_key': 'Zype', - 'url': 'https://player.zype.com/embed/%s.html?access_token=%s' % (video_id, zype_access_token), - 'title': video_meta.get('title'), - 'description': video_meta.get('description'), - 'timestamp': parse_iso8601(video_meta.get('published_at')), - 'thumbnails': [{ - 'id': tn.get('name'), # this appears to be null - 'url': tn['url'], - 'width': tn.get('width'), - 'height': tn.get('height'), - } for tn in video_meta.get('thumbnails', [])], - 'duration': video_meta.get('duration'), - 'channel': channel_title, - 'uploader': channel_title, # we chose uploader = channel name - # TODO: uploader_url, channel_id, channel_url - } + def _real_extract(self, url): + collection_id = self._match_id(url) + channel_url = f'https://content.watchnebula.com/video/channels/{collection_id}/' + channel = self._call_nebula_api(channel_url, collection_id, auth_type='bearer', note='Retrieving channel') + channel_details = channel['details'] + + return self.playlist_result( + entries=self._generate_playlist_entries(collection_id, channel), + playlist_id=collection_id, + playlist_title=channel_details['title'], + playlist_description=channel_details['description'] + )