diff --git a/README.md b/README.md index 56e4458dc1..dca3fe8d0c 100644 --- a/README.md +++ b/README.md @@ -1781,7 +1781,7 @@ #### youtube * `raise_incomplete_data`: `Incomplete Data Received` raises an error instead of reporting a warning * `data_sync_id`: Overrides the account Data Sync ID used in Innertube API requests. This may be needed if you are using an account with `youtube:player_skip=webpage,configs` or `youtubetab:skip=webpage` * `visitor_data`: Overrides the Visitor Data used in Innertube API requests. This should be used with `player_skip=webpage,configs` and without cookies. Note: this may have adverse effects if used improperly. If a session from a browser is wanted, you should pass cookies instead (which contain the Visitor ID) -* `po_token`: Proof of Origin (PO) Token(s) to use for requesting video playback. Comma seperated list of PO Tokens in the format `CLIENT+PO_TOKEN`, e.g. `youtube:po_token=web+XXX,android+YYY` +* `po_token`: Proof of Origin (PO) Token(s) to use. Comma seperated list of PO Tokens in the format `CLIENT.CONTEXT+PO_TOKEN`, e.g. `youtube:po_token=web.gvs+XXX,web.player=XXX,web_safari.gvs+YYY`. Context can be either `gvs` (Google Video Server URLs) or `player` (Innertube player request) #### youtubetab (YouTube playlists, channels, feeds, etc.) * `skip`: One or more of `webpage` (skip initial webpage download), `authcheck` (allow the download of playlists requiring authentication when no initial webpage is downloaded. This may cause unwanted behavior, see [#1122](https://github.com/yt-dlp/yt-dlp/pull/1122) for more details) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 51ff64f1ac..69db2b1ac7 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -1,4 +1,5 @@ import base64 +import binascii import calendar import collections import copy @@ -69,7 +70,14 @@ ) STREAMING_DATA_CLIENT_NAME = '__yt_dlp_client' -STREAMING_DATA_PO_TOKEN = '__yt_dlp_po_token' +STREAMING_DATA_INITIAL_PO_TOKEN = '__yt_dlp_po_token' +PO_TOKEN_GUIDE_URL = 'https://github.com/yt-dlp/yt-dlp/wiki/PO-Token-Guide' + + +class _PoTokenContext(enum.Enum): + PLAYER = 'player' + GVS = 'gvs' + # any clients starting with _ cannot be explicitly requested by the user INNERTUBE_CLIENTS = { @@ -81,7 +89,7 @@ }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 1, - 'REQUIRE_PO_TOKEN': True, + 'PO_TOKEN_REQUIRED_CONTEXTS': [_PoTokenContext.GVS], 'SUPPORTS_COOKIES': True, }, # Safari UA returns pre-merged video+audio 144p/240p/360p/720p/1080p HLS formats @@ -94,7 +102,7 @@ }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 1, - 'REQUIRE_PO_TOKEN': True, + 'PO_TOKEN_REQUIRED_CONTEXTS': [_PoTokenContext.GVS], 'SUPPORTS_COOKIES': True, }, 'web_embedded': { @@ -116,7 +124,7 @@ }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 67, - 'REQUIRE_PO_TOKEN': True, + 'PO_TOKEN_REQUIRED_CONTEXTS': [_PoTokenContext.GVS], 'SUPPORTS_COOKIES': True, }, # This client now requires sign-in for every video @@ -128,7 +136,7 @@ }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 62, - 'REQUIRE_PO_TOKEN': True, + 'PO_TOKEN_REQUIRED_CONTEXTS': [_PoTokenContext.GVS], 'REQUIRE_AUTH': True, 'SUPPORTS_COOKIES': True, }, @@ -145,7 +153,7 @@ }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 3, 'REQUIRE_JS_PLAYER': False, - 'REQUIRE_PO_TOKEN': True, + 'PO_TOKEN_REQUIRED_CONTEXTS': [_PoTokenContext.GVS], }, # This client now requires sign-in for every video 'android_music': { @@ -161,7 +169,7 @@ }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 21, 'REQUIRE_JS_PLAYER': False, - 'REQUIRE_PO_TOKEN': True, + 'PO_TOKEN_REQUIRED_CONTEXTS': [_PoTokenContext.GVS], 'REQUIRE_AUTH': True, }, # This client now requires sign-in for every video @@ -178,7 +186,7 @@ }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 14, 'REQUIRE_JS_PLAYER': False, - 'REQUIRE_PO_TOKEN': True, + 'PO_TOKEN_REQUIRED_CONTEXTS': [_PoTokenContext.GVS], 'REQUIRE_AUTH': True, }, # YouTube Kids videos aren't returned on this client for some reason @@ -213,8 +221,8 @@ }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 5, + 'PO_TOKEN_REQUIRED_CONTEXTS': [_PoTokenContext.GVS], 'REQUIRE_JS_PLAYER': False, - 'REQUIRE_PO_TOKEN': True, }, # This client now requires sign-in for every video 'ios_music': { @@ -231,7 +239,7 @@ }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 26, 'REQUIRE_JS_PLAYER': False, - 'REQUIRE_PO_TOKEN': True, + 'PO_TOKEN_REQUIRED_CONTEXTS': [_PoTokenContext.GVS], 'REQUIRE_AUTH': True, }, # This client now requires sign-in for every video @@ -249,7 +257,7 @@ }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 15, 'REQUIRE_JS_PLAYER': False, - 'REQUIRE_PO_TOKEN': True, + 'PO_TOKEN_REQUIRED_CONTEXTS': [_PoTokenContext.GVS], 'REQUIRE_AUTH': True, }, # mweb has 'ultralow' formats @@ -264,7 +272,7 @@ }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 2, - 'REQUIRE_PO_TOKEN': True, + 'PO_TOKEN_REQUIRED_CONTEXTS': [_PoTokenContext.GVS], 'SUPPORTS_COOKIES': True, }, 'tv': { @@ -318,7 +326,7 @@ def build_innertube_clients(): for client, ytcfg in tuple(INNERTUBE_CLIENTS.items()): ytcfg.setdefault('INNERTUBE_HOST', 'www.youtube.com') ytcfg.setdefault('REQUIRE_JS_PLAYER', True) - ytcfg.setdefault('REQUIRE_PO_TOKEN', False) + ytcfg.setdefault('PO_TOKEN_REQUIRED_CONTEXTS', []) ytcfg.setdefault('REQUIRE_AUTH', False) ytcfg.setdefault('SUPPORTS_COOKIES', False) ytcfg.setdefault('PLAYER_PARAMS', None) @@ -3842,53 +3850,105 @@ def _generate_player_context(cls, sts=None): **cls._get_checkok_params(), } - def _get_config_po_token(self, client): + def _get_config_po_token(self, client: str, context: _PoTokenContext): po_token_strs = self._configuration_arg('po_token', [], ie_key=YoutubeIE, casesense=True) for token_str in po_token_strs: - po_token_client, sep, po_token = token_str.partition('+') + po_token_meta, sep, po_token = token_str.partition('+') if not sep: self.report_warning( - f'Invalid po_token configuration format. Expected "client+po_token", got "{token_str}"', only_once=True) + f'Invalid po_token configuration format. ' + f'Expected "CLIENT.CONTEXT+PO_TOKEN", got "{token_str}"', only_once=True) continue - if po_token_client == client: - return po_token - def fetch_po_token(self, client='web', visitor_data=None, data_sync_id=None, player_url=None, **kwargs): - # PO Token is bound to visitor_data / Visitor ID when logged out. Must have visitor_data for it to function. - if not visitor_data and not self.is_authenticated and player_url: + po_token_client, sep, po_token_context = po_token_meta.partition('.') + if po_token_client.lower() != client: + continue + + if not sep: + # TODO(future): deprecate the old format? + self.write_debug( + f'po_token configuration for {client} client is missing a context; assuming GVS. ' + 'You can provide a context with the format "CLIENT.CONTEXT+PO_TOKEN"', + only_once=True) + po_token_context = _PoTokenContext.GVS.value + + if po_token_context.lower() != context.value: + continue + + # Clean and validate the PO Token. This will strip invalid characters off + # (e.g. additional url params the user may accidentally include) + try: + return base64.urlsafe_b64encode(base64.urlsafe_b64decode(urllib.parse.unquote(po_token))).decode() + except (binascii.Error, ValueError): + self.report_warning( + f'Invalid po_token configuration for {client} client: ' + f'{po_token_context} PO Token should be a base64url-encoded string.', + only_once=True) + continue + + def fetch_po_token(self, client='web', context=_PoTokenContext.GVS, ytcfg=None, visitor_data=None, + data_sync_id=None, session_index=None, player_url=None, video_id=None, **kwargs): + """ + Fetch a PO Token for a given client and context. This function will validate required parameters for a given context and client. + + EXPERIMENTAL: This method is unstable and may change or be removed without notice. + + @param client: The client to fetch the PO Token for. + @param context: The context in which the PO Token is used. + @param ytcfg: The ytcfg for the client. + @param visitor_data: visitor data. + @param data_sync_id: data sync ID. + @param session_index: session index. + @param player_url: player URL. + @param video_id: video ID. + @param kwargs: Additional arguments to pass down. May be more added in the future. + @return: The fetched PO Token. None if it could not be fetched. + """ + + # GVS WebPO Token is bound to visitor_data / Visitor ID when logged out. + # Must have visitor_data for it to function. + if player_url and context == _PoTokenContext.GVS and not visitor_data and not self.is_authenticated: self.report_warning( - f'Unable to fetch PO Token for {client} client: Missing required Visitor Data. ' + f'Unable to fetch GVS PO Token for {client} client: Missing required Visitor Data. ' f'You may need to pass Visitor Data with --extractor-args "youtube:visitor_data=XXX"') return - config_po_token = self._get_config_po_token(client) + if context == _PoTokenContext.PLAYER and not video_id: + self.report_warning( + f'Unable to fetch Player PO Token for {client} client: Missing required Video ID') + return + + config_po_token = self._get_config_po_token(client, context) if config_po_token: - # PO token is bound to data_sync_id / account Session ID when logged in. However, for the config po_token, - # if using first channel in an account then we don't need the data_sync_id anymore... - if not data_sync_id and self.is_authenticated and player_url: + # GVS WebPO token is bound to data_sync_id / account Session ID when logged in. + if player_url and context == _PoTokenContext.GVS and not data_sync_id and self.is_authenticated: self.report_warning( - f'Got a PO Token for {client} client, but missing Data Sync ID for account. Formats may not work.' + f'Got a GVS PO Token for {client} client, but missing Data Sync ID for account. Formats may not work.' f'You may need to pass a Data Sync ID with --extractor-args "youtube:data_sync_id=XXX"') return config_po_token - # Require PO Token if logged in for external fetching - if not data_sync_id and self.is_authenticated and player_url: + # Require GVS WebPO Token if logged in for external fetching + if player_url and context == _PoTokenContext.GVS and not data_sync_id and self.is_authenticated: self.report_warning( - f'Unable to fetch PO Token for {client} client: Missing required Data Sync ID for account. ' + f'Unable to fetch GVS PO Token for {client} client: Missing required Data Sync ID for account. ' f'You may need to pass a Data Sync ID with --extractor-args "youtube:data_sync_id=XXX"') return return self._fetch_po_token( client=client, + context=context.value, + ytcfg=ytcfg, visitor_data=visitor_data, data_sync_id=data_sync_id, + session_index=session_index, player_url=player_url, + video_id=video_id, **kwargs, ) - def _fetch_po_token(self, client, visitor_data=None, data_sync_id=None, player_url=None, **kwargs): - """External PO Token fetch stub""" + def _fetch_po_token(self, client, **kwargs): + """(Unstable) External PO Token fetch stub""" @staticmethod def _is_agegated(player_response): @@ -4036,17 +4096,47 @@ def append_client(*client_names): visitor_data = visitor_data or self._extract_visitor_data(master_ytcfg, initial_pr, player_ytcfg) data_sync_id = data_sync_id or self._extract_data_sync_id(master_ytcfg, initial_pr, player_ytcfg) - po_token = self.fetch_po_token( - client=client, visitor_data=visitor_data, - data_sync_id=data_sync_id if self.is_authenticated else None, - player_url=player_url if require_js_player else None, - ) - require_po_token = self._get_default_ytcfg(client).get('REQUIRE_PO_TOKEN') - if not po_token and require_po_token and 'missing_pot' in self._configuration_arg('formats'): + fetch_po_token_args = { + 'client': client, + 'visitor_data': visitor_data, + 'video_id': video_id, + 'data_sync_id': data_sync_id if self.is_authenticated else None, + 'player_url': player_url if require_js_player else None, + 'session_index': self._extract_session_index(master_ytcfg, player_ytcfg), + 'ytcfg': player_ytcfg, + } + + player_po_token = self.fetch_po_token( + context=_PoTokenContext.PLAYER, **fetch_po_token_args) + + gvs_po_token = self.fetch_po_token( + context=_PoTokenContext.GVS, **fetch_po_token_args) + + required_pot_contexts = self._get_default_ytcfg(client)['PO_TOKEN_REQUIRED_CONTEXTS'] + + if ( + not player_po_token + and _PoTokenContext.PLAYER in required_pot_contexts + ): + # TODO: may need to skip player response request. Unsure yet.. self.report_warning( - f'No PO Token provided for {client} client, ' - f'which may be required for working {client} formats. This client will be deprioritized', only_once=True) + f'No Player PO Token provided for {client} client, ' + f'which may be required for working {client} formats. This client will be deprioritized' + f'You can manually pass a Player PO Token for this client with --extractor-args "youtube:po_token={client}.player+XXX". ' + f'For more information, refer to {PO_TOKEN_GUIDE_URL} .', only_once=True) + deprioritize_pr = True + + if ( + not gvs_po_token + and _PoTokenContext.GVS in required_pot_contexts + and 'missing_pot' in self._configuration_arg('formats') + ): + # note: warning with help message is provided later during format processing + self.report_warning( + f'No GVS PO Token provided for {client} client, ' + f'which may be required for working {client} formats. This client will be deprioritized', + only_once=True) deprioritize_pr = True pr = initial_pr if client == 'web' else None @@ -4059,7 +4149,7 @@ def append_client(*client_names): initial_pr=initial_pr, visitor_data=visitor_data, data_sync_id=data_sync_id, - po_token=po_token) + po_token=player_po_token) except ExtractorError as e: self.report_warning(e) continue @@ -4070,10 +4160,10 @@ def append_client(*client_names): # Save client name for introspection later sd = traverse_obj(pr, ('streamingData', {dict})) or {} sd[STREAMING_DATA_CLIENT_NAME] = client - sd[STREAMING_DATA_PO_TOKEN] = po_token + sd[STREAMING_DATA_INITIAL_PO_TOKEN] = gvs_po_token for f in traverse_obj(sd, (('formats', 'adaptiveFormats'), ..., {dict})): f[STREAMING_DATA_CLIENT_NAME] = client - f[STREAMING_DATA_PO_TOKEN] = po_token + f[STREAMING_DATA_INITIAL_PO_TOKEN] = gvs_po_token if deprioritize_pr: deprioritized_prs.append(pr) else: @@ -4099,10 +4189,10 @@ def _needs_live_processing(self, live_status, duration): def _report_pot_format_skipped(self, video_id, client_name, proto): msg = ( - f'{video_id}: {client_name} client {proto} formats require a PO Token which was not provided. ' + f'{video_id}: {client_name} client {proto} formats require a GVS PO Token which was not provided. ' 'They will be skipped as they may yield HTTP Error 403. ' - f'You can manually pass a PO Token for this client with --extractor-args "youtube:po_token={client_name}+XXX". ' - 'For more information, refer to https://github.com/yt-dlp/yt-dlp/wiki/Extractors#po-token-guide . ' + f'You can manually pass a GVS PO Token for this client with --extractor-args "youtube:po_token={client_name}.gvs+XXX". ' + f'For more information, refer to {PO_TOKEN_GUIDE_URL} . ' 'To enable these broken formats anyway, pass --extractor-args "youtube:formats=missing_pot"') # Only raise a warning for non-default clients, to not confuse users. @@ -4232,13 +4322,17 @@ def build_fragments(f): f'{video_id}: Some formats are possibly damaged. They will be deprioritized', only_once=True) client_name = fmt[STREAMING_DATA_CLIENT_NAME] - po_token = fmt.get(STREAMING_DATA_PO_TOKEN) + po_token = fmt.get(STREAMING_DATA_INITIAL_PO_TOKEN) if po_token: fmt_url = update_url_query(fmt_url, {'pot': po_token}) # Clients that require PO Token return videoplayback URLs that may return 403 - require_po_token = (not po_token and self._get_default_ytcfg(client_name).get('REQUIRE_PO_TOKEN')) + require_po_token = ( + not po_token + and _PoTokenContext.GVS in self._get_default_ytcfg(client_name)['PO_TOKEN_REQUIRED_CONTEXTS'] + and itag not in ['18']) # these formats do not require PO Token + if require_po_token and 'missing_pot' not in self._configuration_arg('formats'): self._report_pot_format_skipped(video_id, client_name, 'https') continue @@ -4327,7 +4421,11 @@ def process_manifest_format(f, proto, client_name, itag, po_token): # Clients that require PO Token return videoplayback URLs that may return 403 # hls does not currently require PO Token - if (not po_token and self._get_default_ytcfg(client_name).get('REQUIRE_PO_TOKEN')) and proto != 'hls': + if ( + not po_token + and _PoTokenContext.GVS in self._get_default_ytcfg(client_name)['PO_TOKEN_REQUIRED_CONTEXTS'] + and proto != 'hls' + ): if 'missing_pot' not in self._configuration_arg('formats'): self._report_pot_format_skipped(video_id, client_name, proto) return False @@ -4368,7 +4466,7 @@ def process_manifest_format(f, proto, client_name, itag, po_token): subtitles = {} for sd in streaming_data: client_name = sd[STREAMING_DATA_CLIENT_NAME] - po_token = sd.get(STREAMING_DATA_PO_TOKEN) + po_token = sd.get(STREAMING_DATA_INITIAL_PO_TOKEN) hls_manifest_url = 'hls' not in skip_manifests and sd.get('hlsManifestUrl') if hls_manifest_url: if po_token: