From f3aa3c3f98e50f4f25d8744a97f642f5eb589ac9 Mon Sep 17 00:00:00 2001 From: coletdjnz Date: Mon, 20 Dec 2021 17:47:53 +1300 Subject: [PATCH] [youtube:tab] Extract more metadata from feeds/channels/playlists (#1018) Parse relative time text, extract live, upcoming status, availability and channel id from feeds/channels/playlists (where applicable). Closes #1883 Authored-by: coletdjnz --- yt_dlp/extractor/youtube.py | 97 ++++++++++++++++++++++--------------- 1 file changed, 57 insertions(+), 40 deletions(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 20452bb70..5a3b98bb5 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -55,6 +55,7 @@ smuggle_url, str_or_none, str_to_int, + strftime_or_none, traverse_obj, try_get, unescapeHTML, @@ -358,7 +359,20 @@ def _initialize_consent(self): consent_id = random.randint(100, 999) self._set_cookie('.youtube.com', 'CONSENT', 'YES+cb.20210328-17-p0.en+FX+%s' % consent_id) + def _initialize_pref(self): + cookies = self._get_cookies('https://www.youtube.com/') + pref_cookie = cookies.get('PREF') + pref = {} + if pref_cookie: + try: + pref = dict(compat_urlparse.parse_qsl(pref_cookie.value)) + except ValueError: + self.report_warning('Failed to parse user PREF cookie' + bug_reports_message()) + pref.update({'hl': 'en'}) + self._set_cookie('.youtube.com', name='PREF', value=compat_urllib_parse_urlencode(pref)) + def _real_initialize(self): + self._initialize_pref() self._initialize_consent() self._login() @@ -391,23 +405,10 @@ def _extract_api_key(self, ytcfg=None, default_client='web'): return self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_API_KEY'], compat_str, default_client) def _extract_context(self, ytcfg=None, default_client='web'): - _get_context = lambda y: try_get(y, lambda x: x['INNERTUBE_CONTEXT'], dict) - context = _get_context(ytcfg) - if context: - return context - - context = _get_context(self._get_default_ytcfg(default_client)) - if not ytcfg: - return context - - # Recreate the client context (required) - context['client'].update({ - 'clientVersion': self._extract_client_version(ytcfg, default_client), - 'clientName': self._extract_client_name(ytcfg, default_client), - }) - visitor_data = try_get(ytcfg, lambda x: x['VISITOR_DATA'], compat_str) - if visitor_data: - context['client']['visitorData'] = visitor_data + context = get_first( + (ytcfg, self._get_default_ytcfg(default_client)), 'INNERTUBE_CONTEXT', expected_type=dict) + # Enforce language for extraction + traverse_obj(context, 'client', expected_type=dict, default={})['hl'] = 'en' return context _SAPISID = None @@ -664,6 +665,29 @@ def _get_text(data, *path_list, max_runs=None): if text: return text + @staticmethod + def extract_relative_time(relative_time_text): + """ + Extracts a relative time from string and converts to dt object + e.g. 'streamed 6 days ago', '5 seconds ago (edited)' + """ + mobj = re.search(r'(?P