From bb69f5dab79fb32c4ec0d50e05f7fa26d05d54ba Mon Sep 17 00:00:00 2001 From: InvalidUsernameException Date: Sun, 26 Jan 2025 01:29:57 +0100 Subject: [PATCH] [ie/zdf] Fix extractors (#11041) Closes #4782, Closes #10672 Authored by: InvalidUsernameException --- yt_dlp/extractor/zdf.py | 76 +++++++++++++++++++++++------------------ 1 file changed, 42 insertions(+), 34 deletions(-) diff --git a/yt_dlp/extractor/zdf.py b/yt_dlp/extractor/zdf.py index a862e25d07..703766cd7b 100644 --- a/yt_dlp/extractor/zdf.py +++ b/yt_dlp/extractor/zdf.py @@ -5,7 +5,6 @@ NO_DEFAULT, ExtractorError, determine_ext, - extract_attributes, float_or_none, int_or_none, join_nonempty, @@ -25,6 +24,11 @@ class ZDFBaseIE(InfoExtractor): _GEO_COUNTRIES = ['DE'] _QUALITIES = ('auto', 'low', 'med', 'high', 'veryhigh', 'hd', 'fhd', 'uhd') + def _download_v2_doc(self, document_id): + return self._download_json( + f'https://zdf-prod-futura.zdf.de/mediathekV2/document/{document_id}', + document_id) + def _call_api(self, url, video_id, item, api_token=None, referrer=None): headers = {} if api_token: @@ -320,9 +324,7 @@ def _extract_regular(self, url, player, video_id): return self._extract_entry(player['content'], player, content, video_id) def _extract_mobile(self, video_id): - video = self._download_json( - f'https://zdf-cdn.live.cellular.de/mediathekV2/document/{video_id}', - video_id) + video = self._download_v2_doc(video_id) formats = [] formitaeten = try_get(video, lambda x: x['document']['formitaeten'], list) @@ -374,7 +376,7 @@ def _real_extract(self, url): class ZDFChannelIE(ZDFBaseIE): - _VALID_URL = r'https?://www\.zdf\.de/(?:[^/]+/)*(?P[^/?#&]+)' + _VALID_URL = r'https?://www\.zdf\.de/(?:[^/?#]+/)*(?P[^/?#&]+)' _TESTS = [{ 'url': 'https://www.zdf.de/sport/das-aktuelle-sportstudio', 'info_dict': { @@ -387,18 +389,19 @@ class ZDFChannelIE(ZDFBaseIE): 'info_dict': { 'id': 'planet-e', 'title': 'planet e.', + 'description': 'md5:87e3b9c66a63cf1407ee443d2c4eb88e', }, 'playlist_mincount': 50, }, { 'url': 'https://www.zdf.de/gesellschaft/aktenzeichen-xy-ungeloest', 'info_dict': { 'id': 'aktenzeichen-xy-ungeloest', - 'title': 'Aktenzeichen XY... ungelöst', - 'entries': "lambda x: not any('xy580-fall1-kindermoerder-gesucht-100' in e['url'] for e in x)", + 'title': 'Aktenzeichen XY... Ungelöst', + 'description': 'md5:623ede5819c400c6d04943fa8100e6e7', }, 'playlist_mincount': 2, }, { - 'url': 'https://www.zdf.de/filme/taunuskrimi/', + 'url': 'https://www.zdf.de/serien/taunuskrimi/', 'only_matching': True, }] @@ -406,36 +409,41 @@ class ZDFChannelIE(ZDFBaseIE): def suitable(cls, url): return False if ZDFIE.suitable(url) else super().suitable(url) - def _og_search_title(self, webpage, fatal=False): - title = super()._og_search_title(webpage, fatal=fatal) - return re.split(r'\s+[-|]\s+ZDF(?:mediathek)?$', title or '')[0] or None + def _extract_entry(self, entry): + return self.url_result( + entry['sharingUrl'], ZDFIE, **traverse_obj(entry, { + 'id': ('basename', {str}), + 'title': ('titel', {str}), + 'description': ('beschreibung', {str}), + 'duration': ('length', {float_or_none}), + # TODO: seasonNumber and episodeNumber can be extracted but need to also be in ZDFIE + })) + + def _entries(self, data, document_id): + for entry in traverse_obj(data, ( + 'cluster', lambda _, v: v['type'] == 'teaser', + # If 'brandId' differs, it is a 'You might also like' video. Filter these out + 'teaser', lambda _, v: v['type'] == 'video' and v['brandId'] == document_id and v['sharingUrl'], + )): + yield self._extract_entry(entry) def _real_extract(self, url): channel_id = self._match_id(url) - webpage = self._download_webpage(url, channel_id) + document_id = self._search_regex( + r'docId\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, 'document id', group='doc_id') + data = self._download_v2_doc(document_id) - matches = re.finditer( - rf''']*?\sdata-plusbar-id\s*=\s*(["'])(?P[\w-]+)\1[^>]*?\sdata-plusbar-url=\1(?P{ZDFIE._VALID_URL})\1''', - webpage) + main_video = traverse_obj(data, ( + 'cluster', lambda _, v: v['type'] == 'teaserContent', + 'teaser', lambda _, v: v['type'] == 'video' and v['basename'] and v['sharingUrl'], any)) or {} - if self._downloader.params.get('noplaylist', False): - entry = next( - (self.url_result(m.group('url'), ie=ZDFIE.ie_key()) for m in matches), - None) - self.to_screen('Downloading just the main video because of --no-playlist') - if entry: - return entry - else: - self.to_screen(f'Downloading playlist {channel_id} - add --no-playlist to download just the main video') + if not self._yes_playlist(channel_id, main_video.get('basename')): + return self._extract_entry(main_video) - def check_video(m): - v_ref = self._search_regex( - r'''(]*?\shref\s*=[^>]+?\sdata-target-id\s*=\s*(["']){}\2[^>]*>)'''.format(m.group('p_id')), - webpage, 'check id', default='') - v_ref = extract_attributes(v_ref) - return v_ref.get('data-target-video-type') != 'novideo' - - return self.playlist_from_matches( - (m.group('url') for m in matches if check_video(m)), - channel_id, self._og_search_title(webpage, fatal=False)) + return self.playlist_result( + self._entries(data, document_id), channel_id, + re.split(r'\s+[-|]\s+ZDF(?:mediathek)?$', self._og_search_title(webpage) or '')[0] or None, + join_nonempty( + 'headline', 'text', delim='\n\n', + from_dict=traverse_obj(data, ('shortText', {dict}), default={})) or None)