From a0fe51d5623a18eb7c2c460a3d35f916e1752504 Mon Sep 17 00:00:00 2001 From: Teemu Ikonen Date: Sat, 7 May 2022 14:24:41 +0300 Subject: [PATCH] [ruutu] Support hs.fi embeds (#3547) Authored by: tpikonen, pukkandan --- yt_dlp/extractor/generic.py | 29 +++++++++++++++++++++--- yt_dlp/extractor/ruutu.py | 45 ++++++++++++++++++++++++++++--------- 2 files changed, 61 insertions(+), 13 deletions(-) diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index 8192fbb86..340161a42 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -2517,6 +2517,29 @@ class GenericIE(InfoExtractor): 'upload_date': '20220308', }, }, + { + # Multiple Ruutu embeds + 'url': 'https://www.hs.fi/kotimaa/art-2000008762560.html', + 'info_dict': { + 'title': 'Koronavirus | Epidemiahuippu voi olla Suomessa ohi, mutta koronaviruksen poistamista yleisvaarallisten tautien joukosta harkitaan vasta syksyllä', + 'id': 'art-2000008762560' + }, + 'playlist_count': 3 + }, + { + # Ruutu embed in hs.fi with a single video + 'url': 'https://www.hs.fi/kotimaa/art-2000008793421.html', + 'md5': 'f8964e65d8fada6e8a562389bf366bb4', + 'info_dict': { + 'id': '4081841', + 'ext': 'mp4', + 'title': 'Puolustusvoimat siirsi panssariajoneuvoja harjoituksiin Niinisaloon 2.5.2022', + 'thumbnail': r're:^https?://.+\.jpg$', + 'duration': 138, + 'age_limit': 0, + 'upload_date': '20220504', + }, + }, ] def report_following_redirect(self, new_url): @@ -3749,9 +3772,9 @@ def _real_extract(self, url): return self.playlist_from_matches(panopto_urls, video_id, video_title) # Look for Ruutu embeds - ruutu_url = RuutuIE._extract_url(webpage) - if ruutu_url: - return self.url_result(ruutu_url, RuutuIE) + ruutu_urls = RuutuIE._extract_urls(webpage) + if ruutu_urls: + return self.playlist_from_matches(ruutu_urls, video_id, video_title) # Look for HTML5 media entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') diff --git a/yt_dlp/extractor/ruutu.py b/yt_dlp/extractor/ruutu.py index f5dadf278..c6d94c100 100644 --- a/yt_dlp/extractor/ruutu.py +++ b/yt_dlp/extractor/ruutu.py @@ -38,6 +38,7 @@ class RuutuIE(InfoExtractor): 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 114, 'age_limit': 0, + 'upload_date': '20150508', }, }, { @@ -51,6 +52,9 @@ class RuutuIE(InfoExtractor): 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 40, 'age_limit': 0, + 'upload_date': '20150507', + 'series': 'Superpesis', + 'categories': ['Urheilu'], }, }, { @@ -63,6 +67,8 @@ class RuutuIE(InfoExtractor): 'description': 'md5:7d90f358c47542e3072ff65d7b1bcffe', 'thumbnail': r're:^https?://.*\.jpg$', 'age_limit': 0, + 'upload_date': '20151012', + 'series': 'Läpivalaisu', }, }, # Episode where is "NOT-USED", but has other @@ -82,6 +88,9 @@ class RuutuIE(InfoExtractor): 'description': 'md5:bbb6963df17dfd0ecd9eb9a61bf14b52', 'thumbnail': r're:^https?://.*\.jpg$', 'age_limit': 0, + 'upload_date': '20190320', + 'series': 'Mysteeritarinat', + 'duration': 1324, }, 'expected_warnings': [ 'HTTP Error 502: Bad Gateway', @@ -126,14 +135,30 @@ class RuutuIE(InfoExtractor): _API_BASE = 'https://gatling.nelonenmedia.fi' @classmethod - def _extract_url(cls, webpage): + def _extract_urls(cls, webpage): + # nelonen.fi settings = try_call( lambda: json.loads(re.search( r'jQuery\.extend\(Drupal\.settings, ({.+?})\);', webpage).group(1), strict=False)) - video_id = traverse_obj(settings, ( - 'mediaCrossbowSettings', 'file', 'field_crossbow_video_id', 'und', 0, 'value')) - if video_id: - return f'http://www.ruutu.fi/video/{video_id}' + if settings: + video_id = traverse_obj(settings, ( + 'mediaCrossbowSettings', 'file', 'field_crossbow_video_id', 'und', 0, 'value')) + if video_id: + return [f'http://www.ruutu.fi/video/{video_id}'] + # hs.fi and is.fi + settings = try_call( + lambda: json.loads(re.search( + '(?s)]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)', + webpage).group(1), strict=False)) + if settings: + video_ids = set(traverse_obj(settings, ( + 'props', 'pageProps', 'page', 'assetData', 'splitBody', ..., 'video', 'sourceId')) or []) + if video_ids: + return [f'http://www.ruutu.fi/video/{v}' for v in video_ids] + video_id = traverse_obj(settings, ( + 'props', 'pageProps', 'page', 'assetData', 'mainVideo', 'sourceId')) + if video_id: + return [f'http://www.ruutu.fi/video/{video_id}'] def _real_extract(self, url): video_id = self._match_id(url) @@ -206,10 +231,10 @@ def extract_formats(node): extract_formats(video_xml.find('./Clip')) def pv(name): - node = find_xpath_attr( - video_xml, './Clip/PassthroughVariables/variable', 'name', name) - if node is not None: - return node.get('value') + value = try_call(lambda: find_xpath_attr( + video_xml, './Clip/PassthroughVariables/variable', 'name', name).get('value')) + if value != 'NA': + return value or None if not formats: if (not self.get_param('allow_unplayable_formats') @@ -234,6 +259,6 @@ def pv(name): 'series': pv('series_name'), 'season_number': int_or_none(pv('season_number')), 'episode_number': int_or_none(pv('episode_number')), - 'categories': themes.split(',') if themes else [], + 'categories': themes.split(',') if themes else None, 'formats': formats, }