From 9e96dc8b3561c1e6e62ce6a34efba485e5e49054 Mon Sep 17 00:00:00 2001 From: fnord Date: Fri, 19 Jun 2015 01:36:59 -0500 Subject: [PATCH 01/14] Support BBC News (bbc.com/news) --- docs/supportedsites.md | 1 + youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/bbcnews.py | 162 +++++++++++++++++++++++++++++++ 3 files changed, 164 insertions(+) create mode 100644 youtube_dl/extractor/bbcnews.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 220e52b988..d4ccbbd3a6 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -50,6 +50,7 @@ # Supported sites - **Bandcamp** - **Bandcamp:album** - **bbc.co.uk**: BBC iPlayer + - **bbc.com**: BBC news videos - **BeatportPro** - **Beeg** - **BehindKink** diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 6fdaf90b2d..51d2d20e90 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -36,6 +36,7 @@ from .bambuser import BambuserIE, BambuserChannelIE from .bandcamp import BandcampIE, BandcampAlbumIE from .bbccouk import BBCCoUkIE +from .bbcnews import BBCNewsIE from .beeg import BeegIE from .behindkink import BehindKinkIE from .beatportpro import BeatportProIE diff --git a/youtube_dl/extractor/bbcnews.py b/youtube_dl/extractor/bbcnews.py new file mode 100644 index 0000000000..b10e30a818 --- /dev/null +++ b/youtube_dl/extractor/bbcnews.py @@ -0,0 +1,162 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, +) +from ..compat import compat_HTTPError +import re +from .bbccouk import BBCCoUkIE + +class BBCNewsIE(BBCCoUkIE): + IE_NAME = 'bbc.com' + IE_DESC = 'BBC news' + _VALID_URL = r'https?://(?:www\.)?(?:bbc\.co\.uk|bbc\.com)/news/(?P[^/]+)' + + _TESTS = [{ + 'url': 'http://www.bbc.com/news/world-europe-32668511', + 'info_dict': { + 'id': 'world-europe-32668511', + 'title': 'Russia stages massive WW2 parade despite Western boycott', + }, + 'playlist_count': 2, + },{ + 'url': 'http://www.bbc.com/news/business-28299555', + 'info_dict': { + 'id': 'business-28299555', + 'title': 'Farnborough Airshow: Video highlights', + }, + 'playlist_count': 9, + },{ + 'url': 'http://www.bbc.com/news/world-europe-32041533', + 'note': 'Video', + 'info_dict': { + 'id': 'p02mprgb', + 'ext': 'mp4', + 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', + 'description': 'Germanwings plane crash site in aerial video - Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', + 'duration': 47, + }, + 'params': { + 'skip_download': True, + } + }] + + def _duration_str2int(self, str): + if not str: + return None + ret = re.match(r'^\d+$', str) + if ret: + return int(ret.group(0)) + ret = re.match(r'PT((?P\d+)H)?((?P\d+)M)?(?P\d+)S$', str) + if ret: + total=int(ret.group('s')) + if ret.group('m'): + total+=(int(ret.group('m'))*60) + if ret.group('h'): + total+=(int(ret.group('h'))*3600) + return total + return None + + def _download_media_selector(self, programme_id): + # bbc news uses http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/ not + # http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/ + # Could add third urlspec arg to BBCCoUkIE._download_media_selector instead of duplicating it + + try: + media_selection = self._download_xml( + 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s' % programme_id, + programme_id, 'Downloading media selection XML') + except ExtractorError as ee: + if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: + media_selection = xml.etree.ElementTree.fromstring(ee.cause.read().encode('utf-8')) + else: + raise + formats = [] + subtitles = None + + for media in self._extract_medias(media_selection): + kind = media.get('kind') + if kind == 'audio': + formats.extend(self._extract_audio(media, programme_id)) + elif kind == 'video': + formats.extend(self._extract_video(media, programme_id)) + elif kind == 'captions': + subtitles = self.extract_subtitles(media, programme_id) + + formats = [] + subtitles = None + + for media in self._extract_medias(media_selection): + kind = media.get('kind') + if kind == 'audio': + formats.extend(self._extract_audio(media, programme_id)) + elif kind == 'video': + formats.extend(self._extract_video(media, programme_id)) + elif kind == 'captions': + subtitles = self.extract_subtitles(media, programme_id) + + return formats, subtitles + + def _real_extract(self, url): + list_id = self._match_id(url) + webpage = self._download_webpage(url, list_id) + + list_title = self._html_search_regex(r'(.*?)(?:\s*-\s*BBC News)?', webpage, 'list title') + + pubdate = self._html_search_regex(r'"datePublished":\s*"(\d+-\d+-\d+)', webpage, 'date', default=None) + if pubdate: + pubdate = pubdate.replace('-','') + + ret = [] + # works with bbc.com/news/something-something-123456 articles + matches = re.findall(r"data-media-meta='({[^']+})'", webpage) + if not matches: + # stubbornly generic extractor for {json with "image":{allvideoshavethis},etc} + # in http://www.bbc.com/news/video_and_audio/international + matches = re.findall(r"({[^{}]+image\":{[^}]+}[^}]+})", webpage) + if not matches: + raise ExtractorError('No video found', expected=True) + + for ent in matches: + jent = self._parse_json(ent,list_id) + + programme_id = jent.get('externalId',None) + xml_url = jent.get('href', None) + + title = jent['caption'] + duration = self._duration_str2int(jent.get('duration',None)) + description = list_title + ' - ' + jent.get('caption','') + thumbnail = None + if jent.has_key('image'): + thumbnail=jent['image'].get('href',None) + + if programme_id: + formats, subtitles = self._download_media_selector(programme_id) + elif xml_url: + # Cheap fallback + # http://playlists.bbc.co.uk/news/(list_id)[ABC..]/playlist.sxml + xml = self._download_webpage(xml_url, programme_id, 'Downloading playlist.sxml for externalId (fallback)') + programme_id = self._search_regex(r']*identifier="(.+?)"', xml, 'playlist.sxml (externalId fallback)') + formats, subtitles = self._download_media_selector(programme_id) + else: + raise ExtractorError('data-media-meta entry has no externalId or href value.') + + self._sort_formats(formats) + + ret.append( { + 'id': programme_id, + 'uploader': 'BBC News', + 'upload_date': pubdate, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'formats': formats, + 'subtitles': subtitles, + } ) + + if len(ret) > 0: + return self.playlist_result(ret, list_id, list_title) + raise ExtractorError('No video found', expected=True) From a8b081a0523c412fd4e01d5cddec7ae382c4793e Mon Sep 17 00:00:00 2001 From: fnord Date: Fri, 19 Jun 2015 01:52:25 -0500 Subject: [PATCH 02/14] BBCNewsIE: eliminate redundant function. BBCCoUkIE._download_media_selector: use class variable instead of hardcoded string for mediaselector_url template. --- youtube_dl/extractor/bbccouk.py | 4 +++- youtube_dl/extractor/bbcnews.py | 42 ++------------------------------- 2 files changed, 5 insertions(+), 41 deletions(-) diff --git a/youtube_dl/extractor/bbccouk.py b/youtube_dl/extractor/bbccouk.py index 0305f88b53..dcc5fc2fad 100644 --- a/youtube_dl/extractor/bbccouk.py +++ b/youtube_dl/extractor/bbccouk.py @@ -15,6 +15,8 @@ class BBCCoUkIE(InfoExtractor): IE_DESC = 'BBC iPlayer' _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:(?:programmes|iplayer(?:/[^/]+)?/(?:episode|playlist))/)|music/clips[/#])(?P[\da-z]{8})' + mediaselector_url = 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s' + _TESTS = [ { 'url': 'http://www.bbc.co.uk/programmes/b039g8p7', @@ -277,7 +279,7 @@ def _extract_text(p): def _download_media_selector(self, programme_id): try: media_selection = self._download_xml( - 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s' % programme_id, + self.mediaselector_url % programme_id, programme_id, 'Downloading media selection XML') except ExtractorError as ee: if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: diff --git a/youtube_dl/extractor/bbcnews.py b/youtube_dl/extractor/bbcnews.py index b10e30a818..9bb8d42e6e 100644 --- a/youtube_dl/extractor/bbcnews.py +++ b/youtube_dl/extractor/bbcnews.py @@ -14,6 +14,8 @@ class BBCNewsIE(BBCCoUkIE): IE_DESC = 'BBC news' _VALID_URL = r'https?://(?:www\.)?(?:bbc\.co\.uk|bbc\.com)/news/(?P[^/]+)' + mediaselector_url = 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s' + _TESTS = [{ 'url': 'http://www.bbc.com/news/world-europe-32668511', 'info_dict': { @@ -59,46 +61,6 @@ def _duration_str2int(self, str): return total return None - def _download_media_selector(self, programme_id): - # bbc news uses http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/ not - # http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/ - # Could add third urlspec arg to BBCCoUkIE._download_media_selector instead of duplicating it - - try: - media_selection = self._download_xml( - 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s' % programme_id, - programme_id, 'Downloading media selection XML') - except ExtractorError as ee: - if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: - media_selection = xml.etree.ElementTree.fromstring(ee.cause.read().encode('utf-8')) - else: - raise - formats = [] - subtitles = None - - for media in self._extract_medias(media_selection): - kind = media.get('kind') - if kind == 'audio': - formats.extend(self._extract_audio(media, programme_id)) - elif kind == 'video': - formats.extend(self._extract_video(media, programme_id)) - elif kind == 'captions': - subtitles = self.extract_subtitles(media, programme_id) - - formats = [] - subtitles = None - - for media in self._extract_medias(media_selection): - kind = media.get('kind') - if kind == 'audio': - formats.extend(self._extract_audio(media, programme_id)) - elif kind == 'video': - formats.extend(self._extract_video(media, programme_id)) - elif kind == 'captions': - subtitles = self.extract_subtitles(media, programme_id) - - return formats, subtitles - def _real_extract(self, url): list_id = self._match_id(url) webpage = self._download_webpage(url, list_id) From d5552a3477a0970f4aaaa746ce07c816267bb9cf Mon Sep 17 00:00:00 2001 From: fnord Date: Fri, 19 Jun 2015 06:25:50 -0500 Subject: [PATCH 03/14] bbcnews: Switch to parse_duration, revert change to docs/supportedsites.md --- docs/supportedsites.md | 1 - youtube_dl/extractor/bbcnews.py | 19 ++----------------- 2 files changed, 2 insertions(+), 18 deletions(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index d4ccbbd3a6..220e52b988 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -50,7 +50,6 @@ # Supported sites - **Bandcamp** - **Bandcamp:album** - **bbc.co.uk**: BBC iPlayer - - **bbc.com**: BBC news videos - **BeatportPro** - **Beeg** - **BehindKink** diff --git a/youtube_dl/extractor/bbcnews.py b/youtube_dl/extractor/bbcnews.py index 9bb8d42e6e..fd4a5e38fb 100644 --- a/youtube_dl/extractor/bbcnews.py +++ b/youtube_dl/extractor/bbcnews.py @@ -3,6 +3,7 @@ from .common import InfoExtractor from ..utils import ( ExtractorError, + parse_duration, int_or_none, ) from ..compat import compat_HTTPError @@ -45,22 +46,6 @@ class BBCNewsIE(BBCCoUkIE): } }] - def _duration_str2int(self, str): - if not str: - return None - ret = re.match(r'^\d+$', str) - if ret: - return int(ret.group(0)) - ret = re.match(r'PT((?P\d+)H)?((?P\d+)M)?(?P\d+)S$', str) - if ret: - total=int(ret.group('s')) - if ret.group('m'): - total+=(int(ret.group('m'))*60) - if ret.group('h'): - total+=(int(ret.group('h'))*3600) - return total - return None - def _real_extract(self, url): list_id = self._match_id(url) webpage = self._download_webpage(url, list_id) @@ -88,7 +73,7 @@ def _real_extract(self, url): xml_url = jent.get('href', None) title = jent['caption'] - duration = self._duration_str2int(jent.get('duration',None)) + duration = parse_duration(jent.get('duration',None)) description = list_title + ' - ' + jent.get('caption','') thumbnail = None if jent.has_key('image'): From 10273d6e0846cd8f3762e3777712d5cd2a0cafcd Mon Sep 17 00:00:00 2001 From: fnord Date: Sat, 20 Jun 2015 08:22:13 -0500 Subject: [PATCH 04/14] toss new stuff into old file --- youtube_dl/extractor/__init__.py | 3 +- youtube_dl/extractor/bbccouk.py | 101 ++++++++++++++++++++++++++++ youtube_dl/extractor/bbcnews.py | 109 ------------------------------- 3 files changed, 102 insertions(+), 111 deletions(-) delete mode 100644 youtube_dl/extractor/bbcnews.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 51d2d20e90..f9f7bdfafc 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -35,8 +35,7 @@ from .baidu import BaiduVideoIE from .bambuser import BambuserIE, BambuserChannelIE from .bandcamp import BandcampIE, BandcampAlbumIE -from .bbccouk import BBCCoUkIE -from .bbcnews import BBCNewsIE +from .bbccouk import BBCCoUkIE, BBCNewsIE from .beeg import BeegIE from .behindkink import BehindKinkIE from .beatportpro import BeatportProIE diff --git a/youtube_dl/extractor/bbccouk.py b/youtube_dl/extractor/bbccouk.py index dcc5fc2fad..ea682fb6f2 100644 --- a/youtube_dl/extractor/bbccouk.py +++ b/youtube_dl/extractor/bbccouk.py @@ -5,9 +5,11 @@ from .common import InfoExtractor from ..utils import ( ExtractorError, + parse_duration, int_or_none, ) from ..compat import compat_HTTPError +import re class BBCCoUkIE(InfoExtractor): @@ -394,3 +396,102 @@ def _real_extract(self, url): 'formats': formats, 'subtitles': subtitles, } + + +class BBCNewsIE(BBCCoUkIE): + IE_NAME = 'bbc.com' + IE_DESC = 'BBC news' + _VALID_URL = r'https?://(?:www\.)?(?:bbc\.co\.uk|bbc\.com)/news/(?P[^/]+)' + + mediaselector_url = 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s' + + _TESTS = [{ + 'url': 'http://www.bbc.com/news/world-europe-32668511', + 'info_dict': { + 'id': 'world-europe-32668511', + 'title': 'Russia stages massive WW2 parade despite Western boycott', + }, + 'playlist_count': 2, + },{ + 'url': 'http://www.bbc.com/news/business-28299555', + 'info_dict': { + 'id': 'business-28299555', + 'title': 'Farnborough Airshow: Video highlights', + }, + 'playlist_count': 9, + },{ + 'url': 'http://www.bbc.com/news/world-europe-32041533', + 'note': 'Video', + 'info_dict': { + 'id': 'p02mprgb', + 'ext': 'mp4', + 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', + 'description': 'Germanwings plane crash site in aerial video - Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', + 'duration': 47, + }, + 'params': { + 'skip_download': True, + } + }] + + def _real_extract(self, url): + list_id = self._match_id(url) + webpage = self._download_webpage(url, list_id) + + list_title = self._html_search_regex(r'(.*?)(?:\s*-\s*BBC News)?', webpage, 'list title') + + pubdate = self._html_search_regex(r'"datePublished":\s*"(\d+-\d+-\d+)', webpage, 'date', default=None) + if pubdate: + pubdate = pubdate.replace('-','') + + ret = [] + # works with bbc.com/news/something-something-123456 articles + matches = re.findall(r"data-media-meta='({[^']+})'", webpage) + if not matches: + # stubbornly generic extractor for {json with "image":{allvideoshavethis},etc} + # in http://www.bbc.com/news/video_and_audio/international + matches = re.findall(r"({[^{}]+image\":{[^}]+}[^}]+})", webpage) + if not matches: + raise ExtractorError('No video found', expected=True) + + for ent in matches: + jent = self._parse_json(ent,list_id) + + programme_id = jent.get('externalId',None) + xml_url = jent.get('href', None) + + title = jent['caption'] + duration = parse_duration(jent.get('duration',None)) + description = list_title + ' - ' + jent.get('caption','') + thumbnail = None + if jent.has_key('image'): + thumbnail=jent['image'].get('href',None) + + if programme_id: + formats, subtitles = self._download_media_selector(programme_id) + elif xml_url: + # Cheap fallback + # http://playlists.bbc.co.uk/news/(list_id)[ABC..]/playlist.sxml + xml = self._download_webpage(xml_url, programme_id, 'Downloading playlist.sxml for externalId (fallback)') + programme_id = self._search_regex(r']*identifier="(.+?)"', xml, 'playlist.sxml (externalId fallback)') + formats, subtitles = self._download_media_selector(programme_id) + else: + raise ExtractorError('data-media-meta entry has no externalId or href value.') + + self._sort_formats(formats) + + ret.append( { + 'id': programme_id, + 'uploader': 'BBC News', + 'upload_date': pubdate, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'formats': formats, + 'subtitles': subtitles, + } ) + + if len(ret) > 0: + return self.playlist_result(ret, list_id, list_title) + raise ExtractorError('No video found', expected=True) diff --git a/youtube_dl/extractor/bbcnews.py b/youtube_dl/extractor/bbcnews.py deleted file mode 100644 index fd4a5e38fb..0000000000 --- a/youtube_dl/extractor/bbcnews.py +++ /dev/null @@ -1,109 +0,0 @@ -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - parse_duration, - int_or_none, -) -from ..compat import compat_HTTPError -import re -from .bbccouk import BBCCoUkIE - -class BBCNewsIE(BBCCoUkIE): - IE_NAME = 'bbc.com' - IE_DESC = 'BBC news' - _VALID_URL = r'https?://(?:www\.)?(?:bbc\.co\.uk|bbc\.com)/news/(?P[^/]+)' - - mediaselector_url = 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s' - - _TESTS = [{ - 'url': 'http://www.bbc.com/news/world-europe-32668511', - 'info_dict': { - 'id': 'world-europe-32668511', - 'title': 'Russia stages massive WW2 parade despite Western boycott', - }, - 'playlist_count': 2, - },{ - 'url': 'http://www.bbc.com/news/business-28299555', - 'info_dict': { - 'id': 'business-28299555', - 'title': 'Farnborough Airshow: Video highlights', - }, - 'playlist_count': 9, - },{ - 'url': 'http://www.bbc.com/news/world-europe-32041533', - 'note': 'Video', - 'info_dict': { - 'id': 'p02mprgb', - 'ext': 'mp4', - 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', - 'description': 'Germanwings plane crash site in aerial video - Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', - 'duration': 47, - }, - 'params': { - 'skip_download': True, - } - }] - - def _real_extract(self, url): - list_id = self._match_id(url) - webpage = self._download_webpage(url, list_id) - - list_title = self._html_search_regex(r'(.*?)(?:\s*-\s*BBC News)?', webpage, 'list title') - - pubdate = self._html_search_regex(r'"datePublished":\s*"(\d+-\d+-\d+)', webpage, 'date', default=None) - if pubdate: - pubdate = pubdate.replace('-','') - - ret = [] - # works with bbc.com/news/something-something-123456 articles - matches = re.findall(r"data-media-meta='({[^']+})'", webpage) - if not matches: - # stubbornly generic extractor for {json with "image":{allvideoshavethis},etc} - # in http://www.bbc.com/news/video_and_audio/international - matches = re.findall(r"({[^{}]+image\":{[^}]+}[^}]+})", webpage) - if not matches: - raise ExtractorError('No video found', expected=True) - - for ent in matches: - jent = self._parse_json(ent,list_id) - - programme_id = jent.get('externalId',None) - xml_url = jent.get('href', None) - - title = jent['caption'] - duration = parse_duration(jent.get('duration',None)) - description = list_title + ' - ' + jent.get('caption','') - thumbnail = None - if jent.has_key('image'): - thumbnail=jent['image'].get('href',None) - - if programme_id: - formats, subtitles = self._download_media_selector(programme_id) - elif xml_url: - # Cheap fallback - # http://playlists.bbc.co.uk/news/(list_id)[ABC..]/playlist.sxml - xml = self._download_webpage(xml_url, programme_id, 'Downloading playlist.sxml for externalId (fallback)') - programme_id = self._search_regex(r']*identifier="(.+?)"', xml, 'playlist.sxml (externalId fallback)') - formats, subtitles = self._download_media_selector(programme_id) - else: - raise ExtractorError('data-media-meta entry has no externalId or href value.') - - self._sort_formats(formats) - - ret.append( { - 'id': programme_id, - 'uploader': 'BBC News', - 'upload_date': pubdate, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'formats': formats, - 'subtitles': subtitles, - } ) - - if len(ret) > 0: - return self.playlist_result(ret, list_id, list_title) - raise ExtractorError('No video found', expected=True) From 75ab0ebcf593ec91a46d83e69854ffa313d33309 Mon Sep 17 00:00:00 2001 From: fnord Date: Sat, 20 Jun 2015 08:24:02 -0500 Subject: [PATCH 05/14] no .get('..',None) --- youtube_dl/extractor/bbccouk.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/bbccouk.py b/youtube_dl/extractor/bbccouk.py index ea682fb6f2..de4d7f9c00 100644 --- a/youtube_dl/extractor/bbccouk.py +++ b/youtube_dl/extractor/bbccouk.py @@ -457,15 +457,15 @@ def _real_extract(self, url): for ent in matches: jent = self._parse_json(ent,list_id) - programme_id = jent.get('externalId',None) - xml_url = jent.get('href', None) + programme_id = jent.get('externalId') + xml_url = jent.get('href') title = jent['caption'] - duration = parse_duration(jent.get('duration',None)) + duration = parse_duration(jent.get('duration') description = list_title + ' - ' + jent.get('caption','') thumbnail = None if jent.has_key('image'): - thumbnail=jent['image'].get('href',None) + thumbnail=jent['image'].get('href') if programme_id: formats, subtitles = self._download_media_selector(programme_id) From 77c975f536befbe89bf718e86282958d391d9ffe Mon Sep 17 00:00:00 2001 From: fnord Date: Sat, 20 Jun 2015 08:28:14 -0500 Subject: [PATCH 06/14] typofix --- youtube_dl/extractor/bbccouk.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bbccouk.py b/youtube_dl/extractor/bbccouk.py index de4d7f9c00..f9404f3fa8 100644 --- a/youtube_dl/extractor/bbccouk.py +++ b/youtube_dl/extractor/bbccouk.py @@ -461,7 +461,7 @@ def _real_extract(self, url): xml_url = jent.get('href') title = jent['caption'] - duration = parse_duration(jent.get('duration') + duration = parse_duration(jent.get('duration')) description = list_title + ' - ' + jent.get('caption','') thumbnail = None if jent.has_key('image'): From de939d89eb83c851c6db66933e5fc0c401a1a679 Mon Sep 17 00:00:00 2001 From: fnord Date: Sat, 20 Jun 2015 11:04:46 -0500 Subject: [PATCH 07/14] Support BBC news in other languages, non-mediaselector videos --- youtube_dl/extractor/bbccouk.py | 87 +++++++++++++++++++++++++++------ 1 file changed, 73 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/bbccouk.py b/youtube_dl/extractor/bbccouk.py index f9404f3fa8..72e20857bf 100644 --- a/youtube_dl/extractor/bbccouk.py +++ b/youtube_dl/extractor/bbccouk.py @@ -401,7 +401,7 @@ def _real_extract(self, url): class BBCNewsIE(BBCCoUkIE): IE_NAME = 'bbc.com' IE_DESC = 'BBC news' - _VALID_URL = r'https?://(?:www\.)?(?:bbc\.co\.uk|bbc\.com)/news/(?P[^/]+)' + _VALID_URL = r'https?://(?:www\.)?bbc\.com/.+?/(?P[^/]+)$' mediaselector_url = 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s' @@ -432,56 +432,115 @@ class BBCNewsIE(BBCCoUkIE): 'params': { 'skip_download': True, } + },{ + 'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu', + 'note': 'Video', + 'info_dict': { + 'id': 'NA', + 'ext': 'mp4', + 'title': 'YPG - Tel Abyad..n tamam. kontrol.m.zde', + 'duration': 47, + }, + 'params': { + 'skip_download': True, + } + },{ + 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw', + 'note': 'Video', + 'info_dict': { + 'id': '39275083', + 'ext': 'mp4', + 'title': 'Honduras militariza sus hospitales por nuevo esc.ndalo de corrupci.n', + 'duration': 87, + }, + 'params': { + 'skip_download': True, + } }] def _real_extract(self, url): list_id = self._match_id(url) webpage = self._download_webpage(url, list_id) - list_title = self._html_search_regex(r'(.*?)(?:\s*-\s*BBC News)?', webpage, 'list title') + list_title = self._html_search_regex(r'(.*?)(?:\s*-\s*BBC [^ ]+)?', webpage, 'list title') pubdate = self._html_search_regex(r'"datePublished":\s*"(\d+-\d+-\d+)', webpage, 'date', default=None) if pubdate: pubdate = pubdate.replace('-','') ret = [] + jsent = [] + # works with bbc.com/news/something-something-123456 articles - matches = re.findall(r"data-media-meta='({[^']+})'", webpage) - if not matches: + jsent = map( + lambda m: self._parse_json(m,list_id), + re.findall(r"data-media-meta='({[^']+})'", webpage) + ) + + if len(jsent) == 0: + # http://www.bbc.com/news/video_and_audio/international + # and single-video articles + masset = self._html_search_regex(r'mediaAssetPage\.init\(\s*({.+?}), "/', webpage, 'mediaassets', default=None) + if masset: + jmasset = self._parse_json(masset,list_id) + for key, val in jmasset.get('videos',{}).items(): + for skey, sval in val.items(): + sval['id'] = skey + jsent.append(sval) + + if len(jsent) == 0: # stubbornly generic extractor for {json with "image":{allvideoshavethis},etc} # in http://www.bbc.com/news/video_and_audio/international - matches = re.findall(r"({[^{}]+image\":{[^}]+}[^}]+})", webpage) - if not matches: + # prone to breaking if entries have sourceFiles list + jsent = map( + lambda m: self._parse_json(m,list_id), + re.findall(r"({[^{}]+image\":{[^}]+}[^}]+})", webpage) + ) + + if len(jsent) == 0: raise ExtractorError('No video found', expected=True) - for ent in matches: - jent = self._parse_json(ent,list_id) - + for jent in jsent: programme_id = jent.get('externalId') - xml_url = jent.get('href') + xml_url = jent.get('hxref') + + title = jent.get('caption',list_title) - title = jent['caption'] duration = parse_duration(jent.get('duration')) description = list_title + ' - ' + jent.get('caption','') thumbnail = None if jent.has_key('image'): thumbnail=jent['image'].get('href') + formats = [] + subtitles = [] + if programme_id: formats, subtitles = self._download_media_selector(programme_id) + elif jent.has_key('sourceFiles'): + # mediaselector not used at + # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu + for key, val in jent['sourceFiles'].items(): + formats.append( { + 'ext': val.get('encoding'), + 'url': val.get('url'), + 'filesize': int(val.get('filesize')), + 'format_id': key + } ) elif xml_url: # Cheap fallback # http://playlists.bbc.co.uk/news/(list_id)[ABC..]/playlist.sxml xml = self._download_webpage(xml_url, programme_id, 'Downloading playlist.sxml for externalId (fallback)') programme_id = self._search_regex(r']*identifier="(.+?)"', xml, 'playlist.sxml (externalId fallback)') formats, subtitles = self._download_media_selector(programme_id) - else: - raise ExtractorError('data-media-meta entry has no externalId or href value.') + + if len(formats) == 0: + raise ExtractorError('unsupported json media entry.\n '+str(jent)+'\n') self._sort_formats(formats) ret.append( { - 'id': programme_id, + 'id': jent.get('programme_id',jent.get('id')), 'uploader': 'BBC News', 'upload_date': pubdate, 'title': title, From 7bb23aeca4e9076528e3d31d501a9a288dcd444c Mon Sep 17 00:00:00 2001 From: fnord Date: Sat, 20 Jun 2015 11:08:13 -0500 Subject: [PATCH 08/14] rename bbccouk.py -> bbc.py --- youtube_dl/extractor/__init__.py | 2 +- youtube_dl/extractor/{bbccouk.py => bbc.py} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename youtube_dl/extractor/{bbccouk.py => bbc.py} (100%) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index f9f7bdfafc..a48346e60e 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -35,7 +35,7 @@ from .baidu import BaiduVideoIE from .bambuser import BambuserIE, BambuserChannelIE from .bandcamp import BandcampIE, BandcampAlbumIE -from .bbccouk import BBCCoUkIE, BBCNewsIE +from .bbc import BBCCoUkIE, BBCNewsIE from .beeg import BeegIE from .behindkink import BehindKinkIE from .beatportpro import BeatportProIE diff --git a/youtube_dl/extractor/bbccouk.py b/youtube_dl/extractor/bbc.py similarity index 100% rename from youtube_dl/extractor/bbccouk.py rename to youtube_dl/extractor/bbc.py From 2a282a3b5f366ba0569bae477d5060329ba254fb Mon Sep 17 00:00:00 2001 From: fnord Date: Sat, 20 Jun 2015 11:11:41 -0500 Subject: [PATCH 09/14] Unbreak breakage that was broken to test breakage --- youtube_dl/extractor/bbc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 72e20857bf..310db9d1db 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -502,7 +502,7 @@ def _real_extract(self, url): for jent in jsent: programme_id = jent.get('externalId') - xml_url = jent.get('hxref') + xml_url = jent.get('href') title = jent.get('caption',list_title) From a9dcf4a860214e37971ab05f27f74bbae65ff8ae Mon Sep 17 00:00:00 2001 From: fnord Date: Tue, 23 Jun 2015 01:08:07 -0500 Subject: [PATCH 10/14] Prefer externalId over non-mediaserver-specific hashkey for video id. --- youtube_dl/extractor/bbc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 310db9d1db..fed344ea0b 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -540,7 +540,7 @@ def _real_extract(self, url): self._sort_formats(formats) ret.append( { - 'id': jent.get('programme_id',jent.get('id')), + 'id': jent.get('id') if programme_id == None else programme_id, 'uploader': 'BBC News', 'upload_date': pubdate, 'title': title, From da92eeae42f556926cb676b3c14e270603b7e38e Mon Sep 17 00:00:00 2001 From: fnord Date: Thu, 25 Jun 2015 00:31:32 -0500 Subject: [PATCH 11/14] Fix tests, description formatting --- youtube_dl/extractor/bbc.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index fed344ea0b..bb671d4731 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -428,6 +428,8 @@ class BBCNewsIE(BBCCoUkIE): 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', 'description': 'Germanwings plane crash site in aerial video - Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', 'duration': 47, + 'upload_date': '20150324', + 'uploader': 'BBC News', }, 'params': { 'skip_download': True, @@ -438,8 +440,11 @@ class BBCNewsIE(BBCCoUkIE): 'info_dict': { 'id': 'NA', 'ext': 'mp4', - 'title': 'YPG - Tel Abyad..n tamam. kontrol.m.zde', + 'title': 'YPG: Tel Abyad\'\u0131n tamam\u0131 kontrol\xfcm\xfczde', + 'description': 'YPG: Tel Abyad\'\u0131n tamam\u0131 kontrol\xfcm\xfczde', 'duration': 47, + 'upload_date': '20150615', + 'uploader': 'BBC News', }, 'params': { 'skip_download': True, @@ -450,8 +455,11 @@ class BBCNewsIE(BBCCoUkIE): 'info_dict': { 'id': '39275083', 'ext': 'mp4', - 'title': 'Honduras militariza sus hospitales por nuevo esc.ndalo de corrupci.n', + 'title': 'Honduras militariza sus hospitales por nuevo esc\xe1ndalo de corrupci\xf3n', + 'description': 'Honduras militariza sus hospitales por nuevo esc\xe1ndalo de corrupci\xf3n', 'duration': 87, + 'upload_date': '20150619', + 'uploader': 'BBC News', }, 'params': { 'skip_download': True, @@ -507,7 +515,9 @@ def _real_extract(self, url): title = jent.get('caption',list_title) duration = parse_duration(jent.get('duration')) - description = list_title + ' - ' + jent.get('caption','') + description = list_title + if jent.get('caption'): + description += ' - ' + jent.get('caption') thumbnail = None if jent.has_key('image'): thumbnail=jent['image'].get('href') @@ -539,8 +549,12 @@ def _real_extract(self, url): self._sort_formats(formats) + id = jent.get('id') if programme_id == None else programme_id + if id == None: + id = 'NA' + ret.append( { - 'id': jent.get('id') if programme_id == None else programme_id, + 'id': id, 'uploader': 'BBC News', 'upload_date': pubdate, 'title': title, From 36da48798a28b8261d2f39f73f2522651d58a364 Mon Sep 17 00:00:00 2001 From: fnord Date: Fri, 17 Jul 2015 02:27:50 -0500 Subject: [PATCH 12/14] handle titles and captions set to '' --- youtube_dl/extractor/bbc.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 471d865d26..c910eb55af 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -497,11 +497,13 @@ def _real_extract(self, url): programme_id = jent.get('externalId') xml_url = jent.get('href') - title = jent.get('caption',list_title) + title = jent.get('caption','') + if title == '': + title = list_title duration = parse_duration(jent.get('duration')) description = list_title - if jent.get('caption'): + if jent.get('caption', '') != '': description += ' - ' + jent.get('caption') thumbnail = None if jent.has_key('image'): From a3bfddfa5ee33cf085b959536f1025c0aa53cc77 Mon Sep 17 00:00:00 2001 From: fnord Date: Fri, 17 Jul 2015 02:47:02 -0500 Subject: [PATCH 13/14] bbc.py: correct syntax --- youtube_dl/extractor/bbc.py | 106 ++++++++++++++++++------------------ 1 file changed, 53 insertions(+), 53 deletions(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index c910eb55af..c8f285165f 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -397,14 +397,14 @@ class BBCNewsIE(BBCCoUkIE): 'title': 'Russia stages massive WW2 parade despite Western boycott', }, 'playlist_count': 2, - },{ + }, { 'url': 'http://www.bbc.com/news/business-28299555', 'info_dict': { 'id': 'business-28299555', 'title': 'Farnborough Airshow: Video highlights', }, 'playlist_count': 9, - },{ + }, { 'url': 'http://www.bbc.com/news/world-europe-32041533', 'note': 'Video', 'info_dict': { @@ -419,7 +419,7 @@ class BBCNewsIE(BBCCoUkIE): 'params': { 'skip_download': True, } - },{ + }, { 'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu', 'note': 'Video', 'info_dict': { @@ -434,7 +434,7 @@ class BBCNewsIE(BBCCoUkIE): 'params': { 'skip_download': True, } - },{ + }, { 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw', 'note': 'Video', 'info_dict': { @@ -459,88 +459,88 @@ def _real_extract(self, url): pubdate = self._html_search_regex(r'"datePublished":\s*"(\d+-\d+-\d+)', webpage, 'date', default=None) if pubdate: - pubdate = pubdate.replace('-','') + pubdate = pubdate.replace('-', '') ret = [] jsent = [] # works with bbc.com/news/something-something-123456 articles jsent = map( - lambda m: self._parse_json(m,list_id), - re.findall(r"data-media-meta='({[^']+})'", webpage) + lambda m: self._parse_json(m, list_id), + re.findall(r"data-media-meta='({[^']+})'", webpage) ) if len(jsent) == 0: - # http://www.bbc.com/news/video_and_audio/international - # and single-video articles - masset = self._html_search_regex(r'mediaAssetPage\.init\(\s*({.+?}), "/', webpage, 'mediaassets', default=None) - if masset: - jmasset = self._parse_json(masset,list_id) - for key, val in jmasset.get('videos',{}).items(): - for skey, sval in val.items(): - sval['id'] = skey - jsent.append(sval) + # http://www.bbc.com/news/video_and_audio/international + # and single-video articles + masset = self._html_search_regex(r'mediaAssetPage\.init\(\s*({.+?}), "/', webpage, 'mediaassets', default=None) + if masset: + jmasset = self._parse_json(masset, list_id) + for key, val in jmasset.get('videos', {}).items(): + for skey, sval in val.items(): + sval['id'] = skey + jsent.append(sval) if len(jsent) == 0: - # stubbornly generic extractor for {json with "image":{allvideoshavethis},etc} - # in http://www.bbc.com/news/video_and_audio/international - # prone to breaking if entries have sourceFiles list - jsent = map( - lambda m: self._parse_json(m,list_id), - re.findall(r"({[^{}]+image\":{[^}]+}[^}]+})", webpage) - ) + # stubbornly generic extractor for {json with "image":{allvideoshavethis},etc} + # in http://www.bbc.com/news/video_and_audio/international + # prone to breaking if entries have sourceFiles list + jsent = map( + lambda m: self._parse_json(m, list_id), + re.findall(r"({[^{}]+image\":{[^}]+}[^}]+})", webpage) + ) if len(jsent) == 0: - raise ExtractorError('No video found', expected=True) + raise ExtractorError('No video found', expected=True) for jent in jsent: programme_id = jent.get('externalId') xml_url = jent.get('href') - title = jent.get('caption','') + title = jent.get('caption', '') if title == '': - title = list_title + title = list_title duration = parse_duration(jent.get('duration')) description = list_title if jent.get('caption', '') != '': - description += ' - ' + jent.get('caption') + description += ' - ' + jent.get('caption') thumbnail = None - if jent.has_key('image'): - thumbnail=jent['image'].get('href') + if jent.get('image') is not None: + thumbnail = jent['image'].get('href') formats = [] subtitles = [] if programme_id: - formats, subtitles = self._download_media_selector(programme_id) - elif jent.has_key('sourceFiles'): - # mediaselector not used at - # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu - for key, val in jent['sourceFiles'].items(): - formats.append( { - 'ext': val.get('encoding'), - 'url': val.get('url'), - 'filesize': int(val.get('filesize')), - 'format_id': key - } ) + formats, subtitles = self._download_media_selector(programme_id) + elif jent.get('sourceFiles') is not None: + # mediaselector not used at + # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu + for key, val in jent['sourceFiles'].items(): + formats.append({ + 'ext': val.get('encoding'), + 'url': val.get('url'), + 'filesize': int(val.get('filesize')), + 'format_id': key + }) elif xml_url: - # Cheap fallback - # http://playlists.bbc.co.uk/news/(list_id)[ABC..]/playlist.sxml - xml = self._download_webpage(xml_url, programme_id, 'Downloading playlist.sxml for externalId (fallback)') - programme_id = self._search_regex(r']*identifier="(.+?)"', xml, 'playlist.sxml (externalId fallback)') - formats, subtitles = self._download_media_selector(programme_id) + # Cheap fallback + # http://playlists.bbc.co.uk/news/(list_id)[ABC..]/playlist.sxml + xml = self._download_webpage(xml_url, programme_id, 'Downloading playlist.sxml for externalId (fallback)') + programme_id = self._search_regex(r']*identifier="(.+?)"', xml, 'playlist.sxml (externalId fallback)') + formats, subtitles = self._download_media_selector(programme_id) if len(formats) == 0: - raise ExtractorError('unsupported json media entry.\n '+str(jent)+'\n') - + raise ExtractorError('unsupported json media entry.\n ' + str(jent) + '\n') + self._sort_formats(formats) - id = jent.get('id') if programme_id == None else programme_id - if id == None: - id = 'NA' + id = jent.get('id') if programme_id is None else programme_id + if id is None: + id = 'NA' - ret.append( { + ret.append({ 'id': id, 'uploader': 'BBC News', 'upload_date': pubdate, @@ -550,8 +550,8 @@ def _real_extract(self, url): 'duration': duration, 'formats': formats, 'subtitles': subtitles, - } ) + }) if len(ret) > 0: - return self.playlist_result(ret, list_id, list_title) + return self.playlist_result(ret, list_id, list_title) raise ExtractorError('No video found', expected=True) From 9afa1770d1a6835bc8fee48dc86cd1a702d1f67a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 25 Jul 2015 20:21:42 +0600 Subject: [PATCH 14/14] [bbc] Improve playlist extraction, refactor, expand support and document --- youtube_dl/extractor/__init__.py | 5 +- youtube_dl/extractor/bbc.py | 375 +++++++++++++++++++++---------- 2 files changed, 259 insertions(+), 121 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index bc61cbdc56..d77ed3ba25 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -43,7 +43,10 @@ from .baidu import BaiduVideoIE from .bambuser import BambuserIE, BambuserChannelIE from .bandcamp import BandcampIE, BandcampAlbumIE -from .bbc import BBCCoUkIE, BBCNewsIE +from .bbc import ( + BBCCoUkIE, + BBCIE, +) from .beeg import BeegIE from .behindkink import BehindKinkIE from .beatportpro import BeatportProIE diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 86327d8ed6..2a0901ee45 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -1,15 +1,18 @@ +# coding: utf-8 from __future__ import unicode_literals +import re import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( ExtractorError, - parse_duration, + float_or_none, int_or_none, + parse_duration, + parse_iso8601, ) from ..compat import compat_HTTPError -import re class BBCCoUkIE(InfoExtractor): @@ -17,7 +20,7 @@ class BBCCoUkIE(InfoExtractor): IE_DESC = 'BBC iPlayer' _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:(?:programmes|iplayer(?:/[^/]+)?/(?:episode|playlist))/)|music/clips[/#])(?P[\da-z]{8})' - mediaselector_url = 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s' + _MEDIASELECTOR_URL = 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s' _TESTS = [ { @@ -264,16 +267,21 @@ def _get_subtitles(self, media, programme_id): return subtitles def _download_media_selector(self, programme_id): + return self._download_media_selector_url( + self._MEDIASELECTOR_URL % programme_id, programme_id) + + def _download_media_selector_url(self, url, programme_id=None): try: media_selection = self._download_xml( - self.mediaselector_url % programme_id, - programme_id, 'Downloading media selection XML') + url, programme_id, 'Downloading media selection XML') except ExtractorError as ee: if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: media_selection = xml.etree.ElementTree.fromstring(ee.cause.read().decode('utf-8')) else: raise + return self._process_media_selector(media_selection, programme_id) + def _process_media_selector(self, media_selection, programme_id): formats = [] subtitles = None @@ -312,10 +320,21 @@ def _download_playlist(self, playlist_id): raise # fallback to legacy playlist - playlist = self._download_xml( - 'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id, - playlist_id, 'Downloading legacy playlist XML') + return self._process_legacy_playlist(playlist_id) + def _process_legacy_playlist_url(self, url, display_id): + playlist = self._download_legacy_playlist_url(url, display_id) + return self._extract_from_legacy_playlist(playlist, display_id) + + def _process_legacy_playlist(self, playlist_id): + return self._process_legacy_playlist_url( + 'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id, playlist_id) + + def _download_legacy_playlist_url(self, url, playlist_id=None): + return self._download_xml( + url, playlist_id, 'Downloading legacy playlist XML') + + def _extract_from_legacy_playlist(self, playlist, playlist_id): no_items = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}noItems') if no_items is not None: reason = no_items.get('reason') @@ -335,8 +354,23 @@ def _download_playlist(self, playlist_id): continue title = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}title').text description = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}summary').text - programme_id = item.get('identifier') + + def get_programme_id(item): + def get_from_attributes(item): + for p in('identifier', 'group'): + value = item.get(p) + if value and re.match(r'^[pb][\da-z]{7}$', value): + return value + get_from_attributes(item) + mediator = item.find('./{http://bbc.co.uk/2008/emp/playlist}mediator') + if mediator is not None: + return get_from_attributes(mediator) + + programme_id = get_programme_id(item) duration = int_or_none(item.get('duration')) + # TODO: programme_id can be None and media items can be incorporated right inside + # playlist's item (e.g. http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu) + # as f4m and m3u8 formats, subtitles = self._download_media_selector(programme_id) return programme_id, title, description, duration, formats, subtitles @@ -383,175 +417,276 @@ def _real_extract(self, url): } -class BBCNewsIE(BBCCoUkIE): - IE_NAME = 'bbc.com' - IE_DESC = 'BBC news' - _VALID_URL = r'https?://(?:www\.)?bbc\.com/.+?/(?P[^/]+)$' +class BBCIE(BBCCoUkIE): + IE_NAME = 'bbc' + IE_DESC = 'BBC' + _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P[^/#?]+)' - mediaselector_url = 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s' + # fails with notukerror for some videos + #_MEDIASELECTOR_URL = 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s' + _MEDIASELECTOR_URL = 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/journalism-pc/vpid/%s' _TESTS = [{ + # article with multiple videos embedded with data-media-meta containing + # playlist.sxml, externalId and no direct video links 'url': 'http://www.bbc.com/news/world-europe-32668511', 'info_dict': { 'id': 'world-europe-32668511', 'title': 'Russia stages massive WW2 parade despite Western boycott', + 'description': 'md5:00ff61976f6081841f759a08bf78cc9c', }, 'playlist_count': 2, }, { + # article with multiple videos embedded with data-media-meta (more videos) 'url': 'http://www.bbc.com/news/business-28299555', 'info_dict': { 'id': 'business-28299555', 'title': 'Farnborough Airshow: Video highlights', + 'description': 'BBC reports and video highlights at the Farnborough Airshow.', }, 'playlist_count': 9, + 'skip': 'Save time', }, { + # single video embedded with mediaAssetPage.init() 'url': 'http://www.bbc.com/news/world-europe-32041533', - 'note': 'Video', 'info_dict': { 'id': 'p02mprgb', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', - 'description': 'Germanwings plane crash site in aerial video - Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', 'duration': 47, + 'timestamp': 1427219242, 'upload_date': '20150324', - 'uploader': 'BBC News', }, 'params': { + # rtmp download 'skip_download': True, } }, { + # article with single video embedded with data-media-meta containing + # direct video links (for now these are extracted) and playlist.xml (with + # media items as f4m and m3u8 - currently unsupported) 'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu', - 'note': 'Video', 'info_dict': { - 'id': 'NA', + 'id': '150615_telabyad_kentin_cogu', 'ext': 'mp4', - 'title': 'YPG: Tel Abyad\'\u0131n tamam\u0131 kontrol\xfcm\xfczde', - 'description': 'YPG: Tel Abyad\'\u0131n tamam\u0131 kontrol\xfcm\xfczde', + 'title': "YPG: Tel Abyad'ın tamamı kontrolümüzde", 'duration': 47, + 'timestamp': 1434397334, 'upload_date': '20150615', - 'uploader': 'BBC News', }, 'params': { 'skip_download': True, } }, { + # single video embedded with mediaAssetPage.init() (regional section) 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw', - 'note': 'Video', 'info_dict': { - 'id': '39275083', + 'id': '150619_video_honduras_militares_hospitales_corrupcion_aw', 'ext': 'mp4', - 'title': 'Honduras militariza sus hospitales por nuevo esc\xe1ndalo de corrupci\xf3n', - 'description': 'Honduras militariza sus hospitales por nuevo esc\xe1ndalo de corrupci\xf3n', + 'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción', 'duration': 87, + 'timestamp': 1434713142, 'upload_date': '20150619', - 'uploader': 'BBC News', }, 'params': { 'skip_download': True, } + }, { + # single video story with digitalData + 'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret', + 'info_dict': { + 'id': 'p02q6gc4', + 'ext': 'flv', + 'title': 'Sri Lanka’s spicy secret', + 'description': 'As a new train line to Jaffna opens up the country’s north, travellers can experience a truly distinct slice of Tamil culture.', + 'timestamp': 1437674293, + 'upload_date': '20150723', + }, + 'params': { + # rtmp download + 'skip_download': True, + } + }, { + # single video story without digitalData + 'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star', + 'info_dict': { + 'id': 'p018zqqg', + 'ext': 'flv', + 'title': 'Hyundai Santa Fe Sport: Rock star', + 'description': 'md5:b042a26142c4154a6e472933cf20793d', + 'timestamp': 1368473503, + 'upload_date': '20130513', + }, + 'params': { + # rtmp download + 'skip_download': True, + } + }, { + # single video with playlist.sxml URL + 'url': 'http://www.bbc.com/sport/0/football/33653409', + 'info_dict': { + 'id': 'p02xycnp', + 'ext': 'flv', + 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?', + 'description': 'md5:398fca0e2e701c609d726e034fa1fc89', + 'duration': 140, + }, + 'params': { + # rtmp download + 'skip_download': True, + } + }, { + # single video with playlist URL from weather section + 'url': 'http://www.bbc.com/weather/features/33601775', + 'only_matching': True, + }, { + # custom redirection to www.bbc.com + 'url': 'http://www.bbc.co.uk/news/science-environment-33661876', + 'only_matching': True, }] + @classmethod + def suitable(cls, url): + return False if BBCCoUkIE.suitable(url) else super(BBCIE, cls).suitable(url) + + def _extract_from_media_meta(self, media_meta, video_id): + # Direct links to media in media metadata (e.g. + # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu) + # TODO: there are also f4m and m3u8 streams incorporated in playlist.sxml + source_files = media_meta.get('sourceFiles') + if source_files: + return [{ + 'url': f['url'], + 'format_id': format_id, + 'ext': f.get('encoding'), + 'tbr': float_or_none(f.get('bitrate'), 1000), + 'filesize': int_or_none(f.get('filesize')), + } for format_id, f in source_files.items() if f.get('url')], [] + + programme_id = media_meta.get('externalId') + if programme_id: + return self._download_media_selector(programme_id) + + # Process playlist.sxml as legacy playlist + href = media_meta.get('href') + if href: + playlist = self._download_legacy_playlist_url(href) + _, _, _, _, formats, subtitles = self._extract_from_legacy_playlist(playlist, video_id) + return formats, subtitles + + return [], [] + def _real_extract(self, url): - list_id = self._match_id(url) - webpage = self._download_webpage(url, list_id) + playlist_id = self._match_id(url) - list_title = self._html_search_regex(r'(.*?)(?:\s*-\s*BBC [^ ]+)?', webpage, 'list title') + webpage = self._download_webpage(url, playlist_id) - pubdate = self._html_search_regex(r'"datePublished":\s*"(\d+-\d+-\d+)', webpage, 'date', default=None) - if pubdate: - pubdate = pubdate.replace('-', '') - - ret = [] - jsent = [] - - # works with bbc.com/news/something-something-123456 articles - jsent = map( - lambda m: self._parse_json(m, list_id), - re.findall(r"data-media-meta='({[^']+})'", webpage) - ) - - if len(jsent) == 0: - # http://www.bbc.com/news/video_and_audio/international - # and single-video articles - masset = self._html_search_regex(r'mediaAssetPage\.init\(\s*({.+?}), "/', webpage, 'mediaassets', default=None) - if masset: - jmasset = self._parse_json(masset, list_id) - for key, val in jmasset.get('videos', {}).items(): - for skey, sval in val.items(): - sval['id'] = skey - jsent.append(sval) - - if len(jsent) == 0: - # stubbornly generic extractor for {json with "image":{allvideoshavethis},etc} - # in http://www.bbc.com/news/video_and_audio/international - # prone to breaking if entries have sourceFiles list - jsent = map( - lambda m: self._parse_json(m, list_id), - re.findall(r"({[^{}]+image\":{[^}]+}[^}]+})", webpage) - ) - - if len(jsent) == 0: - raise ExtractorError('No video found', expected=True) - - for jent in jsent: - programme_id = jent.get('externalId') - xml_url = jent.get('href') - - title = jent.get('caption', '') - if title == '': - title = list_title - - duration = parse_duration(jent.get('duration')) - description = list_title - if jent.get('caption', '') != '': - description += ' - ' + jent.get('caption') - thumbnail = None - if jent.get('image') is not None: - thumbnail = jent['image'].get('href') - - formats = [] - subtitles = [] - - if programme_id: - formats, subtitles = self._download_media_selector(programme_id) - elif jent.get('sourceFiles') is not None: - # mediaselector not used at - # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu - for key, val in jent['sourceFiles'].items(): - formats.append({ - 'ext': val.get('encoding'), - 'url': val.get('url'), - 'filesize': int(val.get('filesize')), - 'format_id': key - }) - elif xml_url: - # Cheap fallback - # http://playlists.bbc.co.uk/news/(list_id)[ABC..]/playlist.sxml - xml = self._download_webpage(xml_url, programme_id, 'Downloading playlist.sxml for externalId (fallback)') - programme_id = self._search_regex(r']*identifier="(.+?)"', xml, 'playlist.sxml (externalId fallback)') - formats, subtitles = self._download_media_selector(programme_id) - - if len(formats) == 0: - raise ExtractorError('unsupported json media entry.\n ' + str(jent) + '\n') + timestamp = parse_iso8601(self._search_regex( + [r'"datePublished":\s*"([^"]+)', + r']+property="article:published_time"[^>]+content="([^"]+)"', + r'itemprop="datePublished"[^>]+datetime="([^"]+)"'], + webpage, 'date', default=None)) + # single video with playlist.sxml URL (e.g. http://www.bbc.com/sport/0/football/3365340ng) + playlist = self._search_regex( + r']+name="playlist"[^>]+value="([^"]+)"', + webpage, 'playlist', default=None) + if playlist: + programme_id, title, description, duration, formats, subtitles = \ + self._process_legacy_playlist_url(playlist, playlist_id) self._sort_formats(formats) - - id = jent.get('id') if programme_id is None else programme_id - if id is None: - id = 'NA' - - ret.append({ - 'id': id, - 'uploader': 'BBC News', - 'upload_date': pubdate, + return { + 'id': programme_id, 'title': title, 'description': description, - 'thumbnail': thumbnail, 'duration': duration, + 'timestamp': timestamp, + 'formats': formats, + 'subtitles': subtitles, + } + + # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret) + programme_id = self._search_regex( + [r'data-video-player-vpid="([\da-z]{8})"', + r']+name="externalIdentifier"[^>]+value="([\da-z]{8})"'], + webpage, 'vpid', default=None) + if programme_id: + formats, subtitles = self._download_media_selector(programme_id) + self._sort_formats(formats) + # digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star) + digital_data = self._parse_json( + self._search_regex( + r'var\s+digitalData\s*=\s*({.+?});?\n', webpage, 'digital data', default='{}'), + programme_id, fatal=False) + page_info = digital_data.get('page', {}).get('pageInfo', {}) + title = page_info.get('pageName') or self._og_search_title(webpage) + description = page_info.get('description') or self._og_search_description(webpage) + timestamp = parse_iso8601(page_info.get('publicationDate')) or timestamp + return { + 'id': programme_id, + 'title': title, + 'description': description, + 'timestamp': timestamp, + 'formats': formats, + 'subtitles': subtitles, + } + + playlist_title = self._html_search_regex( + r'(.*?)(?:\s*-\s*BBC [^ ]+)?', webpage, 'playlist title') + playlist_description = self._og_search_description(webpage) + + # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511) + medias = list(filter(None, map( + lambda s: self._parse_json(s, playlist_id, fatal=False), + re.findall(r"data-media-meta='({[^']+})'", webpage)))) + + if not medias: + # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international) + media_asset_page = self._parse_json( + self._search_regex( + r'mediaAssetPage\.init\(\s*({.+?}), "/', webpage, 'media asset'), + playlist_id) + medias = [] + for video in media_asset_page.get('videos', {}).values(): + medias.extend(video.values()) + + entries = [] + for num, media_meta in enumerate(medias, start=1): + formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id) + if not formats: + continue + self._sort_formats(formats) + + video_id = media_meta.get('externalId') + if not video_id: + video_id = playlist_id if len(medias) == 1 else '%s-%s' % (playlist_id, num) + + title = media_meta.get('caption') + if not title: + title = playlist_title if len(medias) == 1 else '%s - Video %s' % (playlist_title, num) + + duration = int_or_none(media_meta.get('durationInSeconds')) or parse_duration(media_meta.get('duration')) + + images = [] + for image in media_meta.get('images', {}).values(): + images.extend(image.values()) + if 'image' in media_meta: + images.append(media_meta['image']) + + thumbnails = [{ + 'url': image.get('href'), + 'width': int_or_none(image.get('width')), + 'height': int_or_none(image.get('height')), + } for image in images] + + entries.append({ + 'id': video_id, + 'title': title, + 'thumbnails': thumbnails, + 'duration': duration, + 'timestamp': timestamp, 'formats': formats, 'subtitles': subtitles, }) - if len(ret) > 0: - return self.playlist_result(ret, list_id, list_title) - raise ExtractorError('No video found', expected=True) + return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)