From 588b82bbf8c90981c54f180eca40e6c743f8f89f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 17 May 2015 03:32:53 +0600 Subject: [PATCH] [tv2:article] Add extractor (Closes #5724) --- youtube_dl/extractor/__init__.py | 5 ++++- youtube_dl/extractor/tv2.py | 33 ++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index fb4f63ca3a..6f8c261d5b 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -572,7 +572,10 @@ from .tunein import TuneInIE from .turbo import TurboIE from .tutv import TutvIE -from .tv2 import TV2IE +from .tv2 import ( + TV2IE, + TV2ArticleIE, +) from .tv4 import TV4IE from .tvigle import TvigleIE from .tvp import TvpIE, TvpSeriesIE diff --git a/youtube_dl/extractor/tv2.py b/youtube_dl/extractor/tv2.py index 2dcc0e971e..fa338b936d 100644 --- a/youtube_dl/extractor/tv2.py +++ b/youtube_dl/extractor/tv2.py @@ -1,12 +1,15 @@ # encoding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( determine_ext, int_or_none, float_or_none, parse_iso8601, + remove_end, ) @@ -91,3 +94,33 @@ def _real_extract(self, url): 'categories': categories, 'formats': formats, } + + +class TV2ArticleIE(InfoExtractor): + _VALID_URL = 'http://(?:www\.)?tv2\.no/(?:a|\d{4}/\d{2}/\d{2}(/[^/]+)+)/(?P\d+)' + _TESTS = [{ + 'url': 'http://www.tv2.no/2015/05/16/nyheter/alesund/krim/pingvin/6930542', + 'info_dict': { + 'id': '6930542', + 'title': 'Russen hetses etter pingvintyveri – innrømmer å ha åpnet luken på buret', + 'description': 'md5:339573779d3eea3542ffe12006190954', + }, + 'playlist_count': 2, + }, { + 'url': 'http://www.tv2.no/a/6930542', + 'only_matching': True, + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + entries = [ + self.url_result('http://www.tv2.no/v/%s' % video_id, 'TV2') + for video_id in re.findall(r'data-assetid="(\d+)"', webpage)] + + title = remove_end(self._og_search_title(webpage), ' - TV2.no') + description = remove_end(self._og_search_description(webpage), ' - TV2.no') + + return self.playlist_result(entries, playlist_id, title, description)