From 63da13e8291e2debce073aea63bcfb710c0f5f1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Thu, 10 Oct 2013 19:37:17 +0200 Subject: [PATCH] Add an extractor for faz.net (closes #1582) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/faz.py | 60 ++++++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+) create mode 100644 youtube_dl/extractor/faz.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index f44468d358..a4d0c71ec3 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -34,6 +34,7 @@ from .escapist import EscapistIE from .exfm import ExfmIE from .facebook import FacebookIE +from .faz import FazIE from .fktv import ( FKTVIE, FKTVPosteckeIE, diff --git a/youtube_dl/extractor/faz.py b/youtube_dl/extractor/faz.py new file mode 100644 index 0000000000..deaa4ed2d9 --- /dev/null +++ b/youtube_dl/extractor/faz.py @@ -0,0 +1,60 @@ +# encoding: utf-8 +import re +import xml.etree.ElementTree + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + clean_html, + get_element_by_attribute, +) + + +class FazIE(InfoExtractor): + IE_NAME = u'faz.net' + _VALID_URL = r'https?://www\.faz\.net/multimedia/videos/.*?-(?P\d+).html' + + _TEST = { + u'url': u'http://www.faz.net/multimedia/videos/stockholm-chemie-nobelpreis-fuer-drei-amerikanische-forscher-12610585.html', + u'file': u'12610585.mp4', + u'info_dict': { + u'title': u'Stockholm: Chemie-Nobelpreis für drei amerikanische Forscher', + u'description': u'md5:1453fbf9a0d041d985a47306192ea253', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + self.to_screen(video_id) + webpage = self._download_webpage(url, video_id) + config_xml_url = self._search_regex(r'writeFLV\(\'(.+?)\',', webpage, + u'config xml url') + config_xml = self._download_webpage(config_xml_url, video_id, + u'Downloading config xml') + config = xml.etree.ElementTree.fromstring(config_xml.encode('utf-8')) + + encodings = config.find('ENCODINGS') + formats = [] + for code in ['LOW', 'HIGH', 'HQ']: + encoding = encodings.find(code) + if encoding is None: + continue + encoding_url = encoding.find('FILENAME').text + formats.append({ + 'url': encoding_url, + 'ext': determine_ext(encoding_url), + 'format_id': code.lower(), + }) + + descr_html = get_element_by_attribute('class', 'Content Copy', webpage) + info = { + 'id': video_id, + 'title': self._og_search_title(webpage), + 'formats': formats, + 'description': clean_html(descr_html), + 'thumbnail': config.find('STILL/STILL_BIG').text, + } + # TODO: Remove when #980 has been merged + info.update(formats[-1]) + return info