From 67dfbc0cb92a19eda2981528b1456bdc0e3cb805 Mon Sep 17 00:00:00 2001 From: Ismael Mejia Date: Wed, 7 Aug 2013 18:42:40 +0200 Subject: [PATCH 01/17] Added exceptions for the subtitle and video types in .gitignore --- .gitignore | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index ca4e8f3532..fca34b8baa 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,7 @@ build/ dist/ MANIFEST README.txt +README.md youtube-dl.1 youtube-dl.bash-completion youtube-dl @@ -17,4 +18,10 @@ youtube-dl.tar.gz .coverage cover/ updates_key.pem -*.egg-info \ No newline at end of file +*.egg-info +*.srt +*.sbv +*.vtt +*.flv +*.mp4 +*.part From 5898e282726bc2f54fc52fe425c389226e31a797 Mon Sep 17 00:00:00 2001 From: Ismael Mejia Date: Wed, 7 Aug 2013 18:48:24 +0200 Subject: [PATCH 02/17] Fixed small type issue --- youtube_dl/YoutubeDL.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index e69d844b8a..beed79fd04 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -492,7 +492,8 @@ def process_info(self, info_dict): # that way it will silently go on when used with unsupporting IE subtitles = info_dict['subtitles'] sub_format = self.params.get('subtitlesformat') - for sub_lang in subtitles.keys(): + + for sub_lang in subtitles: sub = subtitles[sub_lang] if sub is None: continue From 953e32b2c1be077e65bba844010a5a2707af2e2b Mon Sep 17 00:00:00 2001 From: Ismael Mejia Date: Wed, 7 Aug 2013 18:59:11 +0200 Subject: [PATCH 03/17] [dailymotion] Added support for subtitles + new InfoExtractor for generic subtitle download. The idea is that all subtitle downloaders must descend from SubtitlesIE and implement only three basic methods to achieve the complete subtitle download functionality. This will allow to reduce the code in YoutubeIE once it is rewritten. --- test/test_dailymotion_subtitles.py | 96 +++++++++++++++++++++++++++++ youtube_dl/__init__.py | 10 +-- youtube_dl/extractor/dailymotion.py | 67 ++++++++++++++++++-- youtube_dl/extractor/subtitles.py | 80 ++++++++++++++++++++++++ 4 files changed, 242 insertions(+), 11 deletions(-) create mode 100644 test/test_dailymotion_subtitles.py create mode 100644 youtube_dl/extractor/subtitles.py diff --git a/test/test_dailymotion_subtitles.py b/test/test_dailymotion_subtitles.py new file mode 100644 index 0000000000..f63426a185 --- /dev/null +++ b/test/test_dailymotion_subtitles.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python + +import sys +import unittest +import json +import io +import hashlib + +# Allow direct execution +import os +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from youtube_dl.extractor import DailymotionIE +from youtube_dl.utils import * +from helper import FakeYDL + +md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest() +TEST_URL = 'http://www.dailymotion.com/video/xczg00' + +class TestDailymotionSubtitles(unittest.TestCase): + def setUp(self): + DL = FakeYDL() + DL.params['allsubtitles'] = False + DL.params['writesubtitles'] = False + DL.params['subtitlesformat'] = 'srt' + DL.params['listsubtitles'] = False + def test_no_subtitles(self): + DL = FakeYDL() + DL.params['writesubtitles'] = False + IE = DailymotionIE(DL) + info_dict = IE.extract(TEST_URL) + subtitles = info_dict[0]['subtitles'] + self.assertEqual(subtitles, None) + def test_subtitles(self): + DL = FakeYDL() + DL.params['writesubtitles'] = True + IE = DailymotionIE(DL) + info_dict = IE.extract(TEST_URL) + sub = info_dict[0]['subtitles']['en'] + self.assertEqual(md5(sub), '976553874490cba125086bbfea3ff76f') + def test_subtitles_fr(self): + DL = FakeYDL() + DL.params['writesubtitles'] = True + DL.params['subtitleslang'] = 'fr' + IE = DailymotionIE(DL) + info_dict = IE.extract(TEST_URL) + sub = info_dict[0]['subtitles']['fr'] + self.assertEqual(md5(sub), '594564ec7d588942e384e920e5341792') + def test_onlysubtitles(self): + DL = FakeYDL() + DL.params['writesubtitles'] = True + DL.params['onlysubtitles'] = True + IE = DailymotionIE(DL) + info_dict = IE.extract(TEST_URL) + sub = info_dict[0]['subtitles']['en'] + self.assertEqual(md5(sub), '976553874490cba125086bbfea3ff76f') + def test_allsubtitles(self): + DL = FakeYDL() + DL.params['allsubtitles'] = True + IE = DailymotionIE(DL) + info_dict = IE.extract(TEST_URL) + subtitles = info_dict[0]['subtitles'] + self.assertEqual(len(subtitles.keys()), 5) + # def test_subtitles_sbv_format(self): + # DL = FakeYDL() + # DL.params['writesubtitles'] = True + # DL.params['subtitlesformat'] = 'sbv' + # IE = DailymotionIE(DL) + # info_dict = IE.extract(TEST_URL) + # sub = info_dict[0]['subtitles'][0] + # self.assertEqual(md5(sub), '13aeaa0c245a8bed9a451cb643e3ad8b') + # def test_subtitles_vtt_format(self): + # DL = FakeYDL() + # DL.params['writesubtitles'] = True + # DL.params['subtitlesformat'] = 'vtt' + # IE = DailymotionIE(DL) + # info_dict = IE.extract(TEST_URL) + # sub = info_dict[0]['subtitles'][0] + # self.assertEqual(md5(sub), '356cdc577fde0c6783b9b822e7206ff7') + def test_list_subtitles(self): + DL = FakeYDL() + DL.params['listsubtitles'] = True + IE = DailymotionIE(DL) + info_dict = IE.extract(TEST_URL) + self.assertEqual(info_dict, None) + def test_automatic_captions(self): + DL = FakeYDL() + DL.params['writeautomaticsub'] = True + DL.params['subtitleslang'] = 'en' + IE = DailymotionIE(DL) + info_dict = IE.extract(TEST_URL) + sub = info_dict[0]['subtitles'] + self.assertTrue(len(sub) == 0) + +if __name__ == '__main__': + unittest.main() diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index eb23c53a57..c4d595e1c1 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -187,22 +187,22 @@ def _find_term_columns(): action='store_true', dest='listformats', help='list all available formats (currently youtube only)') video_format.add_option('--write-sub', '--write-srt', action='store_true', dest='writesubtitles', - help='write subtitle file (currently youtube only)', default=False) + help='write subtitle file', default=False) video_format.add_option('--write-auto-sub', '--write-automatic-sub', action='store_true', dest='writeautomaticsub', - help='write automatic subtitle file (currently youtube only)', default=False) + help='write automatic subtitle file (youtube only)', default=False) video_format.add_option('--only-sub', action='store_true', dest='skip_download', help='[deprecated] alias of --skip-download', default=False) video_format.add_option('--all-subs', action='store_true', dest='allsubtitles', - help='downloads all the available subtitles of the video (currently youtube only)', default=False) + help='downloads all the available subtitles of the video', default=False) video_format.add_option('--list-subs', action='store_true', dest='listsubtitles', - help='lists all available subtitles for the video (currently youtube only)', default=False) + help='lists all available subtitles for the video', default=False) video_format.add_option('--sub-format', action='store', dest='subtitlesformat', metavar='FORMAT', - help='subtitle format [srt/sbv/vtt] (default=srt) (currently youtube only)', default='srt') + help='subtitle format (default=srt) ([sbv/vtt] youtube only)', default='srt') video_format.add_option('--sub-lang', '--srt-lang', action='store', dest='subtitleslang', metavar='LANG', help='language of the subtitles to download (optional) use IETF language tags like \'en\'') diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 9bf7a28ca8..eb2322d547 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -1,14 +1,49 @@ import re import json +import itertools +import socket from .common import InfoExtractor +from .subtitles import SubtitlesIE + from ..utils import ( + compat_http_client, + compat_urllib_error, compat_urllib_request, + compat_str, + get_element_by_attribute, + get_element_by_id, ExtractorError, ) -class DailymotionIE(InfoExtractor): + +class DailyMotionSubtitlesIE(SubtitlesIE): + + def _get_available_subtitles(self, video_id): + request = compat_urllib_request.Request('https://api.dailymotion.com/video/%s/subtitles?fields=id,language,url' % video_id) + try: + sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8') + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err)) + return {} + info = json.loads(sub_list) + if (info['total'] > 0): + sub_lang_list = dict((l['language'], l['url']) for l in info['list']) + return sub_lang_list + self._downloader.report_warning(u'video doesn\'t have subtitles') + return {} + + def _get_subtitle_url(self, sub_lang, sub_name, video_id, format): + sub_lang_list = self._get_available_subtitles(video_id) + return sub_lang_list[sub_lang] + + def _request_automatic_caption(self, video_id, webpage): + self._downloader.report_warning(u'Automatic Captions not supported by dailymotion') + return {} + + +class DailymotionIE(DailyMotionSubtitlesIE): #,InfoExtractor): """Information Extractor for Dailymotion""" _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)' @@ -18,7 +53,7 @@ class DailymotionIE(InfoExtractor): u'file': u'x33vw9.mp4', u'md5': u'392c4b85a60a90dc4792da41ce3144eb', u'info_dict': { - u"uploader": u"Alex and Van .", + u"uploader": u"Alex and Van .", u"title": u"Tutoriel de Youtubeur\"DL DES VIDEO DE YOUTUBE\"" } } @@ -57,17 +92,36 @@ def _real_extract(self, url): # TODO: support choosing qualities - for key in ['stream_h264_hd1080_url','stream_h264_hd_url', - 'stream_h264_hq_url','stream_h264_url', + for key in ['stream_h264_hd1080_url', 'stream_h264_hd_url', + 'stream_h264_hq_url', 'stream_h264_url', 'stream_h264_ld_url']: - if info.get(key):#key in info and info[key]: + if info.get(key): # key in info and info[key]: max_quality = key - self.to_screen(u'Using %s' % key) + self.to_screen(u'%s: Using %s' % (video_id, key)) break else: raise ExtractorError(u'Unable to extract video URL') video_url = info[max_quality] + # subtitles + video_subtitles = None + video_webpage = None + + if self._downloader.params.get('writesubtitles', False) or self._downloader.params.get('allsubtitles', False): + video_subtitles = self._extract_subtitles(video_id) + elif self._downloader.params.get('writeautomaticsub', False): + video_subtitles = self._request_automatic_caption(video_id, video_webpage) + + if self._downloader.params.get('listsubtitles', False): + self._list_available_subtitles(video_id) + return + + if 'length_seconds' not in info: + self._downloader.report_warning(u'unable to extract video duration') + video_duration = '' + else: + video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]) + return [{ 'id': video_id, 'url': video_url, @@ -75,5 +129,6 @@ def _real_extract(self, url): 'upload_date': video_upload_date, 'title': self._og_search_title(webpage), 'ext': video_extension, + 'subtitles': video_subtitles, 'thumbnail': info['thumbnail_url'] }] diff --git a/youtube_dl/extractor/subtitles.py b/youtube_dl/extractor/subtitles.py new file mode 100644 index 0000000000..89864e5d78 --- /dev/null +++ b/youtube_dl/extractor/subtitles.py @@ -0,0 +1,80 @@ +import socket + +from .common import InfoExtractor + +from ..utils import ( + compat_http_client, + compat_urllib_error, + compat_urllib_request, + compat_str, +) + + +class SubtitlesIE(InfoExtractor): + + def report_video_subtitles_available(self, video_id, sub_lang_list): + """Report available subtitles.""" + sub_lang = ",".join(list(sub_lang_list.keys())) + self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang)) + + def _list_available_subtitles(self, video_id): + sub_lang_list = self._get_available_subtitles(video_id) + self.report_video_subtitles_available(video_id, sub_lang_list) + + def _extract_subtitles(self, video_id): + """ + Return a dictionary: {language: subtitles} or {} if the subtitles + couldn't be found + """ + sub_lang_list = self._get_available_subtitles(video_id) + sub_format = self._downloader.params.get('subtitlesformat') + if not sub_lang_list: #There was some error, it didn't get the available subtitles + return {} + if self._downloader.params.get('writesubtitles', False): + if self._downloader.params.get('subtitleslang', False): + sub_lang = self._downloader.params.get('subtitleslang') + elif 'en' in sub_lang_list: + sub_lang = 'en' + else: + sub_lang = list(sub_lang_list.keys())[0] + if not sub_lang in sub_lang_list: + self._downloader.report_warning(u'no closed captions found in the specified language "%s"' % sub_lang) + return {} + sub_lang_list = {sub_lang: sub_lang_list[sub_lang]} + subtitles = {} + for sub_lang in sub_lang_list: + subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format) + if subtitle: + subtitles[sub_lang] = subtitle + return subtitles + + def _request_subtitle(self, sub_lang, sub_name, video_id, format): + """ Return the subtitle as a string or None if they are not found """ + # return (u'Did not fetch video subtitles for %s' % sub_lang, None, None) + self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format)) + url = self._get_subtitle_url(sub_lang, sub_name, video_id, format) + try: + sub = compat_urllib_request.urlopen(url).read().decode('utf-8') + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self._downloader.report_warning(u'unable to download video subtitles for %s: %s' % (sub_lang, compat_str(err))) + return + if not sub: + self._downloader.report_warning(u'Did not fetch video subtitles') + return + return sub + + def _get_available_subtitles(self, video_id): + """Get available subtitles. Redefine in subclasses.""" + """returns {(lang, url)} """ + # return {} + pass + + def _get_subtitle_url(self, sub_lang, sub_name, video_id, format): + """returns the url for the given subtitle. Redefine in subclasses.""" + pass + + def _request_automatic_caption(self, video_id, webpage): + """Request automatic caption. Redefine in subclasses.""" + """returns a tuple of ... """ + # return [(err_msg, None, None)] + pass From 372297e713c92489c113bf8649ec4aa1d23511f9 Mon Sep 17 00:00:00 2001 From: Ismael Mejia Date: Wed, 7 Aug 2013 21:24:42 +0200 Subject: [PATCH 04/17] Undo the previous commit (it was a mistake) --- youtube_dl/YoutubeDL.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index beed79fd04..ed5492826f 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -493,7 +493,7 @@ def process_info(self, info_dict): subtitles = info_dict['subtitles'] sub_format = self.params.get('subtitlesformat') - for sub_lang in subtitles: + for sub_lang in subtitles.keys(): sub = subtitles[sub_lang] if sub is None: continue From 8377574c9cb8740e24d45e9b3d30921fd6ec846c Mon Sep 17 00:00:00 2001 From: Ismael Mejia Date: Thu, 8 Aug 2013 08:54:10 +0200 Subject: [PATCH 05/17] [internal] Improved subtitle architecture + (update in youtube/dailymotion) The structure of subtitles was refined, you only need to implement one method that returns a dictionnary of the available subtitles (lang, url) to support all the subtitle options in a website. I updated the subtitle downloaders for youtube/dailymotion to show how it works. --- youtube_dl/extractor/dailymotion.py | 15 +-- youtube_dl/extractor/subtitles.py | 27 ++--- youtube_dl/extractor/youtube.py | 175 ++++++++++------------------ 3 files changed, 73 insertions(+), 144 deletions(-) diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index eb2322d547..97003ee35d 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -1,6 +1,5 @@ import re import json -import itertools import socket from .common import InfoExtractor @@ -34,16 +33,12 @@ def _get_available_subtitles(self, video_id): self._downloader.report_warning(u'video doesn\'t have subtitles') return {} - def _get_subtitle_url(self, sub_lang, sub_name, video_id, format): - sub_lang_list = self._get_available_subtitles(video_id) - return sub_lang_list[sub_lang] - def _request_automatic_caption(self, video_id, webpage): - self._downloader.report_warning(u'Automatic Captions not supported by dailymotion') + self._downloader.report_warning(u'Automatic Captions not supported by this server') return {} -class DailymotionIE(DailyMotionSubtitlesIE): #,InfoExtractor): +class DailymotionIE(DailyMotionSubtitlesIE): """Information Extractor for Dailymotion""" _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)' @@ -116,12 +111,6 @@ def _real_extract(self, url): self._list_available_subtitles(video_id) return - if 'length_seconds' not in info: - self._downloader.report_warning(u'unable to extract video duration') - video_duration = '' - else: - video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]) - return [{ 'id': video_id, 'url': video_url, diff --git a/youtube_dl/extractor/subtitles.py b/youtube_dl/extractor/subtitles.py index 89864e5d78..8843e02209 100644 --- a/youtube_dl/extractor/subtitles.py +++ b/youtube_dl/extractor/subtitles.py @@ -15,7 +15,8 @@ class SubtitlesIE(InfoExtractor): def report_video_subtitles_available(self, video_id, sub_lang_list): """Report available subtitles.""" sub_lang = ",".join(list(sub_lang_list.keys())) - self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang)) + self.to_screen(u'%s: Available subtitles for video: %s' % + (video_id, sub_lang)) def _list_available_subtitles(self, video_id): sub_lang_list = self._get_available_subtitles(video_id) @@ -27,9 +28,9 @@ def _extract_subtitles(self, video_id): couldn't be found """ sub_lang_list = self._get_available_subtitles(video_id) - sub_format = self._downloader.params.get('subtitlesformat') - if not sub_lang_list: #There was some error, it didn't get the available subtitles + if not sub_lang_list: # error, it didn't get the available subtitles return {} + if self._downloader.params.get('writesubtitles', False): if self._downloader.params.get('subtitleslang', False): sub_lang = self._downloader.params.get('subtitleslang') @@ -41,18 +42,15 @@ def _extract_subtitles(self, video_id): self._downloader.report_warning(u'no closed captions found in the specified language "%s"' % sub_lang) return {} sub_lang_list = {sub_lang: sub_lang_list[sub_lang]} + subtitles = {} - for sub_lang in sub_lang_list: - subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format) + for sub_lang, url in sub_lang_list.iteritems(): + subtitle = self._request_subtitle_url(sub_lang, url) if subtitle: subtitles[sub_lang] = subtitle return subtitles - def _request_subtitle(self, sub_lang, sub_name, video_id, format): - """ Return the subtitle as a string or None if they are not found """ - # return (u'Did not fetch video subtitles for %s' % sub_lang, None, None) - self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format)) - url = self._get_subtitle_url(sub_lang, sub_name, video_id, format) + def _request_subtitle_url(self, sub_lang, url): try: sub = compat_urllib_request.urlopen(url).read().decode('utf-8') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: @@ -64,13 +62,8 @@ def _request_subtitle(self, sub_lang, sub_name, video_id, format): return sub def _get_available_subtitles(self, video_id): - """Get available subtitles. Redefine in subclasses.""" - """returns {(lang, url)} """ - # return {} - pass - - def _get_subtitle_url(self, sub_lang, sub_name, video_id, format): - """returns the url for the given subtitle. Redefine in subclasses.""" + """returns the list of available subtitles like this {lang: url} """ + """or {} if not available. Must be redefined by the subclasses.""" pass def _request_automatic_caption(self, video_id, webpage): diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 2b03226f60..414e33b498 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -7,6 +7,7 @@ import itertools from .common import InfoExtractor, SearchInfoExtractor +from .subtitles import SubtitlesIE from ..utils import ( compat_http_client, compat_parse_qs, @@ -24,7 +25,66 @@ ) -class YoutubeIE(InfoExtractor): +class YoutubeSubtitlesIE(SubtitlesIE): + + def _get_available_subtitles(self, video_id): + request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id) + try: + sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8') + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err)) + return {} + lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list) + + sub_lang_list = {} + for l in lang_list: + lang = l[1] + params = compat_urllib_parse.urlencode({ + 'lang': lang, + 'v': video_id, + 'fmt': self._downloader.params.get('subtitlesformat'), + }) + url = u'http://www.youtube.com/api/timedtext?' + params + sub_lang_list[lang] = url + if not sub_lang_list: + self._downloader.report_warning(u'video doesn\'t have subtitles') + return {} + return sub_lang_list + + def _request_automatic_caption(self, video_id, webpage): + """We need the webpage for getting the captions url, pass it as an + argument to speed up the process.""" + sub_lang = self._downloader.params.get('subtitleslang') or 'en' + sub_format = self._downloader.params.get('subtitlesformat') + self.to_screen(u'%s: Looking for automatic captions' % video_id) + mobj = re.search(r';ytplayer.config = ({.*?});', webpage) + err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang + if mobj is None: + self._downloader.report_warning(err_msg) + return {} + player_config = json.loads(mobj.group(1)) + try: + args = player_config[u'args'] + caption_url = args[u'ttsurl'] + timestamp = args[u'timestamp'] + params = compat_urllib_parse.urlencode({ + 'lang': 'en', + 'tlang': sub_lang, + 'fmt': sub_format, + 'ts': timestamp, + 'kind': 'asr', + }) + subtitles_url = caption_url + '&' + params + sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions') + return {sub_lang: sub} + # An extractor error can be raise by the download process if there are + # no automatic captions but there are subtitles + except (KeyError, ExtractorError): + self._downloader.report_warning(err_msg) + return {} + + +class YoutubeIE(YoutubeSubtitlesIE): IE_DESC = u'YouTube.com' _VALID_URL = r"""^ ( @@ -151,19 +211,6 @@ def report_video_info_webpage_download(self, video_id): """Report attempt to download video info webpage.""" self.to_screen(u'%s: Downloading video info webpage' % video_id) - def report_video_subtitles_download(self, video_id): - """Report attempt to download video info webpage.""" - self.to_screen(u'%s: Checking available subtitles' % video_id) - - def report_video_subtitles_request(self, video_id, sub_lang, format): - """Report attempt to download video info webpage.""" - self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format)) - - def report_video_subtitles_available(self, video_id, sub_lang_list): - """Report available subtitles.""" - sub_lang = ",".join(list(sub_lang_list.keys())) - self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang)) - def report_information_extraction(self, video_id): """Report attempt to extract video information.""" self.to_screen(u'%s: Extracting video information' % video_id) @@ -203,106 +250,6 @@ def _decrypt_signature(self, s): else: raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s))) - def _get_available_subtitles(self, video_id): - self.report_video_subtitles_download(video_id) - request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id) - try: - sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8') - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err)) - return {} - sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list) - sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list) - if not sub_lang_list: - self._downloader.report_warning(u'video doesn\'t have subtitles') - return {} - return sub_lang_list - - def _list_available_subtitles(self, video_id): - sub_lang_list = self._get_available_subtitles(video_id) - self.report_video_subtitles_available(video_id, sub_lang_list) - - def _request_subtitle(self, sub_lang, sub_name, video_id, format): - """ - Return the subtitle as a string or None if they are not found - """ - self.report_video_subtitles_request(video_id, sub_lang, format) - params = compat_urllib_parse.urlencode({ - 'lang': sub_lang, - 'name': sub_name, - 'v': video_id, - 'fmt': format, - }) - url = 'http://www.youtube.com/api/timedtext?' + params - try: - sub = compat_urllib_request.urlopen(url).read().decode('utf-8') - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.report_warning(u'unable to download video subtitles for %s: %s' % (sub_lang, compat_str(err))) - return - if not sub: - self._downloader.report_warning(u'Did not fetch video subtitles') - return - return sub - - def _request_automatic_caption(self, video_id, webpage): - """We need the webpage for getting the captions url, pass it as an - argument to speed up the process.""" - sub_lang = self._downloader.params.get('subtitleslang') or 'en' - sub_format = self._downloader.params.get('subtitlesformat') - self.to_screen(u'%s: Looking for automatic captions' % video_id) - mobj = re.search(r';ytplayer.config = ({.*?});', webpage) - err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang - if mobj is None: - self._downloader.report_warning(err_msg) - return {} - player_config = json.loads(mobj.group(1)) - try: - args = player_config[u'args'] - caption_url = args[u'ttsurl'] - timestamp = args[u'timestamp'] - params = compat_urllib_parse.urlencode({ - 'lang': 'en', - 'tlang': sub_lang, - 'fmt': sub_format, - 'ts': timestamp, - 'kind': 'asr', - }) - subtitles_url = caption_url + '&' + params - sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions') - return {sub_lang: sub} - # An extractor error can be raise by the download process if there are - # no automatic captions but there are subtitles - except (KeyError, ExtractorError): - self._downloader.report_warning(err_msg) - return {} - - def _extract_subtitles(self, video_id): - """ - Return a dictionary: {language: subtitles} or {} if the subtitles - couldn't be found - """ - sub_lang_list = self._get_available_subtitles(video_id) - sub_format = self._downloader.params.get('subtitlesformat') - if not sub_lang_list: #There was some error, it didn't get the available subtitles - return {} - if self._downloader.params.get('writesubtitles', False): - if self._downloader.params.get('subtitleslang', False): - sub_lang = self._downloader.params.get('subtitleslang') - elif 'en' in sub_lang_list: - sub_lang = 'en' - else: - sub_lang = list(sub_lang_list.keys())[0] - if not sub_lang in sub_lang_list: - self._downloader.report_warning(u'no closed captions found in the specified language "%s"' % sub_lang) - return {} - sub_lang_list = {sub_lang: sub_lang_list[sub_lang]} - subtitles = {} - for sub_lang in sub_lang_list: - subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format) - if subtitle: - subtitles[sub_lang] = subtitle - return subtitles - def _print_formats(self, formats): print('Available formats:') for x in formats: From 505c28aac90fbee46f0d54945b27e115f90785f2 Mon Sep 17 00:00:00 2001 From: Ismael Mejia Date: Thu, 8 Aug 2013 09:53:25 +0200 Subject: [PATCH 06/17] Separated subtitle options in their own group --- youtube_dl/__init__.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index c4d595e1c1..8c6abddd93 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -119,6 +119,7 @@ def _find_term_columns(): selection = optparse.OptionGroup(parser, 'Video Selection') authentication = optparse.OptionGroup(parser, 'Authentication Options') video_format = optparse.OptionGroup(parser, 'Video Format Options') + subtitles = optparse.OptionGroup(parser, 'Subtitle Options') downloader = optparse.OptionGroup(parser, 'Download Options') postproc = optparse.OptionGroup(parser, 'Post-processing Options') filesystem = optparse.OptionGroup(parser, 'Filesystem Options') @@ -185,25 +186,26 @@ def _find_term_columns(): action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download') video_format.add_option('-F', '--list-formats', action='store_true', dest='listformats', help='list all available formats (currently youtube only)') - video_format.add_option('--write-sub', '--write-srt', + + subtitles.add_option('--write-sub', '--write-srt', action='store_true', dest='writesubtitles', help='write subtitle file', default=False) - video_format.add_option('--write-auto-sub', '--write-automatic-sub', + subtitles.add_option('--write-auto-sub', '--write-automatic-sub', action='store_true', dest='writeautomaticsub', help='write automatic subtitle file (youtube only)', default=False) - video_format.add_option('--only-sub', + subtitles.add_option('--only-sub', action='store_true', dest='skip_download', help='[deprecated] alias of --skip-download', default=False) - video_format.add_option('--all-subs', + subtitles.add_option('--all-subs', action='store_true', dest='allsubtitles', help='downloads all the available subtitles of the video', default=False) - video_format.add_option('--list-subs', + subtitles.add_option('--list-subs', action='store_true', dest='listsubtitles', help='lists all available subtitles for the video', default=False) - video_format.add_option('--sub-format', + subtitles.add_option('--sub-format', action='store', dest='subtitlesformat', metavar='FORMAT', help='subtitle format (default=srt) ([sbv/vtt] youtube only)', default='srt') - video_format.add_option('--sub-lang', '--srt-lang', + subtitles.add_option('--sub-lang', '--srt-lang', action='store', dest='subtitleslang', metavar='LANG', help='language of the subtitles to download (optional) use IETF language tags like \'en\'') @@ -328,6 +330,7 @@ def _find_term_columns(): parser.add_option_group(filesystem) parser.add_option_group(verbosity) parser.add_option_group(video_format) + parser.add_option_group(subtitles) parser.add_option_group(authentication) parser.add_option_group(postproc) From 33eb0ce4c4c515b30e5809f63f892b895601b442 Mon Sep 17 00:00:00 2001 From: Ismael Mejia Date: Thu, 8 Aug 2013 10:06:24 +0200 Subject: [PATCH 07/17] [subtitles] removed only-sub option (--skip-download achieves the same functionality) --- test/parameters.json | 1 - test/test_dailymotion_subtitles.py | 8 -------- test/test_youtube_subtitles.py | 8 -------- youtube_dl/__init__.py | 3 --- 4 files changed, 20 deletions(-) diff --git a/test/parameters.json b/test/parameters.json index 96998b5c39..f042880edb 100644 --- a/test/parameters.json +++ b/test/parameters.json @@ -38,7 +38,6 @@ "writedescription": false, "writeinfojson": true, "writesubtitles": false, - "onlysubtitles": false, "allsubtitles": false, "listssubtitles": false } diff --git a/test/test_dailymotion_subtitles.py b/test/test_dailymotion_subtitles.py index f63426a185..32e3f6abe5 100644 --- a/test/test_dailymotion_subtitles.py +++ b/test/test_dailymotion_subtitles.py @@ -46,14 +46,6 @@ def test_subtitles_fr(self): info_dict = IE.extract(TEST_URL) sub = info_dict[0]['subtitles']['fr'] self.assertEqual(md5(sub), '594564ec7d588942e384e920e5341792') - def test_onlysubtitles(self): - DL = FakeYDL() - DL.params['writesubtitles'] = True - DL.params['onlysubtitles'] = True - IE = DailymotionIE(DL) - info_dict = IE.extract(TEST_URL) - sub = info_dict[0]['subtitles']['en'] - self.assertEqual(md5(sub), '976553874490cba125086bbfea3ff76f') def test_allsubtitles(self): DL = FakeYDL() DL.params['allsubtitles'] = True diff --git a/test/test_youtube_subtitles.py b/test/test_youtube_subtitles.py index fe0eac6804..fe5d097ce5 100644 --- a/test/test_youtube_subtitles.py +++ b/test/test_youtube_subtitles.py @@ -45,14 +45,6 @@ def test_youtube_subtitles_it(self): info_dict = IE.extract('QRS8MkLhQmM') sub = info_dict[0]['subtitles']['it'] self.assertEqual(md5(sub), '164a51f16f260476a05b50fe4c2f161d') - def test_youtube_onlysubtitles(self): - DL = FakeYDL() - DL.params['writesubtitles'] = True - DL.params['onlysubtitles'] = True - IE = YoutubeIE(DL) - info_dict = IE.extract('QRS8MkLhQmM') - sub = info_dict[0]['subtitles']['en'] - self.assertEqual(md5(sub), '4cd9278a35ba2305f47354ee13472260') def test_youtube_allsubtitles(self): DL = FakeYDL() DL.params['allsubtitles'] = True diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 8c6abddd93..34f3dad0f2 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -193,9 +193,6 @@ def _find_term_columns(): subtitles.add_option('--write-auto-sub', '--write-automatic-sub', action='store_true', dest='writeautomaticsub', help='write automatic subtitle file (youtube only)', default=False) - subtitles.add_option('--only-sub', - action='store_true', dest='skip_download', - help='[deprecated] alias of --skip-download', default=False) subtitles.add_option('--all-subs', action='store_true', dest='allsubtitles', help='downloads all the available subtitles of the video', default=False) From 447591e1aea39f3100b66a7b94337bf67546663f Mon Sep 17 00:00:00 2001 From: Ismael Mejia Date: Thu, 8 Aug 2013 11:03:52 +0200 Subject: [PATCH 08/17] [test] Cleaned subtitles tests --- test/test_dailymotion_subtitles.py | 83 +++++++++------------------- test/test_youtube_subtitles.py | 88 ++++++++++++------------------ 2 files changed, 61 insertions(+), 110 deletions(-) diff --git a/test/test_dailymotion_subtitles.py b/test/test_dailymotion_subtitles.py index 32e3f6abe5..26c40493f8 100644 --- a/test/test_dailymotion_subtitles.py +++ b/test/test_dailymotion_subtitles.py @@ -15,74 +15,43 @@ from helper import FakeYDL md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest() -TEST_URL = 'http://www.dailymotion.com/video/xczg00' class TestDailymotionSubtitles(unittest.TestCase): def setUp(self): - DL = FakeYDL() - DL.params['allsubtitles'] = False - DL.params['writesubtitles'] = False - DL.params['subtitlesformat'] = 'srt' - DL.params['listsubtitles'] = False + self.DL = FakeYDL() + self.url = 'http://www.dailymotion.com/video/xczg00' + def getInfoDict(self): + IE = DailymotionIE(self.DL) + info_dict = IE.extract(self.url) + return info_dict + def getSubtitles(self): + info_dict = self.getInfoDict() + return info_dict[0]['subtitles'] def test_no_subtitles(self): - DL = FakeYDL() - DL.params['writesubtitles'] = False - IE = DailymotionIE(DL) - info_dict = IE.extract(TEST_URL) - subtitles = info_dict[0]['subtitles'] + subtitles = self.getSubtitles() self.assertEqual(subtitles, None) def test_subtitles(self): - DL = FakeYDL() - DL.params['writesubtitles'] = True - IE = DailymotionIE(DL) - info_dict = IE.extract(TEST_URL) - sub = info_dict[0]['subtitles']['en'] - self.assertEqual(md5(sub), '976553874490cba125086bbfea3ff76f') + self.DL.params['writesubtitles'] = True + subtitles = self.getSubtitles() + self.assertEqual(md5(subtitles['en']), '976553874490cba125086bbfea3ff76f') def test_subtitles_fr(self): - DL = FakeYDL() - DL.params['writesubtitles'] = True - DL.params['subtitleslang'] = 'fr' - IE = DailymotionIE(DL) - info_dict = IE.extract(TEST_URL) - sub = info_dict[0]['subtitles']['fr'] - self.assertEqual(md5(sub), '594564ec7d588942e384e920e5341792') + self.DL.params['writesubtitles'] = True + self.DL.params['subtitleslang'] = 'fr' + subtitles = self.getSubtitles() + self.assertEqual(md5(subtitles['fr']), '594564ec7d588942e384e920e5341792') def test_allsubtitles(self): - DL = FakeYDL() - DL.params['allsubtitles'] = True - IE = DailymotionIE(DL) - info_dict = IE.extract(TEST_URL) - subtitles = info_dict[0]['subtitles'] + self.DL.params['allsubtitles'] = True + subtitles = self.getSubtitles() self.assertEqual(len(subtitles.keys()), 5) - # def test_subtitles_sbv_format(self): - # DL = FakeYDL() - # DL.params['writesubtitles'] = True - # DL.params['subtitlesformat'] = 'sbv' - # IE = DailymotionIE(DL) - # info_dict = IE.extract(TEST_URL) - # sub = info_dict[0]['subtitles'][0] - # self.assertEqual(md5(sub), '13aeaa0c245a8bed9a451cb643e3ad8b') - # def test_subtitles_vtt_format(self): - # DL = FakeYDL() - # DL.params['writesubtitles'] = True - # DL.params['subtitlesformat'] = 'vtt' - # IE = DailymotionIE(DL) - # info_dict = IE.extract(TEST_URL) - # sub = info_dict[0]['subtitles'][0] - # self.assertEqual(md5(sub), '356cdc577fde0c6783b9b822e7206ff7') - def test_list_subtitles(self): - DL = FakeYDL() - DL.params['listsubtitles'] = True - IE = DailymotionIE(DL) - info_dict = IE.extract(TEST_URL) + def test_list_subtitles(self): #ojo + self.DL.params['listsubtitles'] = True + info_dict = self.getInfoDict() self.assertEqual(info_dict, None) def test_automatic_captions(self): - DL = FakeYDL() - DL.params['writeautomaticsub'] = True - DL.params['subtitleslang'] = 'en' - IE = DailymotionIE(DL) - info_dict = IE.extract(TEST_URL) - sub = info_dict[0]['subtitles'] - self.assertTrue(len(sub) == 0) + self.DL.params['writeautomaticsub'] = True + self.DL.params['subtitleslang'] = 'en' + subtitles = self.getSubtitles() + self.assertTrue(len(subtitles.keys()) == 0) if __name__ == '__main__': unittest.main() diff --git a/test/test_youtube_subtitles.py b/test/test_youtube_subtitles.py index fe5d097ce5..aa6a1a4342 100644 --- a/test/test_youtube_subtitles.py +++ b/test/test_youtube_subtitles.py @@ -18,70 +18,52 @@ class TestYoutubeSubtitles(unittest.TestCase): def setUp(self): - DL = FakeYDL() - DL.params['allsubtitles'] = False - DL.params['writesubtitles'] = False - DL.params['subtitlesformat'] = 'srt' - DL.params['listsubtitles'] = False + self.DL = FakeYDL() + self.url = 'QRS8MkLhQmM' + def getInfoDict(self): + IE = YoutubeIE(self.DL) + info_dict = IE.extract(self.url) + return info_dict + def getSubtitles(self): + info_dict = self.getInfoDict() + return info_dict[0]['subtitles'] def test_youtube_no_subtitles(self): - DL = FakeYDL() - DL.params['writesubtitles'] = False - IE = YoutubeIE(DL) - info_dict = IE.extract('QRS8MkLhQmM') - subtitles = info_dict[0]['subtitles'] + self.DL.params['writesubtitles'] = False + subtitles = self.getSubtitles() self.assertEqual(subtitles, None) def test_youtube_subtitles(self): - DL = FakeYDL() - DL.params['writesubtitles'] = True - IE = YoutubeIE(DL) - info_dict = IE.extract('QRS8MkLhQmM') - sub = info_dict[0]['subtitles']['en'] - self.assertEqual(md5(sub), '4cd9278a35ba2305f47354ee13472260') + self.DL.params['writesubtitles'] = True + subtitles = self.getSubtitles() + self.assertEqual(md5(subtitles['en']), '4cd9278a35ba2305f47354ee13472260') def test_youtube_subtitles_it(self): - DL = FakeYDL() - DL.params['writesubtitles'] = True - DL.params['subtitleslang'] = 'it' - IE = YoutubeIE(DL) - info_dict = IE.extract('QRS8MkLhQmM') - sub = info_dict[0]['subtitles']['it'] - self.assertEqual(md5(sub), '164a51f16f260476a05b50fe4c2f161d') + self.DL.params['writesubtitles'] = True + self.DL.params['subtitleslang'] = 'it' + subtitles = self.getSubtitles() + self.assertEqual(md5(subtitles['it']), '164a51f16f260476a05b50fe4c2f161d') def test_youtube_allsubtitles(self): - DL = FakeYDL() - DL.params['allsubtitles'] = True - IE = YoutubeIE(DL) - info_dict = IE.extract('QRS8MkLhQmM') - subtitles = info_dict[0]['subtitles'] + self.DL.params['allsubtitles'] = True + subtitles = self.getSubtitles() self.assertEqual(len(subtitles.keys()), 13) def test_youtube_subtitles_sbv_format(self): - DL = FakeYDL() - DL.params['writesubtitles'] = True - DL.params['subtitlesformat'] = 'sbv' - IE = YoutubeIE(DL) - info_dict = IE.extract('QRS8MkLhQmM') - sub = info_dict[0]['subtitles']['en'] - self.assertEqual(md5(sub), '13aeaa0c245a8bed9a451cb643e3ad8b') + self.DL.params['writesubtitles'] = True + self.DL.params['subtitlesformat'] = 'sbv' + subtitles = self.getSubtitles() + self.assertEqual(md5(subtitles['en']), '13aeaa0c245a8bed9a451cb643e3ad8b') def test_youtube_subtitles_vtt_format(self): - DL = FakeYDL() - DL.params['writesubtitles'] = True - DL.params['subtitlesformat'] = 'vtt' - IE = YoutubeIE(DL) - info_dict = IE.extract('QRS8MkLhQmM') - sub = info_dict[0]['subtitles']['en'] - self.assertEqual(md5(sub), '356cdc577fde0c6783b9b822e7206ff7') + self.DL.params['writesubtitles'] = True + self.DL.params['subtitlesformat'] = 'vtt' + subtitles = self.getSubtitles() + self.assertEqual(md5(subtitles['en']), '356cdc577fde0c6783b9b822e7206ff7') def test_youtube_list_subtitles(self): - DL = FakeYDL() - DL.params['listsubtitles'] = True - IE = YoutubeIE(DL) - info_dict = IE.extract('QRS8MkLhQmM') + self.DL.params['listsubtitles'] = True + info_dict = self.getInfoDict() self.assertEqual(info_dict, None) def test_youtube_automatic_captions(self): - DL = FakeYDL() - DL.params['writeautomaticsub'] = True - DL.params['subtitleslang'] = 'it' - IE = YoutubeIE(DL) - info_dict = IE.extract('8YoUxe5ncPo') - sub = info_dict[0]['subtitles']['it'] - self.assertTrue(sub is not None) + self.url = '8YoUxe5ncPo' + self.DL.params['writeautomaticsub'] = True + self.DL.params['subtitleslang'] = 'it' + subtitles = self.getSubtitles() + self.assertTrue(subtitles['it'] is not None) if __name__ == '__main__': unittest.main() From 69df680b973841b61594c246a9cf4a708f09cb17 Mon Sep 17 00:00:00 2001 From: Ismael Mejia Date: Thu, 8 Aug 2013 11:20:56 +0200 Subject: [PATCH 09/17] [subtitles] Improved docs + new class for servers who don't support auto-caption --- youtube_dl/extractor/dailymotion.py | 9 ++------ youtube_dl/extractor/subtitles.py | 32 +++++++++++++++-------------- 2 files changed, 19 insertions(+), 22 deletions(-) diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 97003ee35d..8fab160057 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -3,7 +3,7 @@ import socket from .common import InfoExtractor -from .subtitles import SubtitlesIE +from .subtitles import NoAutoSubtitlesIE from ..utils import ( compat_http_client, @@ -17,7 +17,7 @@ ) -class DailyMotionSubtitlesIE(SubtitlesIE): +class DailyMotionSubtitlesIE(NoAutoSubtitlesIE): def _get_available_subtitles(self, video_id): request = compat_urllib_request.Request('https://api.dailymotion.com/video/%s/subtitles?fields=id,language,url' % video_id) @@ -33,11 +33,6 @@ def _get_available_subtitles(self, video_id): self._downloader.report_warning(u'video doesn\'t have subtitles') return {} - def _request_automatic_caption(self, video_id, webpage): - self._downloader.report_warning(u'Automatic Captions not supported by this server') - return {} - - class DailymotionIE(DailyMotionSubtitlesIE): """Information Extractor for Dailymotion""" diff --git a/youtube_dl/extractor/subtitles.py b/youtube_dl/extractor/subtitles.py index 8843e02209..caacea5fe5 100644 --- a/youtube_dl/extractor/subtitles.py +++ b/youtube_dl/extractor/subtitles.py @@ -12,21 +12,15 @@ class SubtitlesIE(InfoExtractor): - def report_video_subtitles_available(self, video_id, sub_lang_list): - """Report available subtitles.""" + def _list_available_subtitles(self, video_id): + """ outputs the available subtitles for the video """ + sub_lang_list = self._get_available_subtitles(video_id) sub_lang = ",".join(list(sub_lang_list.keys())) self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang)) - def _list_available_subtitles(self, video_id): - sub_lang_list = self._get_available_subtitles(video_id) - self.report_video_subtitles_available(video_id, sub_lang_list) - def _extract_subtitles(self, video_id): - """ - Return a dictionary: {language: subtitles} or {} if the subtitles - couldn't be found - """ + """ returns {sub_lang: sub} or {} if subtitles not found """ sub_lang_list = self._get_available_subtitles(video_id) if not sub_lang_list: # error, it didn't get the available subtitles return {} @@ -51,6 +45,7 @@ def _extract_subtitles(self, video_id): return subtitles def _request_subtitle_url(self, sub_lang, url): + """ makes the http request for the subtitle """ try: sub = compat_urllib_request.urlopen(url).read().decode('utf-8') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: @@ -62,12 +57,19 @@ def _request_subtitle_url(self, sub_lang, url): return sub def _get_available_subtitles(self, video_id): - """returns the list of available subtitles like this {lang: url} """ - """or {} if not available. Must be redefined by the subclasses.""" + """ returns {sub_lang: url} or {} if not available """ + """ Must be redefined by the subclasses """ pass def _request_automatic_caption(self, video_id, webpage): - """Request automatic caption. Redefine in subclasses.""" - """returns a tuple of ... """ - # return [(err_msg, None, None)] + """ returns {sub_lang: sub} or {} if not available """ + """ Must be redefined by the subclasses """ pass + + +class NoAutoSubtitlesIE(SubtitlesIE): + """ A subtitle class for the servers that don't support auto-captions""" + + def _request_automatic_caption(self, video_id, webpage): + self._downloader.report_warning(u'Automatic Captions not supported by this server') + return {} From d55de6eec2adf7d1aaca87e75dad06ef15d9be26 Mon Sep 17 00:00:00 2001 From: Ismael Mejia Date: Thu, 8 Aug 2013 18:30:04 +0200 Subject: [PATCH 10/17] [subtitles] Skips now the subtitles that has already been downloaded. Just a validation for file exists, I also removed a method that wasn't been used because it was a copy paste from FileDownloader. --- youtube_dl/YoutubeDL.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index ed5492826f..e11d6f9941 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -221,19 +221,16 @@ def report_writedescription(self, descfn): def report_writesubtitles(self, sub_filename): """ Report that the subtitles file is being written """ - self.to_screen(u'[info] Writing video subtitles to: ' + sub_filename) + self.to_screen(u'[info] Writing subtitle: ' + sub_filename) + + def report_existingsubtitles(self, sub_filename): + """ Report that the subtitles file has been already written """ + self.to_screen(u'[info] Skipping existing subtitle: ' + sub_filename) def report_writeinfojson(self, infofn): """ Report that the metadata file has been written """ self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn) - def report_file_already_downloaded(self, file_name): - """Report file has already been fully downloaded.""" - try: - self.to_screen(u'[download] %s has already been downloaded' % file_name) - except (UnicodeEncodeError) as err: - self.to_screen(u'[download] The file has already been downloaded') - def increment_downloads(self): """Increment the ordinal that assigns a number to each file.""" self._num_downloads += 1 @@ -492,13 +489,16 @@ def process_info(self, info_dict): # that way it will silently go on when used with unsupporting IE subtitles = info_dict['subtitles'] sub_format = self.params.get('subtitlesformat') - + for sub_lang in subtitles.keys(): sub = subtitles[sub_lang] if sub is None: continue try: sub_filename = filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format + if os.path.isfile(encodeFilename(sub_filename)): + self.report_existingsubtitles(sub_filename) + continue self.report_writesubtitles(sub_filename) with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile: subfile.write(sub) From d80a064eff4fe2416f9db36b07f1e2ca641f1334 Mon Sep 17 00:00:00 2001 From: Ismael Mejia Date: Thu, 8 Aug 2013 22:22:33 +0200 Subject: [PATCH 11/17] [subtitles] Added tests to check correct behavior when no subtitles are available --- test/test_dailymotion_subtitles.py | 9 +++++++-- test/test_youtube_subtitles.py | 10 ++++++++-- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/test/test_dailymotion_subtitles.py b/test/test_dailymotion_subtitles.py index 26c40493f8..efc4e574ff 100644 --- a/test/test_dailymotion_subtitles.py +++ b/test/test_dailymotion_subtitles.py @@ -27,14 +27,14 @@ def getInfoDict(self): def getSubtitles(self): info_dict = self.getInfoDict() return info_dict[0]['subtitles'] - def test_no_subtitles(self): + def test_no_writesubtitles(self): subtitles = self.getSubtitles() self.assertEqual(subtitles, None) def test_subtitles(self): self.DL.params['writesubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(md5(subtitles['en']), '976553874490cba125086bbfea3ff76f') - def test_subtitles_fr(self): + def test_subtitles_lang(self): self.DL.params['writesubtitles'] = True self.DL.params['subtitleslang'] = 'fr' subtitles = self.getSubtitles() @@ -52,6 +52,11 @@ def test_automatic_captions(self): self.DL.params['subtitleslang'] = 'en' subtitles = self.getSubtitles() self.assertTrue(len(subtitles.keys()) == 0) + def test_nosubtitles(self): + self.url = 'http://www.dailymotion.com/video/x12u166_le-zapping-tele-star-du-08-aout-2013_tv' + self.DL.params['allsubtitles'] = True + subtitles = self.getSubtitles() + self.assertEqual(len(subtitles), 0) if __name__ == '__main__': unittest.main() diff --git a/test/test_youtube_subtitles.py b/test/test_youtube_subtitles.py index aa6a1a4342..e40243077f 100644 --- a/test/test_youtube_subtitles.py +++ b/test/test_youtube_subtitles.py @@ -27,7 +27,7 @@ def getInfoDict(self): def getSubtitles(self): info_dict = self.getInfoDict() return info_dict[0]['subtitles'] - def test_youtube_no_subtitles(self): + def test_youtube_no_writesubtitles(self): self.DL.params['writesubtitles'] = False subtitles = self.getSubtitles() self.assertEqual(subtitles, None) @@ -35,7 +35,7 @@ def test_youtube_subtitles(self): self.DL.params['writesubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(md5(subtitles['en']), '4cd9278a35ba2305f47354ee13472260') - def test_youtube_subtitles_it(self): + def test_youtube_subtitles_lang(self): self.DL.params['writesubtitles'] = True self.DL.params['subtitleslang'] = 'it' subtitles = self.getSubtitles() @@ -64,6 +64,12 @@ def test_youtube_automatic_captions(self): self.DL.params['subtitleslang'] = 'it' subtitles = self.getSubtitles() self.assertTrue(subtitles['it'] is not None) + def test_youtube_nosubtitles(self): + self.url = 'sAjKT8FhjI8' + self.DL.params['allsubtitles'] = True + subtitles = self.getSubtitles() + self.assertEqual(len(subtitles), 0) + if __name__ == '__main__': unittest.main() From d6e203b3dcef8f291b57021903e629d3e30e1f0b Mon Sep 17 00:00:00 2001 From: Ismael Mejia Date: Fri, 6 Sep 2013 16:26:22 +0200 Subject: [PATCH 12/17] [subtitles] fixed multiple subtitles language separated by comma after merge As mentioned in the pull request, I forgot to include this changes. https://github.com/rg3/youtube-dl/commit/aa6a10c44a8e2e86f709c5301f9ea6ac3f01f002 --- test/test_dailymotion_subtitles.py | 13 +++++++++--- test/test_youtube_subtitles.py | 13 +++++++++--- youtube_dl/YoutubeDL.py | 2 +- youtube_dl/__init__.py | 12 +++++++---- youtube_dl/extractor/subtitles.py | 33 +++++++++++++++++------------- youtube_dl/extractor/youtube.py | 2 +- 6 files changed, 49 insertions(+), 26 deletions(-) diff --git a/test/test_dailymotion_subtitles.py b/test/test_dailymotion_subtitles.py index efc4e574ff..bcd9f79f65 100644 --- a/test/test_dailymotion_subtitles.py +++ b/test/test_dailymotion_subtitles.py @@ -36,20 +36,20 @@ def test_subtitles(self): self.assertEqual(md5(subtitles['en']), '976553874490cba125086bbfea3ff76f') def test_subtitles_lang(self): self.DL.params['writesubtitles'] = True - self.DL.params['subtitleslang'] = 'fr' + self.DL.params['subtitleslangs'] = ['fr'] subtitles = self.getSubtitles() self.assertEqual(md5(subtitles['fr']), '594564ec7d588942e384e920e5341792') def test_allsubtitles(self): self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(len(subtitles.keys()), 5) - def test_list_subtitles(self): #ojo + def test_list_subtitles(self): self.DL.params['listsubtitles'] = True info_dict = self.getInfoDict() self.assertEqual(info_dict, None) def test_automatic_captions(self): self.DL.params['writeautomaticsub'] = True - self.DL.params['subtitleslang'] = 'en' + self.DL.params['subtitleslang'] = ['en'] subtitles = self.getSubtitles() self.assertTrue(len(subtitles.keys()) == 0) def test_nosubtitles(self): @@ -57,6 +57,13 @@ def test_nosubtitles(self): self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(len(subtitles), 0) + def test_multiple_langs(self): + self.DL.params['writesubtitles'] = True + langs = ['es', 'fr', 'de'] + self.DL.params['subtitleslangs'] = langs + subtitles = self.getSubtitles() + for lang in langs: + self.assertTrue(subtitles.get(lang) is not None, u'Subtitles for \'%s\' not extracted' % lang) if __name__ == '__main__': unittest.main() diff --git a/test/test_youtube_subtitles.py b/test/test_youtube_subtitles.py index e40243077f..5632871aca 100644 --- a/test/test_youtube_subtitles.py +++ b/test/test_youtube_subtitles.py @@ -37,7 +37,7 @@ def test_youtube_subtitles(self): self.assertEqual(md5(subtitles['en']), '4cd9278a35ba2305f47354ee13472260') def test_youtube_subtitles_lang(self): self.DL.params['writesubtitles'] = True - self.DL.params['subtitleslang'] = 'it' + self.DL.params['subtitleslangs'] = ['it'] subtitles = self.getSubtitles() self.assertEqual(md5(subtitles['it']), '164a51f16f260476a05b50fe4c2f161d') def test_youtube_allsubtitles(self): @@ -61,7 +61,7 @@ def test_youtube_list_subtitles(self): def test_youtube_automatic_captions(self): self.url = '8YoUxe5ncPo' self.DL.params['writeautomaticsub'] = True - self.DL.params['subtitleslang'] = 'it' + self.DL.params['subtitleslangs'] = ['it'] subtitles = self.getSubtitles() self.assertTrue(subtitles['it'] is not None) def test_youtube_nosubtitles(self): @@ -69,7 +69,14 @@ def test_youtube_nosubtitles(self): self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(len(subtitles), 0) - + def test_youtube_multiple_langs(self): + self.url = 'QRS8MkLhQmM' + self.DL.params['writesubtitles'] = True + langs = ['it', 'fr', 'de'] + self.DL.params['subtitleslangs'] = langs + subtitles = self.getSubtitles() + for lang in langs: + self.assertTrue(subtitles.get(lang) is not None, u'Subtitles for \'%s\' not extracted' % lang) if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 1fd610a6e7..e9f29e6808 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -76,7 +76,7 @@ class YoutubeDL(object): allsubtitles: Downloads all the subtitles of the video listsubtitles: Lists all available subtitles for the video subtitlesformat: Subtitle format [srt/sbv/vtt] (default=srt) - subtitleslang: Language of the subtitles to download + subtitleslangs: Language of the subtitles to download keepvideo: Keep the video file after post-processing daterange: A DateRange object, download only if the upload_date is in the range. skip_download: Skip the actual download of the video file diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 5d686a928e..2c2fd441cf 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -83,6 +83,9 @@ def _format_option_string(option): return "".join(opts) + def _comma_separated_values_options_callback(option, opt_str, value, parser): + setattr(parser.values, option.dest, value.split(',')) + def _find_term_columns(): columns = os.environ.get('COLUMNS', None) if columns: @@ -203,9 +206,10 @@ def _find_term_columns(): subtitles.add_option('--sub-format', action='store', dest='subtitlesformat', metavar='FORMAT', help='subtitle format (default=srt) ([sbv/vtt] youtube only)', default='srt') - subtitles.add_option('--sub-lang', '--srt-lang', - action='store', dest='subtitleslang', metavar='LANG', - help='language of the subtitles to download (optional) use IETF language tags like \'en\'') + subtitles.add_option('--sub-lang', '--sub-langs', '--srt-lang', + action='callback', dest='subtitleslangs', metavar='LANGS', type='str', + default=[], callback=_comma_separated_values_options_callback, + help='languages of the subtitles to download (optional) separated by commas, use IETF language tags like \'en,pt\'') downloader.add_option('-r', '--rate-limit', dest='ratelimit', metavar='LIMIT', help='maximum download rate (e.g. 50k or 44.6m)') @@ -570,7 +574,7 @@ def _real_main(argv=None): 'allsubtitles': opts.allsubtitles, 'listsubtitles': opts.listsubtitles, 'subtitlesformat': opts.subtitlesformat, - 'subtitleslang': opts.subtitleslang, + 'subtitleslangs': opts.subtitleslangs, 'matchtitle': decodeOption(opts.matchtitle), 'rejecttitle': decodeOption(opts.rejecttitle), 'max_downloads': opts.max_downloads, diff --git a/youtube_dl/extractor/subtitles.py b/youtube_dl/extractor/subtitles.py index caacea5fe5..c10cdf2667 100644 --- a/youtube_dl/extractor/subtitles.py +++ b/youtube_dl/extractor/subtitles.py @@ -21,24 +21,29 @@ def _list_available_subtitles(self, video_id): def _extract_subtitles(self, video_id): """ returns {sub_lang: sub} or {} if subtitles not found """ - sub_lang_list = self._get_available_subtitles(video_id) - if not sub_lang_list: # error, it didn't get the available subtitles + available_subs_list = self._get_available_subtitles(video_id) + if not available_subs_list: # error, it didn't get the available subtitles return {} + if self._downloader.params.get('allsubtitles', False): + sub_lang_list = available_subs_list + else: + if self._downloader.params.get('writesubtitles', False): + if self._downloader.params.get('subtitleslangs', False): + requested_langs = self._downloader.params.get('subtitleslangs') + elif 'en' in available_subs_list: + requested_langs = ['en'] + else: + requested_langs = [list(available_subs_list.keys())[0]] - if self._downloader.params.get('writesubtitles', False): - if self._downloader.params.get('subtitleslang', False): - sub_lang = self._downloader.params.get('subtitleslang') - elif 'en' in sub_lang_list: - sub_lang = 'en' - else: - sub_lang = list(sub_lang_list.keys())[0] - if not sub_lang in sub_lang_list: - self._downloader.report_warning(u'no closed captions found in the specified language "%s"' % sub_lang) - return {} - sub_lang_list = {sub_lang: sub_lang_list[sub_lang]} + sub_lang_list = {} + for sub_lang in requested_langs: + if not sub_lang in available_subs_list: + self._downloader.report_warning(u'no closed captions found in the specified language "%s"' % sub_lang) + continue + sub_lang_list[sub_lang] = available_subs_list[sub_lang] subtitles = {} - for sub_lang, url in sub_lang_list.iteritems(): + for sub_lang, url in sub_lang_list.items(): subtitle = self._request_subtitle_url(sub_lang, url) if subtitle: subtitles[sub_lang] = subtitle diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 370cc64cc9..b3400df0ab 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -160,7 +160,7 @@ def _get_available_subtitles(self, video_id): def _request_automatic_caption(self, video_id, webpage): """We need the webpage for getting the captions url, pass it as an argument to speed up the process.""" - sub_lang = self._downloader.params.get('subtitleslang') or 'en' + sub_lang = (self._downloader.params.get('subtitleslangs') or ['en'])[0] sub_format = self._downloader.params.get('subtitlesformat') self.to_screen(u'%s: Looking for automatic captions' % video_id) mobj = re.search(r';ytplayer.config = ({.*?});', webpage) From f8e52269c1a27c28aef606f010e2c64ff9a946d3 Mon Sep 17 00:00:00 2001 From: Ismael Mejia Date: Wed, 11 Sep 2013 15:21:09 +0200 Subject: [PATCH 13/17] [subtitles] made inheritance hierarchy flat as requested --- youtube_dl/extractor/dailymotion.py | 32 ++++++++++++++--------------- youtube_dl/extractor/youtube.py | 6 ++---- 2 files changed, 17 insertions(+), 21 deletions(-) diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index f7dffd4cce..c7bcf6e8e6 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -18,23 +18,7 @@ ) -class DailyMotionSubtitlesIE(NoAutoSubtitlesIE): - - def _get_available_subtitles(self, video_id): - request = compat_urllib_request.Request('https://api.dailymotion.com/video/%s/subtitles?fields=id,language,url' % video_id) - try: - sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8') - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err)) - return {} - info = json.loads(sub_list) - if (info['total'] > 0): - sub_lang_list = dict((l['language'], l['url']) for l in info['list']) - return sub_lang_list - self._downloader.report_warning(u'video doesn\'t have subtitles') - return {} - -class DailymotionIE(DailyMotionSubtitlesIE, InfoExtractor): +class DailymotionIE(NoAutoSubtitlesIE): """Information Extractor for Dailymotion""" _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/(?:embed/)?video/([^/]+)' @@ -120,6 +104,20 @@ def _real_extract(self, url): 'thumbnail': info['thumbnail_url'] }] + def _get_available_subtitles(self, video_id): + request = compat_urllib_request.Request('https://api.dailymotion.com/video/%s/subtitles?fields=id,language,url' % video_id) + try: + sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8') + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err)) + return {} + info = json.loads(sub_list) + if (info['total'] > 0): + sub_lang_list = dict((l['language'], l['url']) for l in info['list']) + return sub_lang_list + self._downloader.report_warning(u'video doesn\'t have subtitles') + return {} + class DailymotionPlaylistIE(InfoExtractor): _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?P.+?)/' diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index e71cd62ec3..5945eab70f 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -24,7 +24,7 @@ orderedSet, ) -class YoutubeBaseInfoExtractor(InfoExtractor): +class YoutubeBaseInfoExtractor(SubtitlesIE): """Provide base functions for Youtube extractors""" _LOGIN_URL = 'https://accounts.google.com/ServiceLogin' _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1' @@ -131,8 +131,6 @@ def _real_initialize(self): return self._confirm_age() -class YoutubeSubtitlesIE(SubtitlesIE): - def _get_available_subtitles(self, video_id): request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id) try: @@ -189,7 +187,7 @@ def _request_automatic_caption(self, video_id, webpage): self._downloader.report_warning(err_msg) return {} -class YoutubeIE(YoutubeSubtitlesIE, YoutubeBaseInfoExtractor): +class YoutubeIE(YoutubeBaseInfoExtractor): IE_DESC = u'YouTube.com' _VALID_URL = r"""^ ( From de7f3446e0bf99a2fe7a93eb28175b16cb2cf6c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Wed, 11 Sep 2013 15:48:23 +0200 Subject: [PATCH 14/17] [youtube] move subtitles methods from the base extractor to YoutubeIE --- youtube_dl/extractor/youtube.py | 115 ++++++++++++++++---------------- 1 file changed, 58 insertions(+), 57 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 5945eab70f..8102f6d243 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -24,7 +24,7 @@ orderedSet, ) -class YoutubeBaseInfoExtractor(SubtitlesIE): +class YoutubeBaseInfoExtractor(InfoExtractor): """Provide base functions for Youtube extractors""" _LOGIN_URL = 'https://accounts.google.com/ServiceLogin' _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1' @@ -131,63 +131,8 @@ def _real_initialize(self): return self._confirm_age() - def _get_available_subtitles(self, video_id): - request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id) - try: - sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8') - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err)) - return {} - lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list) - sub_lang_list = {} - for l in lang_list: - lang = l[1] - params = compat_urllib_parse.urlencode({ - 'lang': lang, - 'v': video_id, - 'fmt': self._downloader.params.get('subtitlesformat'), - }) - url = u'http://www.youtube.com/api/timedtext?' + params - sub_lang_list[lang] = url - if not sub_lang_list: - self._downloader.report_warning(u'video doesn\'t have subtitles') - return {} - return sub_lang_list - - def _request_automatic_caption(self, video_id, webpage): - """We need the webpage for getting the captions url, pass it as an - argument to speed up the process.""" - sub_lang = (self._downloader.params.get('subtitleslangs') or ['en'])[0] - sub_format = self._downloader.params.get('subtitlesformat') - self.to_screen(u'%s: Looking for automatic captions' % video_id) - mobj = re.search(r';ytplayer.config = ({.*?});', webpage) - err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang - if mobj is None: - self._downloader.report_warning(err_msg) - return {} - player_config = json.loads(mobj.group(1)) - try: - args = player_config[u'args'] - caption_url = args[u'ttsurl'] - timestamp = args[u'timestamp'] - params = compat_urllib_parse.urlencode({ - 'lang': 'en', - 'tlang': sub_lang, - 'fmt': sub_format, - 'ts': timestamp, - 'kind': 'asr', - }) - subtitles_url = caption_url + '&' + params - sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions') - return {sub_lang: sub} - # An extractor error can be raise by the download process if there are - # no automatic captions but there are subtitles - except (KeyError, ExtractorError): - self._downloader.report_warning(err_msg) - return {} - -class YoutubeIE(YoutubeBaseInfoExtractor): +class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): IE_DESC = u'YouTube.com' _VALID_URL = r"""^ ( @@ -508,6 +453,62 @@ def _decrypt_signature_age_gate(self, s): # Fallback to the other algortihms return self._decrypt_signature(s) + def _get_available_subtitles(self, video_id): + request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id) + try: + sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8') + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err)) + return {} + lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list) + + sub_lang_list = {} + for l in lang_list: + lang = l[1] + params = compat_urllib_parse.urlencode({ + 'lang': lang, + 'v': video_id, + 'fmt': self._downloader.params.get('subtitlesformat'), + }) + url = u'http://www.youtube.com/api/timedtext?' + params + sub_lang_list[lang] = url + if not sub_lang_list: + self._downloader.report_warning(u'video doesn\'t have subtitles') + return {} + return sub_lang_list + + def _request_automatic_caption(self, video_id, webpage): + """We need the webpage for getting the captions url, pass it as an + argument to speed up the process.""" + sub_lang = (self._downloader.params.get('subtitleslangs') or ['en'])[0] + sub_format = self._downloader.params.get('subtitlesformat') + self.to_screen(u'%s: Looking for automatic captions' % video_id) + mobj = re.search(r';ytplayer.config = ({.*?});', webpage) + err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang + if mobj is None: + self._downloader.report_warning(err_msg) + return {} + player_config = json.loads(mobj.group(1)) + try: + args = player_config[u'args'] + caption_url = args[u'ttsurl'] + timestamp = args[u'timestamp'] + params = compat_urllib_parse.urlencode({ + 'lang': 'en', + 'tlang': sub_lang, + 'fmt': sub_format, + 'ts': timestamp, + 'kind': 'asr', + }) + subtitles_url = caption_url + '&' + params + sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions') + return {sub_lang: sub} + # An extractor error can be raise by the download process if there are + # no automatic captions but there are subtitles + except (KeyError, ExtractorError): + self._downloader.report_warning(err_msg) + return {} + def _print_formats(self, formats): print('Available formats:') for x in formats: From 54d39d8b2f7a9fe148a24dd2785108b7d3823d9d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Wed, 11 Sep 2013 15:51:04 +0200 Subject: [PATCH 15/17] [subtitles] rename SubitlesIE to SubtitlesInfoExtractor Otherwise it can be automatically detected as a IE ready for use. --- youtube_dl/extractor/dailymotion.py | 4 ++-- youtube_dl/extractor/subtitles.py | 4 ++-- youtube_dl/extractor/youtube.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index c7bcf6e8e6..d73023b9ed 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -4,7 +4,7 @@ import socket from .common import InfoExtractor -from .subtitles import NoAutoSubtitlesIE +from .subtitles import NoAutoSubtitlesInfoExtractor from ..utils import ( compat_http_client, @@ -18,7 +18,7 @@ ) -class DailymotionIE(NoAutoSubtitlesIE): +class DailymotionIE(NoAutoSubtitlesInfoExtractor): """Information Extractor for Dailymotion""" _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/(?:embed/)?video/([^/]+)' diff --git a/youtube_dl/extractor/subtitles.py b/youtube_dl/extractor/subtitles.py index c10cdf2667..8953d6789c 100644 --- a/youtube_dl/extractor/subtitles.py +++ b/youtube_dl/extractor/subtitles.py @@ -10,7 +10,7 @@ ) -class SubtitlesIE(InfoExtractor): +class SubtitlesInfoExtractor(InfoExtractor): def _list_available_subtitles(self, video_id): """ outputs the available subtitles for the video """ @@ -72,7 +72,7 @@ def _request_automatic_caption(self, video_id, webpage): pass -class NoAutoSubtitlesIE(SubtitlesIE): +class NoAutoSubtitlesInfoExtractor(SubtitlesInfoExtractor): """ A subtitle class for the servers that don't support auto-captions""" def _request_automatic_caption(self, video_id, webpage): diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 8102f6d243..0476f113e6 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -7,7 +7,7 @@ import itertools from .common import InfoExtractor, SearchInfoExtractor -from .subtitles import SubtitlesIE +from .subtitles import SubtitlesInfoExtractor from ..utils import ( compat_http_client, compat_parse_qs, From d82134c3395c0912157c7ccae9f21d4b3375910b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Wed, 11 Sep 2013 16:05:49 +0200 Subject: [PATCH 16/17] [subtitles] Simplify the extraction of subtitles in subclasses and remove NoAutoSubtitlesInfoExtractor Subclasses just need to call the method extract_subtitles, which will call _extract_subtitles and _request_automatic_caption Now the default implementation of _request_automatic_caption returns {}. --- youtube_dl/extractor/dailymotion.py | 13 +++-------- youtube_dl/extractor/subtitles.py | 34 +++++++++++++++++++---------- youtube_dl/extractor/youtube.py | 7 +----- 3 files changed, 27 insertions(+), 27 deletions(-) diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index d73023b9ed..abd6a36ee0 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -4,7 +4,7 @@ import socket from .common import InfoExtractor -from .subtitles import NoAutoSubtitlesInfoExtractor +from .subtitles import SubtitlesInfoExtractor from ..utils import ( compat_http_client, @@ -18,7 +18,7 @@ ) -class DailymotionIE(NoAutoSubtitlesInfoExtractor): +class DailymotionIE(SubtitlesInfoExtractor): """Information Extractor for Dailymotion""" _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/(?:embed/)?video/([^/]+)' @@ -81,14 +81,7 @@ def _real_extract(self, url): video_url = info[max_quality] # subtitles - video_subtitles = None - video_webpage = None - - if self._downloader.params.get('writesubtitles', False) or self._downloader.params.get('allsubtitles', False): - video_subtitles = self._extract_subtitles(video_id) - elif self._downloader.params.get('writeautomaticsub', False): - video_subtitles = self._request_automatic_caption(video_id, video_webpage) - + video_subtitles = self.extract_subtitles(video_id) if self._downloader.params.get('listsubtitles', False): self._list_available_subtitles(video_id) return diff --git a/youtube_dl/extractor/subtitles.py b/youtube_dl/extractor/subtitles.py index 8953d6789c..5ae8b3b167 100644 --- a/youtube_dl/extractor/subtitles.py +++ b/youtube_dl/extractor/subtitles.py @@ -62,19 +62,31 @@ def _request_subtitle_url(self, sub_lang, url): return sub def _get_available_subtitles(self, video_id): - """ returns {sub_lang: url} or {} if not available """ - """ Must be redefined by the subclasses """ + """ + returns {sub_lang: url} or {} if not available + Must be redefined by the subclasses + """ pass def _request_automatic_caption(self, video_id, webpage): - """ returns {sub_lang: sub} or {} if not available """ - """ Must be redefined by the subclasses """ - pass - - -class NoAutoSubtitlesInfoExtractor(SubtitlesInfoExtractor): - """ A subtitle class for the servers that don't support auto-captions""" - - def _request_automatic_caption(self, video_id, webpage): + """ + returns {sub_lang: sub} or {} if not available + Must be redefined by the subclasses that support automatic captions, + otherwise it will return {} + """ self._downloader.report_warning(u'Automatic Captions not supported by this server') return {} + + def extract_subtitles(self, video_id, video_webpage=None): + """ + Extract the subtitles and/or the automatic captions if requested. + Returns None or a dictionary in the format {sub_lang: sub} + """ + video_subtitles = None + if self._downloader.params.get('writesubtitles', False) or self._downloader.params.get('allsubtitles', False): + video_subtitles = self._extract_subtitles(video_id) + elif self._downloader.params.get('writeautomaticsub', False): + video_subtitles = self._request_automatic_caption(video_id, video_webpage) + return video_subtitles + + diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 0476f113e6..3bba45b799 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -707,12 +707,7 @@ def _real_extract(self, url): video_description = u'' # subtitles - video_subtitles = None - - if self._downloader.params.get('writesubtitles', False) or self._downloader.params.get('allsubtitles', False): - video_subtitles = self._extract_subtitles(video_id) - elif self._downloader.params.get('writeautomaticsub', False): - video_subtitles = self._request_automatic_caption(video_id, video_webpage) + video_subtitles = self.extract_subtitles(video_id, video_webpage) if self._downloader.params.get('listsubtitles', False): self._list_available_subtitles(video_id) From 7fad1c6328b02ba9f23d37f374a05255abfe38a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Wed, 11 Sep 2013 16:24:47 +0200 Subject: [PATCH 17/17] [subtitles] Use self._download_webpage for extracting the subtitles It raises ExtractorError for the same exceptions we have to catch. --- youtube_dl/extractor/dailymotion.py | 10 ++++------ youtube_dl/extractor/subtitles.py | 12 +++--------- youtube_dl/extractor/youtube.py | 7 ++++--- 3 files changed, 11 insertions(+), 18 deletions(-) diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index abd6a36ee0..360113f9c4 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -1,14 +1,11 @@ import re import json import itertools -import socket from .common import InfoExtractor from .subtitles import SubtitlesInfoExtractor from ..utils import ( - compat_http_client, - compat_urllib_error, compat_urllib_request, compat_str, get_element_by_attribute, @@ -98,10 +95,11 @@ def _real_extract(self, url): }] def _get_available_subtitles(self, video_id): - request = compat_urllib_request.Request('https://api.dailymotion.com/video/%s/subtitles?fields=id,language,url' % video_id) try: - sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8') - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + sub_list = self._download_webpage( + 'https://api.dailymotion.com/video/%s/subtitles?fields=id,language,url' % video_id, + video_id, note=False) + except ExtractorError as err: self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err)) return {} info = json.loads(sub_list) diff --git a/youtube_dl/extractor/subtitles.py b/youtube_dl/extractor/subtitles.py index 5ae8b3b167..9a3c54b65a 100644 --- a/youtube_dl/extractor/subtitles.py +++ b/youtube_dl/extractor/subtitles.py @@ -1,12 +1,8 @@ -import socket - from .common import InfoExtractor from ..utils import ( - compat_http_client, - compat_urllib_error, - compat_urllib_request, compat_str, + ExtractorError, ) @@ -52,8 +48,8 @@ def _extract_subtitles(self, video_id): def _request_subtitle_url(self, sub_lang, url): """ makes the http request for the subtitle """ try: - sub = compat_urllib_request.urlopen(url).read().decode('utf-8') - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + sub = self._download_webpage(url, None, note=False) + except ExtractorError as err: self._downloader.report_warning(u'unable to download video subtitles for %s: %s' % (sub_lang, compat_str(err))) return if not sub: @@ -88,5 +84,3 @@ def extract_subtitles(self, video_id, video_webpage=None): elif self._downloader.params.get('writeautomaticsub', False): video_subtitles = self._request_automatic_caption(video_id, video_webpage) return video_subtitles - - diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 3bba45b799..d06cc49c45 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -454,10 +454,11 @@ def _decrypt_signature_age_gate(self, s): return self._decrypt_signature(s) def _get_available_subtitles(self, video_id): - request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id) try: - sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8') - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + sub_list = self._download_webpage( + 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id, + video_id, note=False) + except ExtractorError as err: self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err)) return {} lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)