From aba8df23edf4f1078b163b490174c2d766432b55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 27 Apr 2013 10:41:52 +0200 Subject: [PATCH 1/4] YoutubePlaylistIE: don't crash with empty lists (related #808) The playlist_title wasn't initialized. --- test/test_youtube_lists.py | 7 +++++++ youtube_dl/InfoExtractors.py | 3 +-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py index c7f00af32..b11e6ccaa 100644 --- a/test/test_youtube_lists.py +++ b/test/test_youtube_lists.py @@ -71,6 +71,13 @@ def test_youtube_playlist_with_deleted(self): ytie_results = [YoutubeIE()._extract_id(url['url']) for url in result['entries']] self.assertFalse('pElCt5oNDuI' in ytie_results) self.assertFalse('KdPEApIVdWM' in ytie_results) + + def test_youtube_playlist_empty(self): + dl = FakeDownloader() + ie = YoutubePlaylistIE(dl) + result = ie.extract('https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx')[0] + self.assertIsPlaylist(result) + self.assertEqual(len(result['entries']), 0) def test_youtube_course(self): dl = FakeDownloader() diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 3450f0d17..967f6a100 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -1723,12 +1723,11 @@ def _real_extract(self, url): if 'feed' not in response: self._downloader.report_error(u'Got a malformed response from YouTube API') return + playlist_title = response['feed']['title']['$t'] if 'entry' not in response['feed']: # Number of videos is a multiple of self._MAX_RESULTS break - playlist_title = response['feed']['title']['$t'] - videos += [ (entry['yt$position']['$t'], entry['content']['src']) for entry in response['feed']['entry'] if 'content' in entry ] From 4c9f7a9988f296eeedd0843cded5cbcec3392adb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 27 Apr 2013 11:03:34 +0200 Subject: [PATCH 2/4] SteamIE: accept urls with agecheck --- youtube_dl/InfoExtractors.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 967f6a100..936af9cb4 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -3560,6 +3560,7 @@ def _real_extract(self, url): class SteamIE(InfoExtractor): _VALID_URL = r"""http://store.steampowered.com/ + (agecheck/)? (?Pvideo|app)/ #If the page is only for videos or for a game (?P\d+)/? (?P\d*)(?P\??) #For urltype == video we sometimes get the videoID From bd55852517a40d011b303559f4cd78773a2f3de5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 27 Apr 2013 14:01:55 +0200 Subject: [PATCH 3/4] Allow to select videos to download by their upload dates (related #137) Only absolute dates. --- test/test_utils.py | 9 +++++++++ youtube_dl/FileDownloader.py | 6 ++++++ youtube_dl/__init__.py | 10 +++++++++- youtube_dl/utils.py | 30 ++++++++++++++++++++++++++++++ 4 files changed, 54 insertions(+), 1 deletion(-) diff --git a/test/test_utils.py b/test/test_utils.py index eeaaa7fad..f9d58268b 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -14,6 +14,7 @@ from youtube_dl.utils import sanitize_filename from youtube_dl.utils import unescapeHTML from youtube_dl.utils import orderedSet +from youtube_dl.utils import DateRange if sys.version_info < (3, 0): _compat_str = lambda b: b.decode('unicode-escape') @@ -95,6 +96,14 @@ def test_ordered_set(self): def test_unescape_html(self): self.assertEqual(unescapeHTML(_compat_str('%20;')), _compat_str('%20;')) + + def test_daterange(self): + _20century = DateRange("19000101","20000101") + self.assertFalse("17890714" in _20century) + _ac = DateRange("00010101") + self.assertTrue("19690721" in _ac) + _firstmilenium = DateRange(end="10000101") + self.assertTrue("07110427" in _firstmilenium) if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/FileDownloader.py b/youtube_dl/FileDownloader.py index d0378fb14..2db686d62 100644 --- a/youtube_dl/FileDownloader.py +++ b/youtube_dl/FileDownloader.py @@ -89,6 +89,7 @@ class FileDownloader(object): keepvideo: Keep the video file after post-processing min_filesize: Skip files smaller than this size max_filesize: Skip files larger than this size + daterange: A DateRange object, download only if the upload_date is in the range. """ params = None @@ -424,6 +425,11 @@ def _match_entry(self, info_dict): if rejecttitle: if re.search(rejecttitle, title, re.IGNORECASE): return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"' + date = info_dict.get('upload_date', None) + if date is not None: + dateRange = self.params.get('daterange', DateRange()) + if date not in dateRange: + return u'[download] %s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange) return None def extract_info(self, url, download = True, ie_name = None): diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index d491402c6..ce754ffd3 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -157,6 +157,9 @@ def _find_term_columns(): selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None) selection.add_option('--min-filesize', metavar='SIZE', dest='min_filesize', help="Do not download any videos smaller than SIZE (e.g. 50k or 44.6m)", default=None) selection.add_option('--max-filesize', metavar='SIZE', dest='max_filesize', help="Do not download any videos larger than SIZE (e.g. 50k or 44.6m)", default=None) + selection.add_option('--date', metavar='DATE', dest='date', help='download only videos uploaded in this date', default=None) + selection.add_option('--datebefore', metavar='DATE', dest='datebefore', help='download only videos uploaded before this date', default=None) + selection.add_option('--dateafter', metavar='DATE', dest='dateafter', help='download only videos uploaded after this date', default=None) authentication.add_option('-u', '--username', @@ -447,6 +450,10 @@ def _real_main(argv=None): if opts.recodevideo is not None: if opts.recodevideo not in ['mp4', 'flv', 'webm', 'ogg']: parser.error(u'invalid video recode format specified') + if opts.date is not None: + date = DateRange.day(opts.date) + else: + date = DateRange(opts.dateafter, opts.datebefore) if sys.version_info < (3,): # In Python 2, sys.argv is a bytestring (also note http://bugs.python.org/issue2128 for Windows systems) @@ -513,7 +520,8 @@ def _real_main(argv=None): 'test': opts.test, 'keepvideo': opts.keepvideo, 'min_filesize': opts.min_filesize, - 'max_filesize': opts.max_filesize + 'max_filesize': opts.max_filesize, + 'daterange': date }) if opts.verbose: diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 017f06c42..e5d756b8b 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -12,6 +12,7 @@ import zlib import email.utils import json +import datetime try: import urllib.request as compat_urllib_request @@ -568,3 +569,32 @@ def http_response(self, req, resp): https_request = http_request https_response = http_response + +def date_from_str(date_str): + """Return a datetime object from a string in the format YYYYMMDD""" + return datetime.datetime.strptime(date_str, "%Y%m%d").date() + +class DateRange(object): + """Represents a time interval between two dates""" + def __init__(self, start=None, end=None): + """start and end must be strings in the format accepted by date""" + if start is not None: + self.start = date_from_str(start) + else: + self.start = datetime.datetime.min.date() + if end is not None: + self.end = date_from_str(end) + else: + self.end = datetime.datetime.max.date() + if self.start >= self.end: + raise ValueError('Date range: "%s" , the start date must be before the end date' % self) + @classmethod + def day(cls, day): + """Returns a range that only contains the given day""" + return cls(day,day) + def __contains__(self, date): + """Check if the date is in the range""" + date = date_from_str(date) + return self.start <= date and date <= self.end + def __str__(self): + return '%s - %s' % ( self.start.isoformat(), self.end.isoformat()) From bf50b0383e4d6728bbbf1d0ee70cf586a90efb40 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 27 Apr 2013 15:14:20 +0200 Subject: [PATCH 4/4] Fix some IEs that didn't return the uploade_date in the YYYYMMDD format Create a function unified_strdate in utils.py to fix these problems --- test/test_utils.py | 7 +++++++ youtube_dl/InfoExtractors.py | 16 ++++++---------- youtube_dl/utils.py | 17 ++++++++++++++++- 3 files changed, 29 insertions(+), 11 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index f9d58268b..343409a7a 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -15,6 +15,7 @@ from youtube_dl.utils import unescapeHTML from youtube_dl.utils import orderedSet from youtube_dl.utils import DateRange +from youtube_dl.utils import unified_strdate if sys.version_info < (3, 0): _compat_str = lambda b: b.decode('unicode-escape') @@ -104,6 +105,12 @@ def test_daterange(self): self.assertTrue("19690721" in _ac) _firstmilenium = DateRange(end="10000101") self.assertTrue("07110427" in _firstmilenium) + + def test_unified_dates(self): + self.assertEqual(unified_strdate('December 21, 2010'), '20101221') + self.assertEqual(unified_strdate('8/7/2009'), '20090708') + self.assertEqual(unified_strdate('Dec 14, 2012'), '20121214') + self.assertEqual(unified_strdate('2012/10/11 01:56:38 +0000'), '20121011') if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 936af9cb4..88ea567f8 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -562,12 +562,7 @@ def _real_extract(self, url): mobj = re.search(r'id="eow-date.*?>(.*?)', video_webpage, re.DOTALL) if mobj is not None: upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split()) - format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y'] - for expression in format_expressions: - try: - upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d') - except: - pass + upload_date = unified_strdate(upload_date) # description video_description = get_element_by_id("eow-description", video_webpage) @@ -2385,7 +2380,7 @@ def _real_extract(self, url): shortMediaId = mediaId.split(':')[-1] showId = mediaId.split(':')[-2].replace('.com', '') officialTitle = itemEl.findall('./title')[0].text - officialDate = itemEl.findall('./pubDate')[0].text + officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text) configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' + compat_urllib_parse.urlencode({'uri': mediaId})) @@ -2695,12 +2690,13 @@ def _real_extract(self, url): streams = json.loads(stream_json) mediaURL = streams['http_mp3_128_url'] + upload_date = unified_strdate(info['created_at']) return [{ 'id': info['id'], 'url': mediaURL, 'uploader': info['user']['username'], - 'upload_date': info['created_at'], + 'upload_date': upload_date, 'title': info['title'], 'ext': u'mp3', 'description': info['description'], @@ -3759,7 +3755,7 @@ def _real_extract(self, url): self._downloader.report_warning(u'unable to extract video date') upload_date = None else: - upload_date = result.group('date').strip() + upload_date = unified_strdate(result.group('date').strip()) # Get the video uploader result = re.search(r'Submitted:(?P.*)', webpage) @@ -3866,7 +3862,7 @@ def _real_extract(self, url): if result is None: self._downloader.report_error(u'unable to extract video title') return - upload_date = result.group('date') + upload_date = unified_strdate(result.group('date')) info = {'id': video_id, 'url': video_url, diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index e5d756b8b..3a2f0022f 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -569,7 +569,22 @@ def http_response(self, req, resp): https_request = http_request https_response = http_response - + +def unified_strdate(date_str): + """Return a string with the date in the format YYYYMMDD""" + upload_date = None + #Replace commas + date_str = date_str.replace(',',' ') + # %z (UTC offset) is only supported in python>=3.2 + date_str = re.sub(r' (\+|-)[\d]*$', '', date_str) + format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y', '%Y-%m-%d', '%d/%m/%Y', '%Y/%m/%d %H:%M:%S'] + for expression in format_expressions: + try: + upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d') + except: + pass + return upload_date + def date_from_str(date_str): """Return a datetime object from a string in the format YYYYMMDD""" return datetime.datetime.strptime(date_str, "%Y%m%d").date()