From b7ab05908440915c6c5faa541abe00c62a88bc27 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 20 Jan 2014 11:36:47 +0100 Subject: [PATCH] Add infrastructure for paged lists This commit allows to download pages in playlists as needed instead of all at once. Before this commit, youtube-dl http://www.youtube.com/user/ANNnewsCH/videos --playlist-end 2 --skip-download took quite some time - now it's almost instantaneous. As an example, the youtube:user extractor has been converted. Fixes #2175 --- test/test_utils.py | 22 +++++++++++++++++ youtube_dl/YoutubeDL.py | 23 +++++++++++------ youtube_dl/extractor/youtube.py | 28 ++++++++------------- youtube_dl/utils.py | 44 +++++++++++++++++++++++++++++++++ 4 files changed, 92 insertions(+), 25 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index bee355ee0e..349c1107f4 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -18,6 +18,7 @@ find_xpath_attr, get_meta_content, orderedSet, + PagedList, parse_duration, sanitize_filename, shell_quote, @@ -200,5 +201,26 @@ def test_parse_duration(self): self.assertEqual(parse_duration('9:12:43'), 33163) self.assertEqual(parse_duration('x:y'), None) + def test_paged_list(self): + def testPL(size, pagesize, sliceargs, expected): + def get_page(pagenum): + firstid = pagenum * pagesize + upto = min(size, pagenum * pagesize + pagesize) + for i in range(firstid, upto): + yield i + + pl = PagedList(get_page, pagesize) + got = pl.getslice(*sliceargs) + self.assertEqual(got, expected) + + testPL(5, 2, (), [0, 1, 2, 3, 4]) + testPL(5, 2, (1,), [1, 2, 3, 4]) + testPL(5, 2, (2,), [2, 3, 4]) + testPL(5, 2, (4,), [4]) + testPL(5, 2, (0, 3), [0, 1, 2]) + testPL(5, 2, (1, 4), [1, 2, 3]) + testPL(5, 2, (2, 99), [2, 3, 4]) + testPL(5, 2, (20, 99), []) + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index a0ab89b3d6..2ad6f10286 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -39,6 +39,7 @@ locked_file, make_HTTPS_handler, MaxDownloadsReached, + PagedList, PostProcessingError, platform_name, preferredencoding, @@ -575,19 +576,27 @@ def make_result(embedded_info): playlist_results = [] - n_all_entries = len(ie_result['entries']) playliststart = self.params.get('playliststart', 1) - 1 playlistend = self.params.get('playlistend', None) # For backwards compatibility, interpret -1 as whole list if playlistend == -1: playlistend = None - entries = ie_result['entries'][playliststart:playlistend] - n_entries = len(entries) - - self.to_screen( - "[%s] playlist '%s': Collected %d video ids (downloading %d of them)" % - (ie_result['extractor'], playlist, n_all_entries, n_entries)) + if isinstance(ie_result['entries'], list): + n_all_entries = len(ie_result['entries']) + entries = ie_result['entries'][playliststart:playlistend] + n_entries = len(entries) + self.to_screen( + "[%s] playlist %s: Collected %d video ids (downloading %d of them)" % + (ie_result['extractor'], playlist, n_all_entries, n_entries)) + else: + assert isinstance(ie_result['entries'], PagedList) + entries = ie_result['entries'].getslice( + playliststart, playlistend) + n_entries = len(entries) + self.to_screen( + "[%s] playlist %s: Downloading %d videos" % + (ie_result['extractor'], playlist, n_entries)) for i, entry in enumerate(entries, 1): self.to_screen('[download] Downloading video #%s of %s' % (i, n_entries)) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 248b30ffb3..dd1a58f3fc 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -27,6 +27,7 @@ get_element_by_id, get_element_by_attribute, ExtractorError, + PagedList, RegexNotFoundError, unescapeHTML, unified_strdate, @@ -1580,44 +1581,35 @@ def _real_extract(self, url): # page by page until there are no video ids - it means we got # all of them. - url_results = [] - - for pagenum in itertools.count(0): + def download_page(pagenum): start_index = pagenum * self._GDATA_PAGE_SIZE + 1 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index) - page = self._download_webpage(gdata_url, username, - u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE)) + page = self._download_webpage( + gdata_url, username, + u'Downloading video ids from %d to %d' % ( + start_index, start_index + self._GDATA_PAGE_SIZE)) try: response = json.loads(page) except ValueError as err: raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err)) if 'entry' not in response['feed']: - # Number of videos is a multiple of self._MAX_RESULTS - break + return # Extract video identifiers entries = response['feed']['entry'] for entry in entries: title = entry['title']['$t'] video_id = entry['id']['$t'].split('/')[-1] - url_results.append({ + yield { '_type': 'url', 'url': video_id, 'ie_key': 'Youtube', 'id': 'video_id', 'title': title, - }) - - # A little optimization - if current page is not - # "full", ie. does not contain PAGE_SIZE video ids then - # we can assume that this page is the last one - there - # are no more ids on further pages - no need to query - # again. - - if len(entries) < self._GDATA_PAGE_SIZE: - break + } + url_results = PagedList(download_page, self._GDATA_PAGE_SIZE) return self.playlist_result(url_results, playlist_title=username) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 73fe1ad0a3..ff124d9e8c 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -6,6 +6,7 @@ import email.utils import errno import gzip +import itertools import io import json import locale @@ -1161,3 +1162,46 @@ def check_executable(exe, args=[]): except OSError: return False return exe + + +class PagedList(object): + def __init__(self, pagefunc, pagesize): + self._pagefunc = pagefunc + self._pagesize = pagesize + + def getslice(self, start=0, end=None): + res = [] + for pagenum in itertools.count(start // self._pagesize): + firstid = pagenum * self._pagesize + nextfirstid = pagenum * self._pagesize + self._pagesize + if start >= nextfirstid: + continue + + page_results = list(self._pagefunc(pagenum)) + + startv = ( + start % self._pagesize + if firstid <= start < nextfirstid + else 0) + + endv = ( + ((end - 1) % self._pagesize) + 1 + if (end is not None and firstid <= end <= nextfirstid) + else None) + + if startv != 0 or endv is not None: + page_results = page_results[startv:endv] + res.extend(page_results) + + # A little optimization - if current page is not "full", ie. does + # not contain page_size videos then we can assume that this page + # is the last one - there are no more ids on further pages - + # i.e. no need to query again. + if len(page_results) + startv < self._pagesize: + break + + # If we got the whole page, but the next page is not interesting, + # break out early as well + if end == nextfirstid: + break + return res