From 81c2f20b5386d89a62dc27293654d75b77f47473 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sun, 9 Feb 2014 17:56:10 +0100 Subject: [PATCH] [youtube] Correct invalid JSON (Fixes #2353) --- youtube_dl/extractor/common.py | 5 ++++- youtube_dl/extractor/youtube.py | 10 +++++----- youtube_dl/utils.py | 6 ++++++ 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 2c0c75604..84fca8ba0 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -271,8 +271,11 @@ def _download_xml(self, url_or_request, video_id, def _download_json(self, url_or_request, video_id, note=u'Downloading JSON metadata', - errnote=u'Unable to download JSON metadata'): + errnote=u'Unable to download JSON metadata', + transform_source=None): json_string = self._download_webpage(url_or_request, video_id, note, errnote) + if transform_source: + json_string = transform_source(json_string) try: return json.loads(json_string) except ValueError as ve: diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index e038c7752..18a92e101 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -34,6 +34,7 @@ unified_strdate, orderedSet, write_json_file, + uppercase_escape, ) class YoutubeBaseInfoExtractor(InfoExtractor): @@ -1590,11 +1591,10 @@ def _real_extract(self, url): # Download all channel pages using the json-based channel_ajax query for pagenum in itertools.count(1): url = self._MORE_PAGES_URL % (pagenum, channel_id) - page = self._download_webpage(url, channel_id, - u'Downloading page #%s' % pagenum) - - page = json.loads(page) - + page = self._download_json( + url, channel_id, note=u'Downloading page #%s' % pagenum, + transform_source=uppercase_escape) + ids_in_page = self.extract_videos_from_page(page['content_html']) video_ids.extend(ids_in_page) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 01c8c017d..fa8f80e02 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1214,3 +1214,9 @@ def getslice(self, start=0, end=None): if end == nextfirstid: break return res + + +def uppercase_escape(s): + return re.sub( + r'\\U([0-9a-fA-F]{8})', + lambda m: compat_chr(int(m.group(1), base=16)), s)