[YoutubeDL] Fix format selection with filters (Closes #10083)

2025-01-29 17:47:53 +00:00 · 2016-07-16 00:55:43 +07:00 · 2016-07-16 00:55:43 +07:00 · 317f7ab634
commit 317f7ab634
parent 23495d6a39
2 changed files with 81 additions and 16 deletions
--- a/test/test_YoutubeDL.py
+++ b/test/test_YoutubeDL.py
@ -335,6 +335,40 @@ def format_info(f_id):
            downloaded = ydl.downloaded_info_dicts[0]
            self.assertEqual(downloaded['format_id'], f1['format_id'])

+    def test_audio_only_extractor_format_selection(self):
+        # For extractors with incomplete formats (all formats are audio-only or
+        # video-only) best and worst should fallback to corresponding best/worst
+        # video-only or audio-only formats (as per
+        # https://github.com/rg3/youtube-dl/pull/5556)
+        formats = [
+            {'format_id': 'low', 'ext': 'mp3', 'preference': 1, 'vcodec': 'none', 'url': TEST_URL},
+            {'format_id': 'high', 'ext': 'mp3', 'preference': 2, 'vcodec': 'none', 'url': TEST_URL},
+        ]
+        info_dict = _make_result(formats)
+
+        ydl = YDL({'format': 'best'})
+        ydl.process_ie_result(info_dict.copy())
+        downloaded = ydl.downloaded_info_dicts[0]
+        self.assertEqual(downloaded['format_id'], 'high')
+
+        ydl = YDL({'format': 'worst'})
+        ydl.process_ie_result(info_dict.copy())
+        downloaded = ydl.downloaded_info_dicts[0]
+        self.assertEqual(downloaded['format_id'], 'low')
+
+    def test_format_not_available(self):
+        formats = [
+            {'format_id': 'regular', 'ext': 'mp4', 'height': 360, 'url': TEST_URL},
+            {'format_id': 'video', 'ext': 'mp4', 'height': 720, 'acodec': 'none', 'url': TEST_URL},
+        ]
+        info_dict = _make_result(formats)
+
+        # This must fail since complete video-audio format does not match filter
+        # and extractor does not provide incomplete only formats (i.e. only
+        # video-only or audio-only).
+        ydl = YDL({'format': 'best[height>360]'})
+        self.assertRaises(ExtractorError, ydl.process_ie_result, info_dict.copy())
+
    def test_invalid_format_specs(self):
        def assert_syntax_error(format_spec):
            ydl = YDL({'format': format_spec})
--- a/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@ -5,6 +5,7 @@

 import collections
 import contextlib
+import copy
 import datetime
 import errno
 import fileinput
@ -1051,9 +1052,9 @@ def _build_selector_function(selector):
            if isinstance(selector, list):
                fs = [_build_selector_function(s) for s in selector]

-                def selector_function(formats):
+                def selector_function(ctx):
                    for f in fs:
-                        for format in f(formats):
+                        for format in f(ctx):
                            yield format
                return selector_function
            elif selector.type == GROUP:
@ -1061,17 +1062,17 @@ def selector_function(formats):
            elif selector.type == PICKFIRST:
                fs = [_build_selector_function(s) for s in selector.selector]

-                def selector_function(formats):
+                def selector_function(ctx):
                    for f in fs:
-                        picked_formats = list(f(formats))
+                        picked_formats = list(f(ctx))
                        if picked_formats:
                            return picked_formats
                    return []
            elif selector.type == SINGLE:
                format_spec = selector.selector

-                def selector_function(formats):
-                    formats = list(formats)
+                def selector_function(ctx):
+                    formats = list(ctx['formats'])
                    if not formats:
                        return
                    if format_spec == 'all':
@ -1084,9 +1085,10 @@ def selector_function(formats):
                            if f.get('vcodec') != 'none' and f.get('acodec') != 'none']
                        if audiovideo_formats:
                            yield audiovideo_formats[format_idx]
-                        # for audio only (soundcloud) or video only (imgur) urls, select the best/worst audio format
-                        elif (all(f.get('acodec') != 'none' for f in formats) or
-                              all(f.get('vcodec') != 'none' for f in formats)):
+                        # for extractors with incomplete formats (audio only (soundcloud)
+                        # or video only (imgur)) we will fallback to best/worst
+                        # {video,audio}-only format
+                        elif ctx['incomplete_formats']:
                            yield formats[format_idx]
                    elif format_spec == 'bestaudio':
                        audio_formats = [
@ -1160,17 +1162,18 @@ def _merge(formats_info):
                    }
                video_selector, audio_selector = map(_build_selector_function, selector.selector)

-                def selector_function(formats):
-                    formats = list(formats)
-                    for pair in itertools.product(video_selector(formats), audio_selector(formats)):
+                def selector_function(ctx):
+                    for pair in itertools.product(
+                            video_selector(copy.deepcopy(ctx)), audio_selector(copy.deepcopy(ctx))):
                        yield _merge(pair)

            filters = [self._build_format_filter(f) for f in selector.filters]

-            def final_selector(formats):
+            def final_selector(ctx):
+                ctx_copy = copy.deepcopy(ctx)
                for _filter in filters:
-                    formats = list(filter(_filter, formats))
-                return selector_function(formats)
+                    ctx_copy['formats'] = list(filter(_filter, ctx_copy['formats']))
+                return selector_function(ctx_copy)
            return final_selector

        stream = io.BytesIO(format_spec.encode('utf-8'))
@ -1377,7 +1380,35 @@ def process_video_result(self, info_dict, download=True):
            req_format_list.append('best')
            req_format = '/'.join(req_format_list)
        format_selector = self.build_format_selector(req_format)
-        formats_to_download = list(format_selector(formats))
+
+        # While in format selection we may need to have an access to the original
+        # format set in order to calculate some metrics or do some processing.
+        # For now we need to be able to guess whether original formats provided
+        # by extractor are incomplete or not (i.e. whether extractor provides only
+        # video-only or audio-only formats) for proper formats selection for
+        # extractors with such incomplete formats (see
+        # https://github.com/rg3/youtube-dl/pull/5556).
+        # Since formats may be filtered during format selection and may not match
+        # the original formats the results may be incorrect. Thus original formats
+        # or pre-calculated metrics should be passed to format selection routines
+        # as well.
+        # We will pass a context object containing all necessary additional data
+        # instead of just formats.
+        # This fixes incorrect format selection issue (see
+        # https://github.com/rg3/youtube-dl/issues/10083).
+        incomplete_formats = all(
+            # All formats are video-only or
+            f.get('vcodec') != 'none' and f.get('acodec') == 'none' or
+            # all formats are audio-only
+            f.get('vcodec') == 'none' and f.get('acodec') != 'none'
+            for f in formats)
+
+        ctx = {
+            'formats': formats,
+            'incomplete_formats': incomplete_formats,
+        }
+
+        formats_to_download = list(format_selector(ctx))
        if not formats_to_download:
            raise ExtractorError('requested format not available',
                                 expected=True)