Implement --add-header without modifying std_headers

Closes #2526, #1614
2025-02-20 12:09:46 +00:00 · 2022-01-29 03:25:35 +05:30 · 2022-01-29 03:25:35 +05:30 · 8b7539d27c
commit 8b7539d27c
parent e48b3875ec
10 changed files with 28 additions and 29 deletions
--- a/README.md
+++ b/README.md
@ -737,9 +737,6 @@ ## Workarounds:
    --prefer-insecure                Use an unencrypted connection to retrieve
                                     information about the video (Currently
                                     supported only for YouTube)
-    --user-agent UA                  Specify a custom user agent
-    --referer URL                    Specify a custom referer, use if the video
-                                     access is restricted to one domain
    --add-header FIELD:VALUE         Specify a custom HTTP header and its value,
                                     separated by a colon ":". You can use this
                                     option multiple times
@ -1866,6 +1863,8 @@ #### Redundant options
    --reject-title REGEX             --match-filter "title !~= (?i)REGEX"
    --min-views COUNT                --match-filter "view_count >=? COUNT"
    --max-views COUNT                --match-filter "view_count <=? COUNT"
+    --user-agent UA                  --add-header "User-Agent:UA"
+    --referer URL                    --add-header "Referer:URL"


 #### Not recommended
--- a/yt_dlp/YoutubeDL.py
+++ b/yt_dlp/YoutubeDL.py
@ -83,6 +83,7 @@
    make_dir,
    make_HTTPS_handler,
    MaxDownloadsReached,
+    merge_headers,
    network_exceptions,
    number_of_digits,
    orderedSet,
@ -332,6 +333,7 @@ class YoutubeDL(object):
    nocheckcertificate:  Do not verify SSL certificates
    prefer_insecure:   Use HTTP instead of HTTPS to retrieve information.
                       At the moment, this is only supported by YouTube.
+    http_headers:      A dictionary of custom headers to be used for all requests
    proxy:             URL of the proxy server to use
    geo_verification_proxy:  URL of the proxy to use for IP address verification
                       on geo-restricted sites.
@ -647,6 +649,9 @@ def check_deprecated(param, option, suggestion):
            else self.params['format'] if callable(self.params['format'])
            else self.build_format_selector(self.params['format']))

+        # Set http_headers defaults according to std_headers
+        self.params['http_headers'] = merge_headers(std_headers, self.params.get('http_headers', {}))
+
        self._setup_opener()

        if auto_init:
@ -2250,8 +2255,7 @@ def restore_last_token(self):
        return _build_selector_function(parsed_selector)

    def _calc_headers(self, info_dict):
-        res = std_headers.copy()
-        res.update(info_dict.get('http_headers') or {})
+        res = merge_headers(self.params['http_headers'], info_dict.get('http_headers') or {})

        cookies = self._calc_cookies(info_dict)
        if cookies:
--- a/yt_dlp/init.py
+++ b/yt_dlp/init.py
@ -41,6 +41,7 @@
    SameFileError,
    setproctitle,
    std_headers,
+    traverse_obj,
    write_string,
 )
 from .update import run_update
@ -75,20 +76,15 @@ def _real_main(argv=None):
    parser, opts, args = parseOpts(argv)
    warnings, deprecation_warnings = [], []

-    # Set user agent
    if opts.user_agent is not None:
-        std_headers['User-Agent'] = opts.user_agent
-
-    # Set referer
+        opts.headers.setdefault('User-Agent', opts.user_agent)
    if opts.referer is not None:
-        std_headers['Referer'] = opts.referer
-
-    # Custom HTTP headers
-    std_headers.update(opts.headers)
+        opts.headers.setdefault('Referer', opts.referer)

    # Dump user agent
    if opts.dump_user_agent:
-        write_string(std_headers['User-Agent'] + '\n', out=sys.stdout)
+        ua = traverse_obj(opts.headers, 'User-Agent', casesense=False, default=std_headers['User-Agent'])
+        write_string(f'{ua}\n', out=sys.stdout)
        sys.exit(0)

    # Batch file verification
@ -767,6 +763,7 @@ def report_deprecation(val, old, new=None):
        'legacyserverconnect': opts.legacy_server_connect,
        'nocheckcertificate': opts.no_check_certificate,
        'prefer_insecure': opts.prefer_insecure,
+        'http_headers': opts.headers,
        'proxy': opts.proxy,
        'socket_timeout': opts.socket_timeout,
        'bidi_workaround': opts.bidi_workaround,
--- a/yt_dlp/extractor/instagram.py
+++ b/yt_dlp/extractor/instagram.py
@ -17,7 +17,6 @@
    get_element_by_attribute,
    int_or_none,
    lowercase_escape,
-    std_headers,
    str_or_none,
    str_to_int,
    traverse_obj,
@ -503,7 +502,7 @@ def _extract_graphql(self, data, url):
                    '%s' % rhx_gis,
                    '',
                    '%s:%s' % (rhx_gis, csrf_token),
-                    '%s:%s:%s' % (rhx_gis, csrf_token, std_headers['User-Agent']),
+                    '%s:%s:%s' % (rhx_gis, csrf_token, self.get_param('http_headers')['User-Agent']),
                ]

            # try all of the ways to generate a GIS query, and not only use the
--- a/yt_dlp/extractor/mildom.py
+++ b/yt_dlp/extractor/mildom.py
@ -8,7 +8,6 @@

 from .common import InfoExtractor
 from ..utils import (
-    std_headers,
    update_url_query,
    random_uuidv4,
    try_get,
@ -70,7 +69,7 @@ def _fetch_dispatcher_config(self):
                        'clu': '',
                        'wh': '1919*810',
                        'rtm': self.iso_timestamp(),
-                        'ua': std_headers['User-Agent'],
+                        'ua': self.get_param('http_headers')['User-Agent'],
                    }).encode('utf8')).decode('utf8').replace('\n', ''),
                }).encode('utf8'))
            self._DISPATCHER_CONFIG = self._parse_json(base64.b64decode(tmp['data']), 'initialization')
--- a/yt_dlp/extractor/openload.py
+++ b/yt_dlp/extractor/openload.py
@ -16,7 +16,6 @@
    ExtractorError,
    get_exe_version,
    is_outdated_version,
-    std_headers,
    Popen,
 )

@ -208,7 +207,7 @@ def get(self, url, html=None, video_id=None, note=None, note2='Executing JS on w

        replaces = self.options
        replaces['url'] = url
-        user_agent = headers.get('User-Agent') or std_headers['User-Agent']
+        user_agent = headers.get('User-Agent') or self.get_param('http_headers')['User-Agent']
        replaces['ua'] = user_agent.replace('"', '\\"')
        replaces['jscode'] = jscode

--- a/yt_dlp/extractor/rtve.py
+++ b/yt_dlp/extractor/rtve.py
@ -17,7 +17,6 @@
    qualities,
    remove_end,
    remove_start,
-    std_headers,
    try_get,
 )

@ -71,7 +70,7 @@ class RTVEALaCartaIE(InfoExtractor):
    }]

    def _real_initialize(self):
-        user_agent_b64 = base64.b64encode(std_headers['User-Agent'].encode('utf-8')).decode('utf-8')
+        user_agent_b64 = base64.b64encode(self.get_param('http_headers')['User-Agent'].encode('utf-8')).decode('utf-8')
        self._manager = self._download_json(
            'http://www.rtve.es/odin/loki/' + user_agent_b64,
            None, 'Fetching manager info')['manager']
--- a/yt_dlp/extractor/vimeo.py
+++ b/yt_dlp/extractor/vimeo.py
@ -28,7 +28,6 @@
    parse_qs,
    sanitized_Request,
    smuggle_url,
-    std_headers,
    str_or_none,
    try_get,
    unified_timestamp,
@ -758,7 +757,7 @@ def _try_album_password(self, url):

    def _real_extract(self, url):
        url, data = unsmuggle_url(url, {})
-        headers = std_headers.copy()
+        headers = self.get_param('http_headers').copy()
        if 'http_headers' in data:
            headers.update(data['http_headers'])
        if 'Referer' not in headers:
--- a/yt_dlp/options.py
+++ b/yt_dlp/options.py
@ -860,17 +860,16 @@ def _dict_from_options_callback(
    workarounds.add_option(
        '--user-agent',
        metavar='UA', dest='user_agent',
-        help='Specify a custom user agent')
+        help=optparse.SUPPRESS_HELP)
    workarounds.add_option(
        '--referer',
        metavar='URL', dest='referer', default=None,
-        help='Specify a custom referer, use if the video access is restricted to one domain',
-    )
+        help=optparse.SUPPRESS_HELP)
    workarounds.add_option(
        '--add-header',
        metavar='FIELD:VALUE', dest='headers', default={}, type='str',
        action='callback', callback=_dict_from_options_callback,
-        callback_kwargs={'multiple_keys': False, 'process_key': None},
+        callback_kwargs={'multiple_keys': False},
        help='Specify a custom HTTP header and its value, separated by a colon ":". You can use this option multiple times',
    )
    workarounds.add_option(
--- a/yt_dlp/utils.py
+++ b/yt_dlp/utils.py
@ -1372,7 +1372,7 @@ def http_request(self, req):
        if url != url_escaped:
            req = update_Request(req, url=url_escaped)

-        for h, v in std_headers.items():
+        for h, v in self._params.get('http_headers', std_headers).items():
            # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
            # The dict keys are capitalized because of this bug by urllib
            if h.capitalize() not in req.headers:
@ -5436,3 +5436,8 @@ def _cancel_all_tasks(loop):


 has_websockets = bool(compat_websockets)
+
+
+def merge_headers(*dicts):
+    """Merge dicts of network headers case insensitively, prioritizing the latter ones"""
+    return {k.capitalize(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}