From d92f5d5a9005a7a6df7bc081f64d662c70a3f3cf Mon Sep 17 00:00:00 2001
From: coletdjnz <colethedj@protonmail.com>
Date: Wed, 7 Apr 2021 11:37:43 +0000
Subject: [PATCH] [youtube] Extract comments' approximate timestamp (#221)

Authored by: colethedj
---
 yt_dlp/extractor/youtube.py | 41 +++++++++++++++++++++++--------------
 1 file changed, 26 insertions(+), 15 deletions(-)

diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py
index 016750a70..c3d06b967 100644
--- a/yt_dlp/extractor/youtube.py
+++ b/yt_dlp/extractor/youtube.py
@@ -2,6 +2,7 @@
 
 from __future__ import unicode_literals
 
+import calendar
 import hashlib
 import itertools
 import json
@@ -27,6 +28,7 @@
     bool_or_none,
     clean_html,
     dict_get,
+    datetime_from_str,
     ExtractorError,
     format_field,
     float_or_none,
@@ -46,7 +48,7 @@
     update_url_query,
     url_or_none,
     urlencode_postdata,
-    urljoin,
+    urljoin
 )
 
 
@@ -1499,6 +1501,16 @@ def _extract_yt_initial_variable(self, webpage, regex, video_id, name):
             (r'%s\s*%s' % (regex, self._YT_INITIAL_BOUNDARY_RE),
              regex), webpage, name, default='{}'), video_id, fatal=False)
 
+    @staticmethod
+    def parse_time_text(time_text):
+        """
+        Parse the comment time text
+        time_text is in the format 'X units ago (edited)'
+        """
+        time_text_split = time_text.split(' ')
+        if len(time_text_split) >= 3:
+            return datetime_from_str('now-%s%s' % (time_text_split[0], time_text_split[1]), precision='auto')
+
     @staticmethod
     def _join_text_entries(runs):
         text = None
@@ -1521,7 +1533,7 @@ def _extract_comment(self, comment_renderer, parent=None):
         text = self._join_text_entries(comment_text_runs) or ''
         comment_time_text = try_get(comment_renderer, lambda x: x['publishedTimeText']['runs']) or []
         time_text = self._join_text_entries(comment_time_text)
-
+        timestamp = calendar.timegm(self.parse_time_text(time_text).timetuple())
         author = try_get(comment_renderer, lambda x: x['authorText']['simpleText'], compat_str)
         author_id = try_get(comment_renderer,
                             lambda x: x['authorEndpoint']['browseEndpoint']['browseId'], compat_str)
@@ -1532,11 +1544,10 @@ def _extract_comment(self, comment_renderer, parent=None):
 
         author_is_uploader = try_get(comment_renderer, lambda x: x['authorIsChannelOwner'], bool)
         is_liked = try_get(comment_renderer, lambda x: x['isLiked'], bool)
-
         return {
             'id': comment_id,
             'text': text,
-            # TODO: This should be parsed to timestamp
+            'timestamp': timestamp,
             'time_text': time_text,
             'like_count': votes,
             'is_favorited': is_liked,
@@ -1624,12 +1635,12 @@ def extract_thread(parent_renderer):
                     comment_prog_str = '(%d/%d)' % (comment_counts[0], comment_counts[1])
                     if page_num == 0:
                         if first_continuation:
-                            note_prefix = "Downloading initial comment continuation page"
+                            note_prefix = 'Downloading initial comment continuation page'
                         else:
-                            note_prefix = "    Downloading comment reply thread %d %s" % (comment_counts[2], comment_prog_str)
+                            note_prefix = '    Downloading comment reply thread %d %s' % (comment_counts[2], comment_prog_str)
                     else:
-                        note_prefix = "%sDownloading comment%s page %d %s" % (
-                            "       " if parent else "",
+                        note_prefix = '%sDownloading comment%s page %d %s' % (
+                            '       ' if parent else '',
                             ' replies' if parent else '',
                             page_num,
                             comment_prog_str)
@@ -1644,13 +1655,13 @@ def extract_thread(parent_renderer):
                 except ExtractorError as e:
                     if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503, 404, 413):
                         if e.cause.code == 413:
-                            self.report_warning("Assumed end of comments (received HTTP Error 413)")
+                            self.report_warning('Assumed end of comments (received HTTP Error 413)')
                             return
                         # Downloading page may result in intermittent 5xx HTTP error
                         # Sometimes a 404 is also recieved. See: https://github.com/ytdl-org/youtube-dl/issues/28289
                         last_error = 'HTTP Error %s' % e.cause.code
                         if e.cause.code == 404:
-                            last_error = last_error + " (this API is probably deprecated)"
+                            last_error = last_error + ' (this API is probably deprecated)'
                         if count < retries:
                             continue
                     raise
@@ -1668,7 +1679,7 @@ def extract_thread(parent_renderer):
 
                     # YouTube sometimes gives reload: now json if something went wrong (e.g. bad auth)
                     if browse.get('reload'):
-                        raise ExtractorError("Invalid or missing params in continuation request", expected=False)
+                        raise ExtractorError('Invalid or missing params in continuation request', expected=False)
 
                     # TODO: not tested, merged from old extractor
                     err_msg = browse.get('externalErrorMessage')
@@ -1708,7 +1719,7 @@ def extract_thread(parent_renderer):
 
                     if expected_comment_count:
                         comment_counts[1] = str_to_int(expected_comment_count)
-                        self.to_screen("Downloading ~%d comments" % str_to_int(expected_comment_count))
+                        self.to_screen('Downloading ~%d comments' % str_to_int(expected_comment_count))
                         yield comment_counts[1]
 
                     # TODO: cli arg.
@@ -1724,7 +1735,7 @@ def extract_thread(parent_renderer):
                         continuation = YoutubeTabIE._build_continuation_query(
                             continuation=sort_continuation_renderer.get('continuation'),
                             ctp=sort_continuation_renderer.get('clickTrackingParams'))
-                        self.to_screen("Sorting comments by %s" % ('popular' if comment_sort_index == 0 else 'newest'))
+                        self.to_screen('Sorting comments by %s' % ('popular' if comment_sort_index == 0 else 'newest'))
                         break
 
                 for entry in known_continuation_renderers[key](continuation_renderer):
@@ -1757,7 +1768,7 @@ def _extract_comments(self, ytcfg, video_id, contents, webpage, xsrf_token):
                         continue
                     comments.append(comment)
                 break
-        self.to_screen("Downloaded %d/%d comments" % (len(comments), estimated_total))
+        self.to_screen('Downloaded %d/%d comments' % (len(comments), estimated_total))
         return {
             'comments': comments,
             'comment_count': len(comments),
@@ -2979,7 +2990,7 @@ def extract_entries(parent_renderer):  # this needs to called again for continua
                     self.report_warning('%s. Retrying ...' % last_error)
                 try:
                     response = self._call_api(
-                        ep="browse", fatal=True, headers=headers,
+                        ep='browse', fatal=True, headers=headers,
                         video_id='%s page %s' % (item_id, page_num),
                         query={
                             'continuation': continuation['continuation'],