diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py
index 5f874b72f..3fd5cadfd 100755
--- a/youtube_dl/__init__.py
+++ b/youtube_dl/__init__.py
@@ -242,6 +242,18 @@ def htmlentity_transform(matchobj):
return (u'&%s;' % entity)
+def clean_html(html):
+ """Clean an HTML snippet into a readable string"""
+ # Newline vs
+ html = html.replace('\n', ' ')
+ html = re.sub('<\s*br\s*/?\s*>', '\n', html)
+ # Strip html tags
+ html = re.sub('<.*?>', '', html)
+ # Replace html entities
+ html = re.sub(ur'(?u)&(.+?);', htmlentity_transform, html)
+ return html
+
+
def sanitize_title(utitle):
"""Sanitizes a video title so it could be used as part of a filename."""
utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
@@ -3343,8 +3355,6 @@ def report_config_download(self, showName):
self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
def _real_extract(self, url):
- htmlParser = HTMLParser.HTMLParser()
-
mobj = re.match(self._VALID_URL, url)
if mobj is None:
self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
@@ -3360,11 +3370,11 @@ def _real_extract(self, url):
return
descMatch = re.search('