Tweaks and bugfixes

LifeActor · May 13, 2022 · f0966a2 · f0966a2
1 parent e5eb915
commit f0966a2
Show file tree

Hide file tree

Showing 11 changed files with 77 additions and 68 deletions.
diff --git a/cykdl/__main__.py b/cykdl/__main__.py
@@ -336,11 +336,12 @@ def main(argv=None):
             video = input('YKDL> ').strip()
         except KeyboardInterrupt:
              sys.exit()
+        if not video:
+            continue
+        if video == 'exit':
+            sys.exit()
         try:
-            if video:
-                if video == 'exit':
-                    sys.exit()
-                handle_video(video)
+            handle_video(video)
         except KeyboardInterrupt:
             logger.warning('\nInterrupted by Ctrl-C, press Ctrl-C again to exit YKDL.')
         except Exception as e:

diff --git a/ykdl/extractors/douban/music.py b/ykdl/extractors/douban/music.py
@@ -42,8 +42,8 @@ def prepare(self):
             return info
 
     def list_only(self):
-        return 'site.douban' in self.url and not match(self.url, 's=(\d+)') or \
-                match(self.url, 'sid=\d+,(\d)')
+        return 'site.douban' in self.url and not match(self.url, 's=\d+') or \
+                match(self.url, 'sid=\d+,\d')
 
     def prepare_list(self):
 

diff --git a/ykdl/extractors/generalsimple.py b/ykdl/extractors/generalsimple.py
@@ -7,7 +7,7 @@
 # TODO: subtitles support
 # REF: https://developer.mozilla.org/zh-CN/docs/Web/API/WebVTT_API
 
-pattern1 = r'''(?ix)
+pattern_ext = r'''(?ix)
 ["'](
     (?:https?:|\\?/)[^"'#]+?\.
     (
@@ -22,13 +22,15 @@
     /?(?:\?.+?)?
 )["'#]
 '''
-pattern2 = r'''(?ix)
-<(?:video|audio|source)[^>]+?src=["'](
-    (?:https?:|\\?/)[^"']+
-)["']
-[^>]+?(?:type=["'](
-    (?:video|audio)/[^"']+
-)["'])?
+pattern_src = r'''(?ix)
+<(?:video|audio|source)[^>]+?
+src=["']((?:https?:|\\?/)[^"']+)["']
+[^>]+?
+(?:
+    type=["']((?:video|audio|application)/[^"']+)["']
+    |
+    [^>](?!type)*>
+)
 '''
 
 class GeneralSimple(Extractor):
@@ -44,11 +46,9 @@ def prepare(self):
 
         ext = ctype = None
         for i in range(2):
-            _ = match(html, pattern1)
-            url, ext = _ and _ or (_, _)
+            url, ctype = matchm(html, pattern_src)
             if url is None:
-                _ = match(html, pattern2)
-                url, ctype = _ and _ or (_, _)
+                url, ext = matchm(html, pattern_ext)
             if url:
                 if not i:
                     url = unescape(url)
@@ -63,11 +63,12 @@ def prepare(self):
                 url = self.url[:self.url.find('/')] + url
             elif url[0] == '/':
                 url = self.url[:self.url.find('/', 9)] + url
-            if ext is None:
+            if ext is None or ctype:
+                ctype = str(ctype).lower()
                 ext = contentTypes.get(ctype) or url_info(url)[1] or (
-                        str(ctype).lower().startswith('audio') and 'mp3' or 'mp4')
+                            ctype.startswith('audio') and 'mp3' or 'mp4')
             if ext[:3] == 'm3u':
-                info.streams = load_m3u8_playlist(url)
+                info.streams = load_m3u8_playlist(url, headers={'Referer': self.url})
             else:
                 info.streams['current'] = {
                     'container': ext,

diff --git a/ykdl/extractors/iqilu.py b/ykdl/extractors/iqilu.py
@@ -12,7 +12,7 @@ def init(self):
         self.url_pattern = '"mp4-wrapper"[^"]+"(http[^"]+)"'
 
     def l_assert(self):
-        assert match(self.url, '(https?://v\.iqilu\.com/\w+)')
+        assert match(self.url, 'https?://v\.iqilu\.com/\w+')
 
     def reprocess(self):
         self.info.title = '[{self.info.artist}] {self.info.title}'.format(**vars())

diff --git a/ykdl/extractors/iqiyi/video.py b/ykdl/extractors/iqiyi/video.py
@@ -115,8 +115,8 @@ def prepare(self):
         info = MediaInfo(self.name)
 
         if self.url and not self.vid:
-            vid = match(self.url, 'curid=([^_]+)_([\w]+)')
-            if vid:
+            vid = matchm(self.url, 'curid=([^_]+)_([\w]+)')
+            if vid[0]:
                 self.vid = vid
                 try:
                     info_json = get_response(

diff --git a/ykdl/extractors/ku6.py b/ykdl/extractors/ku6.py
@@ -12,7 +12,7 @@ def init(self):
         pass
 
     def list_only(self):
-        return match(self.url, 'https://www.ku6.com/detail/(\d+)')
+        return match(self.url, 'https://www.ku6.com/detail/\d+')
 
     def prepare_list(self):
         html = get_content(self.url)

diff --git a/ykdl/extractors/le/__init__.py b/ykdl/extractors/le/__init__.py
@@ -11,7 +11,7 @@ def get_extractor(url):
 
     if 'lunbo' in url:
         from . import lunbo as s
-    elif match(url, '(live[\./]|/izt/)'):
+    elif match(url, 'live[\./]|/izt/'):
         from . import live as s
     elif 'bcloud' in url:
         from . import letvcloud as s

diff --git a/ykdl/extractors/lizhi.py b/ykdl/extractors/lizhi.py
@@ -10,11 +10,11 @@ def prepare(self):
         info = MediaInfo(self.name)
 
         html = get_content(self.url)
-        self.vid, info.artist, _, info.title = \
-                    match(html, 'data-hidden-ph\s?=\s?"(.+?)" '
-                                'data-user-name\s?=\s?"(.+?)" '
-                                'data-radio-name\s?=\s?"(.+?)" '
-                                'data-title\s?=\s?"(.+?)"')
+        self.vid, info.artist, _, info.title = matchm(html,
+                'data-hidden-ph\s?=\s?"(.+?)" '
+                'data-user-name\s?=\s?"(.+?)" '
+                'data-radio-name\s?=\s?"(.+?)" '
+                'data-title\s?=\s?"(.+?)"')
         data = get_response('https://www.lizhi.fm/hidden_ph/' +
                             self.vid).json()
         assert data['rcode'] == 0, data['msg']

diff --git a/ykdl/util/http.py b/ykdl/util/http.py
@@ -549,7 +549,7 @@ def unbrotli(data):
 
 def get_response(url, headers={}, data=None, params=None, method='GET',
                       max_redirections=None, encoding=None,
-                      default_headers=fake_headers, cache=True):
+                      default_headers=fake_headers, cache=CACHED):
     '''Fetch the response of giving URL.
 
     Params: both `params` and `data` always use "UTF-8" as encoding.

diff --git a/ykdl/util/human.py b/ykdl/util/human.py
@@ -8,7 +8,7 @@ def _format_str(s):
         s = s.decode()
     s = s.lower()
     n = match1(s, '^((?:0x)?[0-9a-f])$')  #
-    if n and match(n, '([a-fx])'):        # only convert which is unambiguous
+    if n and match(n, '[a-fx]'):        # only convert which is unambiguous
         return str(int(n, 16))
     return s
 
@@ -33,22 +33,22 @@ def human_size(n, unit=None):
     if isinstance(n, (str, bytes)):
         n = _format_str(n)
         try:
-            n, nu = match(n, '''(?x)
-                             (?:
-                                 ^          |  # start
-                                 \De        |  # no scientific notation
-                                 [^\-\.\de]    # no negative, dot, number
-                             )
-                             (
-                                 \d+           # integer
-                                 (?:\.\d+)?    # float
-                                 (?!\.)        # bad float
-                                 (?:e\d+)?     # scientific notation
-                                 (?![\.\de])   # bad scientific notation
-                             )
-                             \s*
-                             (?:([kmgt])i?b)?  # unit
-                             ''')
+            n, nu = matchm(n, '''(?x)
+                              (?:
+                                  ^          |  # start
+                                  \De        |  # no scientific notation
+                                  [^\-\.\de]    # no negative, dot, number
+                              )
+                              (
+                                  \d+           # integer
+                                  (?:\.\d+)?    # float
+                                  (?!\.)        # bad float
+                                  (?:e\d+)?     # scientific notation
+                                  (?![\.\de])   # bad scientific notation
+                              )
+                              \s*
+                              (?:([kmgt])i?b)?  # unit
+                              ''')
         except TypeError:
             raise ValueError('invalid literal for human_size(): %r' % n)
         f = float(n)

diff --git a/ykdl/util/match.py b/ykdl/util/match.py
@@ -1,7 +1,7 @@
 import re
 
 
-__all__ = ['match', 'match1', 'matchall']
+__all__ = ['match', 'match1', 'matchm', 'matchall']
 
 def _format_str(pattern, string):
     '''Format the target which will be scanned, makes the worker happy.'''
@@ -33,49 +33,56 @@ def _format_str(pattern, string):
     return string
 
 def match(obj, *patterns):
-    '''Scans a object for matched some patterns with catch mode (matches first).
+    '''Scans a object for matched some patterns with capture mode (matches first).
 
     Params:
         `obj`, any object which contains string data.
         `patterns`, arbitrary number of regex patterns.
 
-    Returns all the catched substring of first match, or None.
+    Returns the first Match object, or None.
     '''
-
     for pattern in patterns:
         string = _format_str(pattern, obj)
-        match = re.search(pattern, string)
-        groups = match and match.groups()
-        if groups:
-            return groups
+        m = re.search(pattern, string)
+        if m:
+            return m
     return None
 
 def match1(obj, *patterns):
-    '''Scans a object for matched some patterns with catch mode (catches first).
+    '''Scans a object for matched some patterns with capture mode.
 
     Params: same as match()
 
-    Returns the first catched substring, or None.
+    Returns the first captured substring, or None.
     '''
+    m = match(obj, *patterns)
+    return m and m.groups()[0]
+
+def matchm(obj, *patterns):
+    '''Scans a object for matched some patterns with capture mode.
+
+    Params: same as match()
 
-    groups = match(obj, *patterns)
-    return groups and groups[0]
+    Returns all captured substrings of the first Match object, or same number of
+    None objects.
+    '''
+    m = match(obj, *patterns)
+    return m and m.groups() or (None,) * re.compile(patterns[0]).groups
 
 
 def matchall(obj, *patterns):
-    '''Scans a object for matched some patterns with catch mode (matches all).
+    '''Scans a object for matched some patterns with capture mode.
 
     Params: same as match()
 
-    Returns a list of all the catched substring of matches, or a empty list.
-    If a conformity form of catches in the list has be excepted, all the regex
-    patterns MUST include a similar catch mode.
+    Returns a list of all the captured substring of matches, or a empty list.
+    If a conformity form of captures in the list has be excepted, all the regex
+    patterns MUST include a similar capture mode.
     '''
-
     ret = []
     for pattern in patterns:
         string = _format_str(pattern, obj)
-        match = re.findall(pattern, string)
-        ret += match
+        m = re.findall(pattern, string)
+        ret += m
 
     return ret