diff options
author | James Taylor <user234683@users.noreply.github.com> | 2018-07-11 01:31:44 -0700 |
---|---|---|
committer | James Taylor <user234683@users.noreply.github.com> | 2018-07-11 01:31:44 -0700 |
commit | aea11c407f7be904037183da152c4cdf561cd65e (patch) | |
tree | 5fad52326feee68c3771becee44fdf24ceebca05 /youtube_dl/jsinterp.py | |
parent | 2b308ec9f0fd1ba296db7bcdcea9f93aebf6eaa1 (diff) | |
download | yt-local-aea11c407f7be904037183da152c4cdf561cd65e.tar.lz yt-local-aea11c407f7be904037183da152c4cdf561cd65e.tar.xz yt-local-aea11c407f7be904037183da152c4cdf561cd65e.zip |
track custom youtube-dl distribution
Diffstat (limited to 'youtube_dl/jsinterp.py')
-rw-r--r-- | youtube_dl/jsinterp.py | 262 |
1 files changed, 262 insertions, 0 deletions
diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py new file mode 100644 index 0000000..7bda596 --- /dev/null +++ b/youtube_dl/jsinterp.py @@ -0,0 +1,262 @@ +from __future__ import unicode_literals + +import json +import operator +import re + +from .utils import ( + ExtractorError, + remove_quotes, +) + +_OPERATORS = [ + ('|', operator.or_), + ('^', operator.xor), + ('&', operator.and_), + ('>>', operator.rshift), + ('<<', operator.lshift), + ('-', operator.sub), + ('+', operator.add), + ('%', operator.mod), + ('/', operator.truediv), + ('*', operator.mul), +] +_ASSIGN_OPERATORS = [(op + '=', opfunc) for op, opfunc in _OPERATORS] +_ASSIGN_OPERATORS.append(('=', lambda cur, right: right)) + +_NAME_RE = r'[a-zA-Z_$][a-zA-Z_$0-9]*' + + +class JSInterpreter(object): + def __init__(self, code, objects=None): + if objects is None: + objects = {} + self.code = code + self._functions = {} + self._objects = objects + + def interpret_statement(self, stmt, local_vars, allow_recursion=100): + if allow_recursion < 0: + raise ExtractorError('Recursion limit reached') + + should_abort = False + stmt = stmt.lstrip() + stmt_m = re.match(r'var\s', stmt) + if stmt_m: + expr = stmt[len(stmt_m.group(0)):] + else: + return_m = re.match(r'return(?:\s+|$)', stmt) + if return_m: + expr = stmt[len(return_m.group(0)):] + should_abort = True + else: + # Try interpreting it as an expression + expr = stmt + + v = self.interpret_expression(expr, local_vars, allow_recursion) + return v, should_abort + + def interpret_expression(self, expr, local_vars, allow_recursion): + expr = expr.strip() + if expr == '': # Empty expression + return None + + if expr.startswith('('): + parens_count = 0 + for m in re.finditer(r'[()]', expr): + if m.group(0) == '(': + parens_count += 1 + else: + parens_count -= 1 + if parens_count == 0: + sub_expr = expr[1:m.start()] + sub_result = self.interpret_expression( + sub_expr, local_vars, allow_recursion) + remaining_expr = expr[m.end():].strip() + if not remaining_expr: + return sub_result + else: + expr = json.dumps(sub_result) + remaining_expr + break + else: + raise ExtractorError('Premature end of parens in %r' % expr) + + for op, opfunc in _ASSIGN_OPERATORS: + m = re.match(r'''(?x) + (?P<out>%s)(?:\[(?P<index>[^\]]+?)\])? + \s*%s + (?P<expr>.*)$''' % (_NAME_RE, re.escape(op)), expr) + if not m: + continue + right_val = self.interpret_expression( + m.group('expr'), local_vars, allow_recursion - 1) + + if m.groupdict().get('index'): + lvar = local_vars[m.group('out')] + idx = self.interpret_expression( + m.group('index'), local_vars, allow_recursion) + assert isinstance(idx, int) + cur = lvar[idx] + val = opfunc(cur, right_val) + lvar[idx] = val + return val + else: + cur = local_vars.get(m.group('out')) + val = opfunc(cur, right_val) + local_vars[m.group('out')] = val + return val + + if expr.isdigit(): + return int(expr) + + var_m = re.match( + r'(?!if|return|true|false)(?P<name>%s)$' % _NAME_RE, + expr) + if var_m: + return local_vars[var_m.group('name')] + + try: + return json.loads(expr) + except ValueError: + pass + + m = re.match( + r'(?P<in>%s)\[(?P<idx>.+)\]$' % _NAME_RE, expr) + if m: + val = local_vars[m.group('in')] + idx = self.interpret_expression( + m.group('idx'), local_vars, allow_recursion - 1) + return val[idx] + + m = re.match( + r'(?P<var>%s)(?:\.(?P<member>[^(]+)|\[(?P<member2>[^]]+)\])\s*(?:\(+(?P<args>[^()]*)\))?$' % _NAME_RE, + expr) + if m: + variable = m.group('var') + member = remove_quotes(m.group('member') or m.group('member2')) + arg_str = m.group('args') + + if variable in local_vars: + obj = local_vars[variable] + else: + if variable not in self._objects: + self._objects[variable] = self.extract_object(variable) + obj = self._objects[variable] + + if arg_str is None: + # Member access + if member == 'length': + return len(obj) + return obj[member] + + assert expr.endswith(')') + # Function call + if arg_str == '': + argvals = tuple() + else: + argvals = tuple([ + self.interpret_expression(v, local_vars, allow_recursion) + for v in arg_str.split(',')]) + + if member == 'split': + assert argvals == ('',) + return list(obj) + if member == 'join': + assert len(argvals) == 1 + return argvals[0].join(obj) + if member == 'reverse': + assert len(argvals) == 0 + obj.reverse() + return obj + if member == 'slice': + assert len(argvals) == 1 + return obj[argvals[0]:] + if member == 'splice': + assert isinstance(obj, list) + index, howMany = argvals + res = [] + for i in range(index, min(index + howMany, len(obj))): + res.append(obj.pop(index)) + return res + + return obj[member](argvals) + + for op, opfunc in _OPERATORS: + m = re.match(r'(?P<x>.+?)%s(?P<y>.+)' % re.escape(op), expr) + if not m: + continue + x, abort = self.interpret_statement( + m.group('x'), local_vars, allow_recursion - 1) + if abort: + raise ExtractorError( + 'Premature left-side return of %s in %r' % (op, expr)) + y, abort = self.interpret_statement( + m.group('y'), local_vars, allow_recursion - 1) + if abort: + raise ExtractorError( + 'Premature right-side return of %s in %r' % (op, expr)) + return opfunc(x, y) + + m = re.match( + r'^(?P<func>%s)\((?P<args>[a-zA-Z0-9_$,]*)\)$' % _NAME_RE, expr) + if m: + fname = m.group('func') + argvals = tuple([ + int(v) if v.isdigit() else local_vars[v] + for v in m.group('args').split(',')]) if len(m.group('args')) > 0 else tuple() + if fname not in self._functions: + self._functions[fname] = self.extract_function(fname) + return self._functions[fname](argvals) + + raise ExtractorError('Unsupported JS expression %r' % expr) + + def extract_object(self, objname): + _FUNC_NAME_RE = r'''(?:[a-zA-Z$0-9]+|"[a-zA-Z$0-9]+"|'[a-zA-Z$0-9]+')''' + obj = {} + obj_m = re.search( + r'''(?x) + (?<!this\.)%s\s*=\s*{\s* + (?P<fields>(%s\s*:\s*function\s*\(.*?\)\s*{.*?}(?:,\s*)?)*) + }\s*; + ''' % (re.escape(objname), _FUNC_NAME_RE), + self.code) + fields = obj_m.group('fields') + # Currently, it only supports function definitions + fields_m = re.finditer( + r'''(?x) + (?P<key>%s)\s*:\s*function\s*\((?P<args>[a-z,]+)\){(?P<code>[^}]+)} + ''' % _FUNC_NAME_RE, + fields) + for f in fields_m: + argnames = f.group('args').split(',') + obj[remove_quotes(f.group('key'))] = self.build_function(argnames, f.group('code')) + + return obj + + def extract_function(self, funcname): + func_m = re.search( + r'''(?x) + (?:function\s+%s|[{;,]\s*%s\s*=\s*function|var\s+%s\s*=\s*function)\s* + \((?P<args>[^)]*)\)\s* + \{(?P<code>[^}]+)\}''' % ( + re.escape(funcname), re.escape(funcname), re.escape(funcname)), + self.code) + if func_m is None: + raise ExtractorError('Could not find JS function %r' % funcname) + argnames = func_m.group('args').split(',') + + return self.build_function(argnames, func_m.group('code')) + + def call_function(self, funcname, *args): + f = self.extract_function(funcname) + return f(args) + + def build_function(self, argnames, code): + def resf(args): + local_vars = dict(zip(argnames, args)) + for stmt in code.split(';'): + res, abort = self.interpret_statement(stmt, local_vars) + if abort: + break + return res + return resf |