From b2c1066734b59f69666a3cfc0f82ed9718685d4b Mon Sep 17 00:00:00 2001 From: James Taylor Date: Fri, 5 Mar 2021 23:12:41 -0800 Subject: proto_debug: improve recursive_pb function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1) Fix bug where it would decode as base64 but not indicate that in the structure 2) Use a list of patterns of youtube object ids so it will not base64 decode those 3) Detect the base64 type. If not possible, outputs base64? 4) Remove unnecessary try-except clause 5) Remove unused filt parameter Signed-off-by: Jesús --- youtube/proto_debug.py | 117 ++++++++++++++++++++++++++++++++++--------------- 1 file changed, 81 insertions(+), 36 deletions(-) diff --git a/youtube/proto_debug.py b/youtube/proto_debug.py index 75c09a4..d793fe1 100644 --- a/youtube/proto_debug.py +++ b/youtube/proto_debug.py @@ -38,18 +38,18 @@ Example usage: The function recursive_pb will try to do dec/pb recursively automatically. It's a dumb function (so might try to dec or pb something that isn't really -base64 or protobuf) and it's a mess right now so disclaimer. +base64 or protobuf) so be careful. The function pp will pretty print the recursive structure: >>> pp(recursive_pb('4qmFsgJcEhhVQ1lPX2phYl9lc3VGUlY0YjE3QUp0QXcaQEVnWjJhV1JsYjNNWUF5QUFNQUU0QWVvREdFTm5Ua1JSVlVWVFEzZHBYM2gwTTBaeFRuRkZiRFZqUWclM0QlM0Q%3D')) -('base64', +('base64p', [ [2, 80226972, [ [2, 2, b'UCYO_jab_esuFRV4b17AJtAw'], [2, 3, - ('base64', + ('base64p', [ [2, 2, b'videos'], [0, 3, 3], @@ -57,7 +57,7 @@ The function pp will pretty print the recursive structure: [0, 6, 1], [0, 7, 1], [2, 61, - ('base64', + ('base64?', [ [2, 1, b'CAA'], [2, 2, @@ -76,14 +76,15 @@ The function pp will pretty print the recursive structure: ] ) -make_proto will take a recursive_pb structure and make a ctoken out of it: + - base64 means a base64 encode with equals sign paddings - base64s means a base64 encode without padding - base64p means a url base64 encode with equals signs replaced with %3D +- base64? means the base64 type cannot be inferred because of the length -recursive_pb cannot detect between base64 or base64p or base64s so -those must be manually specified if recreating the token. Will not have -make_proto(recursive_pb(x)) == x if x is using base64p or base64s +make_proto is the inverse function. It will take a recursive_pb structure and +make a ctoken out of it, so in general, +x == make_proto(recursive_pb(x)) There are some other functions I wrote while reverse engineering stuff that may or may not be useful. @@ -117,14 +118,12 @@ def varint_encode(offset): This encoding is used in youtube parameters to encode offsets and to encode the length for length-prefixed data. See https://developers.google.com/protocol-buffers/docs/encoding#varints for more info.''' - # (0).bit_length() returns 0, but we need 1 in that case. - needed_bytes = ceil(offset.bit_length()/7) or 1 + needed_bytes = ceil(offset.bit_length()/7) or 1 # (0).bit_length() returns 0, but we need 1 in that case. encoded_bytes = bytearray(needed_bytes) for i in range(0, needed_bytes - 1): encoded_bytes[i] = (offset & 127) | 128 # 7 least significant bits offset = offset >> 7 - # leave first bit as zero for last byte - encoded_bytes[-1] = offset & 127 + encoded_bytes[-1] = offset & 127 # leave first bit as zero for last byte return bytes(encoded_bytes) @@ -179,9 +178,7 @@ def read_varint(data): except IndexError: if i == 0: raise EOFError() - raise Exception( - 'Unterminated varint starting at ' + str(data.tell() - i) - ) + raise Exception('Unterminated varint starting at ' + str(data.tell() - i)) result |= (byte & 127) << 7*i if not byte & 128: break @@ -214,6 +211,7 @@ base64_enc_funcs = { 'base64': base64.urlsafe_b64encode, 'base64s': unpadded_b64encode, 'base64p': percent_b64encode, + 'base64?': base64.urlsafe_b64encode, } @@ -294,6 +292,22 @@ def b64_to_bytes(data): dec = b64_to_bytes +def get_b64_type(data): + '''return base64, base64s, base64p, or base64?''' + if isinstance(data, str): + data = data.encode('ascii') + if data.endswith(b'='): + return 'base64' + if data.endswith(b'%3D'): + return 'base64p' + # Length of data means it wouldn't have an equals sign, + # so we can't tell which type it is. + if len(data) % 4 == 0: + return 'base64?' + + return 'base64s' + + def enc(t): return base64.urlsafe_b64encode(t).decode('ascii') @@ -329,8 +343,7 @@ fromhex = bytes.fromhex def aligned_ascii(data): - return ' '.join(' ' + chr(n) if n in range( - 32, 128) else ' _' for n in data) + return ' '.join(' ' + chr(n) if n in range(32, 128) else ' _' for n in data) def parse_protobuf(data, mutable=False, spec=()): @@ -483,33 +496,63 @@ def dec32(data): return b32decode(data + "="*((8 - len(data)%8)%8)) -def recursive_pb(data, filt=True): - b64 = False - if isinstance(data, str) or all(i > 32 for i in data): - try: - if len(data) > 11 and data[0:2] != b'UC': - data = b64_to_bytes(data) - b64 = True +_patterns = [ + (b'UC', 24), # channel + (b'PL', 34), # playlist + (b'LL', 24), # liked videos playlist + (b'UU', 24), # user uploads playlist + (b'RD', 15), # radio mix + (b'RD', 43), # radio mix + (b'', 11), # video + (b'Ug', 26), # comment + (b'Ug', 49), # comment reply (of form parent_id.reply_id) + (b'9', 22), # comment reply id +] +def is_youtube_object_id(data): + try: + if isinstance(data, str): + data = data.encode('ascii') + except Exception: + return False + + for start_sequence, length in _patterns: + if len(data) == length and data.startswith(start_sequence): + return True + + return False + + +def recursive_pb(data): + try: + # check if this fits the basic requirements for base64 + if isinstance(data, str) or all(i > 32 for i in data): + if len(data) > 11 and not is_youtube_object_id(data): + raw_data = b64_to_bytes(data) + b64_type = get_b64_type(data) + + rpb = recursive_pb(raw_data) + if rpb == raw_data: + # could not interpret as protobuf, probably not b64 + return data + return (b64_type, rpb) else: return data - except Exception as e: - return data + except Exception as e: + return data try: result = pb(data, mutable=True) except Exception as e: return data + for tuple in result: if tuple[0] == 2: - try: - tuple[2] = recursive_pb(tuple[2]) - except Exception: - pass - if b64: - return ('base64', result) + tuple[2] = recursive_pb(tuple[2]) + return result + def indent_lines(lines, indent): return re.sub(r'^', ' '*indent, lines, flags=re.MULTILINE) @@ -524,13 +567,15 @@ def _pp(obj, indent): # not my best work + ')') elif isinstance(obj, list): # [wire_type, field_number, data] - if (len(obj) == 3 and not any( - isinstance(x, (list, tuple)) for x in obj)): + if (len(obj) == 3 + and not any(isinstance(x, (list, tuple)) for x in obj) + ): return obj.__repr__() # [wire_type, field_number, [...]] - elif (len(obj) == 3 and not any( - isinstance(x, (list, tuple)) for x in obj[0:2])): + elif (len(obj) == 3 + and not any(isinstance(x, (list, tuple)) for x in obj[0:2]) + ): return ('[' + obj[0].__repr__() + ', ' + obj[1].__repr__() + ',\n' + indent_lines(_pp(obj[2], indent), indent) + '\n' + ']') -- cgit v1.2.3