3

I'm creating a tool that analyses code used within definition of user defined function, for example:

Test 1 ✅

def help_fun(a, b): return a * b
def test1(numbers):
    r1, r2 = np.multiply.reduce(numbers), sum(numbers)/len(numbers)
    r = math.sin(help_fun(r1, r2))
    return math.sqrt(r)

I understand, however, how to interpret results of dis.dis(test1):

3           0 LOAD_GLOBAL              0 (np)
            2 LOAD_ATTR                1 (multiply)
            4 LOAD_METHOD              2 (reduce)
            6 LOAD_FAST                0 (numbers)
            8 CALL_METHOD              1
           10 LOAD_GLOBAL              3 (sum)
           ...
5          46 LOAD_GLOBAL              5 (math)
           48 LOAD_METHOD              8 (sqrt)
           50 LOAD_FAST                3 (r)
           52 CALL_METHOD              1
           54 RETURN_VALUE

My expect output is:

{0: {'functions': ['sum', 'len', 'help_fun'], 
     'methods': ['np.multiply.reduce', 'math.sin', 'math.sqrt']}} #frame 0: scope of test1

In order to collect function and method names, I implement a wrapper for contents of stack of disassemler and use a specific way to extract these names from stack archive.

import types
import dis
def get_frames(code):
    '''given <function>.__code__ instance, iterate each code frame
    >>> [type(x) for x in get_frames(test1.__code__)]
    [code]
    '''
    yield code
    for c in code.co_consts:
        if isinstance(c, types.CodeType):
            yield from get_frames(c)
            break

def get_calls(instructions, stack):
    '''get called functions and methods in CALL_FUNCTION and CALL_METHOD opnames'''
    functions, methods = [], []
    for idx, instr in enumerate(instructions):
        if instr.opname == 'CALL_FUNCTION':
            functions.append(stack[idx - 1][- 1 - instr.arg])
        elif instr.opname == 'CALL_METHOD':
            methods.append(stack[idx - 1][- 1 - instr.arg])
    return {'functions': functions, 'methods': methods}

def get_stack(instructions):
    '''Wrapper for stack contents'''
    stack = []
    for n in instructions:
        if n.opname in ('LOAD_FAST', 'LOAD_GLOBAL', 'LOAD_CONST'):
            stack.append(n.argrepr) #global var
        elif n.opname in ('LOAD_METHOD', 'LOAD_ATTR'):
            stack[-1] = f'{stack[-1]}.{n.argrepr}'
        elif n.opname in ('CALL_FUNCTION', 'CALL_METHOD'):
            args = stack[-n.arg:]
            del stack[-n.arg:]
            stack[-1] = f'{stack[-1]}({", ".join(args)})'
        elif n.opname == 'BINARY_TRUE_DIVIDE':
            stack[-2:] = [' / '.join(stack[-2:])]
        elif n.opname == 'STORE_FAST':
            del stack[-1]
        elif n.opname == 'ROT_TWO':
            stack[-1], stack[-2] = stack[-2], stack[-1]
        elif n.opname == 'GET_ITER':
            stack[-1] = f'iter({stack[-1]})'
        yield stack.copy()

code = list(get_frames(test1.__code__))
out = dict()
for i, c in enumerate(code):
    instructions = dis.Bytecode(c)
    stack = list(get_stack(instructions))
    out[i] = get_calls(instructions, stack)
out
>>> {0: {'functions': ['sum', 'len', 'help_fun'], 'methods': ['np.multiply.reduce', 'math.sin', 'math.sqrt']}}

In my approach names of functions and methods are extracted from stack column of table:

|   line | opname             |   arg | argrepr   | stack                                                    |
|--------|--------------------|-------|-----------|----------------------------------------------------------|
|      3 | LOAD_GLOBAL        |     0 | np        | np                                                       |
|        | LOAD_ATTR          |     1 | multiply  | np.multiply                                              |
|        | LOAD_METHOD        |     2 | reduce    | np.multiply.reduce                                       |
|        | LOAD_FAST          |     0 | numbers   | np.multiply.reduce, numbers                              |
|        | CALL_METHOD        |     1 |           | np.multiply.reduce(numbers)                              |
|        | LOAD_GLOBAL        |     3 | sum       | np.multiply.reduce(numbers), sum                         |
|        | LOAD_FAST          |     0 | numbers   | np.multiply.reduce(numbers), sum, numbers                |
|        | CALL_FUNCTION      |     1 |           | np.multiply.reduce(numbers), sum(numbers)                |
|        | LOAD_GLOBAL        |     4 | len       | np.multiply.reduce(numbers), sum(numbers), len           |
|        | LOAD_FAST          |     0 | numbers   | np.multiply.reduce(numbers), sum(numbers), len, numbers  |
|        | CALL_FUNCTION      |     1 |           | np.multiply.reduce(numbers), sum(numbers), len(numbers)  |
|        | BINARY_TRUE_DIVIDE |       |           | np.multiply.reduce(numbers), sum(numbers) / len(numbers) |
|        | ROT_TWO            |       |           | sum(numbers) / len(numbers), np.multiply.reduce(numbers) |
|        | STORE_FAST         |     1 | r1        | sum(numbers) / len(numbers)                              |
|        | STORE_FAST         |     2 | r2        |                                                          |
|      4 | LOAD_GLOBAL        |     5 | math      | math                                                     |
|        | LOAD_METHOD        |     6 | sin       | math.sin                                                 |
|        | LOAD_GLOBAL        |     7 | help_fun  | math.sin, help_fun                                       |
|        | LOAD_FAST          |     1 | r1        | math.sin, help_fun, r1                                   |
|        | LOAD_FAST          |     2 | r2        | math.sin, help_fun, r1, r2                               |
|        | CALL_FUNCTION      |     2 |           | math.sin, help_fun(r1, r2)                               |
|        | CALL_METHOD        |     1 |           | math.sin(help_fun(r1, r2))                               |
|        | STORE_FAST         |     3 | r         |                                                          |
|      5 | LOAD_GLOBAL        |     5 | math      | math                                                     |
|        | LOAD_METHOD        |     8 | sqrt      | math.sqrt                                                |
|        | LOAD_FAST          |     3 | r         | math.sqrt, r                                             |
|        | CALL_METHOD        |     1 |           | math.sqrt(r)                                             |
|        | RETURN_VALUE       |       |           | math.sqrt(r)                                             |

However, things get more complicated if other kind of opnames in my instructions are included. For instance, I'm not sure about behaviour of stack if there are any list comprehensions used. Getting names of methods and functions might display incorrect results:

Test 2 ❌

(<listcomp> is not a function but my wrapper things it is)

def test2(x): 
    return [[math.sqrt(m) for m in list(n)] for n in x]

Expected output:

{0: {'functions': [], 'methods': []}, #frame 0: scope of test2
 1: {'functions': ['list'], 'methods': [], #frame 1: scope of <listcomp>
 2: {'functions': [], 'methods': ['math.sqrt']}} #frame 2: scope of <listcomp>

Output of current code:

{0: {'functions': ["'test6.<locals>.<listcomp>'"], 'methods': []},
 1: {'functions': ['list', "'test6.<locals>.<listcomp>.<listcomp>'"], 'methods': []}, 
 2: {'functions': [], 'methods': ['math.sqrt']}}

Test 3 ❌

It might also crash sometimes because I am not sure what's happening with stack:

def test3(x,y):
    return [pow(i,j) for i,j in zip(x,y)]

Expected output:

{0: {'functions': ['zip'], 'methods': []},
 1: {'functions': ['pow'], 'methods': []}}

It crashes after second STORE_FAST command tries to pop item from empty stack. The instructions of second scope looks like:

|   line | opname          |   arg | argrepr   | stack   |
|--------|-----------------|-------|-----------|---------|
|    104 | BUILD_LIST      |     0 |           |         |
|        | LOAD_FAST       |     0 | .0        | .0      |
|        | FOR_ITER        |    18 | to 24     | .0      |
|        | UNPACK_SEQUENCE |     2 |           | .0      |
|        | STORE_FAST      |     1 | i         |         |
|        | STORE_FAST      |     2 | j         | ???     | ### stuck here
|        | LOAD_GLOBAL     |     0 | pow       | ???     |
|        | LOAD_FAST       |     1 | i         | ???     |
|        | LOAD_FAST       |     2 | j         | ???     |
|        | CALL_FUNCTION   |     2 |           | ???     |
|        | LIST_APPEND     |     2 |           | ???     |
|        | JUMP_ABSOLUTE   |     4 |           | ???     |
|        | RETURN_VALUE    |       |           | ???     |

Is there any easier way to get names of methods and functions used inside a caller? Are there any better ways to get archive of stack? I know, my implementation of get_stack is poor at the moment, I'm looking for a different approach or better documentation of stack control.

Remarks

  • I can add more tests and wrapper corrections if needed
  • Attributes (or methods) of class instances also involves, e.g. ls.append
mathfux
  • 5,759
  • 1
  • 14
  • 34
  • What output are you expecting for test2? – Hack5 Dec 12 '22 at 11:26
  • 1
    And what output would you expect for `a = math.sqrt; a(9)`? – Hack5 Dec 12 '22 at 11:46
  • @Hack5 Nice catch. For any `test` I'm running `list(get_frames(test.__code__))` on my script for further processing of `test` (as it is shown in my code). If I try to assign `a = math.sqrt` instead of function definition, `a` becomes `builtin_function_or_method`, not a `function`. It does not have attribute `__code__`. Hence I do not expect to further process global variables like `a = math.sqrt`. Updated a question. – mathfux Dec 12 '22 at 14:40
  • Could you clarify your question? Your approach to viewing the stack seems valid, but this looks like a potential XY problem. What are you actually trying to do? You may not need to do any disassembly at all. And what is wrong with your existing code? If you filter out unwanted invocations of listcomps, it works perfectly on my machine. – Hack5 Dec 12 '22 at 15:47
  • At the moment I'm trying to find names of methods and functions used within scope of a caller. I'm convinced that viewing stack items solves it and share my efforts. Disassembling won't necessarily be the only way for solving it. If someone finds a different way, I'll accept that answer too. – mathfux Dec 12 '22 at 18:09
  • Ultimately, the user can access any functions or methods they want, for example by using `globals()["function"]`, `__import__`, or the `eval` function. Also, your code worked for me when I ran it, so I need to know what you actually want to change. – Hack5 Dec 12 '22 at 18:10
  • It's hard to extend to all possible cases. If you know how to use these commands to capture all functions and methods used locally within scope of any user defined function, please share it. Note that I added `test3` on my question and some remarks – mathfux Dec 12 '22 at 18:32

1 Answers1

2

If you want your code to be more portable across Python versions, you should consider using ast. Although the AST does change across versions, it usually does so in ways that cause easy-to-understand errors.

To fix your code, you will need to implement a lot more operations. I did this by ignoring most things and just applying their stack effect (the number of elements placed or removed from the stack).

import dis
import inspect


def iterate_stack(instructions):
    stack = []
    called = []
    for instruction in instructions:
        old_stack_len = len(stack)
        if instruction.opname == "ROT_TWO":
            stack[-1], stack[-2] = stack[-2], stack[-1]
        elif instruction.opname == "ROT_THREE":
            stack[-1], stack[-2], stack[-3] = stack[-2], stack[-3], stack[-1]
        elif instruction.opname == "ROT_FOUR":
            stack[-1], stack[-2], stack[-3], stack[-4] = stack[-2], stack[-3], stack[-4], stack[-1]
        elif instruction.opname == "DUP_TOP":
            stack.append(stack[-1])
        elif instruction.opname == "DUP_TOP_TWO":
            stack.extend(stack[-2:])
        elif instruction.opname == "LOAD_ASSERTION_ERROR":
            stack.append("AssertionError")
        elif instruction.opname == "LOAD_NAME":
            stack.append(instruction.argrepr)
        elif instruction.opname == "LOAD_ATTR" or instruction.opname == "LOAD_METHOD":
            if stack[-1]:
                stack[-1] = stack[-1] + "." + instruction.argrepr
        elif instruction.opname == "LOAD_GLOBAL":
            stack.append(instruction.argrepr)
        elif instruction.opname == "LOAD_FAST" or instruction.opname == "LOAD_CLOSURE" or instruction.opname == "LOAD_DEREF" or instruction.opname == "LOAD_CLASSDEREF":
            if inspect.iscode(instruction.argval):
                stack.append(None)
            else:
                stack.append(instruction.argrepr)
        elif instruction.opname == "CALL_FUNCTION":
            args = stack[-instruction.arg:]
            del stack[-instruction.arg:]
            if stack[-1] is not None:
                called.append(f'{stack[-1]}({", ".join(args)})')
                stack.pop()
        elif instruction.opname == "CALL_FUNCTION_KW":
            # TODO get the names of keyword arguments
            called.append(stack[-1 - instruction.arg])
            del stack[-1 - instruction.arg:]
            stack.append(None)
        elif instruction.opname == "CALL_FUNCTION_EX":
            # TODO get the arguments
            if instruction.arg & 0x01:
                stack.pop()
            stack.pop()
            called.append(stack.pop())
        elif instruction.opname == "CALL_METHOD":
            # TODO get the arguments
            called.append(stack[-2 - instruction.arg])
            del stack[-2 - instruction.arg:]
            stack.append(None)
        elif instruction.opname == "ROT_N":
            tos = stack.pop()
            stack.insert(1 - instruction.arg, tos)
        stack_effect = dis.stack_effect(instruction.opcode, instruction.arg)
        while old_stack_len + stack_effect < len(stack):
            stack.pop()
        while old_stack_len + stack_effect > len(stack):
            stack.append(None)
    return called


def get_frames(code):
    yield code
    for c in code.co_consts:
        if inspect.iscode(c):
            yield from get_frames(c)


def help_fun(a, b):
    return a * b


def test1(numbers):
    r1, r2 = np.multiply.reduce(numbers), sum(numbers)/len(numbers)
    r = math.sin(help_fun(r1, r2))
    return math.sqrt(r)


def test2(x):
    return [[math.sqrt(m) for m in list(n)] for n in x]

def test3(x,y):
    return [pow(i,j) for i,j in zip(x,y)]




def main():
    code = list(get_frames(test1.__code__))
    out = dict()
    for i, c in enumerate(code):
        instructions = dis.get_instructions(c)
        out[i] = iterate_stack(instructions)
    print(out)
main()

This gives the expected result for all three of your examples, under Python 3.10.8 (main, Nov 14 2022, 00:00:00) [GCC 12.2.1 20220819 (Red Hat 12.2.1-2)] on linux. It will not work on Python 3.11 because bytecode changes every version.

I would really recommend against using this code, because it will break all the time. You should probably just build a pylint checker, or at least use a library like astroid.

Hack5
  • 3,244
  • 17
  • 37