Current approach I have going for enabling both 'import x' and 'from x import y' dependency bundling. One draw back, to this current implementation, is it creates copies of the methods in each module that is used, in contrast to the code origin where each usage is just a reference to the same method in memory (though I have conflicting results here - see section after code).
/// analysis_script.py /// (dependencies excluded for brevity)
import test_module
from third_level_module import z
def f():
for i in range(1,5):
test_module.g('blah string used by g')
z()
/// driver.py ///
import modutil
import analysis_script
modutil.serialize_module_with_dependencies(analysis_script)
/// modutil.py ///
import sys
import modulefinder
import os
import inspect
import marshal
def dump_module(funcfile, name, module):
functions_list = [o for o in inspect.getmembers(module) if inspect.isfunction(o[1])]
print 'module name:' + name
marshal.dump(name, funcfile)
for func in functions_list:
print func
marshal.dump(func[1].func_code, funcfile)
def serialize_module_with_dependencies(module):
python_path = os.environ['PYTHONPATH'].split(os.pathsep)
module_path = os.path.dirname(module.__file__)
#planning to search for modules only on this python path and under the current scripts working directory
#standard libraries should be expected to be installed on the target platform
search_dir = [python_path, module_path]
mf = modulefinder.ModuleFinder(search_dir)
#__file__ returns the pyc after first run
#in this case we use replace to get the py file since we need that for our call to mf.run_script
src_file = module.__file__
if '.pyc' in src_file:
src_file = src_file.replace('.pyc', '.py')
mf.run_script(src_file)
funcfile = open("functions.pickle", "wb")
dump_module(funcfile, 'sandbox', module)
for name, mod in mf.modules.iteritems():
#the sys module is included by default but has no file and we don't want it anyway, i.e. should
#be on the remote systems path. __main__ we also don't want since it should be virtual empty and
#just used to invoke this function.
if not name == 'sys' and not name == '__main__':
dump_module(funcfile, name, sys.modules[name])
funcfile.close()
/// sandbox_reader.py ///
import marshal
import types
import imp
sandbox_module = imp.new_module('sandbox')
dynamic_modules = {}
current_module = ''
with open("functions.pickle", "rb") as funcfile:
while True:
try:
code = marshal.load(funcfile)
except EOFError:
break
if isinstance(code,types.StringType):
print "module name:" + code
if code == 'sandbox':
current_module = "sandbox"
else:
current_module = imp.new_module(code)
dynamic_modules[code] = current_module
exec 'import '+code in sandbox_module.__dict__
elif isinstance(code,types.CodeType):
print "func"
if current_module == "sandbox":
func = types.FunctionType(code, sandbox_module.__dict__, code.co_name)
setattr(sandbox_module, code.co_name, func)
else:
func = types.FunctionType(code, current_module.__dict__, code.co_name)
setattr(current_module, code.co_name, func)
else:
raise Exception( "unknown type received")
#yaa! actually invoke the method
sandbox_module.f()
del sandbox_module
For instance the function graph looks like this before serialization:
module name:sandbox
('f', <function f at 0x15e07d0>)
('z', <function z at 0x7f47d719ade8>)
module name:test_module
('g', <function g at 0x15e0758>)
('z', <function z at 0x7f47d719ade8>)
module name:third_level_module
('z', <function z at 0x7f47d719ade8>)
Specifically, looking at the function z we can see that all the references point to the same address, i.e. 0x7f47d719ade8.
On the remote process after sandbox reconstruction we have:
print sandbox_module.z
<function z at 0x1a071b8>
print sandbox_module.third_level_module.z
<function z at 0x1a072a8>
print sandbox_module.test_module.z
<function z at 0x1a072a8>
This blows my mind! I would have thought all addresses here would be unique after reconstruction but for some reason sandbox_module.test_module.z and sandbox_module.third_level_module.z have the same address?