I am trying to understand what's the size difference between a numpy masked array and a normal array with nans.
import numpy as np
g = np.random.random((5000,5000))
indx = np.random.randint(0,4999,(500,2))
mask = np.full((5000,5000),False,dtype=bool)
mask[indx] = True
g_mask = np.ma.array(g,mask=mask)
I used the following answer to compute the size of the object:
import sys
from types import ModuleType, FunctionType
from gc import get_referents
# Custom objects know their class.
# Function objects seem to know way too much, including modules.
# Exclude modules as well.
BLACKLIST = type, ModuleType, FunctionType
def getsize(obj):
"""sum size of object & members."""
if isinstance(obj, BLACKLIST):
raise TypeError('getsize() does not take argument of type: '+ str(type(obj)))
seen_ids = set()
size = 0
objects = [obj]
while objects:
need_referents = []
for obj in objects:
if not isinstance(obj, BLACKLIST) and id(obj) not in seen_ids:
seen_ids.add(id(obj))
size += sys.getsizeof(obj)
need_referents.append(obj)
objects = get_referents(*need_referents)
return size
That gives me the following result:
getsize(g)
>>>200000112
getsize(g_mask)
>>>25000924
Why the unmasked array is bigger compared to the masked array? How can I estimate the real size of the masked array vs the unmasked array?