While learning to create a hashmap using Numpy and Python 3, I came up with the following code which uses a Numpy structured array data
.
However, the time it takes to select a value from a key is quite slow, as shown in the timeit
runs comparing 13.3 secs for Numpy structured array data
with 0.008 secs for Python dictionary d
.
val = data[data['keys'] == key]['values'][0]
Is there a faster way to get the item for a particular key?
import numpy as np
import timeit
N = 1000*1000
keyArr = np.random.randint(0, 1000*1000*1000*4, N)
valArr = np.random.rand(N)
key = keyArr[0] # Select an existing key value
# Numpy structured array
data = np.empty(keyArr.shape[0], dtype=[('keys', keyArr.dtype), ('values', valArr.dtype)])
data['keys'] = keyArr
data['values'] = valArr
val = data[data['keys'] == key]['values'][0]
print(key, '=>', val) # 558520981 => 0.17948995177905835
print( timeit.Timer("data[data['keys'] == key]['values'][0]",
globals=globals()).timeit(10*1000) , 'secs' ) # 13.256318201000001 secs
# Python built-in dictionary
d = {}
for k, v in zip(keyArr, valArr):
d[k] = v
print(key, '=>', d[key]) # 558520981 => 0.17948995177905835
print( timeit.Timer("d[key]",
globals=globals()).timeit(10*1000) , 'secs' ) # 0.0008061910000000116 secs
Numpy Lookup Method 1: 13.3 secs
val = data[data['keys'] == key]['values'][0]
Numpy Lookup Method 2: 13.4 secs
val = data['values'][np.where(data['keys'] == key)][0]
pandas.Series
: 6.8 secs
import pandas as pd
# Pandas Series
s = pd.Series(valArr, index=keyArr, dtype=valArr.dtype)
val = s[key]
print(key, '=>', val)
print( timeit.Timer("s[key]",
globals=globals()).timeit(10*1000) , 'secs' ) # 6.782590246000002 secs