I have a problem where I have to do the following calculation. I wanted to avoid the loop version, so I vectorized it. Why is the loop version actually fast than the vectorized version? Does anybody have an explanation for this.
thx
import numpy as np
from numpy.core.umath_tests import inner1d
num_vertices = 40000
num_pca_dims = 1000
num_vert_coords = 3
a = np.arange(num_vert_coords * num_vertices * num_pca_dims).reshape((num_pca_dims, num_vertices*num_vert_coords)).T
#n-by-3
norms = np.arange(num_vertices * num_vert_coords).reshape(num_vertices,-1)
#Loop version
def slowversion(a,norms):
res_list = []
for c_idx in range(a.shape[1]):
curr_col = a[:,c_idx].reshape(-1,3)
res = inner1d(curr_col, norms)
res_list.append(res)
res_list_conc = np.column_stack(res_list)
return res_list_conc
#Fast version
def fastversion(a,norms):
a_3 = a.reshape(num_vertices, 3, num_pca_dims)
fast_res = np.sum(a_3 * norms[:,:,None], axis=1)
return fast_res
res_list_conc = slowversion(a,norms)
fast_res = fastversion(a,norms)
assert np.all(res_list_conc == fast_res)