Here is an implementation with various approaches. Without resorting to C or C++, the fastest method seems to be datatable
:
def match(x, y, method = "dt"):
'''
x and y are two numpy 1d arrays containing only finite values.
method = 'dt': use datatable
method = 'pandas': use pandas
method = 'numpy': use numpy
method = 'dict': use hashing.
'''
if method == 'dt': # Use datatable
xdf = datatable.Frame({'val': x})
ydf = datatable.Frame({'val': y, 'ind': np.arange(y.shape[0]) })[
:, datatable.min(datatable.f.ind), datatable.by(datatable.f.val)]
ydf.key = 'val'
rst = xdf[:, :, datatable.join(ydf)]['ind'].to_numpy()
return rst.filled(-1 - y.shape[0]).ravel()
if method == 'pandas': # Use pandas dataframe.
xdf = pd.DataFrame({'val': x})
ydf = pd.DataFrame({'val': y, 'ind': np.arange(y.shape[0]) }).groupby(
['val']).min()
joined = xdf.join(ydf, on = 'val', lsuffix = '_x', rsuffix = '_y')
rst = joined['ind'].to_numpy()
rst[np.isnan(rst)] = -1 - y.shape[0]
return rst.astype(int)
rst = np.zeros(x.shape[0], dtype = np.int32) - (y.shape[0] + 1)
if method == 'numpy':
yorder = y.argsort()
ysorted = y[yorder]
ind = np.searchsorted(ysorted, x)
outofBound = ind >= y.shape[0]
ind[outofBound] = 0
eq = ysorted[ind] == x
eq[outofBound] = False
rst[eq] = yorder[ind][eq]
else: # Hashing.
D = dict(zip(y[::-1], np.arange(y.shape[0] - 1, -1, -1)))
for i, u in enumerate(x):
val = D.get(u)
if val is not None: rst[i] = val
return rst
Test code:
import datatable
import pandas
import time
import numpy as np
N = int(1e9)
k = int(1e7)
x = np.random.choice(N, k)
y = np.random.choice(N, k)
timeCosts = {}
st = time.time()
ind = match(x, y, "dt")
timeCosts['datatable'] = time.time() - st
np.all(x[ind >= 0] == y[ind[ind >= 0]])
st = time.time()
ind = match(x, y, "pandas")
timeCosts['pandas'] = time.time() - st
np.all(x[ind >= 0] == y[ind[ind >= 0]])
st = time.time()
ind = match(x, y, "numpy")
timeCosts['numpy'] = time.time() - st
np.all(x[ind >= 0] == y[ind[ind >= 0]])
st = time.time()
ind = match(x, y, "hashing")
timeCosts['hashing'] = time.time() - st
np.all(x[ind >= 0] == y[ind[ind >= 0]])
The time costs in seconds:
{'datatable': 1.55, 'pandas': 8.01, 'numpy': 14.91, 'hashing': 6.04}
But the fastest is still slower than R's match
: 1.05s
R must have used some hashing technique similar to that in radix sort..