There are several ways to manipulate index sets of x1
and x2
in order to arrive at the mappings you need. A use case could be summarised as follows:
def find_mappings(sidx1, sidx2):
'''Input: two set of indices that sorts arrays x1 and x2
Output: both maps x1->x2 and x2->x1'''
x1_to_x2 = np.empty(len(x1), dtype=x1.dtype)
x2_to_x1 = np.empty(len(x1), dtype=x1.dtype)
x1_to_x2[sidx2] = sidx1
x2_to_x1[sidx1] = sidx2
return x1_to_x2, x2_to_x1
I will review the most common ways to pass two index sets sidx1
and sidx2
Way 1 using np.lexsort
as sorting indices:
sidx1 = np.lexsort(x1.T[::-1])
sidx2 = np.lexsort(x2.T[::-1])
find_mappings(sidx1, sidx2)
>>> (array([1, 3, 0, 2, 4]), array([2, 0, 3, 1, 4]))
Way 2 using np.argsort
of reduced dimensions as sorting indices:
x1_dimreduce = np.ravel_multi_index(x1.T, np.max(x1, axis=0)+1)
x2_dimreduce = np.ravel_multi_index(x2.T, np.max(x2, axis=0)+1)
sidx1 = np.argsort(x1_dimreduce)
sidx2 = np.argsort(x2_dimreduce)
find_mappings(sidx1, sidx2)
>>> (array([1, 3, 0, 2, 4]), array([2, 0, 3, 1, 4]))
Another variation of way 2 (reducing dimensions by hands works faster and could be optimised further in numba
)
x1_dimreduce = x1[:,0] * (np.max(x1[:,1])+1) + x1[:,1]
x2_dimreduce = x2[:,0] * (np.max(x2[:,1])+1) + x2[:,1]
Way 3 using np.searchsorted
in order to find x1_to_x2
and then find inverse mapping x2_to_x1
automatically (credits to @yatu as well):
x1_view = x1.view(dtype=np.dtype([('', x1.dtype), ('', x1.dtype)])).ravel()
x2_view = x2.view(dtype=np.dtype([('', x2.dtype), ('', x2.dtype)])).ravel()
argidx = np.argsort(x1_view)
sidx = np.searchsorted(x1_view, x2_view, sorter=argidx)
sidx1 = argidx[sidx]
sidx2 = np.arange(len(sidx))
find_mappings(sidx1, sidx2)
>>> (array([1, 3, 0, 2, 4]), array([2, 0, 3, 1, 4]))
Another variation of way 3 (simplification of way 3 using sidx
and argidx
arguments instead of sidx1
and sidx2
)
def find_mappings2(sidx, argidx):
'''reduces some excessive commands met in `find_mapping`'''
x1_to_x2 = argidx[sidx]
x2_to_x1 = np.empty(len(x1), dtype=x1.dtype)
x2_to_x1[x1_to_x2] = np.arange(len(sidx))
return x1_to_x2, x2_to_x1
x1_view = x1.view(dtype=np.dtype([('', x1.dtype), ('', x1.dtype)])).ravel()
x2_view = x2.view(dtype=np.dtype([('', x2.dtype), ('', x2.dtype)])).ravel()
argidx = np.argsort(x1_view)
sidx = np.searchsorted(x1_view, x2_view, sorter=argidx)
np.array_equal(find_mapping(sidx1, sidx2), find_mapping2(sidx, argidx))
>>> True
Note that every of these ways could be minimized in case any of arrays x1
and x2
preserves order.
Use case for better understanding
x1_to_x2, x2_to_x1 = find_mapping(sidx1, sidx2)
>>> np.array_equal(x1[x1_to_x2], x2)
True
>>> np.array_equal(x2[x2_to_x1], x1)
True
Runtime
Variant of Way 2 appears to be a winner. Moreover, it's open for further optimizations:
import benchit
%matplotlib inline
benchit.setparams(rep=5)
sizes = [3, 10, 30, 100, 300, 900, 3000, 9000, 30000, 90000, 300000, 900000]
def _generate_x1_x2(N):
'''return two randomly ordered arrays x1 and x2 of length N'''
arr = np.transpose(np.unravel_index(np.arange(N), (int(N**0.5)+1, int(N**0.5)+1)))
rng = np.random.default_rng()
return arr[rng.permutation(N)], arr[rng.permutation(N)]
def _find_mappings(sidx1, sidx2):
x1_to_x2, x2_to_x1 = np.empty(len(sidx1), dtype=sidx1.dtype), np.empty(len(sidx2), dtype=sidx2.dtype)
x1_to_x2[sidx2] = sidx1
x2_to_x1[sidx1] = sidx2
return x1_to_x2, x2_to_x1
def _find_mappings2(sidx, argidx):
x1_to_x2 = argidx[sidx]
x2_to_x1 = np.empty(len(x1_to_x2), dtype=x1_to_x2.dtype)
x2_to_x1[x1_to_x2] = np.arange(len(sidx))
return x1_to_x2, x2_to_x1
def lexsort_map(x1, x2):
sidx1 = np.lexsort(x1.T[::-1])
sidx2 = np.lexsort(x2.T[::-1])
return _find_mappings(sidx1, sidx2)
def argsort_map(x1, x2):
x1_dimreduce = np.ravel_multi_index(x1.T, np.max(x1, axis=0)+1)
x2_dimreduce = np.ravel_multi_index(x2.T, np.max(x2, axis=0)+1)
sidx1 = np.argsort(x1_dimreduce)
sidx2 = np.argsort(x2_dimreduce)
return _find_mappings(sidx1, sidx2)
def argsort_map_v2(x1, x2):
x1_dimreduce = x1[:,0] * (np.max(x1[:,1])+1) + x1[:,1]
x2_dimreduce = x2[:,0] * (np.max(x2[:,1])+1) + x2[:,1]
sidx1 = np.argsort(x1_dimreduce)
sidx2 = np.argsort(x2_dimreduce)
return _find_mappings(sidx1, sidx2)
def searchsort_map(x1, x2):
x1_view = x1.view(dtype=np.dtype([('', x1.dtype), ('', x1.dtype)])).ravel()
x2_view = x2.view(dtype=np.dtype([('', x2.dtype), ('', x2.dtype)])).ravel()
argidx = np.argsort(x1_view)
sidx = np.searchsorted(x1_view, x2_view, sorter=argidx)
sidx1 = argidx[sidx]
sidx2 = np.arange(len(sidx))
return _find_mappings(sidx1, sidx2)
def searchsort_map_v2(x1, x2):
x1_view = x1.view(dtype=np.dtype([('', x1.dtype), ('', x1.dtype)])).ravel()
x2_view = x2.view(dtype=np.dtype([('', x2.dtype), ('', x2.dtype)])).ravel()
argidx = np.argsort(x1_view)
sidx = np.searchsorted(x1_view, x2_view, sorter=argidx)
return _find_mappings2(sidx, argidx)
fns = [lexsort_map, argsort_map, argsort_map_v2, searchsort_map, searchsort_map_v2]
in_ = {s: generate_x1_x2(s) for s in sizes}
t = benchit.timings(fns, in_, multivar=True, input_name='Length of x1 and x2')
t.plot(logx=True, figsize=(12, 6), fontsize=14)

Update on 02/25/2022
There is a nice module numpy_indexed
by @EelcoHoogendoorn (pip install numpy-indexed
) written on top of numpy
that extends to more advanced actions of indexing, grouping and set manipulations. You might like also to check out numpy_indexed.indices
, Line 115 method:
numpy_indices as npi
x1_to_x2 = npi.indices(x1, x2)
x2_to_x1 = np.empty_like(x1_to_x2)
x2_to_x1[x1_to_x2] = np.arange(len(x2_to_x1))