I'm a bit late to the party but here is an approach that leverages numpy
speed at only moderate algorithmic cost: O(n log n). It turns out that for many sizes of the input arrays runtime is dominated by the cost of type conversions. See benchmarks below:
from timeit import timeit
import numpy as np
def make_inputs(N, ncol):
global arr1, arr2, list1, list2, lot1, lot2
# create making sure there are no duplicates *within* arr1 or arr2
all_ = np.array(list(set(map(tuple, np.random.randint(0, 2 * N, (N + N//2, ncol))))))
# create input of various data types
arr1 = all_[np.random.choice(len(all_), N, False)]
arr2 = all_[np.random.choice(len(all_), N, False)]
list1 = arr1.tolist()
list2 = arr2.tolist()
lot1 = list(map(tuple, list1))
lot2 = list(map(tuple, list2))
def np_unique_preserve_order(a, b):
c = np.r_[a, b]
cr = c.view(np.dtype("|S" + str(c.shape[-1] * c.dtype.itemsize)))
uniq, inv, count = np.unique(cr.ravel(), return_inverse=True,
return_counts=True)
return c[(count==1)[inv], :]
def np_unique(a, b):
c = np.r_[a, b]
cr = c.view(np.dtype("|S" + str(c.shape[-1] * c.dtype.itemsize)))
uniq, count = np.unique(cr.ravel(), return_counts=True)
return uniq[count==1, None].view(c.dtype)
def np_sort(a, b):
c = np.r_[a, b]
cr = np.sort(c.view(np.dtype("|S" + str(c.shape[-1] * c.dtype.itemsize))).ravel())
m = np.empty(cr.shape, bool)
m[0] = True
m[1:] = cr[:-1] != cr[1:]
m[:-1] &= m[1:]
return cr[m, None].view(c.dtype)
# check
make_inputs(1000, 2)
assert set(map(tuple, lot1)).symmetric_difference(set(map(tuple, lot2))) == set(map(tuple, np_sort(arr1, arr2)))
assert set(map(tuple, lot1)).symmetric_difference(set(map(tuple, lot2))) == set(map(tuple, np_unique(arr1, arr2)))
assert set(map(tuple, lot1)).symmetric_difference(set(map(tuple, lot2))) == set(map(tuple, np_unique_preserve_order(arr1, arr2)))
for N, ncol in ((10, 2), (10000, 2), (100000, 20)):
make_inputs(N, ncol)
results = []
for inputs in 'lot', 'list', 'arr':
res = []
if inputs == 'lot':
res.append('{:11.5f} ms'.format(timeit(f'list(set({inputs}1).symmetric_difference(set({inputs}2)))',
f'from __main__ import {inputs}1, {inputs}2', number=10) * 100))
else:
res.append('{:11.5f} ms'.format(timeit(f'list(set(map(tuple, {inputs}1)).symmetric_difference(set(map(tuple, {inputs}2))))',
f'from __main__ import {inputs}1, {inputs}2', number=10) * 100))
res.append('{:11.5f} ms'.format(timeit(f'np_sort({inputs}1, {inputs}2)', f'from __main__ import {inputs}1, {inputs}2, np_sort',
number=10) * 100))
res.append('{:11.5f} ms'.format(timeit(f'np_unique({inputs}1, {inputs}2)', f'from __main__ import {inputs}1, {inputs}2, np_unique',
number=10) * 100))
res.append('{:11.5f} ms'.format(timeit(f'np_unique_preserve_order({inputs}1, {inputs}2)', f'from __main__ import {inputs}1, {inputs}2, np_unique_preserve_order',
number=10) * 100))
results.append(res)
results = zip(*results)
appmin = lambda l: l + (min(l),)
print(f'\nno rows {N}, no colums {ncol}')
print('input type lot list array best')
print(f'symm_diff ', *appmin(next(results)))
print(f'np_sort ', *appmin(next(results)))
print(f'np_unique ', *appmin(next(results)))
print(f'np_unique_preserve_order ', *appmin(next(results)))
Output:
no rows 10, no colums 2
input type lot list array best
symm_diff 0.00232 ms 0.00453 ms 0.02034 ms 0.00232 ms
np_sort 0.03890 ms 0.03269 ms 0.03060 ms 0.03060 ms
np_unique 0.04209 ms 0.04118 ms 0.04136 ms 0.04118 ms
np_unique_preserve_order 0.05289 ms 0.05253 ms 0.05253 ms 0.05253 ms
no rows 10000, no colums 2
input type lot list array best
symm_diff 3.71800 ms 8.30081 ms 21.92841 ms 3.71800 ms
np_sort 7.98128 ms 8.09973 ms 2.80091 ms 2.80091 ms
np_unique 8.49229 ms 8.25701 ms 3.00654 ms 3.00654 ms
np_unique_preserve_order 10.12377 ms 8.67133 ms 3.36172 ms 3.36172 ms
no rows 100000, no colums 20
input type lot list array best
symm_diff 97.83141 ms 211.20736 ms 590.15145 ms 97.83141 ms
np_sort 317.73268 ms 316.50081 ms 89.97820 ms 89.97820 ms
np_unique 324.68922 ms 326.89891 ms 98.06377 ms 98.06377 ms
np_unique_preserve_order 339.18597 ms 339.00286 ms 120.94041 ms 120.94041 ms
For very small arrays symm_diff
is fastest, but for larger ones np_sort
has a small edge when all methods are allowed to use the data type they are most comfortable with.