Latest Update
I tweaked below code to use numba and is 2 times faster for larger inputs and can handle larger inputs without being killed on my local.
This converts the function decorated with numba.njit to machine code and shines when input are huge. subsequent runs are always faster than first run due to compilation. I understand this might not be necessary for 10k-20k but might need for 2**64. (P.S. I use numba at work for speed refer here to see how it can speed up)
Using numba
import numpy as np
import numba as nb
ojb_A = np.array(ojb_A)
ojb_B = np.array(ojb_B)
cnt_A = np.array(cnt_A)
cnt_B = np.array(cnt_B)
@nb.njit
def count(out, ojb, cnt, i):
for index in nb.prange(ojb.shape[0]):
out[ojb[i]] = out.get(ojb[i], 0) + cnt[index]
def split_out(out):
return list(out.keys()), list(out.values())
out = nb.typed.Dict.empty(
key_type=nb.core.types.int64,
value_type=nb.core.types.int64,
)
count(out, ojb_A, cnt_A, 0)
count(out, ojb_B, cnt_B, 1)
ojb_C, cnt_C = split_out(out)
new_test.py
import random
import sys
import time
import numba as nb
random.seed(31212223)
MAX = 2 ** int(sys.argv[2])
def f3():
ojb_A = random.sample(range(1, 2**30), MAX)
cnt_A = random.sample(range(1, 2**30), MAX)
ojb_B = random.sample(range(1, 2**30), MAX)
cnt_B = random.sample(range(1, 2**30), MAX)
s1 = time.time()
from collections import defaultdict
# @nb.jit
def count(out, ojb, cnt):
for index,obj in enumerate(ojb):
out[obj] += cnt[index]
def split_out(out):
return list(out.keys()), list(out.values())
out = defaultdict(int)
count(out, ojb_A, cnt_A)
count(out, ojb_B, cnt_B)
ojb_C, cnt_C = split_out(out)
# print(ojb_C, cnt_C)
s2 = time.time()
print('quamrana', s2 - s1)
def f3_1():
ojb_A = random.sample(range(1, 2**30), MAX)
cnt_A = random.sample(range(1, 2**30), MAX)
ojb_B = random.sample(range(1, 2**30), MAX)
cnt_B = random.sample(range(1, 2**30), MAX)
s1 = time.time()
import numpy as np
import numba as nb
ojb_A = np.array(ojb_A)
ojb_B = np.array(ojb_B)
cnt_A = np.array(cnt_A)
cnt_B = np.array(cnt_B)
@nb.njit
def count(out, ojb, cnt, i):
for index in nb.prange(ojb.shape[0]):
out[ojb[i]] = out.get(ojb[i], 0) + cnt[index]
def split_out(out):
return list(out.keys()), list(out.values())
out = nb.typed.Dict.empty(
key_type=nb.core.types.int64,
value_type=nb.core.types.int64,
)
count(out, ojb_A, cnt_A, 0)
count(out, ojb_B, cnt_B, 1)
ojb_C, cnt_C = split_out(out)
# print(ojb_C, cnt_C)
s2 = time.time()
print('eroot163pi', s2 - s1)
if __name__ == '__main__':
# Two runs to show subsequent run is faster
eval(f'{sys.argv[1]}()')
eval(f'{sys.argv[1]}()')
Running on mamoth cases 2^22-27
- Numba process successfully finish for 2^26, but normal python is killed
- Numba is consistently faster for these inputs and is able to handle larger inputs limited by 2^27
(base) xxx@xxx:~$ python test.py f3 22
quamrana 3.2768890857696533
quamrana 3.2760112285614014
(base) xxx@xxx:~$ python test.py f3_1 22
eroot163pi 2.4150922298431396
eroot163pi 1.8658664226531982
(base) xxx@xxx:~$ python test.py f3 23
quamrana 6.903605937957764
quamrana 7.187666654586792
(base) xxx@xxx:~$ python test.py f3_1 23
eroot163pi 4.326314926147461
eroot163pi 3.6970062255859375
(base) xxx@xxx:~$ python test.py f3 24
quamrana 14.135217905044556
quamrana 14.102455615997314
(base) xxx@xxx:~$ python test.py f3_1 24
eroot163pi 8.097218751907349
eroot163pi 7.514840602874756
(base) xxx@xxx:~$ python test.py f3 25
quamrana 29.825793743133545
quamrana 30.300193786621094
(base) xxx@xxx:~$ python test.py f3_1 25
eroot163pi 16.243808031082153
eroot163pi 15.114825010299683
(base) xxx@xxx:~$ python test.py f3 26
Killed
(base) xxx@xxx:~$ python test.py f3_1 26
eroot163pi 35.73880386352539
eroot163pi 34.74332834847338
(base) xxx@xxx:~$ python test.py f3_1 27
Killed
Performance of other methods
This is comparison of some of above methods on mamoth test cases. Since I myself am curious about performance and quamrana performed really good.
For length of array greater than 2**26 all the solution attempts above were killed. (Due to my own system limitations, 16gb, 12core)
For the rest, consistently quamrana's solution performed best and Kapil's solution was 2nd best. Each solution was giving almost 2-3 times speedup than next consistently
test.py
import random
import sys
import time
random.seed(31212223)
MAX = 2 ** int(sys.argv[2])
ojb_A = random.sample(range(1, 2**30), MAX)
cnt_A = random.sample(range(1, 2**30), MAX)
ojb_B = random.sample(range(1, 2**30), MAX)
cnt_B = random.sample(range(1, 2**30), MAX)
def f1():
s1 = time.time()
import pandas as pd
df1 = pd.DataFrame(zip(ojb_A,cnt_A))
df2 = pd.DataFrame(zip(ojb_B,cnt_B))
df_combined = pd.concat([df1,df2]).groupby(0).sum()
ojb_C = list(df_combined.index.values)
cnt_C = list(df_combined[1])
s2 = time.time()
print('Kapil', s2 - s1)
def f2():
s1 = time.time()
from collections import Counter
counts = Counter(dict(zip(ojb_A, cnt_A)))
counts.update(dict(zip(ojb_B, cnt_B)))
ojb_C, cnt_C = zip(*counts.items())
s2 = time.time()
print('fsimonjetz', s2 - s1)
def f3():
s1 = time.time()
from collections import defaultdict
def count(out, ojb, cnt):
for index,obj in enumerate(ojb):
out[obj] += cnt[index]
def split_out(out):
return list(out.keys()), list(out.values())
out = defaultdict(int)
count(out, ojb_A, cnt_A)
count(out, ojb_B, cnt_B)
ojb_C, cnt_C = split_out(out)
# print(ojb_C, cnt_C)
s2 = time.time()
print('quamrana', s2 - s1)
if __name__ == '__main__':
eval(f'{sys.argv[1]}()')
Performance on 2^20, 2^21, 2^22
(base) xxx@xxx:~$ python test.py f1 20
Kapil 2.0021448135375977
(base) xxx@xxx:~$ python test.py f2 20
fsimonjetz 2.720785617828369
(base) xxx@xxx:~$ python test.py f3 20
quamrana 0.717628002166748
(base) xxx@xxx:~$ python test.py f1 21
Kapil 4.06165337562561
(base) xxx@xxx:~$ python test.py f2 21
fsimonjetz 6.2198286056518555
(base) xxx@xxx:~$ python test.py f3 21
quamrana 1.563591718673706
(base) xxx@xxx:~$ python test.py f1 22
Kapil 8.361187934875488
(base) xxx@xxx:~$ python test.py f2 22
fsimonjetz 14.354418992996216
(base) xxx@xxx:~$ python test.py f3 22
quamrana 3.355391025543213