I have a Numpy array of length 500 and a Pandas dataframe containing two columns of integers A and B. For every row of the dataframe, I want to increment the slice of the numpy array by a fraction inversely proportional to the size of the slice.
The best I could come up with is:
import pandas as pd
import numpy as np
def do(n_rows):
df = pd.DataFrame()
df['A'] = np.random.choice(range(0, 400), n_rows)
df['B'] = df['A'] + np.random.choice(range(1, 50), n_rows)
y = np.zeros(500)
for row in df[['A', 'B']].itertuples():
_, l, r = row
y[l:r] += 1 / (r - l)
if __name__=='__main__':
from timeit import Timer
for n_rows in [1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000]:
t = Timer("do(%d)" % n_rows, "from __main__ import do")
print("Time with %d rows:" % n_rows, t.timeit(number=1))
which, on my machine, outputs:
Time with 1 rows: 0.0033148399088531733
Time with 10 rows: 0.0027232079301029444
Time with 100 rows: 0.0028421489987522364
Time with 1000 rows: 0.0047673441004008055
Time with 10000 rows: 0.023656874895095825
Time with 100000 rows: 0.21090111997909844
Time with 1000000 rows: 1.9661296689882874
Time with 10000000 rows: 19.77073140698485
Time with 100000000 rows: 198.70571927190758
I need to run this code on ~70 million lines, about 150 times, every time I want to re-analyse my data. Can anyone think of some way to significantly improve performance here?
Timings + final code (left pandas out of it and cleaned a bit)
import numpy as np
def do(n_rows, solver):
s0 = np.random.choice(range(0, 400), n_rows)
s1 = s0 + np.random.choice(range(1, 50), n_rows)
return solver(s0, s1, 500)
def sol1(s0, s1, points):
y = np.zeros(points)
for l, r in zip(s0, s1):
y[l:r] += 1 / (r - l)
return y
def sol2(s0, s1, points):
out = np.zeros(points)
v = 1.0 / (s1 - s0)
np.add.at(out,s0,+v)
np.add.at(out,s1,-v)
np.cumsum(out,out=out)
out[np.isclose(out, 0)] = 0
return out
def sol3(s0, s1, points):
v = 1.0 / (s1 - s0)
out = np.bincount(s0, v, minlength=points) - np.bincount(s1, v, minlength=points)
np.cumsum(out, out=out)
out[np.isclose(out, 0)] = 0
return out
if __name__=='__main__':
# Test for correctness
s0 = np.array([2,2,4])
s1 = np.array([4,5,6])
r1 = sol1(s0, s1, 10)
r2 = sol2(s0, s1, 10)
r3 = sol3(s0, s1, 10)
print(r1.tolist())
print(r2.tolist())
print(r3.tolist())
print(np.allclose(r1, r2))
print(np.allclose(r1, r3))
# Test for speed
from timeit import Timer
for n_rows in [1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000]:
print("-------------------------------------------------------------")
t = Timer("do(%d, sol1)" % n_rows, "from __main__ import do, sol1")
print("Old App : Time with %d rows:" % n_rows, t.timeit(number=1))
t = Timer("do(%d, sol2)" % n_rows, "from __main__ import do, sol2")
print("New App : Time with %d rows:" % n_rows, t.timeit(number=1))
t = Timer("do(%d, sol3)" % n_rows, "from __main__ import do, sol3")
print("Newer App : Time with %d rows:" % n_rows, t.timeit(number=1))
Results
[0.0, 0.0, 0.8333333333333333, 0.8333333333333333, 0.8333333333333333, 0.5, 0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.8333333333333333, 0.8333333333333333, 0.8333333333333333, 0.49999999999999994, 0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.8333333333333333, 0.8333333333333333, 0.8333333333333333, 0.49999999999999994, 0.0, 0.0, 0.0, 0.0]
True
True
-------------------------------------------------------------
Old App : Time with 1 rows: 0.00015379209071397781
New App : Time with 1 rows: 0.00018249684944748878
Newer App : Time with 1 rows: 0.00016245199367403984
-------------------------------------------------------------
Old App : Time with 10 rows: 0.0001194290816783905
New App : Time with 10 rows: 0.00017586094327270985
Newer App : Time with 10 rows: 0.0001628710888326168
-------------------------------------------------------------
Old App : Time with 100 rows: 0.0002433289773762226
New App : Time with 100 rows: 0.00019087712280452251
Newer App : Time with 100 rows: 0.00016685202717781067
-------------------------------------------------------------
Old App : Time with 1000 rows: 0.001470518996939063
New App : Time with 1000 rows: 0.00034397002309560776
Newer App : Time with 1000 rows: 0.0001920647919178009
-------------------------------------------------------------
Old App : Time with 10000 rows: 0.016746808076277375
New App : Time with 10000 rows: 0.0035001228097826242
Newer App : Time with 10000 rows: 0.0006547670345753431
-------------------------------------------------------------
Old App : Time with 100000 rows: 0.15572935109958053
New App : Time with 100000 rows: 0.017682339064776897
Newer App : Time with 100000 rows: 0.0035442619118839502
-------------------------------------------------------------
Old App : Time with 1000000 rows: 1.3663988099433482
New App : Time with 1000000 rows: 0.1856136831920594
Newer App : Time with 1000000 rows: 0.044601815985515714
-------------------------------------------------------------
Old App : Time with 10000000 rows: 14.203968763817102
New App : Time with 10000000 rows: 2.116381539963186
Newer App : Time with 10000000 rows: 0.6719322800636292
-------------------------------------------------------------
Old App : Time with 100000000 rows: 142.28248666599393
New App : Time with 100000000 rows: 21.45379024511203
Newer App : Time with 100000000 rows: 5.622305189957842
25x speedup, not bad ;)