Ok, I have one really fast version.
Inspired from this answer which deals with the same question but with 1D arrays.
def findIndex(A,B):
a=A.values # ndarray of data in A
b=B.values # same for B
af=a.flatten() # same ndarray, but flatten
bf=b.flatten() # same for B
# This create a rolling window (without actually building the array; it's just a "view") of len(bf) size from af values
# with a "stride" (how many bytes to advance between each window) being the one of a (or b), that is the size of a line
rollwin = np.lib.stride_tricks.as_strided(af, shape=(len(a)-len(b)+1, len(bf)), strides=a.strides)
# Now that we have that, we just have to find a "line" in this rolling window equal to bf.
lineMask=np.all(rollwin==bf, axis=1)
if lineMask.any():
return np.argmax(lineMask)
else:
return None
==
here is a numpy operator, that does, real fast, comparison in all values of rollwin and b. np.all
, with axis=1
search for lines where all values are equal. Then np.argmax
find the first true occurrence.
timeit says that this version is 150 times faster than my previous answer (which was already 5 times faster than a direct "for loop" approach). So, not sure it is elegant. But it is fast, and it does use vectorization power.
A variant in case dataframe is so big that memory (when creating == result. The rest are just "views") is a problem, would be to compute by batches. So, a for loop, but one that does only a fraction of real iterations. The last lines could be replaced by this batch computations
BatchSize=10000
def findIndex(A,B):
a=A.values
b=B.values
af=a.flatten()
bf=b.flatten()
rollwin = np.lib.stride_tricks.as_strided(af, shape=(len(a)-len(b)+1, len(bf)), strides=a.strides)
for start in range(0, len(rollwin), BatchSize):
lineMask=np.all(rollwin[start:start+BatchSize]==bf, axis=1)
if lineMask.any():
return start+np.argmax(lineMask)
BatchSize being a parameter telling the size of the batches. The higher, the more "vectorized" are the computations, but the more memory it takes. The smaller, the closer we get to a classical for loop.
With BatchSize=10000 for example, memory usage should be really small (depending on the real number of columns), and the time similar than with the previous single batch version.
Timings with a benchmark of mine (A=pd.DataFrame(np.random.randint(x,y,(n,w))); B=A[k:k+m]
, with n,w,m being size parameters, k the secret answer that should be returned by computations. x and y range for random values — the closer they are, the more "near miss" will occur)
Version |
Timing |
np.vectorize |
164.15 |
rollwin |
0.58 |
Batch rollwin |
0.59 |
bifflip's apply |
2201.71 |
naive for loop |
657.75 |
So, those timing differences are not a "optimization vs obfuscation" tradeoff case. It is really a game changer, so, it is worth working to make it work.
With a bathsize=10000 you have a good compromise, that should not cause any memory problem, while still begin thousand time faster than easier to write version. Besides, if your jupyter crashed, it is because memory was out. If memory was out, it is because your real dataframe is way bigger than the one I've used for benchmark. And if it is so big, those timing differences will be very big also.
Former answer
I have no elegant solution. But since nobody else seems to be posting one, let's go with what I have.
I fail to see any way to do it without a for loop. But at least, we can avoid to code the for loop ourselves, using numpy vectorize.
def comp(i, a, b):
return np.array_equal(a[i:i+len(b)], b)
vcomp=np.vectorize(comp, signature="(),(m,n),(p,n)->()")
matchPos=vcomp(np.arange(len(A)-len(B)+1), A.values, B.values)
# MatchPos is an array of boolean, with True to any position where B is found. If you want an index, you can use argmax
if matchPos.any():
indexPos=np.argmax(matchPos)
else:
indexPos=None
Naive for loop version
Just for reference and timings, the naive for loop version
def findIndex(A, B):
for i in range(len(A)-len(B)+1):
ok=True # Will remain true if after comparing all A[i+k,j]==B[k,j] we find no diff. In which case i is the answer
for k in range(len(B)):
for j in range(A.shape[1]):
if A.iloc[i+k,j]!=B.iloc[k,j]:
ok=False
break
if not ok: break
if ok: return i
My whole code for those experiments
import pandas as pd
import numpy as np
import re
import timeit
# Note: the lower the second parameter (100), the more "near miss there are".
# Apparently, this impacts only naive for loop (which makes sense: the sooner the break occurs, the better, so
# the smaller this parameter is the worse is naive for loop)
# Beware of having r^(w×len(b)) way bigger than len(a)
# (r being the 2nd parameter of randit, and w aka a.shape[1], 2nd parameter of the range)
# Otherwise, there is a risk that a match is found sooner than expected
aa=np.random.randint(0,100,(10000,2))
A=pd.DataFrame(aa)
# B is just a slice from A. We know that 1st match should be 8000
bb=aa[8000:8005]
B=pd.DataFrame(bb)
# Vectorization method
def comp(i, a, b):
return np.array_equal(a[i:i+len(b)], b)
vcomp=np.vectorize(comp, signature="(),(m,n),(p,n)->()")
def m1(A, B):
return np.argmax(vcomp(np.arange(len(A)-len(B)+1), A.values, B.values))
# For loop with numpy (I havn't post it, so I haven't post timing neither)
def m2(A,B):
for i in range(len(A)-len(B)+1):
if np.array_equal(A.iloc[i:i+len(B)].values, B.values):
return i
return None
# Same. But getting array from df then slicing array, instead of slicing df the gettig array
def m3(A,B):
for i in range(len(A)-len(B)+1):
if np.array_equal(A.values[i:i+len(B)], B.values):
return i
return None
# Rolling window method
def m4(A,B):
a=A.values
b=B.values
af=a.flatten()
bf=b.flatten()
rollwin = np.lib.stride_tricks.as_strided(af, shape=(len(a)-len(b)+1, len(bf)), strides=a.strides)
return np.argmax(np.all(rollwin==bf, axis=1))
# Same, but using batches of BatchSize slices of rollwin, to ensure that rollwin==bf (that creates an actual array
# of boolean, when rollwin was just a "view") is small enough to avoid memory explosion, and big enough to
# allow good vectorization
BatchSize=10000
def m5(A,B):
a=A.values
b=B.values
af=a.flatten()
bf=b.flatten()
rollwin = np.lib.stride_tricks.as_strided(af, shape=(len(a)-len(b)+1, len(bf)), strides=a.strides)
for start in range(0, len(rollwin), BatchSize):
lineMask=np.all(rollwin[start:start+BatchSize]==bf, axis=1)
if lineMask.any():
return start+np.argmax(lineMask)
# bitflip's apply method
def m6(A,B):
l=len(B)
return np.where(A.apply(lambda row: B.equals(A[row.name:row.name+len(B.index)].reset_index(drop=True)), axis=1))
# Naive for/iloc method
def m7(A,B):
for i in range(len(A)-len(B)+1):
ok=True
for k in range(len(B)):
for j in range(A.shape[1]):
if A.iloc[i+k,j]!=B.iloc[k,j]:
ok=False
break
if not ok: break
if ok: return i
# Timings per 1k run (so ×100 when counted on 10 runs, ×0.1 when counted on 10k runs)
print("| Method | Timings | Answer |")
print("| ------------------ | ------- | ------ |")
print("| np.vectorize | %7.2f | %6d |"%(timeit.timeit(lambda: m1(A,B), number=10)*100, m1(A,B)))
print("| df.values[x:y].eq | %7.2f | %6d |"%(timeit.timeit(lambda: m2(A,B), number=10)*100, m2(A,B)))
print("| df[x:y].values.eq | %7.2f | %6d |"%(timeit.timeit(lambda: m3(A,B), number=10)*100, m3(A,B)))
print("| rollwin | %7.2f | %6d |"%(timeit.timeit(lambda: m4(A,B), number=10000)*0.1, m4(A,B)))
print("| batch rollwin | %7.2f | %6d |"%(timeit.timeit(lambda: m5(A,B), number=10000)*0.1, m5(A,B)))
print("| apply | %7.2f | %6d |"%(timeit.timeit(lambda: m6(A,B), number=10)*100, m6(A,B)[0]))
print("| naive for .iloc | %7.2f | %6d |"%(timeit.timeit(lambda: m7(A,B), number=10)*100, m7(A,B)))