Using numpy, one possible solution would be the following (based largely off this answer).
import pandas as pd
d = {"A":[3, 3, 3, 2, 3, 3, 2, 2, 2, 3, 3, 2], "B": [3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3]}
df = pd.DataFrame.from_dict(d)
import numpy as np
def runs_of_ones_array(bits):
# make sure all runs of ones are well-bounded
bounded = np.hstack(([0], bits, [0]))
# get 1 at run starts and -1 at run ends
difs = np.diff(bounded)
run_starts, = np.where(difs > 0)
run_ends, = np.where(difs < 0)
return np.vstack((run_starts, run_ends)).T
interesting_value = 2
runs = runs_of_ones_array(df["A"] == interesting_value)
for start, end in runs:
end -= 1
# since we don't seem to be interested in single-element runs
if start == end:
continue
print("Value {} was observed at instance {} and continued till instance {}.".format(
interesting_value, start, end))
The output of the above is
Value 2 was observed at instance 6 and continued till instance 8.
EDIT: modified the code to only output runs with length greater than 1.
EDIT2: regarding the speed of the two posted quite similar methods, I ran some benchmarks in IPython
EDIT3: If you include the boolean mask generation time in the benchmark, the groupby
method outperforms the others by nearly an order of magnitude
In [28]:
%%timeit -n 10000
mask = df == 2
for col_name in mask:
column = mask[col_name]
runs = runs_of_ones_array(column)
for start, end in runs:
end -= 1
if start == end:
continue
pass
10000 loops, best of 3: 452 µs per loop
In [29]:
%%timeit -n 10000
mask = df == 2
for col_name in mask:
column = mask[col_name]
ind = column[column].index.values
for sub in np.split(ind, np.where(np.diff(ind) != 1)[0]+1):
if sub.size > 1:
pass
pass
10000 loops, best of 3: 585 µs per loop
In [30]:
from itertools import groupby
In [31]:
%%timeit -n 10000
for k in df:
ind = prev = 0
for k, v in groupby(df[k], key=lambda x: x == 2):
ind += sum(1 for _ in v)
if k and prev + 1 != ind:
pass
prev = ind
10000 loops, best of 3: 73.4 µs per loop