Here is a numpy
example that should be fast. The example that includes both the higher and lower replacement assumes that you want to replace the high and low values evenly (50-50) if that is not the case you can change the p
in mask_high = np.random.choice([0,1], p=[.5, .5], size=rand.shape).astype(np.bool)
to whatever you want.
d = {'date':["2016-11-10", "2016-11-10", "2016-11-11", "2016-11-11","2016-11-11","2016-11-11","2016-11-11", "2016-11-11" ],
'time': ["22:00:00", "23:00:00", "00:00:00", "01:00:00", "02:00:00", "03:00:00", "04:00:00", "04:00:00"],
'value':[90, 91, 80, 87, 84,94, 91, 94]}
df = pd.DataFrame(d)
# create a function
def myFunc(df, replace_pct, start_range, stop_range, replace_col):
# create an array of the col you want to replace
val = df[replace_col].values
# create a boolean mask for the percent you want to replace
mask = np.random.choice([0,1], p=[1-replace_pct, replace_pct], size=val.shape).astype(np.bool)
# create a random ints between the range
rand = np.random.randint(start_range, stop_range, size=len(mask[mask == True]))
# replace values in the original array
val[mask] = rand
# update column
df[replace_col] = val
return df
myFunc(df, .2, 1, 50, 'value')
date time value
0 2016-11-10 22:00:00 90
1 2016-11-10 23:00:00 91
2 2016-11-11 00:00:00 80
3 2016-11-11 01:00:00 87
4 2016-11-11 02:00:00 46
5 2016-11-11 03:00:00 94
6 2016-11-11 04:00:00 91
7 2016-11-11 04:00:00 94
timeit
%%timeit
myFunc(df, .2, 1, 50, 'value')
397 µs ± 27.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
Example of both both high and low replacement
# create a function
def myFunc2(df, replace_pct, start_range_low, stop_range_low,
start_range_high, stop_range_high, replace_col):
# create array of col you want to replace
val = df[replace_col].values
# create a boolean mask for the percent you want to replace
mask = np.random.choice([0,1], p=[1-replace_pct, replace_pct], size=val.shape).astype(np.bool)
# create a random int between ranges
rand = np.random.randint(start_range_low, stop_range_low, size=len(mask[mask == True]))
# create a mask for the higher range
mask_high = np.random.choice([0,1], p=[.5, .5], size=rand.shape).astype(np.bool)
# create random ints between high ranges
rand_high = np.random.randint(start_range_high, stop_range_high, size=len(mask_high[mask_high == True]))
# replace values in the rand array
rand[mask_high] = rand_high
# replace values in the original array
val[mask] = rand
# update column
df[replace_col] = val
return df
myFunc2(df, .2, 1, 50, 200, 300, 'value')
date time value
0 2016-11-10 22:00:00 90
1 2016-11-10 23:00:00 216
2 2016-11-11 00:00:00 80
3 2016-11-11 01:00:00 49
4 2016-11-11 02:00:00 84
5 2016-11-11 03:00:00 94
6 2016-11-11 04:00:00 270
7 2016-11-11 04:00:00 94
timeit
%%timeit
myFunc2(df, .2, 1, 50, 200, 300, 'value')
493 µs ± 41.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)