Basically I have some gridded meteorological data with dimensions (time, lat, lon).
- I need to go through each timeseries at each gridsquare, identify consecutive days ("events") when the variable is above a threshold and store that to a new variable (THdays)
- Then I look through the new variable and find the events which are longer than a certain duration (THevents)
Currently I have a super scrappy (non-vectorised) iteration and I'd appreciate your advice on how to speed this up. Thanks!
import numpy as np
import itertools as it
##### Parameters
lg = 2000 # length to initialise array (must be long to store large number of events)
rl = 180 # e.g latitude
cl = 360 # longitude
pcts = [95, 97, 99] # percentiles which are the thresholds that will be compared
dt = [1,2,3] #duration thresholds, i.e. consecutive values (days) above threshold
##### Data
data # this is the gridded data that is (time,lat,lon) , e.g. data = np.random.rand(1000,rl,cl)
# From this data calculate the percentiles at each gridsquare (lat/lon combination) which will act as our thresholds
histpcts = np.percentile(data, q=pcts, axis = 0)
##### Initialize arrays to store the results
THdays = np.ndarray((rl, cl, lg, len(pcts)), dtype='int16') #Array to store consecutive threshold timesteps
THevents = np.ndarray((rl,cl,lg,len(pcts),len(dt)),dtype='int16')
##### Start iteration to identify events
for p in range(len(pcts)): # for each threshold value
br = data>histpcts[p,:,:] # Make boolean array where data is bigger than threshold
# for every lat/lon combination
for r,c in it.product(range(rl),range(cl)):
if br[:,r,c].any()==True: # This is to skip timeseries with nans only and so the iteration is skipped. Important to keep this or something that ignores an array of nans
a = [ sum( 1 for _ in group ) for key, group in it.groupby( br[:,r,c] ) if key ] # Find the consecutive instances
tm = np.full(lg-len(a), np.nan) # create an array of nans to fill in the rest
# Assign to new array
THdays[r,c,0:len(a),p] = a # Consecutive Thresholds days
THdays[r,c,len(a):,p] = tm # Fill the rest of array
# Now cycle through and identify events
# (consecutive values) longer than a certain duration (dt)
for d in range(len(dt)):
b = THdays[r,c,THdays[r,c,:,p]>=dt[d],p]
THevents[r,c,0:len(b),p,d] = b