How to speed up iterating row and apply value wise functions?

Question

I have one DataFrame which has three columns: time, lon, and lat.

The goal is to predict the location of each point at the end_time, using a look up Dataset (ds, three dimension [time, lon, lat]) which has two DataArrays: wdir, and wspd.

Here's the prediction process:

iterate each row of DataFrame
interpolate ds to the location determined by the row values
predict the new lon and lat using interpolated wdir, wspd, and the time step (delta_wind) between time and end_time.
iterate until the end_time, save the predicted lon and lat.

To understand it easily, I write this simple example.

The core is the iteration of each row after the # --- Sample Data end --- line.

import pandas as pd
import xarray as xr
import numpy as np

# !!!! edit len_time to for testing the speed !!!!
len_time = 2000

# --- Sample Data ---

# Two functions for creating sample data
def random_dates(start, end, n=10):
    # random dates based on length (n)
    start_u = start.value//10**9
    end_u = end.value//10**9

    return pd.to_datetime(np.random.randint(start_u, end_u, n), unit='s')

def predict_loc(lon, lat, wdir, wspd, delta):
    # some calculation depends on inputs, i just simplify them here
    lon2 = lon/2; lat2 = lat/2

    return lon2, lat2

# create 3d DataArray as the look up array
da = xr.DataArray(np.abs(np.random.randn(600).reshape(6, 10, 10)),
                [("time",  pd.date_range("20130101", periods=6, freq="1H")),
                 ("lon", range(10)),
                 ("lat", range(10))
                 ],
)

# create two DataArrays (wind direction and wind speed) and merge them into one Dataset
ds = xr.merge([da.rename('wdir')/2, da.rename('wspd')])

# the end time of prediction
end_time = pd.Timestamp('2013-01-01 03:10')

# create times
times = random_dates(pd.to_datetime('2013-01-01 00:00'),
                     pd.to_datetime('2013-01-01 02:00'),
                     n=len_time)

# create DataFrame
df = pd.DataFrame(times, columns=['time'])
df['lon'] = np.random.randint(low=0, high=9, size=(len_time))
df['lat'] = np.random.randint(low=0, high=9, size=(len_time))

# --- Sample Data end ---

# create emtpy list for saving results
lons, lats = [], []

# iterate each row
for _, row in df.iterrows():
    # because the ds is hourly data, we need to create the hourly time step
    times = np.concatenate(([row.time.to_pydatetime()],
                        pd.date_range(row.time.ceil('h'),
                                        end_time.floor('h'), freq='H').to_pydatetime(),
                        [end_time.to_pydatetime()]
    ))

    # calculate the delta seconds
    delta_wind = [t.total_seconds() for t in np.diff(times)]

    # get the beginning location (lon/lat) of the row
    lat, lon = row.lat, row.lon

    # predict the location by each time step
    for t_index, time in enumerate(times[:-1]):
        # interpolate to the location at each time
        data = ds.interp(time=time, lon=lon, lat=lat)
        lon, lat = predict_loc(lon, lat, data['wdir'], data['wspd'], delta_wind[t_index])

    # save the new location at the end of time
    lons.append(lon)
    lats.append(lat)

# add prediction results to DataFrame
df[f'lon_pred'] = lons
df[f'lat_pred'] = lats

When the len_time is increased to 1000 or larger, it's really slow.

Any idea how to improve it?

zxdawn · Accepted Answer · 2022-02-19T16:14:03.193

TL;DR

Use scipy.interpolate.interpn according to this answer.
Use parallel_apply provided by pandarallel.

Saving time from ~ 55 s to 3 s for 2000 rows.

Method 1: xr.interp

df['delta'] = df.apply(lambda row : get_delta_time(row['time']), axis=1)
df.apply(lambda row: predict_loc_xr_apply(row, wdir, wspd), axis=1)

Cost Time

54.1 s ± 697 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

Method 2: scipy.interpolate.interpn

df['delta'] = df.apply(lambda row : get_delta_time(row['time']), axis=1)
df.apply(lambda row: predict_loc_scipy_apply(row, wdir, wspd), axis=1)

Cost Time

12 s ± 623 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

Method 3: scipy.interpolate.interpn with pandarallel

df['delta'] = df.parallel_apply(lambda row : get_delta_time(row['time']), axis=1)
df.parallel_apply(lambda row: predict_loc_scipy_apply(row, wdir, wspd), axis=1)

Cost Time:

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.

3.41 s ± 78.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

I also tested Swifter, but that's slower than the pure apply function

Used functions

def get_delta_time(begin_time):
    # because the ds is hourly data, we need to create the hourly time step
    times = np.concatenate(([pd.Timestamp(begin_time).to_pydatetime()],
                        pd.date_range(begin_time.ceil('h'),
                                        end_time.floor('h'), freq='H').to_pydatetime(),
                        [end_time.to_pydatetime()]
))
    # calculate the delta seconds
    delta_wind = [t.total_seconds() for t in np.diff(times)]

    return delta_wind


def predict_loc_xr_apply(row, wdir, wspd):
    # copy the initial location
    lon_pred = row['lon']
    lat_pred = row['lat']
    time_pred = row['time']

    for delta in row['delta']:
        # update the location
        lon_pred /= 2
        lat_pred /= 2

        ds_interp = ds.interp(time=time_pred, lon=lon_pred, lat=lat_pred)
        wdir_interp = ds_interp['wspd']
        wspd_interp = ds_interp['wdir']

        time_pred = time_pred+pd.Timedelta(seconds=delta)

    row['lon_pred'] = lon_pred
    row['lat_pred'] = lat_pred

    return row

def predict_loc_scipy_apply(row, wdir, wspd):
    # copy the initial location
    lon_pred = row['lon']
    lat_pred = row['lat']
    time_pred = row['time']

    for delta in row['delta']:
        # update the location
        lon_pred /= 2
        lat_pred /= 2

        interp_points = [xr.DataArray(time_pred).astype('float').values, lon_pred, lat_pred]
        wdir_pred = interpn(coords, wdir, interp_points)
        wspd_pred = interpn(coords, wspd, interp_points)

        time_pred = time_pred+pd.Timedelta(seconds=delta)

    row['lon_pred'] = lon_pred
    row['lat_pred'] = lat_pred

    return row


coords = [ds.time.astype(float).values, ds.lon.values, ds.lat.values]
wdir = ds['wdir'].values
wspd = ds['wspd'].values