0

I have one DataFrame which has three columns: time, lon, and lat.

The goal is to predict the location of each point at the end_time, using a look up Dataset (ds, three dimension [time, lon, lat]) which has two DataArrays: wdir, and wspd.

Here's the prediction process:

  1. iterate each row of DataFrame
  2. interpolate ds to the location determined by the row values
  3. predict the new lon and lat using interpolated wdir, wspd, and the time step (delta_wind) between time and end_time.
  4. iterate until the end_time, save the predicted lon and lat.

To understand it easily, I write this simple example.

The core is the iteration of each row after the # --- Sample Data end --- line.

import pandas as pd
import xarray as xr
import numpy as np

# !!!! edit len_time to for testing the speed !!!!
len_time = 2000

# --- Sample Data ---

# Two functions for creating sample data
def random_dates(start, end, n=10):
    # random dates based on length (n)
    start_u = start.value//10**9
    end_u = end.value//10**9

    return pd.to_datetime(np.random.randint(start_u, end_u, n), unit='s')

def predict_loc(lon, lat, wdir, wspd, delta):
    # some calculation depends on inputs, i just simplify them here
    lon2 = lon/2; lat2 = lat/2

    return lon2, lat2

# create 3d DataArray as the look up array
da = xr.DataArray(np.abs(np.random.randn(600).reshape(6, 10, 10)),
                [("time",  pd.date_range("20130101", periods=6, freq="1H")),
                 ("lon", range(10)),
                 ("lat", range(10))
                 ],
)

# create two DataArrays (wind direction and wind speed) and merge them into one Dataset
ds = xr.merge([da.rename('wdir')/2, da.rename('wspd')])

# the end time of prediction
end_time = pd.Timestamp('2013-01-01 03:10')

# create times
times = random_dates(pd.to_datetime('2013-01-01 00:00'),
                     pd.to_datetime('2013-01-01 02:00'),
                     n=len_time)

# create DataFrame
df = pd.DataFrame(times, columns=['time'])
df['lon'] = np.random.randint(low=0, high=9, size=(len_time))
df['lat'] = np.random.randint(low=0, high=9, size=(len_time))

# --- Sample Data end ---

# create emtpy list for saving results
lons, lats = [], []

# iterate each row
for _, row in df.iterrows():
    # because the ds is hourly data, we need to create the hourly time step
    times = np.concatenate(([row.time.to_pydatetime()],
                        pd.date_range(row.time.ceil('h'),
                                        end_time.floor('h'), freq='H').to_pydatetime(),
                        [end_time.to_pydatetime()]
    ))

    # calculate the delta seconds
    delta_wind = [t.total_seconds() for t in np.diff(times)]

    # get the beginning location (lon/lat) of the row
    lat, lon = row.lat, row.lon

    # predict the location by each time step
    for t_index, time in enumerate(times[:-1]):
        # interpolate to the location at each time
        data = ds.interp(time=time, lon=lon, lat=lat)
        lon, lat = predict_loc(lon, lat, data['wdir'], data['wspd'], delta_wind[t_index])

    # save the new location at the end of time
    lons.append(lon)
    lats.append(lat)

# add prediction results to DataFrame
df[f'lon_pred'] = lons
df[f'lat_pred'] = lats

When the len_time is increased to 1000 or larger, it's really slow.

Any idea how to improve it?

zxdawn
  • 825
  • 1
  • 9
  • 19

1 Answers1

0

TL;DR

  1. Use scipy.interpolate.interpn according to this answer.

  2. Use parallel_apply provided by pandarallel.

Saving time from ~ 55 s to 3 s for 2000 rows.

Method 1: xr.interp

df['delta'] = df.apply(lambda row : get_delta_time(row['time']), axis=1)
df.apply(lambda row: predict_loc_xr_apply(row, wdir, wspd), axis=1)

Cost Time

54.1 s ± 697 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

Method 2: scipy.interpolate.interpn

df['delta'] = df.apply(lambda row : get_delta_time(row['time']), axis=1)
df.apply(lambda row: predict_loc_scipy_apply(row, wdir, wspd), axis=1)

Cost Time

12 s ± 623 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

Method 3: scipy.interpolate.interpn with pandarallel

df['delta'] = df.parallel_apply(lambda row : get_delta_time(row['time']), axis=1)
df.parallel_apply(lambda row: predict_loc_scipy_apply(row, wdir, wspd), axis=1)

Cost Time:

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.

3.41 s ± 78.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

I also tested Swifter, but that's slower than the pure apply function

Used functions

def get_delta_time(begin_time):
    # because the ds is hourly data, we need to create the hourly time step
    times = np.concatenate(([pd.Timestamp(begin_time).to_pydatetime()],
                        pd.date_range(begin_time.ceil('h'),
                                        end_time.floor('h'), freq='H').to_pydatetime(),
                        [end_time.to_pydatetime()]
))
    # calculate the delta seconds
    delta_wind = [t.total_seconds() for t in np.diff(times)]

    return delta_wind


def predict_loc_xr_apply(row, wdir, wspd):
    # copy the initial location
    lon_pred = row['lon']
    lat_pred = row['lat']
    time_pred = row['time']

    for delta in row['delta']:
        # update the location
        lon_pred /= 2
        lat_pred /= 2

        ds_interp = ds.interp(time=time_pred, lon=lon_pred, lat=lat_pred)
        wdir_interp = ds_interp['wspd']
        wspd_interp = ds_interp['wdir']

        time_pred = time_pred+pd.Timedelta(seconds=delta)

    row['lon_pred'] = lon_pred
    row['lat_pred'] = lat_pred

    return row

def predict_loc_scipy_apply(row, wdir, wspd):
    # copy the initial location
    lon_pred = row['lon']
    lat_pred = row['lat']
    time_pred = row['time']

    for delta in row['delta']:
        # update the location
        lon_pred /= 2
        lat_pred /= 2

        interp_points = [xr.DataArray(time_pred).astype('float').values, lon_pred, lat_pred]
        wdir_pred = interpn(coords, wdir, interp_points)
        wspd_pred = interpn(coords, wspd, interp_points)

        time_pred = time_pred+pd.Timedelta(seconds=delta)

    row['lon_pred'] = lon_pred
    row['lat_pred'] = lat_pred

    return row


coords = [ds.time.astype(float).values, ds.lon.values, ds.lat.values]
wdir = ds['wdir'].values
wspd = ds['wspd'].values
zxdawn
  • 825
  • 1
  • 9
  • 19