I'm fairly new to xarray and I'm currently trying to leverage it to subset some NetCDFs. I'm running this on a shared server and would like to know how best to limit the processing power used by xarray so that it plays nicely with others. I've read through the dask and xarray documentation a bit but it doesn't seem clear to me how to set a cap on cpus/threads. Here's an example of a spatial subset:
import glob
import os
import xarray as xr
from multiprocessing.pool import ThreadPool
import dask
wd = os.getcwd()
test_data = os.path.join(wd, 'test_data')
lat_bnds = (43, 50)
lon_bnds = (-67, -80)
output = 'test_data_subset'
def subset_nc(ncfile, lat_bnds, lon_bnds, output):
if not glob.os.path.exists(output):
glob.os.makedirs(output)
outfile = os.path.join(output, os.path.basename(ncfile).replace('.nc', '_subset.nc'))
with dask.config.set(scheduler='threads', pool=ThreadPool(5)):
ds = xr.open_dataset(ncfile, decode_times=False)
ds_sub = ds.where(
(ds.lon >= min(lon_bnds)) & (ds.lon <= max(lon_bnds)) & (ds.lat >= min(lat_bnds)) & (ds.lat <= max(lat_bnds)),
drop=True)
comp = dict(zlib=True, complevel=5)
encoding = {var: comp for var in ds.data_vars}
ds_sub.to_netcdf(outfile, format='NETCDF4', encoding=encoding)
list_files = glob.glob(os.path.join(test_data, '*'))
print(list_files)
for i in list_files:
subset_nc(i, lat_bnds, lon_bnds, output)
I've tried a few variations on this by moving the ThreadPool
configuration around but I still see way too much activity in the server's top
(>3000% cpu activity). I'm not sure where the issue lies.