I want to create features(additional columns) from a dataframe and I have the following structure for many functions.
Following this documentation https://docs.dask.org/en/stable/delayed-best-practices.html I have come up with the code below.
However I get the error message: concurrent.futures._base.CancelledError and many times I get the warning: distributed.utils_perf - WARNING - full garbage collections took 10% CPU time recently (threshold: 10%)
I understand that the object I am appending to delay is very large(it works ok when I use the commented out df) which is why the program crashes but is there a better way of doing it?
import pandas as pd
from dask.distributed import Client, LocalCluster
import dask.dataframe as dd
import numpy as np
import dask
def main():
#df = pd.DataFrame({"col1": np.random.randint(1, 100, 100000), "col2": np.random.randint(101, 200, 100000), "col3": np.random.uniform(0, 4, 100000)})
df = pd.DataFrame({"col1": np.random.randint(1, 100, 100000000), "col2": np.random.randint(101, 200, 100000000), "col3": np.random.uniform(0, 4, 100000000)})
ddf = dd.from_pandas(df, npartitions=100)
ddf = ddf.set_index("col1")
delay = []
def create_col_sth():
group = ddf.groupby("col1")["col3"]
@dask.delayed
def small_fun(lag):
return f"col_{lag}", group.transform(lambda x: x.shift(lag), meta=('x', 'float64')).apply(lambda x: np.log(x), meta=('x', 'float64'))
for lag in range(5):
x = small_fun(lag)
delay.append(x)
create_col_sth()
delayed = dask.compute(*delay)
for data in delayed:
ddf[data[0]] = data[1]
ddf.to_parquet("test", engine="fastparquet")
if __name__ == "__main__":
cluster = LocalCluster(n_workers=6,
threads_per_worker=2,
memory_limit='8GB')
client = Client(cluster)
main()