I have a working code with a for-loop that I would like to optimise in terms of speed (the project has been rejected because it's slow).
This is my situation: I have an Excel file from which I import a table with thousands of rows and a few columns into a Pandas DataFrame. The first column is a monotonic increasing series of timestamps, with 15 minutes frequency.
I have to use this table to calculate few extra columns, append them to the original table and save the resulting DataFrame into a new Excel file.
What is slowing the code is the core calculation of the extra columns. Here is a snippet of the working code:
import pandas as pd
from datetime import timedelta as td
cons_prod = pd.read_csv("sample.csv", index_col=0, parse_dates=True)
soc_dct = {} # State of charge (kW)
charge_dct = {} # Charge (kW)
discharge_dct = {} # Discharge (kW)
acc_dct = {} # Auto-consumption NEW (kW)
lst_dct = {} # Lost injection due to battery efficiency (kW)
inj_dct = {} # Injection NEW (kW)
gridcons_dct = {} # Grid Consumption NEW (kW)
agg_dct = {} # Additional Auto-consumption through battery (kW)
battery_parameters = {
"power": 50,
"energy": 130,
"efficiency": 0.9,
"minsoc": 0.1,
"soct0": 65.0,
}
bp_energy = battery_parameters["energy"]
bp_power = battery_parameters["power"]
soct0 = 0.5 * bp_energy
for t in cons_prod.index:
L = cons_prod.loc[t, "Injection (kW)"]
m = cons_prod.loc[t, "Grid Consumption (kW)"]
k = cons_prod.loc[t, "Auto-consumption (kW)"]
f = cons_prod.loc[t, "Consumption (kW)"]
if t == cons_prod.index[0]:
# State of charge (kW)
soc_dct.setdefault(t, soct0)
# Charge (kW)
charge_dct.setdefault(
t,
min(
L,
(bp_energy - soc_dct[t]) * 4,
bp_power,
)
* battery_parameters["efficiency"]
if L >= 0
else 0,
)
# Discharge (kW)
discharge_dct.setdefault(
t,
-min(
m,
4 * soc_dct[t] - 4 * (battery_parameters["minsoc"] * bp_energy),
bp_power,
)
if m >= 0
else 0,
)
# Auto-consumption NEW (kW)
acc_dct.setdefault(t, k - discharge_dct[t])
# Lost injection due to battery efficiency (kW)
lst_dct.setdefault(
t,
(charge_dct[t] / battery_parameters["efficiency"]) - charge_dct[t],
)
# Injection NEW (kW)
inj_dct.setdefault(t, L - charge_dct[t] - lst_dct[t])
# Grid Consumption NEW (kW)
gridcons_dct.setdefault(t, f - acc_dct[t])
# Additional Auto-consumption through battery (kW)
agg_dct.setdefault(t, acc_dct[t] - k)
else:
# State of charge (kW)
soc_dct.setdefault(
t,
soc_dct[t - td(minutes=15)]
+ (charge_dct[t - td(minutes=15)] + discharge_dct[t - td(minutes=15)]) / 4,
)
# Charge (kW)
charge_dct.setdefault(
t,
min(
L,
(bp_energy - soc_dct[t]) * 4,
bp_power,
)
* battery_parameters["efficiency"]
if L >= 0
else 0,
)
# Discharge (kW)
discharge_dct.setdefault(
t,
-min(
m,
4 * soc_dct[t] - 4 * (battery_parameters["minsoc"] * bp_energy),
bp_power,
)
if m >= 0
else 0,
)
# Auto-consumption NEW (kW)
acc_dct.setdefault(t, k - discharge_dct[t])
# Lost injection due to battery efficiency (kW)
lst_dct.setdefault(
t, charge_dct[t] / battery_parameters["efficiency"] - charge_dct[t]
)
# Injection NEW (kW)
inj_dct.setdefault(t, L - charge_dct[t] - lst_dct[t])
# Grid Consumption NEW (kW)
gridcons_dct.setdefault(t, f - acc_dct[t])
# Additional Auto-consumption through battery (kW)
agg_dct.setdefault(t, acc_dct[t] - k)
# Creating a DataFrame with all the values
output_df = pd.DataFrame(
data=[
soc_dct,
charge_dct,
discharge_dct,
acc_dct,
lst_dct,
inj_dct,
gridcons_dct,
agg_dct,
]
).T
output_df.columns = [
"State of charge (kW)",
"Charge (kW)",
"Discharge (kW)",
"Auto-consumption NEW (kW)",
"Lost injection due to battery efficiency (kW)",
"Injection NEW (kW)",
"Grid Consumption NEW (kW)",
"Additional Auto-consumption through battery (kW)",
]
charge_dct = {} # Charge (kW)
discharge_dct = {} # Discharge (kW)
acc_dct = {} # Auto-consumption NEW (kW)
lst_dct = {} # Lost injection due to battery efficiency (kW)
inj_dct = {} # Injection NEW (kW)
gridcons_dct = {} # Grid Consumption NEW (kW)
agg_dct = {} # Additional Auto-consumption through battery (kW)
for t in cons_prod.index:
L = cons_prod.loc[t, "Injection (kW)"]
m = cons_prod.loc[t, "Grid Consumption (kW)"]
k = cons_prod.loc[t, "Auto-consumption (kW)"]
f = cons_prod.loc[t, "Consumption (kW)"]
if t == cons_prod.index[0]:
# State of charge (kW)
soc_dct.setdefault(t, soct0)
# Charge (kW)
charge_dct.setdefault(
t,
min(
L,
(bp_energy - soc_dct[t]) * 4,
bp_power,
)
* battery_parameters["efficiency"]
if L >= 0
else 0,
)
# Discharge (kW)
discharge_dct.setdefault(
t,
-min(
m,
4 * soc_dct[t] - 4 * (battery_parameters["minsoc"] * bp_energy),
bp_power,
)
if m >= 0
else 0,
)
# Auto-consumption NEW (kW)
acc_dct.setdefault(t, k - discharge_dct[t])
# Lost injection due to battery efficiency (kW)
lst_dct.setdefault(
t,
(charge_dct[t] / battery_parameters["efficiency"]) - charge_dct[t],
)
# Injection NEW (kW)
inj_dct.setdefault(t, L - charge_dct[t] - lst_dct[t])
# Grid Consumption NEW (kW)
gridcons_dct.setdefault(t, f - acc_dct[t])
# Additional Auto-consumption through battery (kW)
agg_dct.setdefault(t, acc_dct[t] - k)
else:
# State of charge (kW)
soc_dct.setdefault(
t,
soc_dct[t - td(minutes=15)]
+ (charge_dct[t - td(minutes=15)] + discharge_dct[t - td(minutes=15)]) / 4,
)
# Charge (kW)
charge_dct.setdefault(
t,
min(
L,
(bp_energy - soc_dct[t]) * 4,
bp_power,
)
* battery_parameters["efficiency"]
if L >= 0
else 0,
)
# Discharge (kW)
discharge_dct.setdefault(
t,
-min(
m,
4 * soc_dct[t] - 4 * (battery_parameters["minsoc"] * bp_energy),
bp_power,
)
if m >= 0
else 0,
)
# Auto-consumption NEW (kW)
acc_dct.setdefault(t, k - discharge_dct[t])
# Lost injection due to battery efficiency (kW)
lst_dct.setdefault(
t, charge_dct[t] / battery_parameters["efficiency"] - charge_dct[t]
)
# Injection NEW (kW)
inj_dct.setdefault(t, L - charge_dct[t] - lst_dct[t])
# Grid Consumption NEW (kW)
gridcons_dct.setdefault(t, f - acc_dct[t])
# Additional Auto-consumption through battery (kW)
agg_dct.setdefault(t, acc_dct[t] - k)
# Creating a DataFrame with all the values
output_df = pd.DataFrame(
data=[
soc_dct,
charge_dct,
discharge_dct,
acc_dct,
lst_dct,
inj_dct,
gridcons_dct,
agg_dct,
]
).T
output_df.columns = [
"State of charge (kW)",
"Charge (kW)",
"Discharge (kW)",
"Auto-consumption NEW (kW)",
"Lost injection due to battery efficiency (kW)",
"Injection NEW (kW)",
"Grid Consumption NEW (kW)",
"Additional Auto-consumption through battery (kW)",
]
cons_prod
is the table imported into a DataFrame.
As you can see, we have two situations: when t == cons_prod.index[0]
(i.e., the first item of the timestamps), the calculations use the values at the same t
. However, from the second timestamp on, some of the calculations refer to the previous value (in here referred to the 15 minutes before using the index t - td(minutes=15)
).
These are the reasons why I am struggling to move away from the for-loop.
A few explanations to possible questions
- Q: Why are you using dictionaries? A: Because I figured that they fill faster than other data types, and I can use them to create a DataFrame later on.
- Q: Are the timestamps coherent, e.g. do they have missing values? A: No missing values, as there is a previous function I wrote to make sure that the timestamps are filled completely.
- Q: The calculations do not currently refer to a previous row in a DataFrame, but in a dictionary! Why the misleading title? A: This is the best solution I could come up with so far, but I am wondering whether my not-so-complete knowledge of Pandas is hiding a simpler, faster solution.
Hope the framework is clear.
Thank you in advance!
EDIT: as per request, added a 100-lines sample of cons_prod
and modified the previous code to meet the requirements of a MRE.
EDIT 2: I have tried moving from dictionaries to a Pandas lookup, trying to optimise as much as possible. This is the code I came up with:
from time import time as tt
cp = cons_prod.copy(deep=True)
# Initialise the columns filling them with zeroes
cp["State of charge (kW)"] = 0
cp["Charge (kW)"] = 0
cp["Discharge (kW)"] = 0
# Storing the position of the columns in variables
cp_soc = cp.columns.get_loc("State of charge (kW)")
cp_charge = cp.columns.get_loc("Charge (kW)")
cp_discharge = cp.columns.get_loc("Discharge (kW)")
cp_inj = cp.columns.get_loc("Injection (kW)")
cp_gridcons = cp.columns.get_loc("Grid Consumption (kW)")
# Storing the values of the battery dictionary lookups in variables
bp_energy = dct_bp["energy"]
bp_power = dct_bp["power"]
bp_efficiency = dct_bp["efficiency"]
bp_soct0 = dct_bp["soct0"]
bp_minsoc = dct_bp["minsoc"]
start1 = tt() # Measuring time
for row in cp.itertuples(name=None): # Using itertuples to gain some speed
L = cp.loc[row[0], "Injection (kW)"]
m = cp.loc[row[0], "Grid Consumption (kW)"]
k = cp.loc[row[0], "Auto-consumption (kW)"]
f = cp.loc[row[0], "Consumption (kW)"]
if row[0] == cp.index[0]:
cp.iloc[0, cp_soc] = bp_soct0
cp.iloc[0, cp_charge] = float(
min(L, (bp_energy - bp_soct0) * 4, bp_power) * bp_efficiency
if L >= 0
else 0,
)
cp.iloc[0, cp_discharge] = float(
-min(
m,
4 * bp_soct0 - 4 * (bp_minsoc * bp_energy),
bp_power,
)
if m >= 0
else 0
)
else:
t = pd.Index(cp.index).get_loc(row[0])
cp.iloc[t, cp_soc] = float(
cp.iloc[t - 1, cp_soc]
+ (cp.iloc[t - 1, cp_charge] + cp.iloc[t - 1, cp_discharge]) / 4
)
cp.iloc[t, cp_charge] = float(
min(L, (bp_energy - cp.iloc[t, cp_soc]) * 4, bp_power) * bp_efficiency
if L >= 0
else 0,
)
cp.iloc[t, cp_discharge] = float(
-min(
m,
4 * cp.iloc[t, cp_soc] - 4 * (dct_bp["minsoc"] * bp_energy),
bp_power,
)
if m >= 0
else 0
)
end1 = tt() - start1
print(f"Pandas lookup took {end1:.2f} seconds")
With this code, I got an average of 42 seconds per complete task, while I used to have <20 seconds with dictionaries.