Polars > 0.13.50
(Python)
We can combine the diff
, when/then/otherwise
, and over
in the same context.
(
df
.sort(["ts"])
.with_column(
pl.when(pl.col("value").diff(1) >= 0)
.then(pl.col("value").diff(1))
.otherwise(pl.col("value"))
.cumsum()
.over("group")
.alias("valueCounter")
)
)
shape: (12, 4)
┌───────┬─────────────────────┬───────┬──────────────┐
│ group ┆ ts ┆ value ┆ valueCounter │
│ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ datetime[μs] ┆ i64 ┆ i64 │
╞═══════╪═════════════════════╪═══════╪══════════════╡
│ a ┆ 2022-06-08 00:00:01 ┆ 0 ┆ 0 │
├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ a ┆ 2022-06-08 00:00:06 ┆ 2 ┆ 2 │
├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ a ┆ 2022-06-08 00:00:11 ┆ 3 ┆ 3 │
├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ a ┆ 2022-06-08 00:00:16 ┆ 6 ┆ 6 │
├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ a ┆ 2022-06-08 00:00:21 ┆ 2 ┆ 8 │
├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ a ┆ 2022-06-08 00:00:26 ┆ 4 ┆ 10 │
├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ b ┆ 2022-06-08 00:00:31 ┆ 7 ┆ 7 │
├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ b ┆ 2022-06-08 00:00:36 ┆ 9 ┆ 9 │
├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ b ┆ 2022-06-08 00:00:41 ┆ 3 ┆ 12 │
├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ b ┆ 2022-06-08 00:00:46 ┆ 4 ┆ 13 │
├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ b ┆ 2022-06-08 00:00:51 ┆ 6 ┆ 15 │
├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ b ┆ 2022-06-08 00:00:56 ┆ 7 ┆ 16 │
└───────┴─────────────────────┴───────┴──────────────┘
For polars <= 0.13.50
(Python)
We can use the partition_by
feature to partition our dataset into a list of DataFrames, one DataFrame for each group. We perform our calculations on each DataFrame and then concatenate the results into a single DataFrame.
pl.concat(
[
sub_df
.sort(["ts"])
.with_columns(
pl.when(pl.col("value").diff(1) >= 0)
.then(pl.col("value").diff(1))
.otherwise(pl.col("value"))
.cumsum()
.alias("valueCounter")
)
for sub_df in df.partition_by(groups=["group"])
]
)
shape: (12, 4)
┌───────┬─────────────────────┬───────┬──────────────┐
│ group ┆ ts ┆ value ┆ valueCounter │
│ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ datetime[μs] ┆ i64 ┆ i64 │
╞═══════╪═════════════════════╪═══════╪══════════════╡
│ a ┆ 2022-06-08 00:00:01 ┆ 0 ┆ 0 │
├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ a ┆ 2022-06-08 00:00:06 ┆ 2 ┆ 2 │
├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ a ┆ 2022-06-08 00:00:11 ┆ 3 ┆ 3 │
├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ a ┆ 2022-06-08 00:00:16 ┆ 6 ┆ 6 │
├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ a ┆ 2022-06-08 00:00:21 ┆ 2 ┆ 8 │
├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ a ┆ 2022-06-08 00:00:26 ┆ 4 ┆ 10 │
├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ b ┆ 2022-06-08 00:00:31 ┆ 7 ┆ 7 │
├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ b ┆ 2022-06-08 00:00:36 ┆ 9 ┆ 9 │
├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ b ┆ 2022-06-08 00:00:41 ┆ 3 ┆ 12 │
├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ b ┆ 2022-06-08 00:00:46 ┆ 4 ┆ 13 │
├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ b ┆ 2022-06-08 00:00:51 ┆ 6 ┆ 15 │
├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ b ┆ 2022-06-08 00:00:56 ┆ 7 ┆ 16 │
└───────┴─────────────────────┴───────┴──────────────┘
You can use the maintain_order=True
option on the partition_by
to keep the groups in the same order as your input if you need, but it does come with a performance penalty.
Edit
The proposed solution first sorts and then groups via over(). The requirement is first group then sort within each group. Does the over() maintains previously sorted order within each group?
Yes, the over
windowing expression will maintain the previously sorted order within each group. Indeed, over
will also maintain the ordering of the entire DataFrame itself.
For example, let's change our starting DataFrame, so that the timestamps within each group are identical (so that they overlap). We'll choose the timestamps so that the observations maintain the relative ordering within each group as above, so that the result in the valueCounter
field should be the same as the results above. And we'll order the DataFrame so that the observations in the groups are intermingled.
We'll also add a row number so that we can see the effects of the over
on the ordering of the DataFrame.
df = (
pl.DataFrame(
{
"group": ["a", "b"] * 6,
"ts": [
"2022-06-08T00:00:01",
"2022-06-08T00:00:01",
"2022-06-08T00:00:06",
"2022-06-08T00:00:06",
"2022-06-08T00:00:11",
"2022-06-08T00:00:11",
"2022-06-08T00:00:16",
"2022-06-08T00:00:16",
"2022-06-08T00:00:21",
"2022-06-08T00:00:21",
"2022-06-08T00:00:26",
"2022-06-08T00:00:26",
],
"value": [0, 7, 2, 9, 3, 3, 6, 4, 2, 6, 4, 7, ],
}
)
.with_column(pl.col("ts").str.strptime(pl.Datetime))
.with_row_count()
)
df
shape: (12, 4)
┌────────┬───────┬─────────────────────┬───────┐
│ row_nr ┆ group ┆ ts ┆ value │
│ --- ┆ --- ┆ --- ┆ --- │
│ u32 ┆ str ┆ datetime[μs] ┆ i64 │
╞════════╪═══════╪═════════════════════╪═══════╡
│ 0 ┆ a ┆ 2022-06-08 00:00:01 ┆ 0 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 1 ┆ b ┆ 2022-06-08 00:00:01 ┆ 7 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2 ┆ a ┆ 2022-06-08 00:00:06 ┆ 2 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 3 ┆ b ┆ 2022-06-08 00:00:06 ┆ 9 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 4 ┆ a ┆ 2022-06-08 00:00:11 ┆ 3 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 5 ┆ b ┆ 2022-06-08 00:00:11 ┆ 3 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 6 ┆ a ┆ 2022-06-08 00:00:16 ┆ 6 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 7 ┆ b ┆ 2022-06-08 00:00:16 ┆ 4 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 8 ┆ a ┆ 2022-06-08 00:00:21 ┆ 2 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 9 ┆ b ┆ 2022-06-08 00:00:21 ┆ 6 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 10 ┆ a ┆ 2022-06-08 00:00:26 ┆ 4 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 11 ┆ b ┆ 2022-06-08 00:00:26 ┆ 7 │
└────────┴───────┴─────────────────────┴───────┘
I've purposely constructed the above DataFrame so that it is already sorted only by ts
(and not by group
and ts
). That way, we do not need to sort, and can focus on the effects of over
on the order.
Now, I'll comment out the sort
and run the algorithm.
result = (
df
# .sort(["ts"])
.with_column(
pl.when(pl.col("value").diff(1) >= 0)
.then(pl.col("value").diff(1))
.otherwise(pl.col("value"))
.cumsum()
.over("group")
.alias("valueCounter")
)
)
result
>>> result
shape: (12, 5)
┌────────┬───────┬─────────────────────┬───────┬──────────────┐
│ row_nr ┆ group ┆ ts ┆ value ┆ valueCounter │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ u32 ┆ str ┆ datetime[μs] ┆ i64 ┆ i64 │
╞════════╪═══════╪═════════════════════╪═══════╪══════════════╡
│ 0 ┆ a ┆ 2022-06-08 00:00:01 ┆ 0 ┆ 0 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 1 ┆ b ┆ 2022-06-08 00:00:01 ┆ 7 ┆ 7 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2 ┆ a ┆ 2022-06-08 00:00:06 ┆ 2 ┆ 2 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 3 ┆ b ┆ 2022-06-08 00:00:06 ┆ 9 ┆ 9 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 4 ┆ a ┆ 2022-06-08 00:00:11 ┆ 3 ┆ 3 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 5 ┆ b ┆ 2022-06-08 00:00:11 ┆ 3 ┆ 12 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 6 ┆ a ┆ 2022-06-08 00:00:16 ┆ 6 ┆ 6 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 7 ┆ b ┆ 2022-06-08 00:00:16 ┆ 4 ┆ 13 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 8 ┆ a ┆ 2022-06-08 00:00:21 ┆ 2 ┆ 8 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 9 ┆ b ┆ 2022-06-08 00:00:21 ┆ 6 ┆ 15 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 10 ┆ a ┆ 2022-06-08 00:00:26 ┆ 4 ┆ 10 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 11 ┆ b ┆ 2022-06-08 00:00:26 ┆ 7 ┆ 16 │
└────────┴───────┴─────────────────────┴───────┴──────────────┘
You'll notice that the over
expression has not changed the ordering of the DataFrame (and thus has also maintained the ordering of the observations within each group).
Now, let's sort the result by group
and ts
, so that we can more easily compare the results here with the results above.
result.sort(['group','ts'])
shape: (12, 5)
┌────────┬───────┬─────────────────────┬───────┬──────────────┐
│ row_nr ┆ group ┆ ts ┆ value ┆ valueCounter │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ u32 ┆ str ┆ datetime[μs] ┆ i64 ┆ i64 │
╞════════╪═══════╪═════════════════════╪═══════╪══════════════╡
│ 0 ┆ a ┆ 2022-06-08 00:00:01 ┆ 0 ┆ 0 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2 ┆ a ┆ 2022-06-08 00:00:06 ┆ 2 ┆ 2 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 4 ┆ a ┆ 2022-06-08 00:00:11 ┆ 3 ┆ 3 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 6 ┆ a ┆ 2022-06-08 00:00:16 ┆ 6 ┆ 6 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 8 ┆ a ┆ 2022-06-08 00:00:21 ┆ 2 ┆ 8 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 10 ┆ a ┆ 2022-06-08 00:00:26 ┆ 4 ┆ 10 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 1 ┆ b ┆ 2022-06-08 00:00:01 ┆ 7 ┆ 7 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 3 ┆ b ┆ 2022-06-08 00:00:06 ┆ 9 ┆ 9 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 5 ┆ b ┆ 2022-06-08 00:00:11 ┆ 3 ┆ 12 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 7 ┆ b ┆ 2022-06-08 00:00:16 ┆ 4 ┆ 13 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 9 ┆ b ┆ 2022-06-08 00:00:21 ┆ 6 ┆ 15 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 11 ┆ b ┆ 2022-06-08 00:00:26 ┆ 7 ┆ 16 │
└────────┴───────┴─────────────────────┴───────┴──────────────┘
We see that the results in valueCounter
are the same as the results above.
We could have started our algorithm by sorting our DataFrame by both group
and ts
, but it is not strictly needed. We simply need to order the observations within each group the way we need them. The over
expression will maintain the relative ordering within each group as it runs the algorithm.
In general, I avoid sorting a DataFrame more than is strictly necessary. As DataFrames get large, it becomes costly to sort.