I have a dataframe in pandas which I need to group and store in a new array where I need the size of every group with a specific size and if one exceeds the minimum size, it should be added to one of the previous groups that have the smallest size. For example, after I grouped the data, I will have groups G
that are len(G)<=b
, len(G)>=a
, or a <= len(G) <= b
. So, I need to make the groups with len(G)>=a
to meet the condition a <= len(G) <= b
.
The code is working now. So, I would like to know if there is a more convenient way to do that.
import numpy as np
import pandas as pd
rng = np.random.default_rng() # Just for testing
df = pd.DataFrame(rng.integers(0, 10, size=(1000, 4)), columns=list('ABCD'))
# The dataframe is grouped depend on specific column.
ans = [pd.DataFrame(y) for x, y in df.groupby(df.columns[3], as_index=False)]
n = 20 # The maximum size of the group is 25
new_arrayi_index = 0
new_array = []
for count_index in range(len(ans)):
l = ans[count_index]
if len(l) > n:
df_shuffled = pd.DataFrame(l).sample(frac=1)
final = [df_shuffled[i:i+n] for i in range(0,df_shuffled.shape[0],n)]
for inde in range(len(final)):
if len(final[inde]) <= 5 and new_arrayi_index != 0: #The minimum size of the group is 5
new_array[new_arrayi_index - 1]=new_array[new_arrayi_index - 1]+final[inde]
else:
new_array.append(final[inde])
new_arrayi_index += 1
else:
new_array.append(l)
new_arrayi_index += 1
count_index_ = 0
for count_index in range(len(new_array)):
print("count", count_index, "Size", len(new_array[count_index]))
print(new_array[count_index])
count_index_ += count_index
print(count_index_)