I have a dataframe (pandas):
import pandas as pd
df = pd.DataFrame({'A': ['x1', 'x2', 'x3', 'x4'],
'B': ['b', 'b', 'c', 'c'],
'C': ['d', 'd', 'e', 'e'],
'D': ['x', 'y', 'y', 'x'],})
I want to merge the values of all columns where the values in A are unique.
ouput = pd.DataFrame({'A': ['x1', 'x2', 'x3', 'x4'],
'BC': ['bd', 'bd', 'ce', 'ce'],
'D': ['x', 'y', 'y', 'x'],})
It would be best to have a solution that works independently of column names B, C (perhaps there are also more columns with this "redundant information"). The column name of A is known.
Given the case that my initial dataframe is:
df = pd.DataFrame({'A': ['x1', 'x2', 'x3', 'x4'],
'B': ['b', 'b', 'c', 'c'],
'C': ['d', 'd', 'd', 'e'],
'D': ['x', 'y', 'y', 'x'],})
the desired output is the initial df (no change):
df = pd.DataFrame({'A': ['x1', 'x2', 'x3', 'x4'],
'B': ['b', 'b', 'c', 'c'],
'C': ['d', 'd', 'd', 'e'],
'D': ['x', 'y', 'y', 'x'],})
Many thanks!
Full solution (thanks to Robby the Belgian):
import pandas as pd
df = pd.DataFrame({'A': ['x1', 'x2', 'x3', 'x4'],
'B': ['b', 'b', 'c', 'c'],
'C': ['d', 'd', 'e', 'e'],
'D': ['x', 'y', 'y', 'x']})
print(df)
def is_redundant(df, A, B):
#remove column a
A=A
B=B
if len(df.groupby(f'{A}')) == len(df.groupby([f'{A}', f'{B}'])):
return True
else:
return False
def drop_redundant(df, redundant_groups):
list=redundant_groups
for i in list:
if len(df.groupby(f'{i[0]}')) == len(df.groupby([f'{i[0]}', f'{i[1]}'])):
df[f'{i[0]}' + f'{i[1]}'] = df[[f'{i[0]}', f'{i[1]}']].sum(axis=1)
df.drop([f'{i[0]}', f'{i[1]}'], axis=1, inplace=True)
return(df)
else:
return(df)
cols = [c for c in df.columns if c != 'A']
redundant_groups = []
idx_left = 0
while idx_left < len(cols)-1:
new_group = []
idx_right = idx_left+1
while idx_right < len(cols):
if is_redundant(df, cols[idx_left], cols[idx_right]):
new_group.append(cols.pop(idx_right))
else:
idx_right += 1
if new_group:
redundant_groups.append(new_group + [cols[idx_left]])
idx_left += 1
print(redundant_groups)
drop_redundant(df, redundant_groups)
print(df)
Output:
A B C D
0 x1 b d x
1 x2 b d y
2 x3 c e y
3 x4 c e x
[['C', 'B']]
A D CB
0 x1 x db
1 x2 y db
2 x3 y ec
3 x4 x ec
[Finished in 0.837s]