2

I have a pandas dataframe like where the first four columns form a multiindex:

import pandas as pd
data = [[1, 'A', 1, 0, 10],
        [1, 'A', 0, 1, 10],
        [1, 'A', 1, 1, 10],
        [1, 'A', 0, 0, 10],
        [1, 'B', 1, 0, 10],
        [1, 'B', 0, 1, 10],
        [1, 'B', 1, 1, 10],
        [1, 'B', 0, 0, 10]]
cols = ['user_id','type','flag1','flag2','cnt']
df = pd.DataFrame(data,columns = cols)
df = df.set_index(['user_id','type','flag1','flag2'])
print df

user_id    type    flag1    flag2    cnt
________________________________________
1          A       1        0        10
1          A       0        1        10
1          A       1        1        10
1          A       0        0        10
1          B       1        0        10
1          B       0        1        10
1          B       1        1        10
1          B       0        0        10

I'd like to iterate over the index values to get the grouped total count for each unique index values like so:

user_id    type    flag1    flag2    cnt
________________________________________
1          ALL     ALL      ALL      80
1          ALL     ALL      0        40
1          ALL     ALL      1        40
1          ALL     1        ALL      40
1          ALL     0        ALL      40
1          A       ALL      ALL      40
1          B       ALL      ALL      40
1          A       ALL      0        20
1          A       ALL      1        20
1          B       ALL      0        20
1          B       ALL      1        20
1          A       1        ALL      20
1          A       0        ALL      20
1          B       1        ALL      20
1          B       0        ALL      20
1          A       1        0        10
1          A       0        1        10
1          A       1        1        10
1          A       0        0        10
1          B       1        0        10
1          B       0        1        10
1          B       1        1        10
1          B       0        0        10

I'm able to generate each group easily using query and groupby, but ideally I'd like to be able to iterate over any number of index columns to get the sum of the cnt column.

3 Answers3

1
#build all groupby key combinations
import itertools
keys = ['user_id', 'type', 'flag1', 'flag2']
key_combos = [c for i in range(len(keys)) for c in itertools.combinations(keys, i+1)]
#make sure only select the combos with 'user_id' in it
key_combos = [list(e) for e in key_combos if 'user_id' in e]
#groupby using all groupby keys and concatenate the results to a Dataframe
df2 = pd.concat([df.groupby(by=key).cnt.sum().to_frame().reset_index() for key in sorted(key_combos)])
#Fill na with ALL and re-order columns
df2.fillna('ALL')[['user_id','type','flag1','flag2','cnt']]

Out[521]: 
   user_id type flag1 flag2  cnt
0        1  ALL   ALL   ALL   80
0        1  ALL     0   ALL   40
1        1  ALL     1   ALL   40
0        1  ALL     0     0   20
1        1  ALL     0     1   20
2        1  ALL     1     0   20
3        1  ALL     1     1   20
0        1  ALL   ALL     0   40
1        1  ALL   ALL     1   40
0        1    A   ALL   ALL   40
1        1    B   ALL   ALL   40
0        1    A     0   ALL   20
1        1    A     1   ALL   20
2        1    B     0   ALL   20
3        1    B     1   ALL   20
0        1    A     0     0   10
1        1    A     0     1   10
2        1    A     1     0   10
3        1    A     1     1   10
4        1    B     0     0   10
5        1    B     0     1   10
6        1    B     1     0   10
7        1    B     1     1   10
0        1    A   ALL     0   20
1        1    A   ALL     1   20
2        1    B   ALL     0   20
3        1    B   ALL     1   20
Allen Qin
  • 19,507
  • 8
  • 51
  • 67
1

Similar to previous answers, here's a slightly more streamlined approach using itertools and groupby:

from itertools import chain, combinations
indices = ['user_id','type','flag1','flag2']
powerset = list(chain.from_iterable(combinations(indices, r) for r in range(1,len(indices)+1)))

master = (pd.concat([df.reset_index().groupby(p, as_index=False).sum() 
                     for p in powerset if p[0] == "user_id"])[cols]
            .replace([None,4,2], "ALL")
            .sort_values("cnt", ascending=False))

Output:

user_id type flag1 flag2  cnt
0        1  ALL   ALL   ALL   80
0        1    A   ALL   ALL   40
1        1    B   ALL   ALL   40
0        1  ALL     0   ALL   40
1        1  ALL     1   ALL   40
0        1  ALL   ALL     0   40
1        1  ALL   ALL     1   40
3        1  ALL     1     1   20
2        1  ALL     1     0   20
1        1  ALL     0     1   20
0        1  ALL     0     0   20
3        1    B     1     1   20
2        1    B     1     0   20
1        1    A     1     1   20
0        1    A     1     0   20
3        1    B     1     1   20
2        1    B     0     1   20
1        1    A     1     1   20
0        1    A     0     1   20
0        1    A     0     0   10
1        1    A     0     1   10
2        1    A     1     0   10
3        1    A     1     1   10
4        1    B     0     0   10
5        1    B     0     1   10
6        1    B     1     0   10
7        1    B     1     1   10

The powerset computation is taken directly from the itertools docs.

andrew_reece
  • 20,390
  • 3
  • 33
  • 58
0

I mainly used combinations and product from itertools.
combinations is for all combinations of values within each column.
product is for all combinations of values across all column.

import pandas as pd
from itertools import combinations, product
import numpy as np


def iterativeSum(df, cols, target_col):
    # All possible combinations within each column
    comb_each_col = []
    for col in cols:
        # Take 1 to n element in the unique set of values in each column
        each_col = [list(combinations(set(df[col]), i))
                    for i in range(1, len(set(df[col]))+1)]
        # Flat the list
        each_col = [list(x) for sublist in each_col for x in sublist]
        # Record the combination
        comb_each_col.append(each_col)
    # All possible combinations across all columns
    comb_all_col = list(product(*comb_each_col))
    result = pd.DataFrame()
    # Iterate over all combinations
    for value in comb_all_col:
        # Get condition which match the value in each column
        condition = np.array(
            [df[col].isin(v).values for col, v in zip(cols, value)]).all(axis=0)
        # Get the sum of rows which meet the condition
        condition_sum = df.loc[condition][target_col].sum()
        # Format values for output
        value2 = []
        for x in value:
            try:
                # String can be joined together directly
                value2.append(','.join(x))
            except:
                # Numbers can be joined after converted to string
                x = [str(y) for y in x]
                value2.append(','.join(x))
        # Put result into table
        result = pd.concat([result, pd.DataFrame([value2+[condition_sum]])])
    result.columns = cols + [target_col]
    return(result)

data = [[1, 'A', 1, 0, 10],
        [1, 'A', 0, 1, 10],
        [1, 'A', 1, 1, 10],
        [1, 'A', 0, 0, 10],
        [1, 'B', 1, 0, 10],
        [1, 'B', 0, 1, 10],
        [1, 'B', 1, 1, 10],
        [1, 'B', 0, 0, 10]]
cols = ['user_id', 'type', 'flag1', 'flag2', 'cnt']
df = pd.DataFrame(data, columns=cols)
# Columns for grouping
grouped_cols = ['type', 'flag1', 'flag2']
# Columns for summing
target_col = 'cnt'
print iterativeSum(df, grouped_cols, target_col)

Result:

  type flag1 flag2  cnt
0    A     0     0   10
0    A     0     1   10
0    A     0   0,1   20
0    A     1     0   10
0    A     1     1   10
0    A     1   0,1   20
0    A   0,1     0   20
0    A   0,1     1   20
0    A   0,1   0,1   40
0    B     0     0   10
0    B     0     1   10
0    B     0   0,1   20
0    B     1     0   10
0    B     1     1   10
0    B     1   0,1   20
0    B   0,1     0   20
0    B   0,1     1   20
0    B   0,1   0,1   40
0  A,B     0     0   20
0  A,B     0     1   20
0  A,B     0   0,1   40
0  A,B     1     0   20
0  A,B     1     1   20
0  A,B     1   0,1   40
0  A,B   0,1     0   40
0  A,B   0,1     1   40
0  A,B   0,1   0,1   80