1

This is my dataframe:

d = {'id': [1,1,2,2,3,3,3] ,
'a_code': ['abc', 'abclm', 'pqr', 'pqren', 'lmn', 'lmnre', 'xyznt'], 
'a_type':['CP','CO','CP','CO','CP','CP','CO'],
'z_code': ['abclm', 'wedvg', 'pqren', 'unfdc', 'lmnre','wqrtn','hgbvcx'],
'z_type': ['CO', 'CO', 'CO','CO','CP','CO','RT'],
'stepNo': [1,2,1,2,1,2,3]
}

df= pd.DataFrame(d)

Each id has rows which are continuous paths defined by stepNo. I want to print all steps in a single row so I can visualize the path. The stepNo varies between 2 to 24, so in somecases I could have 5x24 columns. Is it possible to do this?

Output:

id   stepNo  a_code  a_type   z_code   z_type     stepNo    a_code   a_type   z_code   z_type    stepNo    a_code   a_type   z_code   z_type

 1     1      abc     CP     abclm      CO         2       abclm     CO      wedvg     CO
 2     1      pqr     CP     pqren      CO         2       pqren     CO      unfdc     CO        
 3     1      lmn     CP     lmnre      CP         2       lmnre     CP      wqrtn     CO           3      xyznt     CO      hgbvcx     RT

UPDATE:

@NYC Coder solution fails for this sample, I would appreciate if someone could help me figure it out, All other answer timeout or arent legible for how the output is needed due to my dataframe having high number of dimensions.

nan = ""
d = {'NAME': [1,1,2,2,3,3,3,3,3,4,4,4,4,4,4],
 'col1': ['P100','P100','P100','P100','MS','MS','MS','MS','MS','MS','MS','MS','MS','MS','MS'],
 'col2': ['CNMZ',
  'CNMZ',
  'COMX',
  'COMX',
  '_NCTE',
  '_NCTE',
  '_NCTE',
  '_NCTE',
  '_NCTE',
  'T1MF',
  'T1MF',
  'T1MF',
  'T1MF',
  'T1MF',
  'T1MF'],
 'stepNo': [1, 2, 1, 2, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 6],
 'col4': ['xyz',
  'abc',
  'pqr',
  'gvt',
  'mno',
  'tru',
  'ercm',
  'lotr',
  'ddlj',
  'refv',
  'ecv',
  'ecv',
  'ecv',
  'ecv',
  'ecv'],
 'col5': ['PHL',
  'PHL',
  'BHL',
  'ALT',
  'MRS',
  'MRS',
  'TUL',
  'MRS',
  'FAT',
  'PHL',
  'PHL',
  'JEN',
  'FTW',
  'AMB',
  'KGP'],
 'col6': ['CP',
  'CO',
  'CP',
  'CO',
  'CP',
  'CO',
  'CO',
  'CO',
  'RT',
  'CO',
  'CO',
  'CO',
  'CP',
  'CO',
  'CO'],
 'col7': ['PHL',
  'PHL',
  'ALT',
  'ALT',
  'MRS',
  'TUL',
  'MRS',
  'FAT',
  'FAH',
  'PHL',
  'JEN',
  'FTW',
  'AMB',
  'KGP',
  'KGP'],
 'col8': ['CO',
  'CO',
  'CO',
  'CO',
  'CO',
  'CO',
  'CO',
  'RT',
  'CP',
  'CO',
  'CO',
  'CP',
  'CO',
  'CO',
  'CO'],
 'col9': ['SID',
  'M/M',
  'SID',
  'U/D',
  'AL LO',
  'AL LO',
  'AL LO',
  'AL LO',
  'AL LO',
  'M/M',
  'DCS',
  'DCS',
  'DCS',
  'DCS',
  'DCS'],
 'col10': ['SID',
  'M/M',
  'SID',
  'U/D',
  'AL LO',
  '3 M',
  '3 M',
  'M/M',
  'AL LO',
  'M/M',
  'DCS',
  'DCS',
  'DCS',
  'DCS',
  'DCS'],
 'col11': [nan,
  'ATM',
  nan,
  'PACK',
  'AL LP',
  'DCS',
  'DCS',
  'DAM',
  'DAM',
  'DCS',
  'DCS',
  'DCS',
  'DCS',
  'DCS',
  'M/M'],
 'col12': [nan,
  'SID',
  nan,
  'PACK',
  'CAL LO',
  'DCS',
  'DCS',
  'M/M',
  'CAL LO',
  'DCS',
  'DCS',
  'DCS',
  'DCS',
  'DCS',
  'AL LO'],
 'col13': ['abc',
  '-02-1_',
  '-1',
  '-13_',
  nan,
  nan,
  nan,
  'T1_VT1.',
  nan,
  '-06',
  nan,
  nan,
  nan,
  nan,
  '-03_02-03'],
 'col14': [nan,
  nan,
  nan,
  nan,
  '102/',
  '102/',
  '102/',
  nan,
  '101/',
  nan,
  '3405',
  '3102/',
  '3111/',
  '3102/',
  nan]}
df = pd.DataFrame(d)
Murtaza Haji
  • 1,093
  • 1
  • 13
  • 32

4 Answers4

1

I think you want to use pivot_table and then sort_index

table=pd.pivot_table(df, index = ['id'],values = ['a_code','a_type','z_code','z_type'],
                    columns = ['stepNo'], fill_value = '', aggfunc = lambda x: x).swaplevel(0, 1, axis=1).sort_index(axis=1) 

stepNo      1                           2  ...             3                      
       a_code a_type z_code z_type a_code  ... z_type a_code a_type  z_code z_type
id                                         ...                                    
1         abc     CP  abclm     CO  abclm  ...     CO                             
2         pqr     CP  pqren     CO  pqren  ...     CO                             
3         lmn     CP  lmnre     CP  lmnre  ...     CO  xyznt     CO  hgbvcx     RT

or without switching the multiindex column levels:

table=pd.pivot_table(df, index = ['id'],values = ['a_code','a_type','z_code','z_type'],
                    columns = ['stepNo'], fill_value = '', aggfunc = lambda x: x).sort_index(axis=1)

       a_code               a_type         z_code                z_type        
stepNo      1      2      3      1   2   3      1      2       3      1   2   3
id                                                                             
1         abc  abclm            CP  CO      abclm  wedvg             CO  CO    
2         pqr  pqren            CP  CO      pqren  unfdc             CO  CO    
3         lmn  lmnre  xyznt     CP  CP  CO  lmnre  wqrtn  hgbvcx     CP  CO  RT
CAPSLOCK
  • 6,243
  • 3
  • 33
  • 56
  • Thank you for the solution, But I have over 16 columns in my dataframe, with steps being as high as 82, my dataframe is a tracing a path , so I want rows to be lined up one after another which will reduce extra scrolling in cases where I have only 5-6 steps. – Murtaza Haji May 08 '20 at 06:40
1

You can do this:

d = {'id': [1,1,2,2,3,3,3] ,
'a_code': ['abc', 'abclm', 'pqr', 'pqren', 'lmn', 'lmnre', 'xyznt'], 
'a_type':['CP','CO','CP','CO','CP','CP','CO'],
'z_code': ['abclm', 'wedvg', 'pqren', 'unfdc', 'lmnre','wqrtn','hgbvcx'],
'z_type': ['CO', 'CO', 'CO','CO','CP','CO','RT'],
'stepNo': [1,2,1,2,1,2,3]
}
df = pd.DataFrame(d)
dfs = []
for i in range(min(df['stepNo']), max(df['stepNo'])+1):
    dfs.append(df[df['stepNo']==i].reset_index())
dfx = pd.concat(dfs, axis=1)
dfx.drop(inplace=True, columns=['index'])
print(dfx)

   id a_code a_type z_code z_type  stepNo  id a_code a_type z_code z_type  stepNo   id a_code a_type  z_code z_type  stepNo
0   1    abc     CP  abclm     CO       1   1  abclm     CO  wedvg     CO       2  3.0  xyznt     CO  hgbvcx     RT     3.0
1   2    pqr     CP  pqren     CO       1   2  pqren     CO  unfdc     CO       2  NaN    NaN    NaN     NaN    NaN     NaN
2   3    lmn     CP  lmnre     CP       1   3  lmnre     CP  wqrtn     CO       2  NaN    NaN    NaN     NaN    NaN     NaN
NYC Coder
  • 7,424
  • 2
  • 11
  • 24
1

It's not the best answer, but here's an example that can be done with less code, although I wish I could have avoided theLIST format.

df2 = df.groupby(['id', 'stepNo']).agg(list)
df3 = df2.unstack(level=-1, fill_value='')

                a_code     a_type      z_code     z_type
  stepNo    1   2   3   1   2   3   1   2   3   1   2   3
 id                                             
 1  [abc]   [abclm]         [CP]    [CO]    [abclm] [wedvg]     [CO]    [CO]    
 2  [pqr]   [pqren]         [CP]    [CO]    [pqren] [unfdc]     [CO]    [CO]    
 3  [lmn]   [lmnre] [xyznt] [CP]    [CP]    [CO]    [lmnre] [wqrtn] [hgbvcx]    [CP]    [CO]    [RT]
r-beginners
  • 31,170
  • 3
  • 14
  • 32
0

If you don't want the multi index approach suggested by @Gio above, i think this should do the trick, bear in mind a new renaming of data headers for each step:

import pandas as pd
import numpy as np

d = {'id': [1,1,2,2,3,3,3] ,
'a_code': ['abc', 'abclm', 'pqr', 'pqren', 'lmn', 'lmnre', 'xyznt'], 
'a_type':['CP','CO','CP','CO','CP','CP','CO'],
'z_code': ['abclm', 'wedvg', 'pqren', 'unfdc', 'lmnre','wqrtn','hgbvcx'],
'z_type': ['CO', 'CO', 'CO','CO','CP','CO','RT'],
'stepNo': [1,2,1,2,1,2,3]
}

df= pd.DataFrame(d)


hstackedDf =pd.pivot_table(df, index=['id'], aggfunc=lambda x: (np.hstack(x.values.ravel()).astype(str)).tolist(), values=['stepNo']).stepNo.apply(pd.Series).fillna(0).reset_index()
#get length of steps to use in the process for flexible number of steps
noOfSteps = len(hstackedDf.columns)

#loop over steps
for i in range(1,noOfSteps):
    #rename to unique step number
    hstackedDf = hstackedDf.rename(columns={ hstackedDf.columns[i]: 'stepNo_' + str(i)})
    #convert to integer
    hstackedDf['stepNo_' + str(i)] = hstackedDf['stepNo_' + str(i)].astype(int)
    #merge rest of data for the current step
    hstackedDf = hstackedDf.merge(df, how='left', left_on=['id', 'stepNo_' + str(i)], right_on=['id', 'stepNo'])
    #drop stenpNo column
    hstackedDf = hstackedDf.drop(['stepNo'], axis=1)
    #rename data to spicific step number
    hstackedDf = hstackedDf.rename(columns={ 'a_code': 'a_code_' + str(i), 'a_type': 'a_type_' + str(i), 'z_code': 'z_code_' + str(i), 'z_type': 'z_type_' + str(i)})

#create list of steps lists
stepsColumns = list()
for i in range(1,(noOfSteps)):
    indList = [c for c in hstackedDf if ('_'+str(i)) in c]
    stepsColumns.append(indList)
#convert to one flat list
flat_list = list()
for sublist in stepsColumns:
    for item in sublist:
        flat_list.append(item)
#add ID column
flat_list.insert(0, 'id')  
#reorder output dataframe   
outputDF = hstackedDf[flat_list]

print(outputDF)

id  stepNo_1 a_code_1 a_type_1  ... a_code_3 a_type_3  z_code_3 z_type_3
1         1      abc       CP  ...      NaN      NaN       NaN      NaN
2         1      pqr       CP  ...      NaN      NaN       NaN      NaN
3         1      lmn       CP  ...    xyznt       CO    hgbvcx       RT
Mit
  • 679
  • 6
  • 17
  • This solution takes a long time to run. I ran it for close to 1 hr, then had to shut the kernel. This is the shape of my dataframe (2768734 rows × 16 columns). – Murtaza Haji May 08 '20 at 06:42