Pandas if statements and indexing

Question

I am trying to calculate hires and separations for CPS data in a Python Pandas data frame, however my logical statements are not working correctly.

Here is my code:

test_tabData        = test_data.groupby('PID').apply(tabulateHireSeps)

def tabulateHireSeps(df):
    # We need to create the job, hire and separation column within the data frame. 
    df['estJobs']   = 0
    df['tot_sep']   = 0
    df['tot_hir']   = 0 

    # These constants enable the logical statements below to be easier read
    # The naming acts a self-description
    employee     = (df.EmpStat == 1)
    non_employee = (df.EmpStat != 1)
    new_emp      = (df.newEmp  == 2)
    new_occ      = (df.newOcc  == 2)

    maxMonth    = max(df.Month)
    minMonth    = min(df.Month)
    if ((maxMonth == 12) & (minMonth == 01)):
        maxMonth = 01
        minMonth = 12

    lengthMonth = len(df.Month)
    if (lengthMonth > 1):
        # If an individual went from unemployed in the previous month 
        if ((df['Month'] == minMonth) & (non_employee)).any():
            # to employed in the current month, we calculate that as a hire
            df.ix[((df.Month == maxMonth) & (employee)), 'tot_hir'] = 1

        # If an individual went from employed in the previous month
        elif ((df['Month'] == minMonth) & (employee)).any():
            # to employed in the current month with a new employer 
            if ((df['Month'] == maxMonth) & (employee) & (new_emp)).any():
                # and have a new occupation
                if (new_occ):
                    # we calculate a hire in the current month
                    df.ix[(df['Month'] == maxMonth), 'tot_hir'] = 1
                    # we calculate a seperation in the previous month
                    df.ix[(df['Month'] == minMonth), 'tot_sep'] = 1
                else:
                    # we calculate a hire and seperation for the current month
                    df.ix[(df['Month'] == maxMonth), ['tot_hir', 'tot_sep']] = 1, 1

            else:
                # to unemployed in the current month we calcualte a seperation
                df.ix[((df['Month'] == maxMonth) & (non_employee)), 'tot_sep'] = 1

    else:
        df.ix[(df['Month'].isin(maxMonth) & (employee) & ((new_occ) | (new_emp))), \
            'tot_hir'] = 1  

    # We keep only those columns we need to move through the process
    df  = df[['Year', 'Month', 'OccID', 'ageGrp', 'Sex', 'Race', 'Hisp', 'educGrp', 
              'OutWgt', 'LongWgt', 'FinWgt', 'Sample', 'PID', 'COW', 'estJobs', 
              'tot_sep', 'tot_hir']]    

    return df

Here is my test data:

data_set = [[2015, 05, 9130, 4, 1, 1, 2, 1, 0, 51727030, 36527035, 3, 1, 1001, 1, 1, 1],
            [2015, 04, 9130, 4, 1, 1, 2, 1, 0, 51897025, 36164620, 2, 1, 1001, 2, 1, 7],
            [2015, 05, 9130, 4, 1, 1, 2, 1, 0, 51727030, 36527035, 3, 1, 1002, 2, 1, 7],
            [2015, 04, 9130, 4, 1, 1, 2, 1, 0, 51897025, 36164620, 2, 1, 1002, 1, 1, 1],
            [2015, 05, 9130, 4, 1, 1, 2, 1, 0, 51727030, 36527035, 3, 2, 1003, 1, 2, 1],
            [2015, 04, 9130, 4, 1, 1, 2, 1, 0, 51897025, 36164620, 2, 1, 1003, 1, 1, 1]]

test_data = pandas.DataFrame(data       = data_set, 
                            columns     = ['Year',
                                           'Month',
                                           'OccID',
                                           'ageGrp',
                                           'Sex',
                                           'Race',
                                           'Hisp',
                                           'educGrp', 
                                           'OutWgt',
                                           'LongWgt',
                                           'FinWgt',    
                                           'Sample',    
                                           'newEmp',    
                                           'PID',   
                                           'EmpStat',
                                           'newOcc',
                                           'COW'])

Here are my expected results:

expected_set = [[2015, 05, 9130, 4, 1, 1, 2, 1, 0, 51727030, 36527035, 3, 1001, 1, 0, 0, 1],
               [2015, 04, 9130, 4, 1, 1, 2, 1, 0, 51897025, 36164620, 2, 1001, 7, 0, 0, 0],
               [2015, 05, 9130, 4, 1, 1, 2, 1, 0, 51727030, 36527035, 3, 1002, 1, 0, 1, 0],
               [2015, 04, 9130, 4, 1, 1, 2, 1, 0, 51897025, 36164620, 2, 1002, 7, 0, 0, 0],
               [2015, 05, 9130, 4, 1, 1, 2, 1, 0, 51727030, 36527035, 3, 1003, 1, 0, 0, 1],
               [2015, 04, 9130, 4, 1, 1, 2, 1, 0, 51897025, 36164620, 2, 1003, 7, 0, 1, 0]]

expected_data = pandas.DataFrame(data       = expected_set, 
                                columns     = ['Year',
                                               'Month',
                                               'OccID',
                                               'ageGrp',
                                               'Sex',
                                               'Race',
                                               'Hisp',
                                               'educGrp',
                                               'OutWgt',
                                               'LongWgt',
                                               'FinWgt',
                                               'Sample',
                                               'PID',
                                               'COW',
                                               'estJobs',
                                               'tot_sep',
                                               'tot_hir'])

Here is what the function is producing:

  Year  Month  OccID  ageGrp  Sex  Race  Hisp  educGrp  OutWgt   LongWgt  \
  2015      5   9130       4    1     1     2        1       0  51727030   
   2015      4   9130       4    1     1     2        1       0  51897025   
   2015      5   9130       4    1     1     2        1       0  51727030   
   2015      4   9130       4    1     1     2        1       0  51897025   
   2015      5   9130       4    1     1     2        1       0  51727030   
   2015      4   9130       4    1     1     2        1       0  51897025   

     FinWgt  Sample   PID  COW  estJobs  tot_sep  tot_hir  
   36527035       3  1001    1        0        0        1  
   36164620       2  1001    7        0        0        0  
   36527035       3  1001    1        0        0        1  
   36164620       2  1001    7        0        0        0  
   36527035       3  1001    1        0        0        1  
   36164620       2  1001    7        0        0        0

You keep using `&`. [I do not think it means what you think it means.](http://stackoverflow.com/questions/22646463/difference-between-and-boolean-vs-bitwise-in-python-why-difference-i) — Andy Kubiak, Aug 19 '15 at 19:40
I'm playing spot the difference here, it's too difficult. I think you should just put some print statements in your code and see which block it's hitting (or step through with pdb). Are you using a recent pandas? I'm suprised it doesn't raise on `if new_occ:`. — Andy Hayden, Aug 19 '15 at 20:04

score 1 · Accepted Answer · answered Aug 19 '15 at 20:06

You are using .apply which calls tabulateHireSeps on each row of your dataframe. In your function you are resetting the dataframe.

Try this:

def tabulateHireSeps(df):
# We need to create the job, hire and separation column within the data frame. 
df['estJobs']   = 0
df['tot_sep']   = 0
df['tot_hir']   = 0 

# These constants enable the logical statements below to be easier read
# The naming acts a self-description
employee     = (df.EmpStat == 1)
non_employee = (df.EmpStat != 1)
new_emp      = (df.newEmp  == 2)
new_occ      = (df.newOcc  == 2)

maxMonth    = max(df.Month)
minMonth    = min(df.Month)
if ((maxMonth == 12) & (minMonth == 1)):
    maxMonth = 1
    minMonth = 12

lengthMonth = len(df.Month)
if (lengthMonth > 1):
    # If an individual went from unemployed in the previous month 
    if ((df['Month'] == minMonth) & (non_employee)).any():
        # to employed in the current month, we calculate that as a hire
        df.ix[((df.Month == maxMonth) & (employee)), 'tot_hir'] = 1

    # If an individual went from employed in the previous month
    elif ((df['Month'] == minMonth) & (employee)).any():
        # to employed in the current month with a new employer 
        if ((df['Month'] == maxMonth) & (employee) & (new_emp)).any():
            # and have a new occupation
            if (new_occ).any():
                # we calculate a hire in the current month
                df.ix[(df['Month'] == maxMonth), 'tot_hir'] = 1
                # we calculate a seperation in the previous month
                df.ix[(df['Month'] == minMonth), 'tot_sep'] = 1
            else:
                # we calculate a hire and seperation for the current month
                df.ix[(df['Month'] == maxMonth), ['tot_hir', 'tot_sep']] = 1, 1

        else:
            # to unemployed in the current month we calcualte a seperation
            df.ix[((df['Month'] == maxMonth) & (non_employee)), 'tot_sep'] = 1

else:
    df.ix[(df['Month'].isin(maxMonth) & (employee) & ((new_occ) | (new_emp))), \
        'tot_hir'] = 1  

## We keep only those columns we need to move through the process
#df  = df[['Year', 'Month', 'OccID', 'ageGrp', 'Sex', 'Race', 'Hisp', 'educGrp', 
#          'OutWgt', 'LongWgt', 'FinWgt', 'Sample', 'PID', 'COW', 'estJobs', 
#          'tot_sep', 'tot_hir']]    

return df

Thank you, this worked. Out of curiosity, why can't I use the following code at the end of the function: df = df[['Year', 'Month', 'OccID', 'ageGrp', 'Sex', 'Race', 'Hisp', 'educGrp', 'OutWgt', 'LongWgt', 'FinWgt', 'Sample', 'PID', 'COW', 'estJobs', 'tot_sep', 'tot_hir']] — j riot, Aug 19 '15 at 20:34

Pandas if statements and indexing

1 Answers1