0

I have two dataframes df and df_census_final. df data can be downloaded here and df_census_final can be downloaded here and created with:

import pandas as pd
df = pd.read_csv('population_data.csv')
df_census_final = pd.read_csv('census_data.csv')

The df_census_final is year on year estimate of population by individual age groups and is a look up dataframe to get a specific value, whilst df is the actual dataframe where the calculation will be applied and population by age groups 15-19.

I have two functions which are below where the first function function_check_age_19 is used in the bottom function split_population_18_19:

def function_check_age_19(df,df_census_final):
    """
    If the sex, year, and age are equal to the values in the function, then return the population
    multiplied by the estimated proportion.
    
    :param df: the dataframe that contains the population data
    :param df_census_final: a dataframe with the following columns:
    :return: the population multiplied by the estimated proportion.
    """
    if df['Sex'] == 'Male' and df['Year'] == 2019 and df['Age'] == '15-19':
        estimated_proportion =(df_census_final['Male Pop %'].loc[(df_census_final['GROUP'] == '19') & (df_census_final['Year'] == 2019)].values)/100
        return df['population']*estimated_proportion
    elif df['Sex'] == 'Female' and df['Year'] == 2019 and df['Age'] == '15-19':
        estimated_proportion =(df_census_final['Female Pop %'].loc[(df_census_final['GROUP'] == '19') & (df_census_final['Year'] == 2019)].values)/100
        return df['population']*estimated_proportion
    elif df['Sex'] == 'Male' and df['Year'] == 2020 and df['Age'] == '15-19':
        estimated_proportion =(df_census_final['Male Pop %'].loc[(df_census_final['GROUP'] == '19') & (df_census_final['Year'] == 2020)].values)/100
        return df['population']*estimated_proportion
    elif df['Sex'] == 'Female' and df['Year'] == 2020 and df['Age'] == '15-19':
        estimated_proportion =(df_census_final['Female Pop %'].loc[(df_census_final['GROUP'] == '19') & (df_census_final['Year'] == 2020)].values)/100
        return df['population']*estimated_proportion
    elif df['Sex'] == 'Male' and df['Year'] == 2021 and df['Age'] == '15-19':
        estimated_proportion =(df_census_final['Male Pop %'].loc[(df_census_final['GROUP'] == '19') & (df_census_final['Year'] == 2021)].values)/100
        return df['population']*estimated_proportion
    elif df['Sex'] == 'Female' and df['Year'] == 2021 and df['Age'] == '15-19':
        estimated_proportion =(df_census_final['Female Pop %'].loc[(df_census_final['GROUP'] == '19') & (df_census_final['Year'] == 2021)].values)/100
        return df['population']*estimated_proportion
    elif df['Sex'] == 'Male' and df['Year'] == 2022 and df['Age'] == '15-19':
        estimated_proportion =(df_census_final['Male Pop %'].loc[(df_census_final['GROUP'] == '19') & (df_census_final['Year'] == 2022)].values)/100
        return df['population']*estimated_proportion
    elif df['Sex'] == 'Female' and df['Year'] == 2022 and df['Age'] == '15-19':
        estimated_proportion =(df_census_final['Female Pop %'].loc[(df_census_final['GROUP'] == '19') & (df_census_final['Year'] == 2022)].values)/100
        return df['population']*estimated_proportion
    elif df['Sex'] == 'Male' and df['Year'] == 2023 and df['Age'] == '15-19':
        estimated_proportion =(df_census_final['Male Pop %'].loc[(df_census_final['GROUP'] == '19') & (df_census_final['Year'] == 2022)].values)/100
        return df['population']*estimated_proportion
    elif df['Sex'] == 'Female' and df['Year'] == 2023 and df['Age'] == '15-19':
        estimated_proportion =(df_census_final['Female Pop %'].loc[(df_census_final['GROUP'] == '19') & (df_census_final['Year'] == 2022)].values)/100
        return df['population']*estimated_proportion
    else: 
        return df['population'] 

def split_population_18_19(df):
    """
    It takes a dataframe, creates two new columns, one for 18 year olds and one for 19 year olds, then
    adds them together and subtracts them from the total population
    
    :param df: The dataframe that you want to split the population for
    :return: A dataframe with the columns:
    """
    try:
        df['population_18'] = df.apply(lambda row: function_check_age_18(row), axis=1)
        df['population_19'] = df.apply(lambda row: function_check_age_19(row), axis=1)
        df.loc[df['Age'] != '15-19','population_18'] = 0
        df.loc[df['Age'] != '15-19','population_19'] = 0
        df['total_population_18'] = df['population_18'] + df['population_19']
        df['population'] =df['population'] - df['total_population_18']
        df['total_population_18'] = df['total_population_18'].shift(1)
        df['total_population_18'] = df['total_population_18'].fillna(0.0)
        df['population'] = df['total_population_18'] + df['population']
        df.drop(['month','population_18','population_19','total_population_18'],axis=1,inplace=True)
        return df
    except Exception as e:
        print(f"Exception on estimating 18 19 year old population with the error: {e}")

The first function has two dataframe which are passed to it, df and df_census_final but the second only has one. When using .apply in the second function as shown below:

df['population_19'] = df.apply(lambda row: function_check_age_19(row), axis=1)

I get a nonetype returned. i.e. (I am only trying to affect df element-wise) What is the best way to use the .apply function or anything else to pass both the dataframes into the function?

I am refactoring some hard-coded values as shown below:

def function_check_age_19(df):
    """
    This function is checking the population of the age group 15-19 for the years 
    2019, 2020, 2021, and 2022 then mapping the proportion for 19 year olds for each Sex
    """
    if df['Sex'] == 'Male' and df['Year'] == 2019 and df['Age'] == '15-19':
        return df['population']*(19.851399/100)
    elif df['Sex'] == 'Female' and df['Year'] == 2019 and df['Age'] == '15-19':
        return df['population']*(20.088195/100)
    elif df['Sex'] == 'Male' and df['Year'] == 2020 and df['Age'] == '15-19':
        return df['population']*(19.492101/100)
    elif df['Sex'] == 'Female' and df['Year'] == 2020 and df['Age'] == '15-19':
        return df['population']*(19.745797/100)
    elif df['Sex'] == 'Male' and df['Year'] == 2021 and df['Age'] == '15-19':
        return df['population']*(19.489112/100)
    elif df['Sex'] == 'Female' and df['Year'] == 2021 and df['Age'] == '15-19':
        return df['population']*(19.621289/100)
    elif df['Sex'] == 'Male' and df['Year'] == 2022 and df['Age'] == '15-19':
        return df['population']*(19.442557/100)
    elif df['Sex'] == 'Female' and df['Year'] == 2022 and df['Age'] == '15-19':
        return df['population']*(19.534988/100)
    else: 
        return df['population']
Mazil_tov998
  • 396
  • 1
  • 13
  • 1
    Looks like it's over complicating it here. Wouldn't it make sense to do a .groupby() and then calculate what you need? – chitown88 Oct 12 '22 at 10:15
  • would help if you shared a sample of df, and what you want the output to look like – chitown88 Oct 12 '22 at 10:16
  • @chitown88 are you saying using .groupby with the lambda? Essentially the df_census_final is a lookup table for a population proportion. Will try create example df's now. But with function_check_age_19(row), worked when I only passed through one dataframe, – Mazil_tov998 Oct 12 '22 at 10:26
  • No. I'm saying groupby on the dataframe itself. Unless I'm not understanding the issue. Ya it will be helpful to see what the df looks like and what yu are wanting to do. – chitown88 Oct 12 '22 at 10:31
  • @chitown88 i have updated the question with datasources to assist – Mazil_tov998 Oct 12 '22 at 12:06
  • Please share a [small sample of your dataframe](https://stackoverflow.com/a/20159305/5901382) *in* the question itself. Posters might not be willing to go to external websites to download your data. – Abirbhav G. Oct 12 '22 at 12:20
  • `population_data.csv` which you defined as `df`, doesn;t have a column `'Year'`. So how are you using that in your function. Also, where's your `function_check_age_18`? You still aren't being very clear as to what you are wanting to do here. – chitown88 Oct 13 '22 at 09:57

1 Answers1

1

Ok I think I see what you are trying to do. How I would do it is I would simply merge the percentages by 'Sex' and 'Year'. Then just pull out GROUP 18 and 19 and multiply out the columns to get the populations of those.

import pandas as pd
import numpy as np

df = pd.read_csv('https://raw.githubusercontent.com/maz2198/data_1/main/population_data.csv')
df['Year'] = pd.to_datetime(df['month of estimation']).dt.year
df_15_19 = df[df['Age'] == '15-19']

df_census_final = pd.read_csv('https://raw.githubusercontent.com/maz2198/data_1/main/census_data.csv')

df_list = []
ages = [18, 19]
for age in ages:
    df_census_age = df_census_final[df_census_final['GROUP'] == age]
    df_age = df_15_19.merge(df_census_age, how='right', on = ['Year'])

    df_age['pop'] = np.where(df_age['Sex'] == 'Female', df_age['Female Pop %'], df_age['Male Pop %'])
    df_age['population_calc'] = df_age['population'] * (df_age['pop']/100)
    
    df_list.append(df_age)
    
final_df = pd.concat(df_list, axis=0)

Output:

print(final_df[['district', 'Sex', 'Year', 'GROUP', 'population_calc']])
                                          district  ... population_calc
0      MP - Ehlanzeni District Municipality (DC32)  ...    16113.420033
1      MP - Ehlanzeni District Municipality (DC32)  ...    15696.491268
2     EC - Alfred Nzo District Municipality (DC44)  ...     8754.005492
3     EC - Alfred Nzo District Municipality (DC44)  ...     8220.401384
4       EC - Amathole District Municipality (DC12)  ...     6713.988338
                                           ...  ...             ...
3947     WC - Overberg District Municipality (DC3)  ...     2189.536237
3948   WC - West Coast District Municipality (DC1)  ...     3601.165452
3949   WC - West Coast District Municipality (DC1)  ...     3579.662585
3950                        WC- Garden Route (DC4)  ...     5120.187331
3951                        WC- Garden Route (DC4)  ...     4867.683393

[7904 rows x 5 columns]

Visual:

import matplotlib.pyplot as plt
import seaborn as sns

df_filter = final_df[final_df['district'] == ' MP - Ehlanzeni District Municipality (DC32)']

#set seaborn plotting aesthetics
sns.set(style='white')

#create grouped bar chart
sns.catplot(x='Year', 
            y='population_calc', 
            col='GROUP',
            hue='Sex', 
            data=df_filter,
            kind='bar',
            palette=['#DFA1C1', '#3279BB']) 
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.ylabel('Population')
chitown88
  • 27,527
  • 4
  • 30
  • 59