pandas make dummys on small datasets

Question

Can someone help me with this function to make dummies:

def make_dummies(df):
    # Create dummies for all hours of the day
    hours = pd.get_dummies(df.index.hour, prefix='hour')

    # Create columns for hour, day of week, weekend, and month
    df['hour'] = df.index.strftime('%H')
    df['day_of_week'] = df.index.dayofweek
    df['weekend'] = np.where(df['day_of_week'].isin([5,6]), 1, 0)
    df['month'] = df.index.month

    # Create dummies for hours of the day
    hour_dummies = pd.get_dummies(df['hour'], prefix='hour')

    # Create dummies for all days of the week
    day_mapping = {0: 'monday', 1: 'tuesday', 2: 'wednesday', 3: 'thursday', 4: 'friday', 5: 'saturday', 6: 'sunday'}
    all_days = pd.Categorical(df['day_of_week'].map(day_mapping), categories=day_mapping.values())
    day_dummies = pd.get_dummies(all_days)

    # Create dummies for all months of the year
    month_mapping = {1: 'jan', 2: 'feb', 3: 'mar', 4: 'apr', 5: 'may', 6: 'jun', 7: 'jul',
                     8: 'aug', 9: 'sep', 10: 'oct', 11: 'nov', 12: 'dec'}
    all_months = pd.Categorical(df['month'].map(month_mapping), categories=month_mapping.values())
    month_dummies = pd.get_dummies(all_months)

    # Merge all dummies with original DataFrame
    df = pd.concat([df, hours, hour_dummies, day_dummies, month_dummies], axis=1)

    # Drop redundant columns
    df = df.drop(['hour', 'day_of_week', 'month'], axis=1)

    return df

On a small dataset like this:

import pandas as pd
import numpy as np

data = {"temp":[53.13,52.93,52.56,51.58,47.57],
        "Date":["2023-04-07 15:00:00-05:00","2023-04-07 16:00:00-05:00","2023-04-07 17:00:00-05:00","2023-04-07 18:00:00-05:00","2023-04-07 19:00:00-05:00"]
}


df = pd.DataFrame(data).set_index("Date")

# Converting the index as date
df.index = pd.to_datetime(df.index)

df = make_dummies(df)

print(df)

This wont merge the data correctly. I apologize for the screenshot but the function is just stacking dummy variables beneath where what I was hoping for is ALL dummy variables would be added to the df and not stacked beneath. Hopefully this makes sense, was hoping to make a function that creates all dummy variables for each hour, month, and day type.

Typically we would want to use one hot encoder, right? https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html — J_H, Apr 07 '23 at 20:31
maybe ill try this out: https://stackoverflow.com/questions/58101126/using-scikit-learn-onehotencoder-with-a-pandas-dataframe#:~:text=So%20turned%20out%20that%20Scikit%2DLearns%20LabelBinarizer%20gave%20me%20better%20luck%20in%20converting%20the%20data%20to%20one%2D — bbartling, Apr 07 '23 at 20:34

score 1 · Answer 1 · answered Apr 07 '23 at 20:43

For what its worth here's the sci-kit learn version looks a bit daunting but also appears to work:

from sklearn.preprocessing import OneHotEncoder

def make_dummies(df):
    # Create a new DataFrame to hold the encoded data
    encoded_df = df.copy()

    # Create a OneHotEncoder object for hours of the day
    hour_encoder = OneHotEncoder(categories=[range(24)], sparse=False)

    # Encode the hour column
    hour_encoded = hour_encoder.fit_transform(encoded_df.index.hour.values.reshape(-1, 1))
    hour_columns = [f'hour_{i}' for i in range(24)]
    hour_df = pd.DataFrame(hour_encoded, columns=hour_columns, index=encoded_df.index)

    # Create a OneHotEncoder object for days of the week
    day_encoder = OneHotEncoder(categories=[range(7)], sparse=False)

    # Encode the day_of_week column
    day_encoded = day_encoder.fit_transform(encoded_df.index.dayofweek.values.reshape(-1, 1))
    day_columns = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday']
    day_df = pd.DataFrame(day_encoded, columns=day_columns, index=encoded_df.index)

    # Create a OneHotEncoder object for months of the year
    month_encoder = OneHotEncoder(categories=[range(1, 13)], sparse=False)

    # Encode the month column
    month_encoded = month_encoder.fit_transform(encoded_df.index.month.values.reshape(-1, 1))
    month_columns = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
    month_df = pd.DataFrame(month_encoded, columns=month_columns, index=encoded_df.index)

    # Merge all dummies with original DataFrame
    encoded_df = pd.concat([encoded_df, hour_df, day_df, month_df], axis=1)

    return encoded_df

score 0 · Accepted Answer · answered Apr 07 '23 at 20:36

You just missed some set_index to align indexes on pd.concat:

def make_dummies(df):
    # Create dummies for all hours of the day
    hours = pd.get_dummies(df.index.hour, prefix='hour').set_index(df.index)  # HERE
    
    # Create columns for hour, day of week, weekend, and month
    df['hour'] = df.index.strftime('%H')
    df['day_of_week'] = df.index.dayofweek
    df['weekend'] = np.where(df['day_of_week'].isin([5,6]), 1, 0)
    df['month'] = df.index.month

    # Create dummies for hours of the day
    hour_dummies = pd.get_dummies(df['hour'], prefix='hour')

    # Create dummies for all days of the week
    day_mapping = {0: 'monday', 1: 'tuesday', 2: 'wednesday', 3: 'thursday', 4: 'friday', 5: 'saturday', 6: 'sunday'}
    all_days = pd.Categorical(df['day_of_week'].map(day_mapping), categories=day_mapping.values())
    day_dummies = pd.get_dummies(all_days).set_index(df.index)  # HERE

    # Create dummies for all months of the year
    month_mapping = {1: 'jan', 2: 'feb', 3: 'mar', 4: 'apr', 5: 'may', 6: 'jun', 7: 'jul',
                     8: 'aug', 9: 'sep', 10: 'oct', 11: 'nov', 12: 'dec'}
    all_months = pd.Categorical(df['month'].map(month_mapping), categories=month_mapping.values())
    month_dummies = pd.get_dummies(all_months).set_index(df.index)  # HERE

    # Merge all dummies with original DataFrame
    df = pd.concat([df, hours, hour_dummies, day_dummies, month_dummies], axis=1)

    # Drop redundant columns
    df = df.drop(['hour', 'day_of_week', 'month'], axis=1)

    return df

Note: I think hours and hour_dummies are redundant.

Output:

>>> make_dummies(df)
                            temp  weekend  hour_15  hour_16  hour_17  hour_18  hour_19  hour_15  hour_16  ...  apr  may  jun  jul  aug  sep  oct  nov  dec
Date                                                                                                      ...                                             
2023-04-07 15:00:00-05:00  53.13        0        1        0        0        0        0        1        0  ...    1    0    0    0    0    0    0    0    0
2023-04-07 16:00:00-05:00  52.93        0        0        1        0        0        0        0        1  ...    1    0    0    0    0    0    0    0    0
2023-04-07 17:00:00-05:00  52.56        0        0        0        1        0        0        0        0  ...    1    0    0    0    0    0    0    0    0
2023-04-07 18:00:00-05:00  51.58        0        0        0        0        1        0        0        0  ...    1    0    0    0    0    0    0    0    0
2023-04-07 19:00:00-05:00  47.57        0        0        0        0        0        1        0        0  ...    1    0    0    0    0    0    0    0    0

[5 rows x 31 columns]

thanks for the help...where do you see the `hours` in the code? IE., the redundancy can you expand on that more? — bbartling, Apr 07 '23 at 20:45
I see in my output 2 columns hour_15, hour_16, hour_17. Did I missed something? Check your image, you have the same problem, no? — Corralien, Apr 07 '23 at 20:46
Its working for me...I removed `hours = pd.get_dummies(df.index.hour, prefix='hour').set_index(df.index)` from the top of the function and its working fine for me — bbartling, Apr 07 '23 at 20:50

pandas make dummys on small datasets

2 Answers2