0

I have created a function to handle data processing such as filling null values but the result of the function is returning a series instead of giving me a dataframe. How do I solve this?

def preprocessing(df):
    df_columns = ['column1', 'column2','column3','column4', 'column5', 'column6','column7', 'column8']
    
    features= [c for c in df.columns.values if c in df_columns[0:2]]
    df = df[features].notna()
    
    features= [c for c in df.columns.values if c in df_columns[2:4]]
    max = df[features].max()
    df = df[features].fillna(max)
    
    # Fill na with 0
    features= [c for c in df.columns.values if c not in df_columns]
    df = df[features].fillna(0)
    
    return df

df = preprocessing(df) 

df.isnull().sum()
Shadow Walker
  • 979
  • 5
  • 27
  • 51
  • Does this answer your question? [Convert pandas Series to DataFrame](https://stackoverflow.com/questions/26097916/convert-pandas-series-to-dataframe) – Julien Dec 11 '20 at 07:24

2 Answers2

1

I think you need change like:

df = df[features].notna()

to:

df[features] = df[features].notna()

for processing only columns from list and also assign back for all your code.

It means:

def preprocessing(df):
    df_columns = ['column1', 'column2','column3','column4', 
                  'column5', 'column6','column7', 'column8']
    
    features= [c for c in df.columns.values if c in df_columns[0:2]]
    df[features] = df[features].notna()
    
    features= [c for c in df.columns.values if c in df_columns[2:4]]
    max1 = df[features].max()
    df[features] = df[features].fillna(max1)
    
    # Fill na with 0
    features= [c for c in df.columns.values if c not in df_columns]
    df[features] = df[features].fillna(0)
    
    return df

df = preprocessing(df) 

df.isnull().sum()
jezrael
  • 822,522
  • 95
  • 1,334
  • 1,252
0

Returning as a data frame might work for your problem.

return pd.DataFrame(df)

for your example

import pandas as pd

def preprocessing(df):
df_columns = ['column1', 'column2','column3','column4', 
              'column5', 'column6','column7', 'column8']

features= [c for c in df.columns.values if c in df_columns[0:2]]
df[features] = df[features].notna()

features= [c for c in df.columns.values if c in df_columns[2:4]]
max1 = df[features].max()
df[features] = df[features].fillna(max1)

# Fill na with 0
features= [c for c in df.columns.values if c not in df_columns]
df[features] = df[features].fillna(0)

return pd.DataFrame(df)