1

I have a pandas dataframe with columns val_min and val_max below:

df 
   val_min  val_max
0   1001.0  1500.0
3   401.0   500.0
4   401.0   500.0
5   401.0   500.0
6   501.0   600.0

Now I am trying to create a new column using these two columns with pandas apply function and create a new column based on some certain functions defined in my function, see my attempt below:

def get_range(df, val_min, val_max, new_col):
    if df[val_min] <500 & df[val_max]<500:
        df[new_col] = "<500"
        
    elif df[val_min] >500 & df[val_max]<=1000:
        df[new_col] = "500 - 1000"
        
    elif df[val_min] >1000 & df[val_max]<=2000:
        df[new_col] = "1000 - 2000"
        
    elif df[val_min] >2000 & df[val_max]<=5000:
        df[new_col] = "2000 - 5000"
        
    elif df[val_min] >5000 & df[val_max]<=7500:
        df[new_col] = "5000 - 7500"
        
    elif df[val_min] >7500 & df[val_max]<=10000:
        df[new_col] = "7500 - 10000"
        
    elif df[val_min] >10000 & df[val_max]<=12000:
        df[new_col] = "10000 - 12000"
        
    elif df[val_min] >12000 & df[val_max]<=15000:
        df[new_col] = "12000 - 15000"
        
    elif df[val_min] >15000 & df[val_max]>15000:
        df[new_col] = ">15000"

df["new_col"] = df[["val_min", "val_max"]].apply(get_range, axis=1)
JA-pythonista
  • 1,225
  • 1
  • 21
  • 44

4 Answers4

1

You should change the function's structure to this:

def get_range(x):
    val_min=x.val_min
    val_max=x.val_max
    if (val_min <500)&(val_max<500):
        return "<500"
        
    elif (val_min >500)&(val_max<=1000):
        return  "500 - 1000"
    ...

And then apply this to the dataframe:

df["new_col"] = df.apply(get_range, axis=1)

So, for example:

import pandas as pd


def get_range(x):
    val_min=x.val_min
    val_max=x.val_max
    if (val_min <500 )&(val_max<500):
        return "<500"
        
    elif (val_min>500)&(val_max<=1000):
        return  "500 - 1000"
        

df = pd.DataFrame({'val_min':[600, 34], 'val_max':[800, 340]})

df["new_col"] = df.apply(get_range, axis=1)

Output:

   val_min  val_max     new_col
0      600      800  500 - 1000
1       34      340        <500
MrNobody33
  • 6,413
  • 7
  • 19
  • Thanks for this. Unfortunately., I get the following error: `TypeError: ufunc 'bitwise_and' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''` – JA-pythonista Jul 15 '20 at 12:50
  • 1
    You should change `val_min <500 & val_max<500` to `(val_min <500)&(val_max<500)` or to `(val_min<500) and (val_max<500)`. Just changed in my answer – MrNobody33 Jul 15 '20 at 12:54
  • 1
    @Pythonista, you can also factorize code by defining the variable that's is going to be returned and, return it at the end of the code, instead of returning it in each if statement. Just a suggestion :) – MrNobody33 Jul 15 '20 at 13:01
  • oh, that's a fantastic idea! I love it! Than you! :D – JA-pythonista Jul 15 '20 at 13:05
1

Consider using np.select

import numpy as np

cases = [(df.val_min < 500) & (df.val_max < 500),
           (df.val_min > 500) & (df.val_max <= 1000),
           (df.val_min > 1000) & (df.val_max <= 2000),
           (df.val_min > 2000) & (df.val_max <= 5000),
           (df.val_min > 5000) & (df.val_max <= 7500),
           (df.val_min > 7500) & (df.val_max <= 10000),
           (df.val_min > 10000) &( df.val_max <= 12000),
           (df.val_min > 12000) &( df.val_max <= 15000),
            (df.val_min > 15000) &( df.val_max > 15000)]


values = ['<500',
          '500 - 1000',
          '1000 - 2000',
          '2000 - 5000',
          '5000 - 7500',
          '7500 - 10000',
          '10000 - 12000',
          '12000 - 15000',
          '>15000']


df['new_col'] = np.select(cases, values, default='')
print(df)

Output

   val_min  val_max      new_col
0   1001.0   1500.0  1000 - 2000
1    401.0    500.0             
2    401.0    500.0             
3    401.0    500.0             
4    501.0    600.0   500 - 1000
Balaji Ambresh
  • 4,977
  • 2
  • 5
  • 17
0

could you try this ? it might work

def get_range(df, val_min, val_max, new_col):
    if val_min <500 & val_max <500:
        return "<500"

    elif val_min >500 & val_max <=1000:
        return "500 - 1000"
[...]

df["new_col"] = df[["val_min", "val_max"]].apply(get_range, axis=1)
Robin
  • 51
  • 5
0

you can use apply with lambda

df = pd.DataFrame({"val_min":list(range(10,1000,100)),"val_max":list(range(20,2000,200))})


def get_range(val_min, val_max):
    if val_min <500 & val_max<500:
        new_col = "<500"
        
    elif val_min >500 & val_max<=1000:
        new_col = "500 - 1000"
        
    elif val_min >1000 & val_max<=2000:
        new_col = "1000 - 2000"
        
    elif val_min >2000 & val_max<=5000:
        new_col = "2000 - 5000"
        
    elif val_min >5000 & val_max<=7500:
        new_col = "5000 - 7500"
        
    elif val_min >7500 & val_max<=10000:
        new_col = "7500 - 10000"
        
    elif val_min >10000 & val_max<=12000:
        new_col = "10000 - 12000"
        
    elif val_min >12000 & val_max<=15000:
        new_col = "12000 - 15000"
        
    elif val_min >15000 & val_max>15000:
        new_col = ">15000"
    return new_col

df["new_col"] = df[["val_min", "val_max"]].apply(lambda row :get_range(*row), axis=1)

print(df)

   val_min  val_max     new_col
0       10       20        <500
1      110      220        <500
2      210      420        <500
3      310      620  500 - 1000
4      410      820  500 - 1000
5      510     1020  500 - 1000
6      610     1220  500 - 1000
7      710     1420  500 - 1000
8      810     1620  500 - 1000
9      910     1820  500 - 1000
[Finished in 1.8s]
Akhilesh_IN
  • 1,217
  • 1
  • 13
  • 19