I have a dataframe that contains float values. A new dataframe needs to be produced with the ranking of all of these values. Example below:
import pandas as pd
import numpy as np
import numba as nb
@nb.njit('int32[:,:](float64[:,:])', parallel=True)
def fastRanks(df):
n, m = df.shape
res = np.empty((n, m), dtype=np.int32)
for col in nb.prange(m):
dfCol = -df[:, col]
order = np.argsort(dfCol)
# Compute the ranks with the min method
if n > 0:
prevVal = dfCol[order[0]]
prevRank = 1
res[order[0], col] = 1
for row in range(1, n):
curVal = dfCol[order[row]]
if curVal == prevVal:
res[order[row], col] = prevRank
else:
res[order[row], col] = row + 1
prevVal = curVal
prevRank = row + 1
return res
df = pd.DataFrame(np.random.uniform(0,50,size=(100000, 5000)), columns=list(range(0,5000)))
%%time
ranking = pd.DataFrame(range(1, 100000 + 1), columns=['index'])
ranking = pd.concat([ranking, pd.DataFrame(fastRanks(df[range(0, 5000 )].to_numpy()))],
axis=1)
This ends up taking about 24 seconds to run.
Anyone have any suggestions on how to speed this up at all?