Adding to the code from this answer, you can create a dataframe out_df
based on the characteristics of df
- although not the quartiles. The higher the number of data points (count), the closer the distributions of columns will match.
import pandas as pd
import numpy as np
import scipy.stats
def my_distribution(min_val, max_val, mean, std):
scale = max_val - min_val
location = min_val
# Mean and standard deviation of the unscaled beta distribution
unscaled_mean = (mean - min_val) / scale
unscaled_var = (std / scale) ** 2
# Computation of alpha and beta can be derived from mean and variance formulas
t = unscaled_mean / (1 - unscaled_mean)
beta = ((t / unscaled_var) - (t * t) - (2 * t) - 1) / ((t * t * t) + (3 * t * t) + (3 * t) + 1)
alpha = beta * t
# Not all parameters may produce a valid distribution
if alpha <= 0 or beta <= 0:
raise ValueError('Cannot create distribution for the given parameters.')
# Make scaled beta distribution with computed parameters
return scipy.stats.beta(alpha, beta, scale=scale, loc=location)
desc = df.describe()
out_df = pd.DataFrame()
for col in desc.columns:
data = my_distribution(desc.loc["min", col],
desc.loc["max", col],
desc.loc["mean", col],
desc.loc["std", col]).rvs(int(desc.loc["count", col]))
out_df[col] = data
out_df.describe().sub(desc)