I am putting together a table (var_sig in the code below) of p-values but I cannot get the numbers to show in 4 decimal places. I tried the map function, but it returned the error message "Unknown format code 'f' for object of type 'str'. Then I tried wrapping pval in float() but that also returned an error. How can I fix the code below?
insdf = pd.DataFrame({'n':[500,1200,100,400,500,300],
'c':[42, 37, 1, 101, 73, 14],
'car':['small','medium','large','small','medium','large'],
'age':[1,1,1,2,2,2]})
insdf['n_log']=np.log(insdf['n'])
ratingvar = ['car','age']
def glm_freq(df, family, ratingvar, exposure, response, offset):
h2o.no_progress()
hf = h2o.H2OFrame(df)
# first convert to categorical variables
print('***Rebasing levels***')
for r in ratingvar:
hf[r] = hf[r].asfactor()
# then re-basing on level with most exposure
rebase = pd.pivot_table(df,
values=[exposure],
index=[r],
aggfunc=np.sum).sort_values((exposure), ascending=False).index.to_list()
rebase_list = [str(x) for x in rebase]
hf[r] = hf[r].set_levels(levels=rebase_list)
print('***Fitting GLM***')
glm_model = H2OGeneralizedLinearEstimator(family= family,
lambda_ = 0,
compute_p_values = True,
offset_column = offset
)
glm_model.train(ratingvar, response, training_frame= hf)
print('***Printing model diagnostics***')
print('Model AIC is: ', glm_model.aic())
print('Model Residual Deviance is: ', glm_model.residual_deviance())
llf = glm_model.scoring_history()['negative_log_likelihood'].iloc[-1]
print('Log Likelihood is: ', llf)
print('***Calculating Type III Statistics***')
var_sig=pd.DataFrame(columns=['ratingvar','pval'])
for p in ratingvar:
remainder_pred = ratingvar.copy()
remainder_pred.remove(p)
glm_model_tmp = H2OGeneralizedLinearEstimator(family= family,
lambda_ = 0,
compute_p_values = True,
offset_column = offset
)
glm_model_tmp.train(remainder_pred, response, training_frame = hf)
llvar = glm_model_tmp.scoring_history()['negative_log_likelihood'].iloc[-1]
d_freedom = df[p].unique().shape[0] - 1
print('Degree of freedom of ', str(p), ' is ', d_freedom)
chisqstat = 2 * (llvar - llf)
print('Log Likelihood of model without ', str(p), ' is ', llvar)
print('Chi-sq stat of ', str(p), ' is ', chisqstat)
pval = 1-stats.chi2.cdf(chisqstat, d_freedom)
var_sig = var_sig.append({'ratingvar':p,'pval':pval}, ignore_index=True)
var_sig['pval'] = var_sig['pval'].map('{:.4f}'.format)
return var_sig
glm_freq(insdf, 'poisson', ratingvar, 'n', 'c', 'n_log')