I've been working on a python script to do Probit analysis to find the Lower Limit of Detection (LLoD) for assays run in our lab. I had a script that worked perfectly last week, but was messy and lacked any kind of input-checking to make sure the user's input was valid.
On running the script, the user was prompted for a few questions (the column headers for the relevant data in the .csv file containing the data to analyze, and whether or not to use the data in said columns as-is or in log_10 form). The script would then -perform the necessary data cleanup and calculations -print out a table of the relevant data along with the equation and R^2 value for the linear regression -display a graph of the data along with the linear regression and calculated LLoD -print out "The lower limit of detection at 95% CI is [whatever]".
Now, on running the script, the program stops after printing the data table and displaying the graph (the regression equation and R^2 value are not printed, nor is anything afterwards). Furthermore, python doesn't return to prompt for input with the standard >>>
, and I have to exit and reopen Python. Does anyone have any idea what's going on? Full code at the bottom of post, sample data can be found here. Note: This is the exact data I've been using, which was working last week.
(P.S. any misc. tips for cleaning up the code would be appreciated as well, so long as the function is unchanged. I come from a C background and adapting to Python is still a work in progress...)
Code: (FWIW I'm running 3.8)
import os
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy
from scipy.stats import norm
from numpy.polynomial import Polynomial
from tkinter import filedialog
from tkinter import *
# Initialize tkinter
root = Tk()
root.withdraw()
# Prompt user for data file and column headers, ask whether to use log(qty)
print("In the directory prompt, select the .csv file containing data for analysis")
path = filedialog.askopenfilename()
#data = pd.read_csv(path, usecols=[conc, detect])
data = pd.read_csv(path)
while True:
conc = input("Enter the column header for concentration/number of copies: ")
if conc in data.columns:
break
else:
print('Invalid input. Column \''+str(conc)+'\' does not exist. Try again')
continue
while True:
detect = input("Enter the column header for target detection: ")
if detect in data.columns:
break
else:
print('Invalid input. Column \''+str(detect)+'\' does not exist. Try again')
continue
while True:
logans = input("Analyze using log10(concentration/number of copies)? (y/n): ")
if logans == 'y':
break
elif logans == 'n':
break
else:
print('Invalid input. Please enter either y or n.')
continue
# Read the columns of data specified by the user and rename them for consistency
data = data.rename(columns={conc:"qty", detect:"result"})
# Create list of unique values for RNA quantity, initialize vectors of same length
# to store probabilies and probit scores for each
qtys = data['qty'].unique()
log_qtys = [0] * len(qtys)
prop = [0] * len(qtys)
probit = [0] * len(qtys)
# Function to get the hitrate/probability of detection for a given quantity
# Note: any values in df.result that cannot be parsed as a number will be converted to NaN
def hitrate(qty, df):
t_s = df[df.qty == qty].result
t_s = t_s.apply(pd.to_numeric, args=('coerce',)).isna()
return (len(t_s)-t_s.sum())/len(t_s)
# Iterate over quantities to calculate log10(quantity), the corresponding probability
# of detection, and its associated probit score
for idx, val in enumerate(qtys):
log_qtys[idx] = math.log10(val)
prop[idx] = hitrate(val, data)
probit[idx] = 5 + norm.ppf(prop[idx])
# Create a dataframe (with headers) composed of the quantaties and their associated
# probabilities and probit scores, then drop rows with probability of 0 or 1
hitTable = pd.DataFrame(np.vstack([qtys,log_qtys,prop,probit]).T, columns=['qty','log_qty','probability','probit'])
hitTable.probit.replace([np.inf,-np.inf],np.nan, inplace=True)
hitTable.dropna(inplace=True)
def regPlot(x, y, log):
# Update parameters, set y95 to probit score corresponding to 95% CI
params = {'mathtext.default': 'regular'}
plt.rcParams.update(params)
y95 = 6.6448536269514722
# Define lambda function for a line, run regression, and find the coefficient of determination
regFun = lambda m, x, b : (m*x) + b
regression = scipy.stats.linregress(x,y)
r_2 = regression.rvalue*regression.rvalue
# Solve y=mx+b for x at 95% CI
log_llod = (y95 - regression.intercept) / regression.slope
xmax = log_llod * 1.2
# Start plotting all the things!
fig = plt.figure()
ax = fig.add_subplot(111)
ax.set_ylabel('Probit score\n$(\sigma + 5)$')
if log == 'y':
ax.set_xlabel('$log_{10}$(input quantity)')
elif log == 'n':
ax.set_xlabel('input quantity')
else:
raise ValueError('Error when calling regPlot(x,y,log) - User input invalid.')
x_r = [0, xmax]
y_r = [regression.intercept, regFun(regression.slope,x_r[1],regression.intercept)]
ax.plot(x_r, y_r, '--k') # linear regression
ax.plot(log_llod, y95, color='red', marker='o', markersize=8) # LLOD point
ax.plot([0,xmax], [y95,y95], color='red', linestyle=':') # horiz. red line
ax.plot([log_llod,log_llod], [regFun(regression.slope,x_r[0],regression.intercept),7.1], color='red', linestyle=':') # vert. red line
ax.plot(x, y, 'bx') # actual (qty, probit) data points
ax.grid() # grid
plt.show()
print('\n Linear regression using least-squares method yields:\n')
print('\t\ty = '+str("%.3f"%regression.slope)+'x + '+str("%.3f"%regression.intercept)+'\n')
print('\twith a corresponding R-squared value of', str("%.5f"%r_2)+"\n")
return regression.slope, regression.intercept, r_2, regression.stderr, regression.intercept_stderr, log_llod
print('\n', hitTable, '\n')
if logans == 'y':
m, b, r_2, stderr, int_stderr, log_llod = regPlot(hitTable.log_qty, hitTable.probit, logans)
llod_95 = 10**log_llod
if r_2 < 0.9:
print('WARNING: low r-squared value for linear regression. Try re-analyzing without using log10.')
elif logans == 'n':
m, b, r_2, stderr, int_stderr, log_llod = regPlot(hitTable.qty, hitTable.probit, logans)
llod_95 = log_llod
if r_2 < 0.9:
print('WARNING: low r-squared value for linear regression. Try re-analyzing using log10.')
else:
raise ValueError('Error when attempting to evaluate llod_95 - User input invalid.')
print("\nThe lower limit of detection (LLoD) at 95% CI is " + str("%.4f"%llod_95) + ".\n")