Back again. The script I'm working on for lower limit of detection (LLoD) for laboratory assays ran fine yesterday, but now that I'm trying to direct the output to a .txt, I'm running into all kinds of errors. Have tried the sys.stdout = open("test.txt", "w")
with sys.stdout.close()
at the end, to no avail (further, this throws a ValueError: I/O operation on closed file.
error if I run the script again, even if I removed the sys.stdout
stuff). My most recent attempt was to use with open(outName, 'w') as f: print(hitTable, '\n', file=f)
for the first block to be printed to a .txt (note: before this block, I have the following:
ts = time.time()
dt = datetime.datetime.fromtimestamp(ts).strftime('%Y%m%d')
tm = datetime.datetime.fromtimestamp(ts).strftime('%H:%M')
outName = dt + " Probit Analysis " + tm +".txt"
as I expect that this script will be run multiple times a day, and don't want the output file overwritten for multiple runs)
All the examples I found were very basic, like
with open('file.txt', 'w') as f:
print('some text', file=f)
but nothing explaining what to do when you want something printed to the file, then a bunch of calculations are done, then some more stuff is to be written, etc., so later I have lines like
if r_2 < 0.9:
with open(outName, 'a') as f:
print('WARNING: low r-squared value for linear regression. Try re-analyzing without using log10.', file=f)
but I'm not sure if that's the right way to handle it.
My desired output would look like
qty log_qty probability probit
0 1.0 0.000000 0.07 3.524209
1 2.0 0.301030 0.15 3.963567
2 3.0 0.477121 0.24 4.293697
3 4.0 0.602060 0.32 4.532301
4 5.0 0.698970 0.40 4.746653
5 10.0 1.000000 0.60 5.253347
6 20.0 1.301030 0.87 6.126391
7 30.0 1.477121 0.90 6.281552
8 40.0 1.602060 0.95 6.644854
9 50.0 1.698970 0.98 7.053749
Linear regression using least-squares method yields:
y = 2.062x + 3.353
with a corresponding R-squared value of 0.99066
The lower limit of detection (LLoD) at 95% CI is 39.4563.
possibly adding a header at the top along the lines of "Probit analysis YYYY-MM-DD HH:MM:SS", but that's not important atm.
Code follows, sample data available here
import os
import sys
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy
import datetime
from scipy.stats import norm
from numpy.polynomial import Polynomial
from tkinter import filedialog
from tkinter import *
# Initialize tkinter
root = Tk()
root.withdraw()
# Function to check whether user-input column exists in data
def checkInput(prompt,colCheck):
while True:
qVar = input(f"Enter the column header for {prompt}: ")
if qVar in colCheck.columns:
break
print(f"Invalid input. Column '{qVar}' does not exist. Try again.")
return qVar
# Function to get the hitrate/probability of detection for a given quantity
# Note: any values in df.result that cannot be parsed as a number will be converted to NaN
def hitrate(qty, df):
t_s = df[df.qty == qty].result
t_s = t_s.apply(pd.to_numeric, args=('coerce',)).isna()
return (len(t_s)-t_s.sum())/len(t_s)
# Main function for generating plot and calculating LLoD-95
def regPlot(x, y, log):
ts = time.time()
dt = datetime.datetime.fromtimestamp(ts).strftime('%Y%m%d')
tm = datetime.datetime.fromtimestamp(ts).strftime('%H:%M')
params = {'mathtext.default': 'regular'}
plt.rcParams.update(params)
y95 = 6.6448536269514722 # probit score corresponding to 95% CI
regFun = lambda m, x, b : (m*x) + b # equation for line
regression = scipy.stats.linregress(x,y)
r_2 = regression.rvalue*regression.rvalue #calculate R^2
# Solve linear equation for x at 95% CI
log_llod = (y95 - regression.intercept) / regression.slope
xmax = log_llod * 1.2
# Start plotting all the things!
fig = plt.figure()
ax = fig.add_subplot(111)
ax.set_title('Limit of Detection')
ax.set_ylabel('Probit score\n$(\sigma + 5)$')
while True:
if log == 'y':
ax.set_xlabel('$log_{10}$(input quantity)')
break
elif log == 'n':
ax.set_xlabel('input quantity')
break
raise ValueError('Error when calling regPlot(x,y,log) - User input invalid.')
x_r = [0, xmax]
y_r = [regression.intercept, regFun(regression.slope,x_r[1],regression.intercept)]
ax.plot(x_r, y_r, '--k') # linear regression
ax.plot(log_llod, y95, color='red', marker='o', markersize=8) # LLOD point
ax.plot([0,xmax], [y95,y95], color='red', linestyle=':') # horiz. red line
ax.plot([log_llod,log_llod], [regFun(regression.slope,x_r[0],regression.intercept),7.1], color='red', linestyle=':') # vert. red line
ax.plot(x, y, 'bx') # actual (qty, probit) data points
ax.grid() # grid
ax.set_xlim(left=0)
plt.ion()
plt.show()
plt.savefig(f"{dt} Probit Analysis at {tm}.png")
plt.pause(0.001)
return regression.slope, regression.intercept, r_2, regression.stderr, regression.intercept_stderr, log_llod
def printResult(m,b,r2):
print('\n Linear regression using least-squares method yields:\n')
print('\t\ty = '+str("%.3f"%regression.slope)+'x + '+str("%.3f"%regression.intercept)+'\n')
print('\twith a corresponding R-squared value of', str("%.5f"%r_2)+"\n")
# Prompt user for data file and column headers, ask whether to use log(qty)
print("In the directory prompt, select the .csv file containing data for analysis")
path = filedialog.askopenfilename()
#data = pd.read_csv(path, usecols=[conc, detect])
data = pd.read_csv(path)
conc = checkInput('concentration/copy number', data)
detect = checkInput('target detection', data)
while True:
logans = input("Analyze using log10(concentration/number of copies)? (y/n): ")
if logans == 'y' or logans == 'n':
break
print('Invalid input. Please enter either y or n.')
# Get timestamps for file names
ts = time.time()
dt = datetime.datetime.fromtimestamp(ts).strftime('%Y%m%d')
tm = datetime.datetime.fromtimestamp(ts).strftime('%H:%M')
outName = dt + " Probit Analysis " + tm +".txt"
# Read the columns of data specified by the user and rename them for consistency
data = data.rename(columns={conc:"qty", detect:"result"})
# Create list of unique qty values and initialize corresponding
# log_10, probability, and probit vectors
qtys = data['qty'].unique()
log_qtys = [0] * len(qtys)
prop = [0] * len(qtys)
probit = [0] * len(qtys)
# Calculate log10(qty), detection probability, and associated probit score for each unique qty
for idx, val in enumerate(qtys):
log_qtys[idx] = math.log10(val)
prop[idx] = hitrate(val, data)
probit[idx] = 5 + norm.ppf(prop[idx])
# Create a dataframe of quantaties, log(qtys), probabilities, and probit scores.
hitTable = pd.DataFrame(np.vstack([qtys,log_qtys,prop,probit]).T, columns=['qty','log_qty','probability','probit'])
# Convert infinite probit values to NaN and drop those rows.
hitTable.probit.replace([np.inf,-np.inf],np.nan, inplace=True)
hitTable.dropna(inplace=True)
with open(outName, 'w') as f:
print(hitTable, '\n', file=f)
while True:
if logans == 'y':
m, b, r_2, stderr, int_stderr, log_llod = regPlot(hitTable.log_qty, hitTable.probit, logans)
printResult(m,b,r_2)
llod_95 = 10**log_llod
if r_2 < 0.9:
with open(outName, 'a') as f:
print('WARNING: low r-squared value for linear regression. Try re-analyzing without using log10.', file=f)
break
elif logans == 'n':
m, b, r_2, stderr, int_stderr, log_llod = regPlot(hitTable.qty, hitTable.probit, logans)
printResult(m,b,r_2)
llod_95 = log_llod
if r_2 < 0.9:
with open(outName, 'a') as f:
print('WARNING: low r-squared value for linear regression. Try re-analyzing using log10.', file=f)
break
raise ValueError('Error when attempting to evaluate llod_95 - User input invalid.')
with open(outName, 'a') as f:
print("\nThe lower limit of detection (LLoD) at 95% CI is " + str("%.4f"%llod_95) + ".\n", file=f)
#sys.stdout.close()
EDIT: If possible, it'd be helpful to have the output written to a txt as well as printed in the CLI, so I don't have to double check the output every time during testing to make sure things are showing correctly. Also edited for formatting.