I build a machine learning model to classify documents using NaiveBayesMultinomial. I am using Java Weka Api to train and test model. To evaluate model performance I want to generate ROC curve. I do not understand how to calculate TPR and FPR for different threshold values. I attached my source code and sample dataset. I would be very grateful if anyone help me to calculate TPR and FPR for different threshold values for generating ROC curve. Thanks in advance for your help. My Java Code:
package smote;
import java.io.File;
import java.util.Random;
import weka.classifiers.Classifier;
import weka.classifiers.bayes.NaiveBayesMultinomial;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.converters.ConverterUtils.DataSource;
import weka.filters.Filter;
import weka.filters.unsupervised.attribute.StringToWordVector;
public class calRoc {
public static void main(String agrs[]) throws Exception{
String fileRootPath = "...../DocsFIle.arff";
Instances rawData = DataSource.read(fileRootPath);
StringToWordVector filter = new StringToWordVector(10000);
filter.setInputFormat(rawData);
String[] options = { "-W", "10000", "-L", "-M", "2",
"-stemmer",
"weka.core.stemmers.IteratedLovinsStemmer",
"-stopwords-handler",
"weka.core.stopwords.Rainbow",
"-tokenizer",
"weka.core.tokenizers.AlphabeticTokenizer"
};
filter.setOptions(options);
filter.setIDFTransform(true);
filter.setStopwords(new
File("/Research/DoctoralReseacher/IEICE/Dataset/stopwords.txt"));
Instances data = Filter.useFilter(rawData,filter);
data.setClassIndex(0);
int numRuns = 10;
double[] recall=new double[numRuns];
double[] precision=new double[numRuns];
double[] fmeasure=new double[numRuns];
double tp, fp, fn, tn;
String classifierName[] = { "NBM"};
double totalPrecision,totalRecall,totalFmeasure;
totalPrecision=totalRecall=totalFmeasure=0;
double avgPrecision, avgRecall, avgFmeasure;
avgPrecision=avgRecall=avgFmeasure=0;
for(int run = 0; run < numRuns; run++) {
Classifier classifier = null;
classifier = new NaiveBayesMultinomial();
int folds = 10;
Random random = new Random(1);
data.randomize(random);
data.stratify(folds);
tp = fp = fn = tn = 0;
for (int i = 0; i < folds; i++) {
Instances trains = data.trainCV(folds, i,random);
Instances tests = data.testCV(folds, i);
classifier.buildClassifier(trains);
for (int j = 0; j < tests.numInstances(); j++) {
Instance instance = tests.instance(j);
double classValue = instance.classValue();
double result = classifier.classifyInstance(instance);
if (result == 0.0 && classValue == 0.0) {
tp++;
} else if (result == 0.0 && classValue == 1.0) {
fp++;
} else if (result == 1.0 && classValue == 0.0) {
fn++;
} else if (result == 1.0 && classValue == 1.0) {
tn++;
}
}
}
if (tn + fn > 0)
precision[run] = tn / (tn + fn);
if (tn + fp > 0)
recall[run] = tn / (tn + fp);
if (precision[run] + recall[run] > 0)
fmeasure[run] = 2 * precision[run] * recall[run] / (precision[run] + recall[run]);
System.out.println("The "+(run+1)+"-th run");
System.out.println("Precision: " + precision[run]);
System.out.println("Recall: " + recall[run]);
System.out.println("Fmeasure: " + fmeasure[run]);
totalPrecision+=precision[run];
totalRecall+=recall[run];
totalFmeasure+=fmeasure[run];
}
avgPrecision=totalPrecision/numRuns;
avgRecall=totalRecall/numRuns;
avgFmeasure=totalFmeasure/numRuns;
System.out.println("avgPrecision: " + avgPrecision);
System.out.println("avgRecall: " + avgRecall);
System.out.println("avgFmeasure: " + avgFmeasure);
}
}
Sample Dataset with few instances:
@relation 'CamelBug'
@attribute Feature string
@attribute class-att {0,1}
@data
'XQuery creates an empty out message that makes it impossible to chain
more processors behind it ',1
'org apache camel Message hasAttachments is buggy ',0
'unmarshal new JaxbDataFormat com foo bar returning JAXBElement ',0
'Can t get the soap header when the camel cxf endpoint working in the
PAYLOAD data fromat ',0
'camel jetty Exchange failures should not be returned as ',1
'Delayer not working as expected ',1
'ParallelProcessing and executor flags are ignored in Multicast
processor ',1