For non-numeric attributes, Weka allows you to retrieve the unique values via Attribute.numValues()
(how many are there) and Attribute.value(int)
(the -th value).
package weka;
import weka.core.Attribute;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.converters.ConverterUtils;
public class LOOByValue {
/**
* 1st arg: ARFF file to load
* 2nd arg: 0-based index in ARFF to use for class
* 3rd arg: 0-based index in ARFF to use for LOO
*
* @param args the command-line arguments
* @throws Exception if loading/processing of data fails
*/
public static void main(String[] args) throws Exception {
// load data
Instances full = ConverterUtils.DataSource.read(args[0]);
full.setClassIndex(Integer.parseInt(args[1]));
int looCol = Integer.parseInt(args[2]);
Attribute looAtt = full.attribute(looCol);
if (looAtt.isNumeric())
throw new IllegalStateException("Attribute cannot be numeric!");
// iterate unique values to create train/test splits
for (int i = 0; i < looAtt.numValues(); i++) {
String value = looAtt.value(i);
System.out.println("\n" + (i+1) + "/" + full.attribute(looCol).numValues() + ": " + value);
Instances train = new Instances(full, full.numInstances());
Instances test = new Instances(full, full.numInstances());
for (int n = 0; n < full.numInstances(); n++) {
Instance inst = full.instance(n);
if (inst.stringValue(looCol).equals(value))
test.add((Instance) inst.copy());
else
train.add((Instance) inst.copy());
}
train.compactify();
test.compactify();
// TODO do something with the data
System.out.println("train size: " + train.numInstances());
System.out.println("test size: " + test.numInstances());
}
}
}
With Weka's anneal UCI dataset and the surface-quality
for leave-one-out, you can generate something like this:
1/5: ?
train size: 654
test size: 244
2/5: D
train size: 843
test size: 55
3/5: E
train size: 588
test size: 310
4/5: F
train size: 838
test size: 60
5/5: G
train size: 669
test size: 229