4

I am using the Auto MPG training set from http://archive.ics.uci.edu/ml/datasets/Auto+MPG

My code is:

'use strict';
var brain, fs, normalizeData, trainNetwork, _;

_ = require('lodash');

brain = require('brain');

fs = require('fs');

trainNetwork = function(trainNetworkCb) {
  var net;
  net = new brain.NeuralNetwork();
  return fs.readFile('./data/autodata.csv', function(err, fileData) {
    var fileString, lines, trainingData;
    if (err) {
      return trainNetworkCb(err);
    }
    fileString = fileData.toString();
    lines = fileString.split('\n');
    trainingData = lines.splice(0, lines.length / 2);
    trainingData = _.map(trainingData, function(dataPoint) {
      var normalizedData, obj;
      normalizedData = normalizeData(dataPoint);
      obj = {
        input: normalizedData,
        output: {
          continuous: normalizedData.continuous
        }
      };
      delete obj.input.continuous;
      return obj;
    });
    net.train(trainingData, {
      log: true,
      logPeriod: 100,
      errorThresh: 0.00005
    });
    return trainNetworkCb(null, net);
  });
};

trainNetwork(function(err, net) {
  if (err) {
    throw err;
  }
  return fs.readFile('./data/autodata.csv', function(err, fileData) {
    var fileString, lines, testData;
    if (err) {
      return trainNetworkCb(err);
    }
    fileString = fileData.toString();
    lines = fileString.split('\n');
    testData = lines.splice(lines.length / 2);
    testData = _.filter(testData, function(point) {
      return point !== '';
    });
    testData = _.map(testData, function(dataPoint) {
      var normalizedData, obj;
      normalizedData = normalizeData(dataPoint);
      obj = {
        output: {
          continuous: normalizedData.continuous
        },
        input: normalizedData
      };
      delete obj.input.continuous;
      return obj;
    });
    return _.each(testData, function(dataPoint) {
      var output;
      output = net.run(dataPoint.input);
      console.log(output);
      console.log(dataPoint);
      return console.log('');
    });
  });
});

normalizeData = function(dataRow) {
  var cylinders, dataSet, model_years, origins, row;
  dataSet = dataRow.split(',');
  dataSet = _.map(dataSet, function(point) {
    return Number(point);
  });
  row = {};
  cylinders = [5, 3, 6, 4, 8];
  _.each(cylinders, function(cylinder) {
    row["cylinder" + cylinder] = cylinder === dataSet[0] ? 1 : 0;
  });
  row.displacement = dataSet[1] / 500;
  row.horsepower = dataSet[2] / 500;
  row.weight = dataSet[3] / 10000;
  row.acceleration = dataSet[4] / 100;
  model_years = [82, 81, 80, 79, 78, 77, 76, 75, 74, 73, 72, 71, 70];
  _.each(model_years, function(model_year) {
    row["model_year" + model_year] = model_year === dataSet[5] ? 1 : 0;
  });
  origins = [2, 3, 1];
  _.each(origins, function(origin) {
    row["origin" + origin] = origin === dataSet[6] ? 1 : 0;
  });
  row.continuous = dataSet[7] / 100;
  return row;
};

I believe I am normalizing everything correctly. I am using half the data for training and the other half for testing. The data is not ordered, as far as I can tell, so which half is used for which shouldn't matter.

My errors are pretty large however when testing. Usually by 10MPG or so (30% error). What am I doing incorrectly?

Thanks

Shamoon
  • 41,293
  • 91
  • 306
  • 570

1 Answers1

7

The dataset you linked is ordered by model-year; perhaps drastic changes in technology made the engines more efficient? Neural networks are dependent on correct outputs during training. I would try training the network with all but the last row, and then test using that. Can you link me the csv file you're using? The normalizeData function doesn't give us what you want with the linked file (http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data)

edit:

It seems like regardless of whatever errorThresh you specify, brain won't run more than 20,000 iterations on training runs. There's several ways to get around this. You can specify the learningRate of your neural network. Upping the learningRate to 0.6 (default is 0.3) helped me get more accurate results

net.train(trainingData, {
  log: true,
  logPeriod: 100,
  errorThresh: 0.00005,
  learningRate: 0.6
});

Higher learningRate means more aggressive weight adjustment, which helps when you aren't running as many iterations as you want.

Alternatively, you can specify the total amount of iterations in the options object (if not specified, it defaults to 20,000 - see here).

net.train(trainingData, {
  log: true,
  logPeriod: 100,
  errorThresh: 0.00005,
  iterations: 100000
});

Brain stops training when i < iterations && error > errorThresh evaluates to false. So feel free to crank up the iterations count to ensure that the above expression turns false because the error is below your specified errorTresh (source).

  • My data is from https://gist.github.com/shamoons/5ab50109dd43a84d1d09, with the fields being: cylinders: nominal : 5,3,6,4,8 displacement: continuous horsepower: continuous weight: continuous acceleration: continuous model_year: nominal : 82,81,80,79,78,77,76,75,74,73,72,71,70 origin: nominal : 2,3,1 class:continuous I tried what you said and trained all but 5 and used the last 5 for testing. It certainly was better, but still not great. – Shamoon Dec 22 '14 at 17:13
  • Thanks, that helps. Updated main answer with some more guidance - there's several ways you can fix this issue, each pretty trivial changes. – Keenan Lidral-Porter Dec 23 '14 at 05:12
  • Thanks so much. Generally, how does one find and tune these various parameters? Is there a heuristic for it? – Shamoon Dec 23 '14 at 12:37
  • Honestly? Just look at the library source code. Once I tried lowering the `errorTresh` and noticed that it still only ran 20,000 times, I looked to see what exactly the `net.train` method was doing. From there you can see all the different options and how they affect various parts. Normally this would be in the documentation, but looks like brain.js isn't that well documented – Keenan Lidral-Porter Dec 23 '14 at 15:00