I combed through the source code of their benchmark and I found you only get the speedup when you run a lot of operations in a row. I do think it is an overhead issue. I created the following super simple benchmark comparing gpu.js to numeric.js. Here it is if anyone is interested:
var gpu = new GPU();
var size = 512;
var scale = 10;
var iterations = 100;
// Scaling up the matricies decreases the effect of precision errors
A = numeric.mul(numeric.random([size, size]), scale)
B = numeric.mul(numeric.random([size, size]), scale)
// I know eval is dangerous but I couldn't get the size in any other way
function multGen(size) {
return eval("(function(A, B) { var sum = 0; for (var i=0; i<"+ size +"; i++) {sum += A[this.thread.y][i] * B[i][this.thread.x];} return sum;})")
}
var mat_mult = gpu.createKernel(multGen(size)).dimensions([size, size]);
var before = new Date().getTime();
var parallel = mat_mult(A, B);
// Need to do many computations to get the advantages of the GPU
for(var i = 0; i < iterations; i++) {
parallel = mat_mult(A, B);
}
var parTime = (new Date().getTime()) - before;
console.log('Parallel Time: ', parTime);
before = new Date().getTime();
var procedural = numeric.dot(A, B);
// Need to do many computations to get the advantages of the GPU
for(var i = 0; i < iterations; i++) {
procedural = numeric.dot(A, B);
}
var procTime = (new Date().getTime()) - before;
console.log('Procedural Time: ', procTime);
console.log((procTime / parTime) + ' times faster');
// This is for RMSD nornalization, flattening and doing min and max that way exceeded the call stack
var max = Math.max(Math.max(...A.map((function(row) {return Math.max(...row);}))), Math.max(...B.map((function(row) {return Math.max(...row);}))))
var min = Math.min(Math.min(...A.map((function(row) {return Math.min(...row);}))), Math.min(...B.map((function(row) {return Math.min(...row);}))))
// The matricies will be different due to precision issues so the Normalized RMDS can give you an idea of the difference
var nrmsd = Math.sqrt(numeric.sum(numeric.pow(numeric.sub(parallel, procedural), 2)) / size) / (max - min);
console.log('Normalized RMSD: ', nrmsd);
This gave me the following output:
scriptfour.js:26 Parallel Time: 20490
scriptfour.js:36 Procedural Time: 28736
scriptfour.js:38 1.402440214738897 times faster
scriptfour.js:48 Normalized RMSD: 0.009671934749138042
These results are pretty good. The eval slowed down the parallel one unfairly but it is still always faster. I don't think a setup like that is good for production but it still works here.