GPU Parallelism in Javascript going slower

Question

This is kind of a specific problem. I have recently tested out gpu.js. This library is supposed to accelerate computations by using webgl to parallelize computations. I made a quick test:

var gpu = new GPU();

function product(v, u) {
  return gpu.createKernel(function(X, Y) {
      return X[this.thread.x] * Y[this.thread.x];
  }).dimensions([v.length])(v, u);
}


var before = new Date().getTime();
console.log(product(numeric.random([100000]), numeric.random([100000])).length);
console.log('Parallel Time: ', (new Date().getTime()) - before);

before = new Date().getTime();
v = numeric.random([100000])
u = numeric.random([100000])
for(var i = 0; i < v.length; i++){
  v[i] = v[i] * u[i];
}
console.log(v.length);
console.log('Procedural Time: ', (new Date().getTime()) - before);

And got the following output:

script.js:11 100000 
script.js:12 Parallel Time:  340 
script.js:20 100000 
script.js:21 Procedural Time:  15

The parallel time is over an order of magnitude slower. Is there any reason why this would be the case? I tried this on a few machines with different GPUs. I have tried a few similar operations as well. Am I doing something wrong or is it a problem with the library? Is there some way I can improve this?

if you run the benchmark on `http://gpu.rocks/` do you get a significant acceleration? — Jaromanda X, Nov 17 '16 at 03:17
Yeah I did that originally and I got 5.72 times faster. I am not sure what I am doing wrong. — user5505266, Nov 17 '16 at 03:44

score 3 · Answer 1 · answered Nov 17 '16 at 16:09

3

When dealing with the GPU you have to be aware of overhead.

Calls to gpu.createKernel are likely to be very expensive as it has to parse your JavaScript code, create the appropriate GLSL code, and send it to WebGL to be compiled and linked.

At the very least you'll want to call that command once and store the result in a global variable to be reused every time you call product.

It's also worth being aware that there is a non zero amount of work required to move the data to and from the GPU, so you'll see more gains with more complicated calculations.

answered Nov 17 '16 at 16:09

Hargo

1,256
1
16
24

Thanks for this answer, I was typing my answer when I read it. I do think it is an overhead issue but I got it working still with calling that every time. If I actually make something with it I will definitely follow your tip. Thanks! – user5505266 Nov 17 '16 at 17:08
A good place to get a more comparable benchmark would be to break up the `.dimensions([v.length])(v, u)` into `var kernel = ....dimensions([v.length]); kernel(v, u)` and start the timer before invoking the intermediate `kernel(v, u)`. – Patrick Roberts Sep 05 '18 at 20:23

score 1 · Answer 2 · answered Nov 17 '16 at 17:07

I combed through the source code of their benchmark and I found you only get the speedup when you run a lot of operations in a row. I do think it is an overhead issue. I created the following super simple benchmark comparing gpu.js to numeric.js. Here it is if anyone is interested:

var gpu = new GPU();

var size = 512;
var scale = 10;
var iterations = 100;

// Scaling up the matricies decreases the effect of precision errors
A = numeric.mul(numeric.random([size, size]), scale)
B = numeric.mul(numeric.random([size, size]), scale)

// I know eval is dangerous but I couldn't get the size in any other way
function multGen(size) {
  return eval("(function(A, B) { var sum = 0; for (var i=0; i<"+ size +"; i++) {sum += A[this.thread.y][i] * B[i][this.thread.x];} return sum;})")
}

var mat_mult = gpu.createKernel(multGen(size)).dimensions([size, size]);

var before = new Date().getTime();
var parallel = mat_mult(A, B);

// Need to do many computations to get the advantages of the GPU
for(var i = 0; i < iterations; i++) {
  parallel = mat_mult(A, B);
}
var parTime = (new Date().getTime()) - before;
console.log('Parallel Time: ', parTime);

before = new Date().getTime();
var procedural = numeric.dot(A, B);

// Need to do many computations to get the advantages of the GPU
for(var i = 0; i < iterations; i++) {
  procedural = numeric.dot(A, B);
}
var procTime = (new Date().getTime()) - before;
console.log('Procedural Time: ', procTime);

console.log((procTime / parTime) + ' times faster');

// This is for RMSD nornalization, flattening and doing min and max that way exceeded the call stack
var max = Math.max(Math.max(...A.map((function(row) {return Math.max(...row);}))), Math.max(...B.map((function(row) {return Math.max(...row);}))))

var min = Math.min(Math.min(...A.map((function(row) {return Math.min(...row);}))), Math.min(...B.map((function(row) {return Math.min(...row);}))))

// The matricies will be different due to precision issues so the Normalized RMDS can give you an idea of the difference
var nrmsd = Math.sqrt(numeric.sum(numeric.pow(numeric.sub(parallel, procedural), 2)) / size) / (max - min);

console.log('Normalized RMSD: ', nrmsd);

This gave me the following output:

scriptfour.js:26 Parallel Time:  20490
scriptfour.js:36 Procedural Time:  28736
scriptfour.js:38 1.402440214738897 times faster
scriptfour.js:48 Normalized RMSD:  0.009671934749138042

These results are pretty good. The eval slowed down the parallel one unfairly but it is still always faster. I don't think a setup like that is good for production but it still works here.

You can specify `size` as a constant: https://github.com/gpujs/gpu.js#dynamic-sized-via-constants — Patrick Roberts, Sep 05 '18 at 20:31

score 0 · Answer 3 · answered Nov 17 '16 at 15:48

0

Use:

t0 = performance.now();
yourFunctionCall();
t1 = performance.now();
console.log("Function yourFunctionCall took " + (t1 - t0) + " ms.");

Not sure if that's the core of the issue, but I've been having problems with Date too.

answered Nov 17 '16 at 15:48

Nicolas Yazman

3
3

Not the core of the issue but thanks for the tip. I got it working with date but if I do anything like this going forward I will use performance. Thanks! – user5505266 Nov 17 '16 at 17:09

GPU Parallelism in Javascript going slower

3 Answers3