I'm doing Jacobi iteration method on OpenCL. When my kernel takes in a matrix bigger than 200 in dimension, the floating (aka, cl_float, aka, float) overflows.
On host side (CPU), there's big number and multiprecision libraries which might help on this issue, however, on GPU side, doing big number and multiprecision is not easy.
How do you solve this problem? Is there any clues?
My code for reference:
inline float dotRow(__global float * mat, __global float * vec, uint col) {
float ret = 0;
for(uint i = 0; i < col; ++i)
ret += mat[i] * vec[i];
return ret;
}//dotRow(mat, vec, col)
__kernel void jacobi(
const uint itemsPerThread,
uint maxIteration,
__global bool * cont,
__global float * outX,
__local float * outXOld,
__global float * matA, __global float * vecB,
uint row, uint col)
{
int itemOffset = get_global_id(0) * itemsPerThread;
for(uint iter = 0; iter < maxIteration; ++iter) {
// preserve the current outX
for(uint i = itemOffset; i < itemOffset + itemsPerThread; ++i)
outXOld[i] = outX[i];
barrier(CLK_LOCAL_MEM_FENCE);
*cont = false;
bool contPrivate = false;
// calculate one iteration
for(uint i = itemOffset; i < itemOffset + itemsPerThread; ++i) {
if(i >= row)
break;
float s1 = dotRow(matA+(i*col)+0, outX+0, i);
float s2 = dotRow(matA+(i*col)+(i+1), outX+(i+1), col-(i+1));
outX[i] = (vecB[i] - s1 - s2) / matA[i*col+i];
if(fabs(outX[i] - outXOld[i]) > 1e-10f)
contPrivate = true;
}//for
if(contPrivate)
*cont = true;
barrier(CLK_GLOBAL_MEM_FENCE);
if(*cont == false) {
break;
}//if
}//for
}
Thanks to @aland 's hint for emulation more precision using float. Here's a ref about this (if useful for anyone): Emulate "double" using 2 "float"s