I'm getting a CUDA_ERROR_ILLEGAL_ADDRESS
exception when trying to run a kernel used for calculating Buddhabrot fractal orbits.
extern "C"
__global__ void exec(int iterations, int size,
float* inputR, float* inputI, // Real/Imaginary input
int* output // Output image in one dimension
) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
float cR = inputR[i];
float cI = inputI[i];
float x = 0;
float y = 0;
float outX[1000];
float outY[1000];
for (int j = 0; j < iterations; j++) {
outX[j] = x;
outY[j] = y;
float xNew = (x * x) - (y * y) + cR;
float yNew = (2 * x * y) + cI;
if (xNew * xNew + yNew * yNew > 4) {
for (int k = 1; k < j; k++) {
int curX = (outX[k] + 2 ) * size / 4;
int curY = (outY[k] + 2 ) * size / 4;
int idx = curX + size * curY;
output[idx]++; // <- exception here
}
return;
}
x = xNew;
y = yNew;
}
}
I've tried multiple things now and the error doesn't even seem to be stemming from the array contrary to what I first thought. For example,
output[0] = 0;
will work just fine. However, when I tried to debug idx
(Remember I first thought the error was related to the array), i found out that I can neither assign idx like so
output[0] = idx;
nor use it in a printf statement
if (i == 0) {
printf("%d\n", idx);
}
I've tried the same with curX
and curY
which also refuse to work, however cR
for example will work without any error. There seems to be a problem with the variables assigned inside the innermost loop (I also can't assign k
), so I tried declaring idx
outside of all loops at the start of the function, but to no avail. Still the same error.
Stack trace:
Exception in thread "main" jcuda.CudaException: CUDA_ERROR_ILLEGAL_ADDRESS
at jcuda.driver.JCudaDriver.checkResult(JCudaDriver.java:330)
at jcuda.driver.JCudaDriver.cuCtxSynchronize(JCudaDriver.java:1938)
at fractal.Buddhabrot.<init>(Buddhabrot.java:96)
at controller.Controller.<init>(Controller.java:10)
at Main.main(Main.java:8)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:497)
at com.intellij.rt.execution.application.AppMain.main(AppMain.java:144)
Constants:
block size 512*1*1
grid size 64 *1*1
iterations 1000
size 256
inputR, inputI length 64*512
output length 256*256
MCVE:
import jcuda.Pointer;
import jcuda.Sizeof;
import jcuda.driver.*;
import java.io.File;
import java.util.Random;
import static jcuda.driver.JCudaDriver.*;
public class Stackoverflow {
public static final int SIZE = 256;
public static final long NUM_POINTS = 128 * 128 * 128;
public static final int ITERATIONS = 10000;
public static final int BLOCK_SIZE = 512;
public static final int SIM_THREADS = BLOCK_SIZE * 64;
public static final Random random = new Random();
public static void main(String[] args) {
File ptxFile = new File("Buddha.ptx");
setExceptionsEnabled(true);
cuInit(0);
CUdevice device = new CUdevice();
cuDeviceGet(device, 0);
CUcontext context = new CUcontext();
cuCtxCreate(context, 0, device);
CUmodule module = new CUmodule();
cuModuleLoad(module, ptxFile.getAbsolutePath());
CUfunction function = new CUfunction();
cuModuleGetFunction(function, module, "exec");
cuCtxSetLimit(CUlimit.CU_LIMIT_PRINTF_FIFO_SIZE, 4096);
float[] inR = new float[SIM_THREADS];
float[] inI = new float[SIM_THREADS];
int[] out = new int[SIZE * SIZE];
CUdeviceptr deviceInputR = new CUdeviceptr();
cuMemAlloc(deviceInputR, inR.length * Sizeof.FLOAT);
CUdeviceptr deviceInputI = new CUdeviceptr();
cuMemAlloc(deviceInputI, inI.length * Sizeof.FLOAT);
CUdeviceptr deviceOutput = new CUdeviceptr();
cuMemAlloc(deviceOutput, out.length * Sizeof.INT);
for (long i = 0; i < NUM_POINTS; i += SIM_THREADS) {
for (int j = 0; j < SIM_THREADS; j++) {
inR[j] = random.nextFloat() * 4f - 2f;
inI[j] = random.nextFloat() * 4f - 2f;
}
System.out.println("GPU START");
cuMemcpyHtoD(deviceInputR, Pointer.to(inR), inR.length * Sizeof.FLOAT);
cuMemcpyHtoD(deviceInputI, Pointer.to(inI), inI.length * Sizeof.FLOAT);
Pointer kernelParameters = Pointer.to(
Pointer.to(new int[]{ITERATIONS}),
Pointer.to(new int[]{SIZE}),
Pointer.to(deviceInputR),
Pointer.to(deviceInputI),
Pointer.to(deviceOutput)
);
int gridSize = (int) Math.ceil(((double) SIM_THREADS) / BLOCK_SIZE);
cuLaunchKernel(function,
gridSize, 1, 1,
BLOCK_SIZE, 1, 1,
0, null,
kernelParameters, null
);
cuCtxSynchronize();
System.out.println("GPU END");
}
cuMemcpyDtoH(Pointer.to(out), deviceOutput, out.length * Sizeof.INT);
}
}