0

I'm getting a CUDA_ERROR_ILLEGAL_ADDRESS exception when trying to run a kernel used for calculating Buddhabrot fractal orbits.

extern "C"

__global__ void exec(int iterations, int size,
                float* inputR,  float* inputI, // Real/Imaginary input
                int* output                    // Output image in one dimension
                ) {

    int i = blockIdx.x * blockDim.x + threadIdx.x;

    float cR = inputR[i];
    float cI = inputI[i];

    float x = 0;
    float y = 0;

    float outX[1000];
    float outY[1000];

    for (int j = 0; j < iterations; j++) {
        outX[j] = x;
        outY[j] = y;

        float xNew = (x * x) - (y * y) + cR;
        float yNew = (2 * x * y) + cI;

        if (xNew * xNew + yNew * yNew > 4) {
            for (int k = 1; k < j; k++) {
                int curX = (outX[k] + 2 ) * size / 4;
                int curY = (outY[k] + 2 ) * size / 4;

                int idx = curX + size * curY;

                output[idx]++; // <- exception here
            }
            return;
        }

        x = xNew;
        y = yNew;
    }
}

I've tried multiple things now and the error doesn't even seem to be stemming from the array contrary to what I first thought. For example,

output[0] = 0;

will work just fine. However, when I tried to debug idx (Remember I first thought the error was related to the array), i found out that I can neither assign idx like so

output[0] = idx;

nor use it in a printf statement

if (i == 0) {
    printf("%d\n", idx);
}

I've tried the same with curX and curY which also refuse to work, however cR for example will work without any error. There seems to be a problem with the variables assigned inside the innermost loop (I also can't assign k), so I tried declaring idx outside of all loops at the start of the function, but to no avail. Still the same error.

Stack trace:

Exception in thread "main" jcuda.CudaException: CUDA_ERROR_ILLEGAL_ADDRESS
        at jcuda.driver.JCudaDriver.checkResult(JCudaDriver.java:330)
        at jcuda.driver.JCudaDriver.cuCtxSynchronize(JCudaDriver.java:1938)
        at fractal.Buddhabrot.<init>(Buddhabrot.java:96)
        at controller.Controller.<init>(Controller.java:10)
        at Main.main(Main.java:8)
        at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
        at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
        at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
        at java.lang.reflect.Method.invoke(Method.java:497)
        at com.intellij.rt.execution.application.AppMain.main(AppMain.java:144)

Constants:

block size            512*1*1
grid size             64 *1*1
iterations            1000
size                  256
inputR, inputI length 64*512
output length         256*256

MCVE:

import jcuda.Pointer;
import jcuda.Sizeof;
import jcuda.driver.*;

import java.io.File;
import java.util.Random;

import static jcuda.driver.JCudaDriver.*;

public class Stackoverflow {
    public static final int SIZE = 256;
    public static final long NUM_POINTS = 128 * 128 * 128;
    public static final int ITERATIONS = 10000;

    public static final int BLOCK_SIZE = 512;
    public static final int SIM_THREADS = BLOCK_SIZE * 64;

    public static final Random random = new Random();

    public static void main(String[] args) {
        File ptxFile = new File("Buddha.ptx");

        setExceptionsEnabled(true);
        cuInit(0);

        CUdevice device = new CUdevice();
        cuDeviceGet(device, 0);

        CUcontext context = new CUcontext();
        cuCtxCreate(context, 0, device);

        CUmodule module = new CUmodule();
        cuModuleLoad(module, ptxFile.getAbsolutePath());

        CUfunction function = new CUfunction();
        cuModuleGetFunction(function, module, "exec");

        cuCtxSetLimit(CUlimit.CU_LIMIT_PRINTF_FIFO_SIZE, 4096);

        float[] inR = new float[SIM_THREADS];
        float[] inI = new float[SIM_THREADS];

        int[] out = new int[SIZE * SIZE];

        CUdeviceptr deviceInputR = new CUdeviceptr();
        cuMemAlloc(deviceInputR, inR.length * Sizeof.FLOAT);
        CUdeviceptr deviceInputI = new CUdeviceptr();
        cuMemAlloc(deviceInputI, inI.length * Sizeof.FLOAT);

        CUdeviceptr deviceOutput = new CUdeviceptr();
        cuMemAlloc(deviceOutput, out.length * Sizeof.INT);

        for (long i = 0; i < NUM_POINTS; i += SIM_THREADS) {
            for (int j = 0; j < SIM_THREADS; j++) {
                inR[j] = random.nextFloat() * 4f - 2f;
                inI[j] = random.nextFloat() * 4f - 2f;
            }

            System.out.println("GPU START");

            cuMemcpyHtoD(deviceInputR, Pointer.to(inR), inR.length * Sizeof.FLOAT);
            cuMemcpyHtoD(deviceInputI, Pointer.to(inI), inI.length * Sizeof.FLOAT);

            Pointer kernelParameters = Pointer.to(
                    Pointer.to(new int[]{ITERATIONS}),
                    Pointer.to(new int[]{SIZE}),
                    Pointer.to(deviceInputR),
                    Pointer.to(deviceInputI),
                    Pointer.to(deviceOutput)
            );

            int gridSize = (int) Math.ceil(((double) SIM_THREADS) / BLOCK_SIZE);

            cuLaunchKernel(function,
                    gridSize, 1, 1,
                    BLOCK_SIZE, 1, 1,
                    0, null,
                    kernelParameters, null
            );

            cuCtxSynchronize();

            System.out.println("GPU END");
        }

        cuMemcpyDtoH(Pointer.to(out), deviceOutput, out.length * Sizeof.INT);
    }
}
Marv
  • 3,517
  • 2
  • 22
  • 47
  • Probably need to see a complete repro definition for this, not just the kernel code. Include a full code so someone else could compile/run easily, and need to know your exact compile command, as well as the platform you are running on (OS, CUDA version) and the specific GPU you are using. My initial attempt to build a reproducer around what you have shown does not result in any errors when I run it with synthetic data. My example is [here](http://pastebin.com/3vxeVMzm). Alternatively, if you can modify my example until it produces the failure, post that. – Robert Crovella May 27 '16 at 17:48
  • @RobertCrovella I've added a MCVE to the original question. This is CUDA 7.5.18 on Windows 10 using a NVIDIA GeForce GTX 970. – Marv May 27 '16 at 18:56

1 Answers1

3

In your "constants" section you had indicated this:

iterations            1000

but in your java code (after you provided the MCVE) you have this:

public static final int ITERATIONS = 10000;

That can clearly cause this section of your kernel code to break:

float outX[1000];
float outY[1000];

for (int j = 0; j < iterations; j++) {
    outX[j] = x;
    outY[j] = y;

since 10000 for iterations is indexing out of bounds. (The extent of this loop is actually data dependent, but for some data input patterns, the loop will traverse past 1000, as written).

When I change this:

public static final int ITERATIONS = 10000;

to this:

public static final int ITERATIONS = 1000;

your code runs correctly for me:

$ cuda-memcheck java -cp ".:jcuda-0.7.5b.jar" so1
========= CUDA-MEMCHECK
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
========= ERROR SUMMARY: 0 errors
$
Robert Crovella
  • 143,785
  • 11
  • 213
  • 257
  • Before posting I was so very sure I've fixed this issue. I've noticed it as well and thought I tested it. Thank you. What seems to happen is that accessing `outX[k]` or `outY[k]` throws the error when `k >= 1000`, however when `curX`, `curY` or `idx` are not accessed in or after the loop, the compiler optimizes and does not calculate them. – Marv May 28 '16 at 07:46
  • 1
    That's correct. Your debugging technique has some value, but it also suffers from the fact that as you change the code, the compiler may optimize out things that no longer are required. This can make the process confusing. Instead, you may wish to use tools like `cuda-memcheck` and the methodology discussed [here](http://stackoverflow.com/questions/27277365/unspecified-launch-failure-on-memcpy/27278218#27278218). That is what I used. – Robert Crovella May 28 '16 at 12:52
  • Good stuff, thank you. `cuda-memcheck` is also described [here](http://www.jcuda.org/debugging/Debugging.html) for `JCuda` for future reference. – Marv May 28 '16 at 12:57