inexplicable cuda behavior related to memory

Question

so basically i took my c++ code (which is working correctly) and rewrite it to cuda (i have no experience with cuda). The one part of the code (solve() method) is not working correctly and i really dont know why.

So my question is what exactly means "unspecified launch failure" error during cudaMemcpy and why is it happening in my code.

My second question is why variables backup_ans and ans differs when they compute the same thing?

#include "stdio.h"
#include <algorithm>

__device__ unsigned int primes[1024];
__device__ long long n = 1ll<<32; // #unsigned_integers




__device__ int hashh(long long x) {
      return (x>>1)%1024;
}

// compute (x^e)%n
__device__ unsigned long long mulmod(unsigned long long x,unsigned long long e,unsigned long long n) {
    unsigned long long ans = 1;
    while(e>0) {
        if(e&1) ans = (ans*x)%n;
        x = (x*x)%n;
        e>>=1;
    }
    return ans;
}

// determine whether n is strong probable prime base a or not.
// n is ODD
__device__ int is_SPRP(unsigned long long a,unsigned long long n) {
  int d=0;
  unsigned long long t = n-1;
  while(t%2==0) {
      ++d;
      t>>=1;
  }
  unsigned long long x = mulmod(a,t,n);
  if(x==1) return 1; 
  for(int i=0;i<d;++i) {
      if(x==n-1) return 1;
      x=(x*x)%n;
  }
  return 0;
}
__device__ int prime(long long x) {
        return is_SPRP((unsigned long long)primes[(((long long)0xAFF7B4*x)>>7)%1024],(unsigned long long)x);
}

// copy all unsigned COMPOSITE ingeters which are not congruent to zero modulo 2,3,5,7 and their hashh value = 0; 
// count of those elements store in c
// 335545 is just magic constant to distribute all integers equally on all 400*32 threads
__global__ void find(unsigned int *out,unsigned int *c) {
    unsigned int buff[4096];
    int local_c = 0;
    long long b = 121+(threadIdx.x+blockIdx.x*blockDim.x)*335545;
    long long e = b+335545;
    if(b%2==0) ++b;
    for(long long i=b;i<e && i<n;i+=2) {
        if(i%3==0 || i%5==0 || i%7==0 || prime(i)) continue;
        if(hashh(i)==0) {
            buff[local_c++]=(unsigned int)i;
            if(local_c==4096) {
                int start = atomicAdd(c,local_c);
                for(int i=0;i<local_c;++i) out[i+start]=buff[i];
                local_c=0;
            }
        }
    }
    int start = atomicAdd(c,local_c);
    for(int i=0;i<local_c;++i) out[i+start]=buff[i];
}

// find base for which all elements in input are NOT SPRP. base is from {2,..,34} stored in 32bit uint
__global__ void solve(unsigned int *input, unsigned int *count,unsigned int *backup, unsigned int *ans) {
      __shared__ unsigned int s[32];
    unsigned int dif = (*count)/(blockDim.x*gridDim.x) +1;
    unsigned int b = (threadIdx.x+blockIdx.x*blockDim.x)*dif;
    unsigned int e = b+dif>(*count)?(*count):b+dif;
    unsigned int mysol = 0;
    for(long long i = 2; i<33; ++i) {
          int sol = 1;
          // each thread doing its part
          for(unsigned int j = b; j<e ; ++j) {
              //is some element is sprp base i break
              if(is_SPRP((unsigned long long)i,(unsigned long long)input[j])!=0) {
              sol=0;
              break;
              }
          }
          // if all elements passed store base to mysol
          if(sol==1) mysol|=1<<(i-2);
    }
    s[threadIdx.x] = mysol;
    // save thread_result
    backup[threadIdx.x+blockDim.x*blockIdx.x] = mysol;
    __syncthreads();
    // compute global resulte and store it to ans
    if(threadIdx.x==0) {
          unsigned int global_sol = ~0;
          for(int i=0;i<blockDim.x;++i) global_sol&=s[i];
          atomicAnd(ans,global_sol);
    }
}


int main(void) {
// number of blocks & thread for solve
const int blocks = 400;
const int threads = 32;

unsigned int prms[] = { 17, 11, 6, 60, 7, 13, 11, 34, 13, 2, 3, 37, 13, 11, 38, 2, 7, 105, 2, 7, 42, 11, 7, 3, 6, 15, 53, 44, 6, 6, 5, 15, 54, 7, 35, 10, 10, 15, 10, 10, 17, 17, 11, 10, 15, 43, 7, 5, 5, 3, 7, 43, 34, 2, 34, 2, 68, 53, 39, 10, 7, 6, 11, 2, 5, 2, 7, 2, 6, 5, 15, 40, 3, 5, 5, 2, 2, 10, 47, 13, 7, 43, 6, 7, 5, 6, 6, 13, 6, 35, 6, 15, 6, 13, 40, 10, 11, 2, 7, 2, 2, 3, 13, 3, 11, 15, 10, 5, 11, 14, 7, 11, 47, 5, 2, 2, 6, 2, 5, 55, 6, 5, 7, 2, 6, 58, 35, 11, 5, 12, 17, 6, 10, 12, 6, 6, 2, 53, 2, 2, 13, 5, 14, 7, 15, 6, 13, 62, 10, 6, 3, 7, 7, 3, 14, 5, 14, 73, 15, 11, 11, 6, 5, 17, 10, 5, 3, 37, 51, 10, 7, 5, 38, 12, 5, 11, 5, 7, 6, 5, 6, 40, 43, 57, 10, 13, 7, 15, 2, 10, 34, 7, 39, 10, 5, 3, 6, 13, 11, 5, 10, 43, 10, 5, 3, 14, 5, 2, 5, 41, 5, 39, 46, 2, 10, 2, 5, 12, 3, 2, 2, 5, 15, 43, 17, 41, 2, 13, 15, 38, 11, 11, 3, 34, 5, 6, 3, 7, 2, 37, 5, 6, 10, 17, 35, 2, 15, 6, 7, 5, 3, 13, 13, 12, 34, 2, 12, 10, 15, 13, 2, 2, 34, 6, 6, 5, 2, 7, 13, 3, 6, 11, 39, 42, 7, 2, 6, 39, 47, 3, 17, 5, 13, 7, 2, 47, 3, 7, 6, 11, 17, 37, 48, 7, 37, 11, 7, 10, 3, 14, 39, 14, 15, 43, 17, 2, 12, 7, 13, 5, 3, 6, 34, 37, 3, 17, 13, 2, 5, 10, 10, 44, 37, 2, 2, 10, 10, 7, 3, 7, 2, 7, 5, 43, 43, 11, 15, 51, 13, 17, 10, 11, 2, 5, 34, 17, 2, 2, 42, 6, 6, 5, 47, 15, 2, 12, 7, 3, 10, 15, 3, 7, 12, 12, 15, 43, 14, 7, 58, 13, 10, 6, 6, 38, 34, 5, 5, 13, 38, 6, 11, 10, 6, 7, 2, 55, 2, 13, 5, 11, 44, 15, 17, 2, 40, 2, 15, 13, 6, 2, 3, 3, 3, 3, 6, 39, 5, 11, 17, 37, 5, 7, 6, 10, 6, 12, 7, 5, 14, 10, 12, 71, 10, 35, 6, 11, 3, 2, 38, 3, 2, 34, 10, 17, 42, 2, 12, 6, 6, 11, 40, 12, 10, 6, 10, 2, 3, 3, 56, 11, 7, 42, 2, 38, 12, 2, 2, 13, 40, 12, 6, 5, 5, 59, 15, 38, 5, 5, 5, 7, 2, 10, 7, 2, 17, 10, 11, 6, 6, 6, 2, 10, 6, 54, 2, 82, 3, 34, 14, 15, 44, 5, 46, 2, 13, 5, 12, 13, 11, 10, 39, 5, 40, 3, 60, 3, 42, 11, 3, 46, 17, 3, 2, 37, 6, 42, 12, 14, 3, 12, 66, 13, 34, 7, 3, 13, 3, 11, 2, 13, 12, 38, 34, 5, 40, 10, 14, 6, 14, 11, 38, 58, 2, 48, 5, 15, 5, 73, 3, 37, 5, 11, 10, 5, 5, 13, 2, 10, 13, 34, 17, 3, 7, 47, 2, 2, 10, 15, 3, 3, 13, 6, 34, 13, 10, 13, 3, 6, 41, 10, 6, 2, 6, 2, 6, 2, 6, 6, 37, 10, 44, 35, 13, 51, 2, 7, 53, 5, 40, 5, 2, 37, 11, 15, 11, 13, 2, 5, 2, 6, 10, 17, 15, 43, 39, 17, 2, 12, 10, 15, 17, 7, 13, 3, 7, 15, 37, 5, 15, 7, 6, 10, 51, 2, 2, 40, 61, 2, 13, 13, 11, 2, 5, 34, 5, 5, 7, 2, 2, 2, 11, 3, 6, 13, 6, 17, 11, 10, 7, 46, 15, 7, 14, 35, 11, 7, 10, 6, 11, 40, 11, 2, 39, 7, 6, 66, 5, 3, 6, 5, 11, 10, 2, 10, 7, 13, 2, 45, 34, 6, 35, 2, 11, 5, 59, 75, 10, 17, 14, 17, 17, 17, 2, 11, 7, 10, 6, 11, 6, 56, 34, 35, 11, 14, 12, 41, 40, 17, 40, 3, 11, 7, 37, 14, 7, 13, 7, 5, 2, 10, 6, 39, 2, 7, 37, 35, 10, 5, 15, 2, 7, 38, 34, 11, 17, 5, 6, 10, 3, 6, 7, 7, 43, 14, 2, 43, 3, 2, 47, 7, 35, 7, 3, 53, 2, 10, 10, 10, 60, 10, 6, 2, 6, 10, 5, 7, 57, 53, 13, 3, 35, 38, 15, 42, 3, 3, 12, 2, 10, 3, 38, 54, 13, 10, 11, 7, 13, 7, 2, 12, 39, 10, 54, 2, 12, 38, 10, 12, 12, 5, 15, 6, 10, 13, 5, 15, 10, 13, 6, 41, 40, 14, 12, 10, 11, 40, 5, 11, 10, 2, 5, 2, 13, 6, 2, 13, 5, 2, 10, 15, 5, 5, 10, 34, 13, 2, 5, 14, 5, 6, 5, 13, 3, 43, 6, 13, 11, 50, 3, 6, 6, 12, 15, 11, 37, 7, 69, 11, 14, 14, 7, 43, 5, 35, 11, 35, 11, 11, 34, 34, 39, 14, 11, 2, 10, 53, 6, 11, 2, 11, 60, 39, 11, 6, 15, 40, 17, 47, 34, 50, 7, 59, 47, 5, 13, 39, 5, 6, 53, 10, 14, 5, 51, 5, 7, 5, 6, 77, 7, 12, 7, 42, 2, 5, 2, 6, 60, 10, 13, 10, 6, 47, 6, 15, 17, 10, 11, 10, 12, 7, 7, 10, 17, 34, 5, 10, 7, 7, 2, 6, 10, 38, 2, 15, 6, 13, 7, 13, 2, 3, 13, 5, 3, 17, 2, 5, 15, 11, 39, 7, 39, 10, 10, 2, 6, 13, 3, 5, 17, 6, 14, 10, 37, 44, 3, 34, 5, 11, 7, 12, 2, 5, 3, 12, 3, 2, 3, 133, 12, 2, 2, 2, 3, 34, 14, 41, 2, 37, 11, 2, 6, 11, 6, 7, 15, 11, 35, 13, 6, 5, 2, 14, 7, 2 };

printf("primes_copy: %s\n",cudaGetErrorString(cudaMemcpyToSymbol(primes,prms,1024*4)));

/*-----*/

// allocate buffers
unsigned int *dev_input,*dev_count;
printf("alloc_input: %s\n",cudaGetErrorString(cudaMalloc((void**)&dev_input,sizeof(int)*(1<<23))));
printf("alloc_count: %s\n",cudaGetErrorString(cudaMalloc((void**)&dev_count,4)));
printf("memset_count: %s\n",cudaGetErrorString(cudaMemset(dev_count,0,4)));
find<<<400,32>>>(dev_input,dev_count);
cudaDeviceSynchronize();

unsigned int count;
printf("copy_count: %s\n",cudaGetErrorString(cudaMemcpy(&count,dev_count,4,cudaMemcpyDeviceToHost)));

// sort found elements just to make debbug easier, it is not necessary
unsigned int *backup_numbers = new unsigned int[1000000];
printf("copy_backup: %s\n",cudaGetErrorString(cudaMemcpy(backup_numbers,dev_input,4*count,cudaMemcpyDeviceToHost)));
std::sort(backup_numbers,backup_numbers+count);
printf("copy_S_backup: %s\n",cudaGetErrorString(cudaMemcpy(dev_input,backup_numbers,4*count,cudaMemcpyHostToDevice)));
delete[] backup_numbers;

printf("\nsize: %u\n",count);

// allocate buffers
unsigned int *dev_backup, *dev_ans;
printf("malloc_backup: %s\n",cudaGetErrorString(cudaMalloc((void**)&dev_backup,sizeof(int)*blocks*threads)));
printf("malloc_ans: %s\n",cudaGetErrorString(cudaMalloc((void**)&dev_ans,4)));
printf("memset_ans: %s\n",cudaGetErrorString(cudaMemset(dev_ans,0xFF,4)));

solve<<<blocks,threads>>>(dev_input,dev_count,dev_backup,dev_ans);
cudaDeviceSynchronize();

unsigned int ans,*backup;
printf("memcpy_ans: %s\n",cudaGetErrorString(cudaMemcpy(&ans,dev_ans,4,cudaMemcpyDeviceToHost)));
backup = new unsigned int[400*32];
printf("memcpy_backup: %s\n",cudaGetErrorString(cudaMemcpy(backup,dev_backup,4*blocks*threads,cudaMemcpyDeviceToHost)));
unsigned int backup_ans = ~0;

// compute global result using backuped thread_results
// notice backup_ans and ans MUST be the same, but they are NOT (WHY!)
for(int i=0;i<threads*blocks;++i) backup_ans&=backup[i];
printf("ans: %u\nbackup_ans %u\n",ans,backup_ans);
printf("%u\n",backup[48]);

delete[] backup;
cudaFree(dev_ans);
cudaFree(dev_backup);
cudaFree(dev_count);
cudaFree(dev_input);
}

All code except solve() method works as intend. solve() method just computes bullshit (because backup_ans and ans differ) and it is also giving me the "unspecified launch failure" error on last two cudaMemcpy. When i run solve<<<1,1>>>(...) i got

ans: 134816642 backup_ans 432501552

but when i run solve<<<400,32>>>(...) it gives me

ans: 134816642 backup_ans 0 (correct answer should be 0)

In all situations it should compute backup_ans=ans=0

Any advice what i am doing wrong would be helpful.

Code for generating primes.bin

#include <cstdlib>
#include <stdio.h>
using namespace std;

const unsigned long long n = 1ll<<32;
const int buffer_size = 2000000;

typedef unsigned char uch;
typedef unsigned int uint;
typedef unsigned long long ull;

uch *primes;

int prime(long long x) {
if(x==2) return 1;
if(x%2==0) return 0;
long long pos = x/16;
long long index = (x&15)>>1;
return (1<<index)&(~(primes[pos]));
}
void eratosten_sieve(void) {
  long long pos;
  long long index;
  for(long long i=3;i*i<n;++i) {
      if(!prime(i)) continue;
      for(long long j=i*i;j<n;j+=(i<<1)) {
      pos = j/16;
      index = ((j&15)>>1);
      primes[pos]|=(1<<index);
      }
  }

}

int main(void) {
primes = new uch[(n/16)+1];
for(long long i=0;i<(n/16)+1;++i) primes[i]=0;
printf("generating\n");
eratosten_sieve(); 
int l = n/16 +1;
printf("writing\n");
FILE *f = fopen("primes.bin","wb");
fwrite(primes,1,l,f);
fclose(f);
printf("done\n");
delete[] primes;
}

PS: i am compiling it by nvcc -arch compute_11

CUDA Driver Version / Runtime Version          5.5 / 5.5
CUDA Capability Major/Minor version number:    1.1
Total amount of global memory:                 1023 MBytes (1073020928 bytes)
(14) Multiprocessors, (  8) CUDA Cores/MP:     112 CUDA Cores
GPU Clock rate:                                1500 MHz (1.50 GHz)
Memory Clock rate:                             900 Mhz
Memory Bus Width:                              256-bit
Maximum Texture Dimension Size (x,y,z)         1D=(8192), 2D=(65536, 32768), 3D=(2048, 2048, 2048)
Maximum Layered 1D Texture Size, (num) layers  1D=(8192), 512 layers
Maximum Layered 2D Texture Size, (num) layers  2D=(8192, 8192), 512 layers
Total amount of constant memory:               65536 bytes
Total amount of shared memory per block:       16384 bytes
Total number of registers available per block: 8192
Warp size:                                     32
Maximum number of threads per multiprocessor:  768
Maximum number of threads per block:           512
Max dimension size of a thread block (x,y,z): (512, 512, 64)
Max dimension size of a grid size    (x,y,z): (65535, 65535, 1)
Maximum memory pitch:                          2147483647 bytes
Texture alignment:                             256 bytes
Concurrent copy and kernel execution:          Yes with 1 copy engine(s)
Run time limit on kernels:                     Yes
Integrated GPU sharing Host Memory:            No
Support host page-locked memory mapping:       Yes
Alignment requirement for Surfaces:            Yes
Device has ECC support:                        Disabled
Device supports Unified Addressing (UVA):      No
Device PCI Bus ID / PCI location ID:           1 / 0
Compute Mode:
  < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >

deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 5.5, CUDA Runtime Version = 5.5, NumDevs = 1, Device0 = GeForce 9800 GT
Result = PASS

The unspecified launch failure on the `cudaMemcpy` call after the kernel call is coming from the kernel call. Your kernel is doing some kind of illegal access or some other illegal operation. (You can do proper cuda error checking on the kernel call). You might want to run your code with `cuda-memcheck` to get some additional insight into why the kernel is failing. — Robert Crovella, Mar 06 '14 at 22:19
I tried running the code you have posted (after generating primes.bin) and I got no errors on cc2.0 device. Output looks like [this](http://pastebin.com/zL6JcF7e). Puzzling. — Robert Crovella, Mar 06 '14 at 22:29
How much memory is on that 9800 GT? Are you also hosting a display on it? What is the operating system? — Robert Crovella, Mar 06 '14 at 22:50
Thanks for the answer, your output is OK which is nice. I updated deviceQuery and i am using kubuntu 12.04. I did the memcheck and propper error checking using cudaPeekAtLastError() after the kernell call and got "out of memory" and some other stuff. http://pastebin.com/0xjwvHu5 . — user3390078, Mar 06 '14 at 23:22

score 1 · Accepted Answer · edited May 23 '17 at 12:28

OK, you are out of memory. It took me a while to figure out because I was not thinking about the large static allocation:

__device__ unsigned char primes[(1<<28)+1];

Normally when folks are out of memory, they discover it on a cudaMalloc operation. In your case, your GPU has 1GB of memory, and I am guessing you are also hosting a display on it (you didn't answer that question). Take a look at how much free memory there is in the nvidia-smi -a output, it will look something like this:

FB Memory Usage
    Total                       : 1535 MiB
    Used                        : 3 MiB
    Free                        : 1532 MiB

Your numbers will be smaller - the Free line is what we care about.

Your dynamic allocations (ie. from cudaMalloc) are allocating about 350MB. But the kernel launch brings the static allocation into play, and then your total footprint rises to over 700MB (2^28 is over 250MB). If you have a display running on that GPU, it will consume some of the 1GB of memory, leaving you with not enough to run a kernel that requires 700MB.

If you want to run on that GPU, see if you can pare your problem size down somehow.

And it's always good to do proper cuda error checking, but apart from this issue, your code seems to run with no errors for me on devices with more memory.

Memory Usage Total : 1023 MB Used : 186 MB Free : 837 MB This is enough in my opinion. Anyway i stopped using that 256MB bitset and reduced it to 4kB. But now i am getting "the launch timed out and was terminated". I have updated the code, you could try if it is working for you. (the first terminate is result of first cudaDeviceSynchronize() — user3390078, Mar 07 '14 at 00:09
launch timed out and was terminated is due to running the display on the same GPU that you are running CUDA on (you still haven't confirmed this, but I assume it is the case). I won't be able to reproduce that conveniently. If you want to learn some ways to work around that, read [this](http://nvidia.custhelp.com/app/answers/detail/a_id/3029/~/using-cuda-and-x). Anyway your original problem I believe was due to running out of memory, confirmed by both the kernel fail error code (2) and the fact that it goes away when you reduce the memory footprint. — Robert Crovella, Mar 07 '14 at 04:26

inexplicable cuda behavior related to memory

1 Answers1