Bad data coming from cudaMemcpy2D

Question

If this sort of question has been asked I apologize, link me to the thread please!

Anyhow I am new to CUDA (I'm coming from OpenCL) and wanted to try generating an image with it. The relevant CUDA code is:

__global__
void mandlebrot(uint8_t *pixels, size_t pitch, unsigned long width, unsigned long height) {
  unsigned block_size = blockDim.x;
  uint2 location = {blockIdx.x*block_size, blockIdx.y*block_size};
  ulong2 pixel_location = {threadIdx.x, threadIdx.y};
  ulong2 real_location = {location.x + pixel_location.x, location.y + pixel_location.y};
  if (real_location.x >= width || real_location.y >= height)
    return;
  uint8_t *row = (uint8_t *)((char *)pixels + real_location.y * pitch);
  row[real_location.x * 4+0] = 0;
  row[real_location.x * 4+1] = 255;
  row[real_location.x * 4+2] = 0;
  row[real_location.x * 4+3] = 255;
}

cudaError_t err = cudaSuccess;

#define CUDA_ERR(e) \
  if ((err = e) != cudaSuccess) { \
    fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err)); \
    exit(-1); \
  }


int main(void) {
  ulong2 dims = {1000, 1000};
  unsigned long block_size = 500;
  dim3 threads_per_block(block_size, block_size);
  dim3 remainders(dims.x % threads_per_block.x, dims.y % threads_per_block.y);
  dim3 blocks(dims.x / threads_per_block.x + (remainders.x == 0 ? 0 : 1), dims.y / threads_per_block.y + (remainders.y == 0 ? 0 : 1));

  size_t pitch;
  uint8_t *pixels, *h_pixels = NULL;
  CUDA_ERR(cudaMallocPitch(&pixels, &pitch, dims.x * 4 * sizeof(uint8_t), dims.y));
  mandlebrot<<<blocks, threads_per_block>>>(pixels, pitch, dims.x, dims.y);

  h_pixels = (uint8_t *)malloc(dims.x * 4 * sizeof(uint8_t) * dims.y);
  memset(h_pixels, 0, dims.x * 4 * sizeof(uint8_t) * dims.y);
  CUDA_ERR(cudaMemcpy2D(h_pixels, dims.x * 4 * sizeof(uint8_t), pixels, pitch, dims.x, dims.y, cudaMemcpyDeviceToHost));

  save_png("out.png", h_pixels, dims.x, dims.y);

  CUDA_ERR(cudaFree(pixels));
  free(h_pixels);

  CUDA_ERR(cudaDeviceReset());
  puts("Success");
  return 0;
}

The save_png function is a usual utility function I created for taking a block of data and saving it to a png:

void save_png(const char *filename, uint8_t *buffer, unsigned long width, unsigned long height) {
  png_structp png_ptr = png_create_write_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL);
  if (!png_ptr) {
    std::cerr << "Failed to create png write struct" << std::endl;
    return;
  }
  png_infop info_ptr = png_create_info_struct(png_ptr);
  if (!info_ptr) {
    std::cerr << "Failed to create info_ptr" << std::endl;
    png_destroy_write_struct(&png_ptr, NULL);
    return;
  }
  FILE *fp = fopen(filename, "wb");
  if (!fp) {
    std::cerr << "Failed to open " << filename << " for writing" << std::endl;
    png_destroy_write_struct(&png_ptr, &info_ptr);
    return;
  }
  if (setjmp(png_jmpbuf(png_ptr))) {
    png_destroy_write_struct(&png_ptr, &info_ptr);
    std::cerr << "Error from libpng!" << std::endl;
    return;
  }
  png_init_io(png_ptr, fp);
  png_set_IHDR(png_ptr, info_ptr, width, height, 8, PNG_COLOR_TYPE_RGBA, PNG_INTERLACE_NONE, PNG_COMPRESSION_TYPE_DEFAULT, PNG_FILTER_TYPE_DEFAULT);
  png_write_info(png_ptr, info_ptr);
  png_byte *row_pnts[height];
  size_t i;
  for (i = 0; i < height; i++) {
    row_pnts[i] = buffer + width * 4 * i;
  }
  png_write_image(png_ptr, row_pnts);
  png_write_end(png_ptr, info_ptr);
  png_destroy_write_struct(&png_ptr, &info_ptr);
  fclose(fp);
}

Anyways the image that's generated is a weird whiteish strip that's speckled with random colored pixels which can be seen here.

Is there something glaring I did wrong? I tried to follow the introduction documentation on the CUDA site. Otherwise can anyone help me out to fix this? Here I'm simply trying to fill the pixels buffer with green pixels.

I am using a MBP retina with an NVIDIA GeForce GT 650M discrete graphics card. I can run and paste the output to print_devices from the cuda sample code if need be.

EDIT: Note no errors or warnings during compilation with the following makefile:

all:
    nvcc -c mandlebrot.cu -o mandlebrot.cu.o
    nvcc mandlebrot.cu.o -o mandlebrot -lpng

and no errors at runtime.

score 1 · Accepted Answer · edited May 23 '17 at 12:13

It's better if you provide a complete code that someone can copy, paste, compile, and run, without adding anything or changing anything, Stripping off the include headers isn't helpful, in my opinion, and making your test code dependent on a png library that others may not have is also not productive, if you want help.

Your error checking on kernel launches is broken. You may want to review proper cuda error checking. If you had proper error checking, or ran your code with cuda-memcheck, you would discover an error 9 on the kernel launch. This is an invalid configuration. If you print out your blocks and threads_per_block variables, you'll see something like this:

blocks: 2, 2
threads: 500, 500

You are in fact setting threads per block to 500,500 here:

unsigned long block_size = 500;
dim3 threads_per_block(block_size, block_size);

That is illegal, as you are requesting 500x500 threads per block (i.e. 250000 threads) which exceeds the maximum limit of 1024 threads per block.

So your kernel is not running at all and you're getting garbage.

You can fix this error pretty simply by changing your block_size definition:

unsigned long block_size = 16;

After that there is still an issue, as you've misinterpreted the parameters for cudaMemcpy2D.:

CUDA_ERR(cudaMemcpy2D(h_pixels, dims.x * 4 * sizeof(uint8_t), pixels, pitch, dims.x, dims.y, cudaMemcpyDeviceToHost));

The documentation states for the 5th parameter:

width - Width of matrix transfer (columns in bytes)

but you've passed the width in elements (groups of 4 bytes) rather than bytes.

This will fix that:

CUDA_ERR(cudaMemcpy2D(h_pixels, dims.x * 4 * sizeof(uint8_t), pixels, pitch, dims.x*4, dims.y, cudaMemcpyDeviceToHost));

With the above changes, I was able to get good results with a test version of your code:

#include <stdio.h>
#include <stdint.h>

__global__
void mandlebrot(uint8_t *pixels, size_t pitch, unsigned long width, unsigned long height) {
  unsigned block_size = blockDim.x;
  uint2 location = {blockIdx.x*block_size, blockIdx.y*block_size};
  ulong2 pixel_location = {threadIdx.x, threadIdx.y};
  ulong2 real_location = {location.x + pixel_location.x, location.y + pixel_location.y};
  if (real_location.x >= width || real_location.y >= height)
    return;
  uint8_t *row = (uint8_t *)((char *)pixels + real_location.y * pitch);
  row[real_location.x * 4+0] = 0;
  row[real_location.x * 4+1] = 255;
  row[real_location.x * 4+2] = 0;
  row[real_location.x * 4+3] = 255;
}

cudaError_t err = cudaSuccess;

#define CUDA_ERR(e) \
  if ((err = e) != cudaSuccess) { \
    fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err)); \
    exit(-1); \
  }

int main(void) {
  ulong2 dims = {1000, 1000};
  dim3 threads_per_block(16, 16);
  dim3 remainders(dims.x % threads_per_block.x, dims.y % threads_per_block.y);
  dim3 blocks(dims.x / threads_per_block.x + (remainders.x == 0 ? 0 : 1), dims.y / threads_per_block.y + (remainders.y == 0 ? 0 : 1));

  size_t pitch;
  uint8_t *pixels, *h_pixels = NULL;
  CUDA_ERR(cudaMallocPitch(&pixels, &pitch, dims.x * 4 * sizeof(uint8_t), dims.y));

  printf("blocks: %u, %u\n", blocks.x, blocks.y);
  printf("threads: %u, %u\n", threads_per_block.x, threads_per_block.y);
  mandlebrot<<<blocks, threads_per_block>>>(pixels, pitch, dims.x, dims.y);

  h_pixels = (uint8_t *)malloc(dims.x * 4 * sizeof(uint8_t) * dims.y);
  memset(h_pixels, 0, dims.x * 4 * sizeof(uint8_t) * dims.y);
  CUDA_ERR(cudaMemcpy2D(h_pixels, dims.x * 4 * sizeof(uint8_t), pixels, pitch, dims.x*4, dims.y, cudaMemcpyDeviceToHost));

//  save_png("out.png", h_pixels, dims.x, dims.y);
  for (int row = 0; row < dims.y; row++)
    for (int col = 0; col < dims.x; col++){
      if (h_pixels[(row*dims.x*4) + col*4   ] !=   0) {printf("mismatch 0 at %u,%u: was: %u should be: %u\n", row,col, h_pixels[(row*dims.x)+col*4], 0); return 1;}
      if (h_pixels[(row*dims.x*4) + col*4 +1] != 255) {printf("mismatch 1 at %u,%u: was: %u should be: %u\n", row,col, h_pixels[(row*dims.x)+col*4 +1], 255); return 1;}
      if (h_pixels[(row*dims.x*4) + col*4 +2] !=   0) {printf("mismatch 2: was: %u should be: %u\n", h_pixels[(row*dims.x)+col*4 +2], 0); return 1;}
      if (h_pixels[(row*dims.x*4) + col*4 +3] != 255) {printf("mismatch 3: was: %u should be: %u\n", h_pixels[(row*dims.x)+col*4 +3 ], 255); return 1;}
      }
  CUDA_ERR(cudaFree(pixels));
  free(h_pixels);

  CUDA_ERR(cudaDeviceReset());
  puts("Success");
  return 0;
}

Note the above code is a complete code you can copy, paste, compile and run.

Aha right! Thank you - I totally forgot about the `cudaPeekAtLastError` and thanks for the tip to just paste the entire code in a block. I was not aware of `cuda-memcheck`, thanks for pointing that out. I will fix the issues that you pointed out and mark this as the answer if everything is fixed! — DanZimm, Sep 23 '14 at 19:47
I've pasted a complete code in my answer now that fixes everything I found. — Robert Crovella, Sep 23 '14 at 20:10
Thanks a bunch! Didn't realize I was improperly using `cudaMemcpy2D`. Thanks for taking such effect to help me out! — DanZimm, Sep 23 '14 at 20:48

Bad data coming from cudaMemcpy2D

1 Answers1

Linked