In OpenCl, multiple gpu is slower than single gpu. How can I make faster?

Question

I made vector addition kernel and run it in the single gpu and multiple gpu. However in Multi gpu case is much slower than single gpu in the same length of vector addition.

The structure of my code is one context one kernel and multi queues which has same number of devices.. How can I modify for faster in multi gpu case?

The code is below

#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <unistd.h>
#include <CL/cl.h>
#include <math.h>

//#define VECTOR_SIZE 640000
//#define LOCAL_SIZE 64

#define CHECK_ERROR(err) \
  if (err != CL_SUCCESS) { \
    printf("[%s:%d] OpenCL error %d\n", __FILE__, __LINE__, err); \
    exit(EXIT_FAILURE); \
  }

double get_time() {
  struct timeval tv;
  gettimeofday(&tv, NULL);
  return (double)tv.tv_sec + (double)1e-6 * tv.tv_usec;
}

char *get_source_code(const char *file_name, size_t *len) {
  char *source_code;
  size_t length;
  FILE *file = fopen(file_name, "r");
  if (file == NULL) {
    printf("[%s:%d] Failed to open %s\n", __FILE__, __LINE__, file_name);
    exit(EXIT_FAILURE);
  }

  fseek(file, 0, SEEK_END);
  length = (size_t)ftell(file);
  rewind(file);

  source_code = (char *)malloc(length + 1);
  fread(source_code, length, 1, file);
  source_code[length] = '\0';

  fclose(file);

  *len = length;
  return source_code;
}

int main() {
  // OpenCl variables
  cl_platform_id platform;
  //cl_device_id device;
  cl_device_id *devices;
  cl_device_id device_temp;

  cl_context context;
  //cl_command_queue queue;
  cl_command_queue *queues;

  cl_mem bufferA, bufferB, bufferC;
  cl_program program;
  char *kernel_source;
  size_t kernel_source_size;
  
  cl_kernel kernel;
  //cl_kernel *kernels;

  cl_int err;

  //
  
  
  size_t VECTOR_SIZE = 64000000 ;
  int num_devices = 4;
  size_t LOCAL_SIZE = 64;
  
  // Time variables
  double start;
  double end;

  // Get platform
  err = clGetPlatformIDs(1, &platform, NULL);
  CHECK_ERROR(err);

  // Get GPU device
  
  devices = (cl_device_id *) malloc(sizeof(cl_device_id)*num_devices);
  err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, num_devices, devices, NULL);
  //err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
  CHECK_ERROR(err);

  // Create context
  context = clCreateContext(NULL,num_devices, devices , NULL, NULL , &err);
  //context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
  CHECK_ERROR(err);

  // Get kernel code
  kernel_source = get_source_code("kernel.cl", &kernel_source_size);

  // Create program
  program = clCreateProgramWithSource(context, 1, (const char**)&kernel_source,
    &kernel_source_size, &err);
  CHECK_ERROR(err);

  // Build program
  err = clBuildProgram(program, num_devices, devices, "", NULL, NULL);
  
  if(err == CL_BUILD_PROGRAM_FAILURE) {
    size_t log_size;
    char *log;

    // Get program build
    //err = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,
    //  0, NULL, &log_size);
    err = clGetProgramBuildInfo(program,devices[0],CL_PROGRAM_BUILD_LOG,0,NULL,&log_size);
    CHECK_ERROR(err);
    
    // Get build log
    log = (char*)malloc(log_size + 1);
    //err = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 
    //  log_size, log, NULL);
    err = clGetProgramBuildInfo(program,devices[0],CL_PROGRAM_BUILD_LOG,log_size,log,NULL);
    CHECK_ERROR(err);

    log[log_size] = '\0';
    printf("Compiler error : \n%s\n", log);
    free(log);
    exit(0);
  }
  CHECK_ERROR(err);
  // Create Vector A, B, C
  float *A = (float*)malloc(sizeof(float) * VECTOR_SIZE);
  float *B = (float*)malloc(sizeof(float) * VECTOR_SIZE);
  float *C = (float*)malloc(sizeof(float) * VECTOR_SIZE);

  // Initial Vector A, B
  //cl_ushort idx;
  /*for(idx = 0; idx < VECTOR_SIZE; idx++) {
    A[idx] = rand() % 100;
    B[idx] = rand() % 100;
  }*/
  printf("start\n");
  start = get_time();
  for(int i = 0; i <VECTOR_SIZE; i++){
      A[i] = sinf(i)*sinf(i);
      B[i] = cosf(i)*cosf(i);
  }
  end = get_time();
  printf("Initialization time : %f seconds elapsed\n", end-start);
  
  
  // Create kernel
  /*kernels = (cl_kernel *) malloc(sizeof(cl_kernel)*num_devices);
  for(int i=0; i<num_devices; i++){
      kernels[i] = clCreateKernel(program,"vec_add", &err);
      CHECK_ERROR(err);
  }*/
  kernel = clCreateKernel(program, "vec_add", &err);
  CHECK_ERROR(err);

  // Create Buffer
  bufferA = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * VECTOR_SIZE, NULL, &err);
  CHECK_ERROR(err);

  bufferB = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * VECTOR_SIZE, NULL, &err);
  CHECK_ERROR(err);

  bufferC = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * VECTOR_SIZE, NULL, &err);
  CHECK_ERROR(err);
  
  printf("error hi\n");
  // Create command-queue
  queues = (cl_command_queue *) malloc(sizeof(cl_command_queue)*num_devices);
  for(int i=0; i<num_devices; i++){
      if (i==0){
          queues[i] = clCreateCommandQueue(context,devices[i],CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,&err);
          CHECK_ERROR(err);
      }
      else{
          queues[i] = clCreateCommandQueue(context,devices[i], CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err);
          CHECK_ERROR(err);
      }
  }
  printf("error bye\n");
 
  //queue = clCreateCommandQueue(context, device, 0, &err);
  //CHECK_ERROR(err);

  // Write Buffer
  for (int i = 0; i<num_devices; i++){
      err = clEnqueueWriteBuffer(queues[i],bufferA,CL_FALSE,0,sizeof(float)*VECTOR_SIZE,A,0,NULL,NULL);
      CHECK_ERROR(err);
      err = clEnqueueWriteBuffer(queues[i],bufferB,CL_FALSE,0,sizeof(float)*VECTOR_SIZE,B,0,NULL,NULL);
      CHECK_ERROR(err);
  }
  //err = clEnqueueWriteBuffer(queue, bufferA, CL_FALSE, 0, sizeof(float) * VECTOR_SIZE, A, 0, NULL, NULL);
  //CHECK_ERROR(err);
  //err = clEnqueueWriteBuffer(queue, bufferB, CL_FALSE, 0, sizeof(float) * VECTOR_SIZE, B, 0, NULL, NULL);
  //CHECK_ERROR(err);

  for(int i=0; i<num_devices; i++){
      err=clFinish(queues[i]);
      CHECK_ERROR(err);
  }


  // Set Kernel arguments
  start = get_time();
  /*for(int i=0; i<num_devices; i++){
      err=clSetKernelArg(kernels[i], 0, sizeof(cl_mem), &bufferA);
      CHECK_ERROR(err);
      err=clSetKernelArg(kernels[i], 1, sizeof(cl_mem), &bufferB);
      CHECK_ERROR(err);
      err=clSetKernelArg(kernels[i], 2, sizeof(cl_mem), &bufferC);
      CHECK_ERROR(err);
      err=clSetKernelArg(kernels[i], 3, sizeof(unsigned int), &VECTOR_SIZE);
      CHECK_ERROR(err);
  }*/
  err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &bufferA);
  CHECK_ERROR(err);
  err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &bufferB);
  CHECK_ERROR(err);
  err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &bufferC);
  CHECK_ERROR(err);
  err = clSetKernelArg(kernel, 3, sizeof(unsigned int), &VECTOR_SIZE);
  CHECK_ERROR(err);

  end = get_time();

  printf("Send Vector A, B to GPU : %f seconds elapsed\n", end - start);

  for(int i=0; i<num_devices; i++){
      err=clFinish(queues[i]);
      CHECK_ERROR(err);
  }

  cl_event ooo_events[num_devices];
  start = get_time();
  // Execute Kernel
  size_t global_size = VECTOR_SIZE;
  size_t local_size = LOCAL_SIZE;
  for(int i=0; i<num_devices; i++){
      //start=get_time();
      
      err= clEnqueueNDRangeKernel(queues[i],kernel,1,NULL,&global_size,&local_size,0,NULL,NULL);
      CHECK_ERROR(err);
      //err = clEnqueueNDRangeKernel(queues[i],kernels[i],1,NULL,&global_size, &local_size,0,NULL,NULL);
      //CHECK_ERROR(err);
      //end=get_time();
      //printf("Calculate C : %f seconds elapsed\n", end-start);
  }
  //err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL,&global_size, &local_size, 0, NULL, NULL);
  //CHECK_ERROR(err);
  for(int i=0; i<num_devices; i++){
      err=clFinish(queues[i]);
      CHECK_ERROR(err);
  }

  end = get_time();

  printf("Calculate C : %f seconds elapsed\n", end - start);

  // Read Buffer
  start = get_time();
  for(int i=0; i<num_devices; i++){
      err = clEnqueueReadBuffer(queues[i],bufferC,CL_TRUE,0,sizeof(float)*VECTOR_SIZE,C,0,NULL,NULL);
      CHECK_ERROR(err);
  }
  //err = clEnqueueReadBuffer(queue, bufferC, CL_TRUE, 0, sizeof(float) * VECTOR_SIZE, C, 0, NULL, NULL);
  //CHECK_ERROR(err);

  end = get_time();
  printf("Receive C from GPU : %f seconds elapsed\n", end - start);

  // Evaluate Vector C
  start = get_time();
  double sum = 0;
  for(int i = 0; i < VECTOR_SIZE; i++) {
    sum += C[i];
  }
  end = get_time();
  printf("Verification time : %f seconds elapsed\n", end-start);
  printf("%lf, %ld \n", sum,VECTOR_SIZE);
  if (abs(VECTOR_SIZE - sum) < 1) {
    printf("Verification success!\n");
  }
  printf("Sum : %f\n", sum);

  // Release OpenCL object
  clReleaseMemObject(bufferA);
  clReleaseMemObject(bufferB);
  clReleaseMemObject(bufferC);
  free(A);
  free(B);
  free(C);
  clReleaseKernel(kernel);
  //clReleaseKernel(kernels[0]);
  //clReleaseKernel(kernels[1]);
  clReleaseProgram(program);
  
  clReleaseCommandQueue(queues[0]);
  clReleaseCommandQueue(queues[1]);
  //clReleaseCommandQueue(queue);
  clReleaseContext(context);

  return 0;
}

Haven't touched GPUs for a while, but, are you doing enough work to mask overheads? — NNN, Nov 16 '20 at 12:42
I didn't do any special thinggs to mask overheads.. Can you tell me about that? Actually this is the first time for me running the code in multi gpu. — Song, Nov 16 '20 at 12:49
Basically there is a computational cost for any parallel computing framework, which gets counted in the wall clock time spent running the program. Your program needs to do enough work in order for you to see performance improvement. — NNN, Nov 16 '20 at 12:58
Thank you for your comment. Then can you give me any tips or reference or sample for understanding about that? — Song, Nov 16 '20 at 13:18
Check out: https://computing.llnl.gov/tutorials/parallel_comp/#:~:text=Parallel%20Overhead%20The%20amount%20of,system%20-%20having%20many%20processing%20elements — NNN, Nov 16 '20 at 13:39
I think double buffer or non blocking api is the anser for improving performance.. — Song, Nov 16 '20 at 13:46

score 1 · Answer 1 · answered Nov 21 '20 at 10:05

Using multiple GPUs is only beneficial in terms of performance if the amount of computational work that each GPU performs takes more time then the communication, scheduling and synchronization overhead. This is true for a single GPU as well.

In your case, each GPU performs a simple vector addition. but that rarely takes more time then transferring the data to the GPU, waiting for the kernel to actually get scheduled for execution, etc.

Your code is not measuring the total kernel execution time but also the scheduling overhead.

I would advise you to use proper GPU profiling tools (depending on your GPU vendor) instead of manual CPU timings to properly examine what is going on. You can also try measuring kernel execution time via events.

In OpenCl, multiple gpu is slower than single gpu. How can I make faster?

1 Answers1