0

We have a problem with our CUDA application when we run it under the CUDA Visual Profiler on linux. When we create a new session and the toolkit generates the timeline, the kernel call doesn't appear if the matrix size is very large. Our application is designed to horizontally flip a matrix composed by random numbers and we pass its size on the command line. We have to use routines and API of the cuda_visual_profiler library to take the time of CPU and GPU, but we don't know how to.

Could someone help us?

Thank you guys

#include <cuda.h>
#include <stdio.h>
#include <cuda_profiler_api.h>

#include <thrust/system_error.h>
#include <thrust/system/cuda_error.h>
#include <sstream>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

/* START PROGRAM */

void inizializzaMatrice (int*,int,int);             // Initialize Matrix
void stampaMatrice (int*,int,int);                  // Print Matrix
void flipMatriceCPU (int*,int,int);                 // CPU Flipping
void confrontaMatrici (int*,int*,int,int);          // Equal Matrix
__global__ void flipMatriceGPU (int*,int*,int,int); // GPU Flipping

int main(int argn, char * argv[]){
    dim3 nBlocchi,nThreadPerBlocco;
    int M,N,flag;
    int *in_host, *out_host,*out_DeToHo;
    int *in_device, *out_device;
    long int size;
    printf("\n\n******************** RIFLESSIONE ORIZZONTALE DI UNA MATRICE ********************\n\n");
    if(argn<6 || atoi(argv[2])%2==0 ){
        if(argn<6)
            printf("Numero di parametri insufficiente!!!\n");

        else if(atoi(argv[2])%2==0)
            printf("Errore nell'utilizzo di %s. Il numero di colonne <N> deve essere dispari\n",argv[0]);

        printf("Uso corretto: %s <M> <N> <NumThreadPerBlocco.x> <NumThreadPerBlocco.y> <flag per la Stampa>\n", argv[0]);
        printf("Uso dei valori di default ... ...\n\n\n"); 
        nThreadPerBlocco.x=2; 
        nThreadPerBlocco.y=3;
        M=4; N=9; flag=1;
    }
    else {
        M=atoi(argv[1]); 
        N=atoi(argv[2]);
        nThreadPerBlocco.x=atoi(argv[3]);
        nThreadPerBlocco.y=atoi(argv[4]);
        flag=atoi(argv[5]); 
    }


    nBlocchi.x=M/nThreadPerBlocco.x+((M%nThreadPerBlocco.x)==0?0:1); 
    nBlocchi.y=N/nThreadPerBlocco.y+((N%nThreadPerBlocco.y)==0?0:1); 

    size=M*N*sizeof(int);

//stampa delle info sull'esecuzione del kernel
    printf("Numero di elementi = %d * %d\n",M, N);
    printf("Numero di thread per blocco = %d * %d\n", nThreadPerBlocco.x,nThreadPerBlocco.y); 
    printf("Numero di blocchi = %d * %d\n\n\n",nBlocchi.x,nBlocchi.y);

// Allocazione dati sull'host
    in_host=(int*)malloc(size);
    out_host=(int*)malloc(size);
    out_DeToHo=(int*)malloc(size);

// Allocazione dati dul device

    gpuErrchk( cudaMalloc((void**)&in_device,size) );
    gpuErrchk( cudaMalloc((void**)&out_device,size) );

// Inizializzazione dati sull'host

    inizializzaMatrice(in_host,M,N);

//cudaProfilerStart();

// Copia dei dati dall'host al device

    gpuErrchk(cudaMemcpy(in_device, in_host, size, cudaMemcpyHostToDevice));

// Invocazione del Kernel

    flipMatriceGPU<<<nBlocchi, nThreadPerBlocco, size>>>(in_device, out_device, N,M);

// Copia dei risultati dal device all'host
    gpuErrchk( cudaPeekAtLastError() );
    gpuErrchk( cudaDeviceSynchronize() );
    gpuErrchk( cudaMemcpy(out_DeToHo, out_device, size, cudaMemcpyDeviceToHost) );


// Flip Matrice CPU
    memcpy(out_host,in_host,size);
    flipMatriceCPU(out_host,M,N);

//cudaProfilerStop();

// Stampa Matrici

    if (flag==1){

        printf("Matrice di input:\n");
        //stampaMatrice(in_host, M, N);

        printf("Matrice di output host CPU:\n");
        //stampaMatrice(out_host, M, N);

        printf("Matrice di output device GPU:\n");
        stampaMatrice(out_DeToHo, M, N);

    }

    confrontaMatrici(out_host,out_DeToHo,M,N);
    printf("\n\n********************************************************************************\n\n");
    free(in_host);
    free(out_host);
    free(out_DeToHo);
    cudaFree(in_device);
    cudaFree(out_device);

    exit(0);
}






void inizializzaMatrice(int* matrice, int M, int N) {
    int i,j; for(i=0;i<M;i++)
    for(j=0;j<N;j++) matrice[i*N+j]=i*N+j;
}

void stampaMatrice(int*matrice, int M, int N) {
    int i,j; 
    for(i=0;i<M;i++) {
        for(j=0;j<N;j++)
            printf("%d\t", matrice[i*N+j]);
        printf("\n"); 
    }
}

void flipMatriceCPU(int *matrice, int row, int col){
    int i, j,tmp;
    for ( i = 0; i < row; i++ ) {
        for (  j = 0; j < col/2; j++ ) {
            tmp=matrice[col*i+j];
            matrice[col*i+j] = matrice[col*i+col-j-1];
            matrice[col*i+col-j-1] = tmp;

        }
    }
}

void confrontaMatrici(int* m1, int*m2, int M, int N) {
    int i, j; for(i=0;i<M;i++)
    for(j=0;j<N;j++) if(m1[i*N+j]!=m2[i*N+j]) {
        printf("I risultati dell'host e del device sono diversi.\n");
        return; 
    }
    if(i==M && j==N)
        printf("I risultati dell'host e del device coincidono.\n");
}

__global__ void flipMatriceGPU(int *in, int *out, int col, int row) {
    extern __shared__ int s_data[];
    int indexRow=threadIdx.x + blockIdx.x*blockDim.x; 
    int indexCol=threadIdx.y + blockIdx.y*blockDim.y; 
    int index=indexRow*col+indexCol;

    if(indexCol<col && indexRow<row){
        int index_data=blockDim.y-1-threadIdx.y+indexRow*col;

        s_data[index_data]=in[index];
        __syncthreads();

        int outOffset= blockDim.y*(gridDim.y-1-blockIdx.y);
        int outIndex= outOffset + threadIdx.y -(gridDim.y*blockDim.y - col) + indexRow*col;

        if(blockIdx.y==gridDim.y-1){
            outIndex+=gridDim.y*blockDim.y - col;
            out[outIndex]= s_data[(gridDim.y*blockDim.y - col)+(threadIdx.y+indexRow*col)];
        }

        else    
            out[outIndex]= s_data[threadIdx.y+indexRow*col];

    }
}

enter image description here

Vitality
  • 20,705
  • 4
  • 108
  • 146
FlaGlo
  • 5
  • 2
  • 1
    Are you using [proper CUDA check](http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api) in your code? It might be that your kernel isn't actually executed if you are requesting too many resources. Also, the minimum sized code reproducing your issue that someone else could copy, paste, compile and run would be necessary if you hope to receive some concrete help. – Vitality Jul 07 '14 at 13:10
  • thanks for the answer. we added proper CUDA check in our code. the image shows two different outputs. In the first, the cuda application, called "flip", returns the horizontal matrix of the matrix that we passed on command line of size 90x131, in the second, we passed a matrix with a bigger size and the check code returns two errors. see the image here : [image link](https://www.dropbox.com/s/kwpcjumel8gsxr5/Schermata%202014-07-07%20alle%2015.51.48.png) – FlaGlo Jul 07 '14 at 14:02
  • 1
    The image says _GPUassert: invalid argument flip.cu 90_. Have you checked line number `90` of the flip.cu file? It would be anyway impossible to diagnose your problem without a repro code. – Vitality Jul 07 '14 at 14:26
  • yeah you're right, but we can't post our code here because our reputation is less than 10, so could you see this link? [flip.cu] (https://www.dropbox.com/s/r50l0rf8wh7pbcz/flip.cu) thanks a lot – FlaGlo Jul 07 '14 at 14:42

1 Answers1

2

In the flipMatriceGPU __global__ function, you are dynamically allocating a shared memory array of size size. When you are considering the case of a 100 x 131 sized matrix, size becomes equal to 52400 which exceeds the maximum allowed memory size of 48KB per streaming multiprocessor.

Vitality
  • 20,705
  • 4
  • 108
  • 146
  • 1
    It's also recommended not to directly use header files in your code from thrust/system. Your code will not compile on CUDA 6 due to this. – Robert Crovella Jul 07 '14 at 15:17