0

I have a problem. I have 2 different implementations of a piece of code, but both have the same problem: the value assignment of a value inside an array, inside a struct, inside an array doesn't work in the code (I hope you still follow me).

Here are the two pieces of code:

Version1:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#define ITER 4000

typedef struct Map{
    int length;
    double *A;
    int *x;
    int *dx;
    int *y;
    int *dy;
    int *delta;
    int *phi;
}Map;

typedef struct Coefs{
    int length;
    double *x;
    double *dx;
    double *y;
    double *dy;
    double *delta;
    double *phi;
}Coefs;

void cudaMallocMap(Map **m, int p){
    *m = (Map*) malloc(sizeof(Map));
    (**m).length = p;
    if(p>0){
        cudaMalloc((void**)&((**m).A), p*sizeof(double));
        cudaMalloc((void**)&((**m).x), p*sizeof(int));
        cudaMalloc((void**)&((**m).dx), p*sizeof(int));
        cudaMalloc((void**)&((**m).y), p*sizeof(int));
        cudaMalloc((void**)&((**m).dy), p*sizeof(int));
        cudaMalloc((void**)&((**m).delta), p*sizeof(int));
        cudaMalloc((void**)&((**m).phi), p*sizeof(int));
    }
}

void cudaFreeMap(Map **m){
    if((**m).length > 0){
        cudaFree((**m).A);
        cudaFree((**m).x);
        cudaFree((**m).dx);
        cudaFree((**m).y);
        cudaFree((**m).dy);
        cudaFree((**m).delta);
        cudaFree((**m).phi);
    }
    free(*m);
}

void cudaMallocCoefs(Coefs **c, int iter, int p){
    if(iter>0){
        int i;
        *c = (Coefs*) malloc(p*sizeof(Coefs));
        (*c)[0].length = iter;
        for(i=0;i<p;i++){
            cudaMalloc((void**)&((*c)[i].x), iter*sizeof(double));
            cudaMalloc((void**)&((*c)[i].dx), iter*sizeof(double));
            cudaMalloc((void**)&((*c)[i].y), iter*sizeof(double));
            cudaMalloc((void**)&((*c)[i].dy), iter*sizeof(double));
            cudaMalloc((void**)&((*c)[i].delta), iter*sizeof(double));
            cudaMalloc((void**)&((*c)[i].phi), iter*sizeof(double));
        }
    }
}

void cudaFreeCoefs(Coefs **c, int p){
    int i;
    for(i=0;i<p;i++){
        if((**c).length > 0){
            cudaFree((*c)[i].x);
            cudaFree((*c)[i].dx);
            cudaFree((*c)[i].y);
            cudaFree((*c)[i].dy);
            cudaFree((*c)[i].delta);
            cudaFree((*c)[i].phi);
        }
    }
    free(*c);
}

__global__ void testVals(Map *m, Coefs *c){
    m->length = 42;
    m->A[0] = 1.5;
    m->dx[20] = 5;
    c[0].delta[4] = 3.14159265;
}

int main(int argc, char **argv){
    int xSize = 31, particleCount = 1, iter = ITER;
    Map *dev_x;
    Coefs *dev_c;

    // allocate memory for the map
    cudaMallocMap(&dev_x, xSize);

    // malloc the coefficients
    cudaMallocCoefs(&dev_c, iter, particleCount);

    // cuda test kernel
    testVals<<<1, 1>>>(dev_x, dev_c);
    int testval1;
    double testval2;
    double testval3;
    int length1;
    int length2;
    cudaMemcpy(&testval1, &(dev_x->dx[20]), sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy(&testval2, &(dev_x->A[0]), sizeof(double), cudaMemcpyDeviceToHost);
    cudaMemcpy(&testval3, &(dev_c[0].delta[4]), sizeof(double), cudaMemcpyDeviceToHost);
    cudaMemcpy(&length1, &(dev_x->length), sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy(&length2, &(dev_c->length), sizeof(int), cudaMemcpyDeviceToHost);

    // print test results
    fprintf(stderr, "Length map: %d\nLength coefs: %d\nValue map A[0]: %lf\nValue map dx[20]: %d\nValue coefs[0] delta[4]: %lf\n", length1, length2, testval2, testval1, testval3);

    // clean up the heap and tell that the computation is finished
    cudaFreeMap(&dev_x);
    cudaFreeCoefs(&dev_c, particleCount);
    getchar();
    return 0;
}

Version 2:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#define ITER 4000

typedef struct Map{
    int length;
    double *A;
    int *x;
    int *dx;
    int *y;
    int *dy;
    int *delta;
    int *phi;
}Map;

typedef struct Coefs{
    int length;
    double *x;
    double *dx;
    double *y;
    double *dy;
    double *delta;
    double *phi;
}Coefs;

void cudaMallocMap(Map **m, int p){
    cudaMalloc((void**)m, sizeof(Map));
    cudaMemset(&((**m).length), p, sizeof(int));
    if(p>0){
        double **h_arr1 = (double**)malloc(sizeof(double*));
        int **h_arr2 = (int**)malloc(sizeof(int*));
        cudaMemcpy(h_arr1, &((**m).A), sizeof(double*), cudaMemcpyDeviceToHost);
        cudaMalloc((void**)h_arr1, p*sizeof(double));
        cudaMemcpy(h_arr2, &((**m).x), sizeof(int*), cudaMemcpyDeviceToHost);
        cudaMalloc((void**)h_arr2, p*sizeof(int));
        cudaMemcpy(h_arr2, &((**m).dx), sizeof(int*), cudaMemcpyDeviceToHost);
        cudaMalloc((void**)h_arr2, p*sizeof(int));
        cudaMemcpy(h_arr2, &((**m).y), sizeof(int*), cudaMemcpyDeviceToHost);
        cudaMalloc((void**)h_arr2, p*sizeof(int));
        cudaMemcpy(h_arr2, &((**m).dy), sizeof(int*), cudaMemcpyDeviceToHost);
        cudaMalloc((void**)h_arr2, p*sizeof(int));
        cudaMemcpy(h_arr2, &((**m).delta), sizeof(int*), cudaMemcpyDeviceToHost);
        cudaMalloc((void**)h_arr2, p*sizeof(int));
        cudaMemcpy(h_arr2, &((**m).phi), sizeof(int*), cudaMemcpyDeviceToHost);
        cudaMalloc((void**)h_arr2, p*sizeof(int));
        free(h_arr1);
        free(h_arr2);
    }
}

void cudaFreeMap(Map **m){
    Map h_map;
    cudaMemcpy(&h_map, *m, sizeof(Map), cudaMemcpyDeviceToHost);

    if(h_map.length > 0){
        cudaFree(h_map.A);
        cudaFree(h_map.x);
        cudaFree(h_map.dx);
        cudaFree(h_map.y);
        cudaFree(h_map.dy);
        cudaFree(h_map.delta);
        cudaFree(h_map.phi);
    }
    cudaFree(*m);
}

void cudaMallocCoefs(Coefs **c, int iter, int p){
    if(iter>0){
        int i;
        cudaMalloc((void**)c, p*sizeof(Coefs));
        for(i=0;i<p;i++){
            double **h_arr = (double**)malloc(sizeof(double*));
            cudaMemset(&((*c)[i].length), iter, sizeof(int));
            cudaMemcpy(h_arr, &((*c)[i].x), sizeof(double*), cudaMemcpyDeviceToHost);
            cudaMalloc((void**)h_arr, iter*sizeof(double));
            cudaMemcpy(h_arr, &((*c)[i].dx), sizeof(double*), cudaMemcpyDeviceToHost);
            cudaMalloc((void**)h_arr, iter*sizeof(double));
            cudaMemcpy(h_arr, &((*c)[i].y), sizeof(double*), cudaMemcpyDeviceToHost);
            cudaMalloc((void**)h_arr, iter*sizeof(double));
            cudaMemcpy(h_arr, &((*c)[i].dy), sizeof(double*), cudaMemcpyDeviceToHost);
            cudaMalloc((void**)h_arr, iter*sizeof(double));
            cudaMemcpy(h_arr, &((*c)[i].delta), sizeof(double*), cudaMemcpyDeviceToHost);
            cudaMalloc((void**)h_arr, iter*sizeof(double));
            cudaMemcpy(h_arr, &((*c)[i].phi), sizeof(double*), cudaMemcpyDeviceToHost);
            cudaMalloc((void**)h_arr, iter*sizeof(double));
            free(h_arr);
        }
    }
}

void cudaFreeCoefs(Coefs **c, int p){
    Coefs h_coefs;
    int i;
    for(i=0;i<p;i++){
        cudaMemcpy(&h_coefs, &((*c)[i]), sizeof(Coefs), cudaMemcpyDeviceToHost);
        if(h_coefs.length > 0){
            cudaFree(h_coefs.x);
            cudaFree(h_coefs.dx);
            cudaFree(h_coefs.y);
            cudaFree(h_coefs.dy);
            cudaFree(h_coefs.delta);
            cudaFree(h_coefs.phi);
        }
    }
    cudaFree(*c);
}

__global__ void testVals(Map *m, Coefs *c){
    m->length = 42;
    m->A[0] = 1.5;
    m->dx[20] = 5;
    c[0].delta[4] = 3.14159265;
}


int main(int argc, char **argv){
    int xSize = 31, iter = ITER, particleCount = 1;
    Map *dev_x;
    Coefs *dev_c;

    //malloc map
    cudaMallocMap(&dev_x, xSize);

    // malloc the coefficients 
    cudaMallocCoefs(&dev_c, iter, particleCount);


    // cuda test kernel
    testVals<<<1, 1>>>(dev_x, dev_c);
    int testval1;
    double testval2;
    double testval3;
    int length1;
    int length2;
    Map testmap;
    mallocMap(&testmap, xSize);
    Coefs *testcoefs;
    mallocCoefs(&testcoefs,iter, particleCount);
    cudaMemcpy(&testmap, &dev_x, sizeof(Map), cudaMemcpyDeviceToHost);
    cudaMemcpy(&testcoefs, &dev_c, sizeof(Coefs), cudaMemcpyDeviceToHost);
    cudaMemcpy(&testval1, &(testmap.dx[20]), sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy(&testval2, &(testmap.A[0]), sizeof(double), cudaMemcpyDeviceToHost);
    cudaMemcpy(&testval3, &(testcoefs[0].delta[4]), sizeof(double), cudaMemcpyDeviceToHost);
    cudaMemcpy(&length1, &(testmap.length), sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy(&length2, &(testcoefs[0].length), sizeof(int), cudaMemcpyDeviceToHost);


    // print test results
    fprintf(stderr, "Length map: %d\nLength coefs: %d\nValue map A[0]: %lf\nValue map dx[20]: %d\nValue coefs[0] delta[4]: %lf\n", length1, length2, testval2, testval1, testval3);


    // clean up the heap and tell that the computation is finished
    cudaFreeMap(&dev_x);
    cudaFreeCoefs(&dev_c, particleCount);
    getchar();
    return 0;
}

I think that either the malloc doesn't work the way I want it to or the test kernel does something wrong with pointers without notifying me, but I can't seem to find what's wrong. Both versions run without error and show a very negative values where they should be 42 (set in kernel, overwriting the value when mallocing), 31 (set during mallocing), 1.5 , 5 and 3.141592 (set in kernel)

Can someone help me explain how to properly malloc an array of structures which contain arrays and the length of those arrays (Coefs/Map) and how to pass around their pointers?

EDIT: additionally, here is a graphical representation of the map and coefs:

Coefs *c: 
┌──────────────────────┬─────┬─────┬─────┬─────┬─────┬─
│ Particles/Iterations │  1  │  2  │  3  │  4  │  5  │
├──────────────────────┼─────┼─────┼─────┼─────┼─────┼─
│        length=4000   │     │     │     │     │     │
│ 1: c[0].    x        │ x[0]│ x[1]│ x[2]│ x[3]│ x[4]│
│            dx        │dx[0]│dx[1]│dx[2]│dx[3]│dx[4]│
│             y        │ y[0]│ y[1]│ y[2]│ y[3]│ y[4]│
│            dy        │ ... │     │     │     │     │
│         delta        │     │     │     │     │     │
│           phi        │     │     │     │     │     │
├──────────────────────┼─────┼─────┼─────┼─────┼─────┼─
│        length=4000   │     │     │     │     │     │
│ 2: c[1].    x        │     │     │     │     │     │
│            dx        │     │     │     │     │     │
│             y        │     │     │     │     │     │
│            dy        │     │     │     │     │     │
│         delta        │     │     │     │     │     │
│           phi        │     │     │     │     │     │
├──────────────────────┼─────┼─────┼─────┼─────┼─────┼─
│        length=4000   │     │     │     │     │     │
│ 3: c[2].    x        │     │     │     │     │     │
│            dx        │     │     │     │     │     │
│             y        │     │     │     │     │     │
│            dy        │     │     │     │     │     │
│         delta        │     │     │     │     │     │
│           phi        │     │     │     │     │     │
├──────────────────────┼─────┼─────┼─────┼─────┼─────┼─
│        length=4000   │     │     │     │     │     │
│ 4: c[2].    x        │     │     │     │     │     │
│            dx        │     │     │     │     │     │
│             y        │     │     │     │     │     │
│            dy        │     │     │     │     │     │
│         delta        │     │     │     │     │     │
│           phi        │     │     │     │     │     │
├──────────────────────┼─────┼─────┼─────┼─────┼─────┼─

Map *m: 
┌──────────────────────┬─────┬─────┬─────┬─────┬─────┬─
│   Mapnr \ Mapline    │  1  │  2  │  3  │  4  │  5  │
├──────────────────────┼─────┼─────┼─────┼─────┼─────┼─
│        length=31     │     │     │     │     │     │
│ 1 (*m).     x        │ x[0]│ x[1]│ x[2]│ x[3]│ x[4]│
│            dx        │dx[0]│dx[1]│dx[2]│dx[3]│dx[4]│
│             y        │ y[0]│ y[1]│ y[2]│ y[3]│ y[4]│
│            dy        │ ... │     │     │     │     │
│         delta        │     │     │     │     │     │
│           phi        │     │     │     │     │     │
└──────────────────────┴─────┴─────┴─────┴─────┴─────┴─

As you can see, at the moment I use only one map and with a variable amount of map lines. Also I use a variable length array of Coefs, which has members of variable length ITER (default to 4000). This is what I need to malloc on the GPU, to be used in a kernel.

Rik Schaaf
  • 1,101
  • 2
  • 11
  • 30
  • in version 1, `cudaFreeCoefs` should use `c->length` instead of taking argument `p` , and you should either be careful not to call it if length was `0`, or (probably better) still do the first `malloc` and set the length to 0 ; like you did for `cudaMallocMap`. – M.M Jun 24 '14 at 02:12
  • In version 2, `cudaMemset(&((**m).length), p, sizeof(int));` should be `cudaMemcpy` – M.M Jun 24 '14 at 02:13
  • simiarly for `cudaMemset(&((*c)[i].length), iter, sizeof(int));` – M.M Jun 24 '14 at 02:13
  • I'm 99% sure that this doesn't do what you think it does, `cudaMemcpy(h_arr1, &((**m).A), sizeof(double*), cudaMemcpyDeviceToHost); cudaMalloc((void**)h_arr1, p*sizeof(double));` – M.M Jun 24 '14 at 02:22
  • In your post, you indicate that `length map` should be `42`, but I believe it should be `31`. Consider now the `cudaMemcpy(&length1, &(dev_x->length), sizeof(int), cudaMemcpyDeviceToHost);` instruction of Version1 and notice that `dev_x` is now a host-side variable, so that you cannot use it as the second argument of a `cudaMemcpy` with `DeviceToHost` direction. Try using `dev_x->length` instead of `length1` in your final `fprintf` instruction. – Vitality Jun 24 '14 at 05:26
  • @MattMcNabb about cudaFreeCoefs: no I shouldn't because p represents the amount of particles, where c->length represents the number of iterations. also, p will never be 0. This is only the relevant fraction of the code, p is set elsewhere and always >= 1. the amount of iterations can be 0, therefore I check for that. – Rik Schaaf Jun 24 '14 at 14:04
  • @MattMcNabb comment #4: what I think it does is make the devicepointer available on the host, so that I have a host pointer pointing to a device address. That way I can cudaMalloc arrays and put them in that pointer. Now I think of it, Should I also (or only) copy the pointers back? – Rik Schaaf Jun 24 '14 at 14:10
  • @JackOLantern you are right that the value is initially 31, but it is set to 42 in the kernel. And thanks for the tip on the length thing, that does make sense. On the other hand, it doesn't explain why the A[0], dx[20] and delta[4] don't work, since those point to device memory. – Rik Schaaf Jun 24 '14 at 14:15

0 Answers0