Can I use GPU to parallelize C++ code in Visual Studio?

Question

I am trying to use GPU to parallelize my code in Visual Studio C++.

Currently, I used OpenMP to use CPU parallelization.

But I am thinking of using GPU parallelization because I think it would be faster if I use a bigger size of arrays in calculations.

Below is the code that I am working on. I only used parallelization once.

I found out that in order to use GPU parallelization, I need to use OpenCL or Cuda.

And OpenCL and Cuda seem like I need to change the whole code. So I was wondering whether there is a way to use GPU parallelization without changing the whole code (maybe just changing '#pragma omp parallel for')

#include <iostream>
#include <cstdio>
#include <chrono>
#include <vector>
#include <math.h>       // power
#include <cmath>        // abs
#include <fstream>
#include <omp.h>

using namespace std;
using namespace chrono;

// Dynamically allocation with values(float)
void dallo_fn(float**** pMat, int Na, int Nd, int Ny) {
    float*** Mat = new float** [Na];
    for (int i = 0; i < Na; i++) {
        Mat[i] = new float* [Nd];
        for (int j = 0; j < Nd; j++) {
            Mat[i][j] = new float[Ny];
            fill_n(Mat[i][j], Ny, 1);
        }
    }
    *pMat = Mat;
}

// Dynamically allocation without values(float)
void dallo_fn0(float**** pMat, int Na, int Nd, int Ny) {
    float*** Mat = new float** [Na];
    for (int i = 0; i < Na; i++) {
        Mat[i] = new float* [Nd];
        for (int j = 0; j < Nd; j++) {
            Mat[i][j] = new float[Ny];
        }
    }
    *pMat = Mat;
}

// Dynamically allocation without values(int)
void dallo_fn1(int**** pMat, int Na, int Nd, int Ny) {
    int*** Mat = new int** [Na];
    for (int i = 0; i < Na; i++) {
        Mat[i] = new int* [Nd];
        for (int j = 0; j < Nd; j++) {
            Mat[i][j] = new int[Ny];
        }
    }
    *pMat = Mat;
}

// Utility function
float utility(float a, float a_f, float d, float d_f, float y, double sig, double psi, double delta, double R) {
    float C;
    C = y + a - a_f / R - (d_f - (1 - delta) * d);
    float result;
    if (C > 0) {
        result = 1 / (1 - 1 / sig) * pow(pow(C, psi) * pow(d_f, 1 - psi), (1 - 1 / sig));
    }
    else {
        result = -999999;
    }
    return result;
}


int main()
{
    
#if defined _OPENMP
    omp_set_num_threads(8);
#endif

    float duration;

    // Iteration Parameters
    double tol = 0.000001;
    int itmax = 200;
    int H = 15;

    // Model Parameters and utility function
    double sig = 0.75;
    double beta = 0.95;
    double psi = 0.5;
    double delta = 0.1;
    double R = 1 / beta - 0.00215;

    // =============== 2. Discretizing the state space =========================

    // Size of arrays
    const int Na = 1 * 91;
    const int Nd = 1 * 71;
    const int Ny = 3;

    // Variables for discretization of state space
    const float amin = -2;
    const float amax = 7;
    const float dmin = 0.01;
    const float dmax = 7;
    const float ymin = 0.5;
    const float ymax = 1.5;
    const float Ptrans[3] = { 0.2, 0.6, 0.2 };

    // Discretization of state space
    float ca = (amax - amin) / (Na - 1.0);
    float cd = (dmax - dmin) / (Nd - 1.0);
    float cy = (ymax - ymin) / (Ny - 1.0);

    float* A = new float[Na];
    float* Y = new float[Ny];
    float* D = new float[Nd];

    for (int i = 0; i < Na; i++) {
        A[i] = amin + i * ca;
    }
    for (int i = 0; i < Nd; i++) {
        D[i] = dmin + i * cd;
    }
    for (int i = 0; i < Ny; i++) {
        Y[i] = ymin + i * cy;
    }

    // === 3. Initial guesses, Variable initialization and Transition matrix ===

    // Initial guess for value function
    float*** V;
    dallo_fn(&V, Na, Nd, Ny);
    float*** Vnew;
    dallo_fn(&Vnew, Na, Nd, Ny);

    // Initialization of other variables
    float Val[Na][Nd];
    float** Vfuture = new float* [Na];
    for (int i = 0; i < Na; i++)
    {
        Vfuture[i] = new float[Nd];
    }
    float** temphoward = new float* [Na];
    for (int i = 0; i < Na; i++)
    {
        temphoward[i] = new float[Nd];
    }

    float*** Vhoward;
    dallo_fn0(&Vhoward, Na, Nd, Ny);
    float*** tempdiff;
    dallo_fn0(&tempdiff, Na, Nd, Ny);
    int*** maxposition_a;
    dallo_fn1(&maxposition_a, Na, Nd, Ny);
    int*** maxposition_d;
    dallo_fn1(&maxposition_d, Na, Nd, Ny);

    float** mg_A_v = new float* [Na];
    for (int i = 0; i < Na; i++)
    {
        mg_A_v[i] = new float[Nd];
    }
    for (int j = 0; j < Nd; j++) {
        for (int i = 0; i < Na; i++) {
            mg_A_v[i][j] = A[i];
        }
    }

    float** mg_D_v = new float* [Na];
    for (int i = 0; i < Na; i++)
    {
        mg_D_v[i] = new float[Nd];
    }
    for (int j = 0; j < Nd; j++) {
        for (int i = 0; i < Na; i++) {
            mg_D_v[i][j] = D[j];
        }
    }

    float***** Uvec = new float**** [Na];
    for (int i = 0; i < Na; i++) {
        Uvec[i] = new float*** [Nd];
        for (int j = 0; j < Nd; j++) {
            Uvec[i][j] = new float** [Ny];
            for (int k = 0; k < Ny; k++) {
                Uvec[i][j][k] = new float* [Na];
                for (int l = 0; l < Na; l++) {
                    Uvec[i][j][k][l] = new float[Nd];
                }
            }
        }
    }

    for (int i = 0; i < Na; i++) {
        for (int j = 0; j < Nd; j++) {
            for (int k = 0; k < Ny; k++) {
                for (int l = 0; l < Na; l++) {
                    for (int m = 0; m < Nd; m++) {
                        Uvec[i][j][k][l][m] = utility(A[i], mg_A_v[l][m], D[j], mg_D_v[l][m], Y[k], sig, psi, delta, R);
                    }
                }
            }
        }
    }

    // Value function iteration
    int it;
    float dif;
    float max;
    it = 0;
    dif = 1;

    // ================ 4. Value function iteration ============================

    while (dif >= tol && it <= itmax) {
        system_clock::time_point start = system_clock::now();
        it = it + 1;
        // V = Vnew;
        for (int i = 0; i < Na; i++) {
            for (int j = 0; j < Nd; j++) {
                for (int k = 0; k < Ny; k++) {
                    V[i][j][k] = Vnew[i][j][k];
                }
            }
        }

        for (int i = 0; i < Na; i++) {
            for (int j = 0; j < Nd; j++) {
                Vfuture[i][j] = 0;
                for (int k = 0; k < Ny; k++) {
                    Vfuture[i][j] += beta * Ptrans[k] * Vnew[i][j][k]; // + beta * Ptrans[1] * Vnew[i][j][1] + beta * Ptrans[2] * Vnew[i][j][2]; // Why is this different from Vfuture[i][j] += beta * Vnew[i][j][k] * Ptrans[k]; with for k
                }
            }
        }
        #pragma omp parallel for private(Val)     // USE PARALLELIZATION
        for (int a = 0; a < Na; a++) {
            for (int b = 0; b < Nd; b++) {
                for (int c = 0; c < Ny; c++) {
                    max = -99999;
                    for (int d = 0; d < Na; d++) {
                        for (int e = 0; e < Nd; e++) {
                            Val[d][e] = Uvec[a][b][c][d][e] + Vfuture[d][e];
                            if (max < Val[d][e]) {
                                max = Val[d][e];
                                maxposition_a[a][b][c] = d;
                                maxposition_d[a][b][c] = e;
                            }
                        }
                    }
                    Vnew[a][b][c] = max;
                }
            }
        }

        // Howard improvement
        for (int h = 0; h < H; h++) {
            for (int i = 0; i < Na; i++) {
                for (int j = 0; j < Nd; j++) {
                    for (int k = 0; k < Ny; k++) {
                        Vhoward[i][j][k] = Vnew[i][j][k];
                    }
                }
            }

            for (int i = 0; i < Na; i++) {
                for (int j = 0; j < Nd; j++) {
                    for (int k = 0; k < Ny; k++) {
                        temphoward[i][j] = beta * Vhoward[maxposition_a[i][j][k]][maxposition_d[i][j][k]][0] * Ptrans[0]
                            + beta * Vhoward[maxposition_a[i][j][k]][maxposition_d[i][j][k]][1] * Ptrans[1]
                            + beta * Vhoward[maxposition_a[i][j][k]][maxposition_d[i][j][k]][2] * Ptrans[2];
                        Vnew[i][j][k] = temphoward[i][j] + Uvec[i][j][k][maxposition_a[i][j][k]][maxposition_d[i][j][k]];
                    }
                }
            }
        }


        // Calculate Diff
        dif = -100000;
        for (int i = 0; i < Na; i++) {
            for (int j = 0; j < Nd; j++) {
                for (int k = 0; k < Ny; k++) {
                    tempdiff[i][j][k] = abs(V[i][j][k] - Vnew[i][j][k]);
                    if (tempdiff[i][j][k] > dif) {
                        dif = tempdiff[i][j][k];
                    }
                }
            }
        }

        system_clock::time_point end = system_clock::now();
        std::chrono::duration<float> sec = end - start;


        cout << dif << endl;
        cout << it << endl;
        cout << sec.count() << endl;
    }

    for (int k = 0; k < Ny; k++) {
        for (int i = 0; i < Na; i++) {
            for (int j = 0; j < Nd; j++) {
                cout << Vnew[i][j][k];
            }
            cout << '\n';
        }
    }
    cout << omp_get_max_threads() << endl;

}

_"...without changing the whole code..."_: GPU computing handles work in a different way from the CPU; more massively parallel, preferably stateless and branchless. It kind of needs you to design the program in a different way to take full advantage, — Richard Critten, Mar 04 '21 at 20:45
Reminder: The calculation time should be longer than the time to load the GPU. — Thomas Matthews, Mar 04 '21 at 21:31

score 0 · Accepted Answer · answered Mar 04 '21 at 22:02

There is no convenient way to add a #pragma and everything magically runs on the GPU.

However your code is well suited for GPU acceleration: In your loops, the elements are independent of each other. You can especially parallelize the Na, Nd, and Ny loops on the GPU. You will need to:

include the OpenCL C++ headers, see here
linearize the triple loop: crerate a linear index n = (i*Nd+j)*Ny+k;, turn the three loops into one
transfer your code to OpenCL C and get rid of the linear loop, a simple example how a kernel looks is here
create Buffers (allocate memory on the GPU)
create Kernel objects (one for each linearized triple loop) in C++ and link the Buffers as Kernel arguments
manually handle CPU<->GPU memory transfer (enqueueReadBuffer/enqueueWriteBuffer)
run the Kernels (enqueueNDRangeKernel)

Thanks for the comments. Just one question, can I still use matrix form A[][] rather than linear index even after using OpenCL? — Changseok Ma, Mar 09 '21 at 02:25
No, you have to use a linear index, even to access the matrix elements in C++. The elements of a 2D matrix A[][] may not be stored contiguous in memory, but OpenCL requires exactly that, see https://stackoverflow.com/a/35442603/9178992 — ProjectPhysX, Mar 09 '21 at 07:06

Can I use GPU to parallelize C++ code in Visual Studio?

1 Answers1