I am trying to use GPU to parallelize my code in Visual Studio C++.
Currently, I used OpenMP to use CPU parallelization.
But I am thinking of using GPU parallelization because I think it would be faster if I use a bigger size of arrays in calculations.
Below is the code that I am working on. I only used parallelization once.
I found out that in order to use GPU parallelization, I need to use OpenCL or Cuda.
And OpenCL and Cuda seem like I need to change the whole code. So I was wondering whether there is a way to use GPU parallelization without changing the whole code (maybe just changing '#pragma omp parallel for')
#include <iostream>
#include <cstdio>
#include <chrono>
#include <vector>
#include <math.h> // power
#include <cmath> // abs
#include <fstream>
#include <omp.h>
using namespace std;
using namespace chrono;
// Dynamically allocation with values(float)
void dallo_fn(float**** pMat, int Na, int Nd, int Ny) {
float*** Mat = new float** [Na];
for (int i = 0; i < Na; i++) {
Mat[i] = new float* [Nd];
for (int j = 0; j < Nd; j++) {
Mat[i][j] = new float[Ny];
fill_n(Mat[i][j], Ny, 1);
}
}
*pMat = Mat;
}
// Dynamically allocation without values(float)
void dallo_fn0(float**** pMat, int Na, int Nd, int Ny) {
float*** Mat = new float** [Na];
for (int i = 0; i < Na; i++) {
Mat[i] = new float* [Nd];
for (int j = 0; j < Nd; j++) {
Mat[i][j] = new float[Ny];
}
}
*pMat = Mat;
}
// Dynamically allocation without values(int)
void dallo_fn1(int**** pMat, int Na, int Nd, int Ny) {
int*** Mat = new int** [Na];
for (int i = 0; i < Na; i++) {
Mat[i] = new int* [Nd];
for (int j = 0; j < Nd; j++) {
Mat[i][j] = new int[Ny];
}
}
*pMat = Mat;
}
// Utility function
float utility(float a, float a_f, float d, float d_f, float y, double sig, double psi, double delta, double R) {
float C;
C = y + a - a_f / R - (d_f - (1 - delta) * d);
float result;
if (C > 0) {
result = 1 / (1 - 1 / sig) * pow(pow(C, psi) * pow(d_f, 1 - psi), (1 - 1 / sig));
}
else {
result = -999999;
}
return result;
}
int main()
{
#if defined _OPENMP
omp_set_num_threads(8);
#endif
float duration;
// Iteration Parameters
double tol = 0.000001;
int itmax = 200;
int H = 15;
// Model Parameters and utility function
double sig = 0.75;
double beta = 0.95;
double psi = 0.5;
double delta = 0.1;
double R = 1 / beta - 0.00215;
// =============== 2. Discretizing the state space =========================
// Size of arrays
const int Na = 1 * 91;
const int Nd = 1 * 71;
const int Ny = 3;
// Variables for discretization of state space
const float amin = -2;
const float amax = 7;
const float dmin = 0.01;
const float dmax = 7;
const float ymin = 0.5;
const float ymax = 1.5;
const float Ptrans[3] = { 0.2, 0.6, 0.2 };
// Discretization of state space
float ca = (amax - amin) / (Na - 1.0);
float cd = (dmax - dmin) / (Nd - 1.0);
float cy = (ymax - ymin) / (Ny - 1.0);
float* A = new float[Na];
float* Y = new float[Ny];
float* D = new float[Nd];
for (int i = 0; i < Na; i++) {
A[i] = amin + i * ca;
}
for (int i = 0; i < Nd; i++) {
D[i] = dmin + i * cd;
}
for (int i = 0; i < Ny; i++) {
Y[i] = ymin + i * cy;
}
// === 3. Initial guesses, Variable initialization and Transition matrix ===
// Initial guess for value function
float*** V;
dallo_fn(&V, Na, Nd, Ny);
float*** Vnew;
dallo_fn(&Vnew, Na, Nd, Ny);
// Initialization of other variables
float Val[Na][Nd];
float** Vfuture = new float* [Na];
for (int i = 0; i < Na; i++)
{
Vfuture[i] = new float[Nd];
}
float** temphoward = new float* [Na];
for (int i = 0; i < Na; i++)
{
temphoward[i] = new float[Nd];
}
float*** Vhoward;
dallo_fn0(&Vhoward, Na, Nd, Ny);
float*** tempdiff;
dallo_fn0(&tempdiff, Na, Nd, Ny);
int*** maxposition_a;
dallo_fn1(&maxposition_a, Na, Nd, Ny);
int*** maxposition_d;
dallo_fn1(&maxposition_d, Na, Nd, Ny);
float** mg_A_v = new float* [Na];
for (int i = 0; i < Na; i++)
{
mg_A_v[i] = new float[Nd];
}
for (int j = 0; j < Nd; j++) {
for (int i = 0; i < Na; i++) {
mg_A_v[i][j] = A[i];
}
}
float** mg_D_v = new float* [Na];
for (int i = 0; i < Na; i++)
{
mg_D_v[i] = new float[Nd];
}
for (int j = 0; j < Nd; j++) {
for (int i = 0; i < Na; i++) {
mg_D_v[i][j] = D[j];
}
}
float***** Uvec = new float**** [Na];
for (int i = 0; i < Na; i++) {
Uvec[i] = new float*** [Nd];
for (int j = 0; j < Nd; j++) {
Uvec[i][j] = new float** [Ny];
for (int k = 0; k < Ny; k++) {
Uvec[i][j][k] = new float* [Na];
for (int l = 0; l < Na; l++) {
Uvec[i][j][k][l] = new float[Nd];
}
}
}
}
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
for (int k = 0; k < Ny; k++) {
for (int l = 0; l < Na; l++) {
for (int m = 0; m < Nd; m++) {
Uvec[i][j][k][l][m] = utility(A[i], mg_A_v[l][m], D[j], mg_D_v[l][m], Y[k], sig, psi, delta, R);
}
}
}
}
}
// Value function iteration
int it;
float dif;
float max;
it = 0;
dif = 1;
// ================ 4. Value function iteration ============================
while (dif >= tol && it <= itmax) {
system_clock::time_point start = system_clock::now();
it = it + 1;
// V = Vnew;
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
for (int k = 0; k < Ny; k++) {
V[i][j][k] = Vnew[i][j][k];
}
}
}
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
Vfuture[i][j] = 0;
for (int k = 0; k < Ny; k++) {
Vfuture[i][j] += beta * Ptrans[k] * Vnew[i][j][k]; // + beta * Ptrans[1] * Vnew[i][j][1] + beta * Ptrans[2] * Vnew[i][j][2]; // Why is this different from Vfuture[i][j] += beta * Vnew[i][j][k] * Ptrans[k]; with for k
}
}
}
#pragma omp parallel for private(Val) // USE PARALLELIZATION
for (int a = 0; a < Na; a++) {
for (int b = 0; b < Nd; b++) {
for (int c = 0; c < Ny; c++) {
max = -99999;
for (int d = 0; d < Na; d++) {
for (int e = 0; e < Nd; e++) {
Val[d][e] = Uvec[a][b][c][d][e] + Vfuture[d][e];
if (max < Val[d][e]) {
max = Val[d][e];
maxposition_a[a][b][c] = d;
maxposition_d[a][b][c] = e;
}
}
}
Vnew[a][b][c] = max;
}
}
}
// Howard improvement
for (int h = 0; h < H; h++) {
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
for (int k = 0; k < Ny; k++) {
Vhoward[i][j][k] = Vnew[i][j][k];
}
}
}
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
for (int k = 0; k < Ny; k++) {
temphoward[i][j] = beta * Vhoward[maxposition_a[i][j][k]][maxposition_d[i][j][k]][0] * Ptrans[0]
+ beta * Vhoward[maxposition_a[i][j][k]][maxposition_d[i][j][k]][1] * Ptrans[1]
+ beta * Vhoward[maxposition_a[i][j][k]][maxposition_d[i][j][k]][2] * Ptrans[2];
Vnew[i][j][k] = temphoward[i][j] + Uvec[i][j][k][maxposition_a[i][j][k]][maxposition_d[i][j][k]];
}
}
}
}
// Calculate Diff
dif = -100000;
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
for (int k = 0; k < Ny; k++) {
tempdiff[i][j][k] = abs(V[i][j][k] - Vnew[i][j][k]);
if (tempdiff[i][j][k] > dif) {
dif = tempdiff[i][j][k];
}
}
}
}
system_clock::time_point end = system_clock::now();
std::chrono::duration<float> sec = end - start;
cout << dif << endl;
cout << it << endl;
cout << sec.count() << endl;
}
for (int k = 0; k < Ny; k++) {
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
cout << Vnew[i][j][k];
}
cout << '\n';
}
}
cout << omp_get_max_threads() << endl;
}