I have implemented this code from this to vectorizing it:
int c=0;
for (int j=-halfHeight; j<=halfHeight; ++j)
{
#pragma omp simd
for(int i=-halfWidth; i<=halfWidth; ++i){
wx_[c] = ofsx + j * a12 + i * a11;
wy_[c] = ofsy + j * a22 + i * a21;
x_[c] = (int) floor(wx_[c]);
y_[c] = (int) floor(wy_[c]);
++c;
}
}
std::cout<<"First size="<<size<<std::endl;
float imat_1[size];
std::cout<<"imat1"<<std::endl;
float imat_2[size];
std::cout<<"imat2"<<std::endl;
float imat_3[size];
std::cout<<"imat3"<<std::endl;
float imat_4[size];`
std::cout<<"imat4"<<std::endl;
#pragma omp simd
for(int c=0; c<size; c++){
if (x_[c] >= 0 && y_[c] >= 0 && x_[c] < width && y_[c] < height){
wx_[c] -= x_[c];
wy_[c] -= y_[c];
imat_1[c] = im.at<float>(y_[c],x_[c]);
imat_2[c] = im.at<float>(y_[c],x_[c]+1);
imat_3[c] = im.at<float>(y_[c]+1,x_[c]);
imat_4[c] = im.at<float>(y_[c]+1,x_[c]+1);
}
else{
wx_[c] = 0;
wy_[c] = 0;
imat_1[c] = 0;
imat_2[c] = 0;
imat_3[c] = 0;
imat_4[c] = 0;
ret = true;
}
}
std::cout<<"Second"<<std::endl;
#pragma omp simd
for(int c=0; c<size; c++){
out[c] =
(1.0f - wy_[c]) * ((1.0f - wx_[c]) * imat_1[c] + wx_[c] * imat_2[c]) +
( wy_[c]) * ((1.0f - wx_[c]) * imat_3[c] + wx_[c] * imat_4[c]);
}
In particular, size
can reach up to 275625
. When I run this code, it goes in segmentation fault at line float imat_4[size];
. In fact, this is solved by using float *imat_4 = (float*)malloc(sizeof(float)*size);
I think that this is because of this, so we run out of memory on the stack... But then, how can I solve this? I don't see much other possibilities for vectorizing this code.
notice that performance are crucial here, so allocating on the stack is less efficient (right?)