I am getting "call to cuMemcpyDtoHsync returned error 700: Illegal address during kernel execution" error when I try to parallelize this simple loop.
#include <vector>
#include <iostream>
using namespace std;
int main() {
vector<float> xF = {0, 1, 2, 3};
#pragma acc parallel loop
for (int i = 0; i < 4; ++i) {
xF[i] = 0.0;
}
return 0;
}
Compiled with: $ pgc++ -fast -acc -std=c++11 -Minfo=accel -o test test.cpp
main:
6, Accelerator kernel generated
Generating Tesla code
9, #pragma acc loop gang, vector(4) /* blockIdx.x threadIdx.x */
std::vector<float, std::allocator<float>>::operator [](unsigned long):
1, include "vector"
64, include "stl_vector.h"
771, Generating implicit acc routine seq
Generating acc routine seq
Generating Tesla code
T3 std::__copy_move_a2<(bool)0, const float *, decltype((std::allocator_traits<std::allocator<float>>::_S_pointer_helper<std::allocator<float>>((std::allocator<float>*)0)))>(T2, T2, T3):
1, include "vector"
64, include "stl_vector.h"
/usr/bin/ld: error in /tmp/pgc++cAUEgAXViQSY.o(.eh_frame); no .eh_frame_hdr table will be created.
$ ./test
call to cuStreamSynchronize returned error 700: Illegal address during kernel execution
The code runs normally without the #pragma, but I would like to make it parallel. What am I doing wrong?