According to the test case below, the assembly generated for AMD MI250 (gfx90a) is the same with or without simd
. Though, if you look at the CPU code, you shall see a significant change with the simd
clause which in this case, allows for a similar optimization to the ones observed with an explicit usage of the restrict
keyword.
TLDR: Currently, the simd
clause is irrelevant and only leads to this warning, even for extremely trivial cases:
loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]
.
#include <cstdint>
#define RESTRICT __restrict
using Float = double;
void test0_0(Float* a, const Float* b) {
a[0] = b[0] * b[0];
// Forced store/reload (b[0] could be a[0]).
a[1] = b[0];
}
void test0_1(Float* RESTRICT a, const Float* RESTRICT b) {
a[0] = b[0] * b[0];
// No forced store/reload.
a[1] = b[0];
}
void test1_0(Float* a, Float* b, std::size_t length) {
#pragma omp parallel for
for (std::size_t i = 0; i < length; i += 2) {
a[i + 0] = b[i + 0] * b[i + 0];
// Forced store/reload
a[i + 1] = b[i + 0];
}
}
void test1_1(Float* a, Float* b, std::size_t length) {
#pragma omp parallel for simd
for (std::size_t i = 0; i < length; i += 2) {
a[i + 0] = b[i + 0] * b[i + 0];
// simd -> no loop carried dependencies:
// No forced store/reload -> easier vectorization, less generated code.
a[i + 1] = b[i + 0];
}
}
void test2_0(Float* a, Float* b, std::size_t length) {
#pragma omp target teams distribute parallel for
for (std::size_t i = 0; i < length; i += 2) {
a[i + 0] = b[i + 0] * b[i + 0];
// ASM shows forced store/reload, as expected.
a[i + 1] = b[i + 0];
}
}
void test2_1(Float* RESTRICT a, Float* RESTRICT b, std::size_t length) {
#pragma omp target teams distribute parallel for
for (std::size_t i = 0; i < length; i += 2) {
a[i + 0] = b[i + 0] * b[i + 0];
// ASM shows forced store/reload, but a/b are restricted BAD!
a[i + 1] = b[i + 0];
}
}
void test3_0(Float* a, const Float* b, std::size_t length) {
#pragma omp target teams distribute parallel for simd
for (std::size_t i = 0; i < length; i += 2) {
a[i + 0] = b[i + 0] * b[i + 0];
// ASM shows forced store/reload, but a/b are restricted BAD!
a[i + 1] = b[i + 0];
}
}
void test3_1(Float* RESTRICT a, const Float* RESTRICT b, std::size_t length) {
#pragma omp target teams distribute parallel for simd
for (std::size_t i = 0; i < length; i += 2) {
a[i + 0] = b[i + 0] * b[i + 0];
// ASM shows forced store/reload, but a/b are restricted BAD!
a[i + 1] = b[i + 0];
}
}
test2_1(Float* RESTRICT a, Float* RESTRICT b, std::size_t length) {
#pragma omp target teams distribute parallel for
for (std::size_t i = 0; i < length; i += 2) {
a[i + 0] = b[i + 0];
// ASM shows forced store/reload, but a/b are restricted BAD!
a[i + 1] = b[i + 0];
}
}
void test3_0(Float* a, const Float* b, std::size_t length) {
#pragma omp target teams distribute parallel for simd
for (std::size_t i = 0; i < length; i += 2) {
a[i + 0] = b[i + 0];
// ASM shows forced store/reload, but a/b are restricted BAD!
a[i + 1] = b[i + 0];
}
}
void test3_1(Float* RESTRICT a, const Float* RESTRICT b, std::size_t length) {
#pragma omp target teams distribute parallel for simd
for (std::size_t i = 0; i < length; i += 2) {
a[i + 0] = b[i + 0];
// ASM shows forced store/reload, but a/b are restricted BAD!
a[i + 1] = b[i + 0];
}
}
Code available at: https://godbolt.org/z/sMY48s8jz