Running
template<SumAlgorithm algorithm_t, typename iterator_t, typename sum_t = typename std::iterator_traits<iterator_t>::value_type>
sum_t avx2_sum(iterator_t begin, iterator_t end) noexcept requires std::is_same<sum_t, float>::value && (algorithm_t == SumAlgorithm::Normal) {
// SIMD-parallel summation stage
auto running_sums = _mm256_set1_ps(0);
long iterator_skip = 256/(8*sizeof(sum_t));
for (; begin < end; begin += iterator_skip){
std::cout << "HERE\n";
__m256 tmp = _mm256_load_ps(begin);
running_sums = _mm256_add_ps(tmp, running_sums);
}
// Serial summation
running_sums = _mm256_add_ps(running_sums, _mm256_permute2f128_ps(running_sums, running_sums, 1));
running_sums = _mm256_hadd_ps(running_sums, running_sums);
running_sums = _mm256_hadd_ps(running_sums, running_sums);
return _mm256_cvtss_f32(running_sums);
}
like so
constexpr unsigned int num_elements = 4096*4096;
constexpr unsigned int alignment = 32;
alignas(alignment) std::vector<float> float_vec(num_elements, 1.0);
std::cout << "avx2<float, Normal>: " << accumulators::avx2_sum<algo::Normal>(&(*float_vec.begin()), &(*float_vec.end())) <<"\n";
segfaults at the first run of the loop during _mm256_load_ps. When I dereference begin, it is validly 1.0... I aligned the vector all the floats come from, why does this segfault? Any ideas?
Edit solution: as Peter Cordes said alignas only aligns the control block for the vector. used:Modern approach to making std::vector allocate aligned memory