Giving a bit of context. I'm using c++17. I'm using pointer T* data
because this will interop with cuda code. I'm trying write a parallel version (on CPU) of a histogram creator. The sequential version:
template <class T>
vector<uint> Histogram<T>::SortDataToHist(T* data, size_t size)
{
vector<uint> bars{};
bars.resize(BarCount); // BarCount is the number of histogram bars
for (int i = 0; i < size; ++i) // size is the count of elements (in data*) to sort
{
// given the value of data[i] GetBarIndex will tell which bar it belongs, for counting
auto idx = GetBarIndex(data[i]);
// counting
bars[idx] += 1u;
}
return bars;
}
The parallel version splits the consideration of data array (read only) for several threads, sorts each sub array into local histograms and then merge (reduce) each into one final histogram. There is no need for mutex.
template <class T>
vector<uint> Histogram<T>::SortDataToHistPar(T* data, size_t size, int threadsCount)
{
vector<uint> bars{};
bars.resize(BarCount);
auto indexes = GetIndexes(size, threadsCount);
vector<future<vector<uint>>> futures{};
// loop to start threads
for (int i = 0; i < indexes.size() - 1; i++)
{
int idxA = indexes[i];
int idxB = indexes[i + 1];
future<vector<uint>> future = async(LocalSortHist, data, idxA, idxB);
// Error C3867 'Histogram<float>::LocalSortHist': non-standard syntax; use '&' to create a pointer to member
// Error C2672 'std::async': no matching overloaded function found
futures.push_back(future);
}
// loop to collect threads results
for (int i = 0; i < threadsCount; ++i)
{
auto result = futures[i].get();
for (int r = 0; r < BarCount; ++r)
bars[r] += result[r];
}
return bars;
}
I could not find a way to use LocalSortHist
as argument for async
. As written, I get:
- Error C3867
'Histogram<float>::LocalSortHist': non-standard syntax; use '&' to create a pointer to member
- Error C2672
'std::async': no matching overloaded function found
and with &Histogram<T>::LocalSortHist
(yes a template function has no address..) it yields:
- Error C2672
'async': no matching overloaded function found
- Error C2440
'initializing': cannot convert from 'std::vector<std::seed_seq::result_type,std::allocator<std::seed_seq::result_type>> (__cdecl Histogram<float>::* )(T *,uint,uint)' to 'std::launch'
- Error C2893
Failed to specialize function template 'std::future<_Select_invoke_traits<decay<_Ty>::type,decay<_ArgTypes>::type...>::type> std::async(_Fty &&,_ArgTypes &&...)'
With async(LocalSortHist<T>
it gives:
How can I use LocalSortHist
in several threads like this or so ?
For consideration, LocalSortHist
. The range [idxA, idxB] is the local consideration of the data array for "local sorting" or local histogram generation.
template <class T>
vector<uint> Histogram<T>::LocalSortHist(T* data, uint idxA, uint idxB)
{
vector<uint> bars{};
bars.resize(BarCount);
for (uint i = idxA; i < idxB; ++i)
{
auto idx = GetBarIndex(data[i]);
bars[idx] += 1u;
}
return bars;
}
And GetIndexes
:
template <class T>
vector<int> Histogram<T>::GetIndexes(size_t size, int threadsCount)
{
vector<int> pidx{};
int w = size / threadsCount;
int idx;
while(idx < size)
{
pidx.push_back(idx);
idx += w;
}
if (idx != size - 1)
pidx.push_back(size - 1);
return pidx;
}
A AAA test method:
TEST_METHOD(SortDataToHistParSOTest)
{
std::default_random_engine generator{};
std::normal_distribution<float> distribution(15.0, 5.0);
size_t sampleSize = 80000;
size_t sampleSizeBytes = sampleSize * sizeof(float);
float* samples = (float*)malloc(sampleSizeBytes);
for (int i = 0; i < sampleSize; ++i)
{
float number = distribution(generator);
samples[i] = number;
}
MinMax<float> mm;
mm.Min = 0.0f;
mm.Max = 30.0f;
Histogram sut(mm, 15);
auto hist = sut.SortDataToHistPar(samples, sampleSize, 16);
wstringstream s{};
for (auto x : hist)
s << x << L" ";
Logger::WriteMessage(s.str().c_str());
}