I am trying to optimize a piece of code without resorting to parallelizing / SSE.
Current critical code runs in about 20ms on my PC with O2. That seems quite a bit even for ~17mil iterations. The particular piece that is too slow is as follows:
for (int d = 0; d < numDims; d++)
{
for (int i = 0; i < numNodes; i++)
{
bins[d][(int) (floodVals[d][i] * binSteps)]++;
}
}
Update: Changing to iterators reduced the run-time to 17ms.
for (int d = 0; d < numDims; d++)
{
std::vector<float>::iterator floodIt;
for (floodIt = floodVals[d].begin(); floodIt < floodVals[d].end(); floodIt++)
{
bins[d][(int) (*floodIt * binSteps)]++;
}
}
The full dummy code is here:
#include <vector>
#include <random>
#include <iostream>
#include <chrono>
int main()
{
// Initialize random normalized input [0, 1)
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_real_distribution<float> dist(0, 0.99999);
// Initialize dimensions
const int numDims = 130;
const int numNodes = 130000;
const int binSteps = 30;
// Make dummy data
std::vector<std::vector<float>> floodVals(numDims, std::vector<float>(numNodes));
for (int d = 0; d < numDims; d++)
{
for (int i = 0; i < numNodes; i++)
{
floodVals[d][i] = dist(gen);
}
}
// Initialize binning
std::vector<std::vector<int>> bins(numDims, std::vector<int>(binSteps, 0));
// Time critical section of code
auto start = std::chrono::high_resolution_clock::now();
for (int d = 0; d < numDims; d++)
{
for (int i = 0; i < numNodes; i++)
{
bins[d][(int) (floodVals[d][i] * binSteps)]++;
}
}
auto finish = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed = finish - start;
std::cout << "Elapsed: " << elapsed.count() * 1000 << " ms" << std::endl;
return 0;
}