I'm trying to implement parallel algorithm that will compute Levenshtein distance between each of sequences in a list and store them in matrix (2d vector). In other words, I'm given 2d vector with numbers (thousands of number sequences of up to 30 numbers) and I need to compute Levenshtein distance between each vector of integers. I implemented serial algorithm that works, but when I tried to convert it to parallel, it is much slower (the more threads, the slower it is). The parallel version is implemented with c++11 threads (I also tried OpenMP, but with the same results).
Here is the function that distributes work:
vector<vector<int>> getGraphParallel(vector<vector<int>>& records){
int V = records.size();
auto threadCount = std::thread::hardware_concurrency();
if(threadCount == 0){
threadCount = 1;
}
vector<future<vector<vector<int>>>> futures;
int rowCount = V / threadCount;
vector<vector<int>>::const_iterator first = records.begin();
vector<vector<int>>::const_iterator last = records.begin() + V;
for(int i = 0; i < threadCount; i++){
int start = i * rowCount;
if(i == threadCount - 1){
rowCount += V % threadCount;
}
futures.push_back(std::async(getRows, std::ref(records), start, rowCount, V));
}
vector<vector<int>> graph;
for(int i = 0; i < futures.size(); i++){
auto result = futures[i].get();
for(const auto &row : result){
graph.push_back(row);
}
}
for(int i = 0; i < V; i++)
{
for(int j = i + 1; j < V; j++){
graph[j][i] = graph[i][j];
}
}
return graph;
}
Here is the function that computes rows of final matrix:
vector<vector<int>> getRows(vector<vector<int>>& records, int from, int count, int size){
vector<vector<int>> result(count, vector<int>(size, 0));
for(int i = 0; i < count; i++){
for(int j = i + from + 1; j < size; j++){
result[i][j] = levenshteinDistance(records[i + from], records[j]);
}
}
return result;
}
And finally function that computes Levenshtein distance:
int levenshteinDistance(const vector<int>& first, const vector<int>& second){
const int sizeFirst = first.size();
const int sizeSecond = second.size();
if(sizeFirst == 0) return sizeSecond;
if(sizeSecond == 0) return sizeFirst;
vector<vector<int>> distances(sizeFirst + 1, vector<int>(sizeSecond + 1, 0));
for(int i = 0; i <= sizeFirst; i++){
distances[i][0] = i;
}
for(int j = 0; j <= sizeSecond; j++){
distances[0][j] = j;
}
for (int j = 1; j <= sizeSecond; j++)
for (int i = 1; i <= sizeFirst; i++)
if (first[i - 1] == second[j - 1])
distances[i][j] = distances[i - 1][j - 1];
else
distances[i][j] = min(min(
distances[i - 1][j] + 1,
distances[i][j - 1] + 1),
distances[i - 1][j - 1] + 1
);
return distances[sizeFirst][sizeSecond];
}
One thing that came to my mind is that this slow down is caused by false sharing, but I could not check it with perf, because I'm working with Ubuntu in Oracle VirtualBox - cache misses are not avalaible there. If I'm right and the slow down is caused by false sharing, what should I do to fix it? If not, what is the reason of this slow down?