As a challenge, I was asked to make a parallel algorithm for inverting a matrix. I mainly looked at this paper and this SO question while doing research for it.
Before I tried to write my own code, I stumbled across someone else's implementation.
I come from an objective-c background, so I immediately thought of using GCD for this task. I also came across something called POSIX which seems more low-level, and might be apt for this task if GCD won't work--I don't know.
My naive attempt for parallelizing this was just to replace every for-loop with a dispatch_apply
, which worked (the product of the original and the inverse produces the identity matrix). However, that just slowed things down significantly (about 20x as slow at a glance). I see that there are SO questions on GCD and for-loops, but I'm mainly interested in what a better approach could be to this, not links to those answers which I've already read. Is it possible that the problem is the way I'm creating the dispatch queue, or maybe the fact that I'm only using one dispatch queue?
#include <stdio.h>
#include <dispatch/dispatch.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#define PARALLEL true
void invertMatrixNonParallel(double **matrix, long n);
void invertMatrixParallel(double **matrix, long n, dispatch_queue_t q);
void invertMatrixParallel(double **matrix, long n, dispatch_queue_t q)
{
__block double r;
__block long temp;
dispatch_apply(n, q, ^(size_t i) {
dispatch_apply(n, q, ^(size_t j) {
matrix[i][j + n] = (j == i) ? 1 : 0;
});
});
/* using gauss-jordan elimination */
dispatch_apply(n, q, ^(size_t j) {
temp=j;
/* finding maximum jth column element in last (n-j) rows */
dispatch_apply(n - j - 1, q, ^(size_t i) {
if (matrix[i + j + 1][j] > matrix[temp][j])
{
temp = i + j + 1;
}
});
/* swapping row which has maximum jth column element */
if(temp!=j)
{
double *row = matrix[j];
matrix[j] = matrix[temp];
matrix[temp] = row;
}
/* performing row operations to form required identity matrix out of the input matrix */
dispatch_apply(n, q, ^(size_t i) {
r = matrix[i][j];
if (i == j)
{
dispatch_apply(2 * n, q, ^(size_t k) {
matrix[i][k]/=r ;
});
}
else
{
dispatch_apply(2 * n, q, ^(size_t k) {
matrix[i][k]-=(matrix[j][k]/matrix[j][j])*r ;
});
}
});
});
}
void invertMatrixNonParallel(double **matrix, long n)
{
double temporary, r;
long i, j, k, temp;
for (i = 0; i < n; ++i)
{
for (j = n; j < n * 2; ++j)
{
matrix[i][j] = (j == i + n) ? 1 : 0;
}
}
/* using gauss-jordan elimination */
for(j=0; j<n; j++)
{
temp=j;
/* finding maximum jth column element in last (n-j) rows */
for(i=j+1; i<n; i++)
if(matrix[i][j]>matrix[temp][j])
temp=i;
/* swapping row which has maximum jth column element */
if(temp!=j)
{
for(k=0; k<2*n; k++)
{
temporary=matrix[j][k] ;
matrix[j][k]=matrix[temp][k] ;
matrix[temp][k]=temporary ;
}
}
/* performing row operations to form required identity matrix out of the input matrix */
for(i=0; i<n; i++)
{
if(i!=j)
{
r=matrix[i][j];
for(k=0; k<2*n; k++)
matrix[i][k]-=(matrix[j][k]/matrix[j][j])*r ;
}
else
{
r=matrix[i][j];
for(k=0; k<2*n; k++)
matrix[i][k]/=r ;
}
}
}
}
#pragma mark - Main
int main(int argc, const char * argv[])
{
long i, j, k;
const long n = 5;
const double range = 10.0;
__block double **matrix;
__block double **invertedMatrix = malloc(sizeof(double *) * n);
matrix = malloc(sizeof(double *) * n);
invertedMatrix = malloc(sizeof(double *) * n);
for (i = 0; i < n; ++i)
{
matrix[i] = malloc(sizeof(double) * n);
invertedMatrix[i] = malloc(sizeof(double) * n * 2);
for (j = 0; j < n; ++j)
{
matrix[i][j] = drand48() * range;
invertedMatrix[i][j] = matrix[i][j];
}
}
clock_t t;
#if PARALLEL
dispatch_queue_t q1 = dispatch_queue_create("com.example.queue1", DISPATCH_QUEUE_CONCURRENT);
t = clock();
invertMatrixParallel(invertedMatrix, n, q1);
#else
t = clock();
invertMatrixNonParallel(invertedMatrix, n);
#endif
t = clock() - t;
double time_taken = ((double)t * 1000)/CLOCKS_PER_SEC; // in seconds
printf("\n%s took %f milliseconds to execute \n\n", (PARALLEL == true) ? "Parallel" : "Non-Parallel", time_taken);
printf("Here's the product of the inverted matrix and the original matrix\n");
double product[n][n];
for (i = 0; i < n; ++i)
{
for (j = 0; j < n; ++j)
{
double sum = 0;
for (k = 0; k < n; ++k)
{
sum += matrix[i][k] * invertedMatrix[k][j + n];
}
product[i][j] = sum;
}
}
// should print the identity matrix
for (i = 0; i < n; ++i)
{
for (j = 0; j < n; ++j)
{
printf("%5.2f%s", product[i][j], (j < n - 1) ? ", " : "\n");
}
}
return 0;
}
Output for parallel:
Parallel took 0.098000 milliseconds to execute
For nonparallel:
Non-Parallel took 0.004000 milliseconds to execute
For both:
Here's the product of the inverted matrix and the original matrix
1.00, -0.00, -0.00, 0.00, -0.00
0.00, 1.00, 0.00, 0.00, 0.00
0.00, -0.00, 1.00, -0.00, 0.00
-0.00, -0.00, -0.00, 1.00, 0.00
0.00, 0.00, 0.00, 0.00, 1.00
Please, no answers that are just links, I'm only using SO as a last resort.