Can the following C++ code be parallelized, with OpenMP, for better performance?

Question

I have a code that does Singular Value Decomposition (SVD) for square matrices. Code does the job however, it is quite slow and when matrix size increases it gets unbearable. As I am not familiar with parallel programming hence, I am asking advise from experts before I start digging deeper and eventually realize the action I want to achieve is not even possible.

Thank you in advance.

void SVD::decompose() {
bool flag;
int i, its, j, jj, k, l, nm;
double anorm, c, f, g, h, s, scale, x, y, z;
Row rv1(n);
g = scale = anorm = 0.0;            //Householder reduction to bidiagonal form.
for (i = 0; i < n; i++) {
    l = i + 2;
    rv1[i] = scale*g;
    g = s = scale = 0.0;
    if (i < m) {
        for (k = i; k < m; k++) scale += abs(u[k][i]);
        if (scale != 0.0) {
            for (k = i; k < m; k++) {
                u[k][i] /= scale;
                s += u[k][i] * u[k][i];
            }
            f = u[i][i];
            g = -SIGN(sqrt(s), f);
            h = f*g - s;
            u[i][i] = f - g;
            for (j = l - 1; j < n; j++) {
                for (s = 0.0, k = i; k < m; k++) s += u[k][i] * u[k][j];
                f = s / h;
                for (k = i; k < m; k++) u[k][j] += f*u[k][i];
            }
            for (k = i; k < m; k++) u[k][i] *= scale;
        }
    }
    w[i] = scale *g;
    g = s = scale = 0.0;
    if (i + 1 <= m && i + 1 != n) {
        for (k = l - 1; k < n; k++) scale += abs(u[i][k]);
        if (scale != 0.0) {
            for (k = l - 1; k < n; k++) {
                u[i][k] /= scale;
                s += u[i][k] * u[i][k];
            }
            f = u[i][l - 1];
            g = -SIGN(sqrt(s), f);
            h = f*g - s;
            u[i][l - 1] = f - g;
            for (k = l - 1; k < n; k++) rv1[k] = u[i][k] / h;
            for (j = l - 1; j < m; j++) {
                for (s = 0.0, k = l - 1; k < n; k++) s += u[j][k] * u[i][k];
                for (k = l - 1; k < n; k++) u[j][k] += s*rv1[k];
            }
            for (k = l - 1; k < n; k++) u[i][k] *= scale;
        }
    }
    anorm = MAX(anorm, (abs(w[i]) + abs(rv1[i])));
}
for (i = n - 1; i >= 0; i--) {          //Accumulation of right-hand tranformations.
    if (i < n - 1) {
        if (g != 0.0) {
            for (j = l; j < n; j++)         // Double division to avoid possible underflow.
                v[j][i] = (u[i][j] / u[i][l]) / g;
            for (j = l; j < n; j++) {
                for (s = 0.0, k = l; k < n; k++) s += u[i][k] * v[k][j];
                for (k = l; k < n; k++) v[k][j] += s*v[k][i];
            }
        }
        for (j = l; j < n; j++) v[i][j] = v[j][i] = 0.0;
    }
    v[i][i] = 1.0;
    g = rv1[i];
    l = i;
}
for (i = MIN(m, n) - 1; i >= 0; i--) {          //Accumulation of left-hand transformations.
    l = i + 1;
    g = w[i];
    for (j = l; j < n; j++) u[i][j] = 0.0;
    if (g != 0.0) {
        g = 1.0 / g;
        for (j = l; j < n; j++) {
            for (s = 0.0, k = l; k < m; k++) s += u[k][i] * u[k][j];
            f = (s / u[i][i])*g;
            for (k = i; k < m; k++) u[k][j] += f*u[k][i];
        }
        for (j = i; j < m; j++) u[j][i] *= g;
    }
    else for (j = i; j < m; j++) u[j][i] = 0.0;
    ++u[i][i];
}
for (k = n - 1; k >= 0; k--) {          //Diagonalization of the bidiagonal form: Loop over
    for (its = 0; its < 30; its++) {            //singular values, and over allowed iterations.
        flag = true;
        for (l = k; l >= 0; l--) {          //Test ofr splitting.
            nm = l - 1;
            if (l == 0 || abs(rv1[l]) <= eps*anorm) {
                flag = false;
                break;
            }
            if (abs(w[nm]) <= eps*anorm) break;
        }
        if (flag) {
            c = 0.0;            //Cancellatin of rv[l], if l>0.
            s = 1.0;
            for (i = l; i < k + 1; i++) {
                f = s*rv1[i];
                rv1[i] = c*rv1[i];
                if (abs(f) <= eps*anorm) break;
                g = w[i];
                h = pythag(f, g);
                w[i] = h;
                h = 1.0 / h;
                c = g*h;
                s = -f*h;
                for (j = 0; j < m; j++) {
                    y = u[j][nm];
                    z = u[j][i];
                    u[j][nm] = y*c + z*s;
                    u[j][i] = z*c - y*s;
                }
            }
        }
        z = w[k];
        if (l == k) {           //Convergence.
            if (z < 0.0) {          //Singular value is made nonnegative.
                w[k] = -z;
                for (j = 0; j < n; j++) v[j][k] = -v[j][k];
            }
            break;
        }
        x = w[l];               //Shift from bottom 2-by-2 minor.
        nm = k - 1;
        y = w[nm];
        g = rv1[nm];
        h = rv1[k];
        f = ((y - z)*(y + z) + (g - h)*(g + h)) / (2.0*h*y);
        g = pythag(f, 1.0);
        f = ((x - z)*(x + z) + h*((y / (f + SIGN(g, f))) - h)) / x;
        c = s = 1.0;            //Next QR transformation:
        for (j = l; j <= nm; j++) {
            i = j + 1;
            g = rv1[i];
            y = w[i];
            h = s*g;
            g = c*g;
            z = pythag(f, h);
            rv1[j] = z;
            c = f / z;
            s = h / z;
            f = x*c + g*s;
            g = g*c - x*s;
            h = y*s;
            y *= c;
            for (jj = 0; jj < n; jj++) {
                x = v[jj][j];
                z = v[jj][i];
                v[jj][j] = x*c + z*s;
                v[jj][i] = z*c - x*s;
            }
            z = pythag(f, h);
            w[j] = z;           //Rotation can be arbitrary if z = 0.
            if (z) {
                z = 1.0 / z;
                c = f*z;
                s = h*z;
            }
            f = c*g + s*y;
            x = c*y - s*g;
            for (jj = 0; jj < m; jj++) {
                y = u[jj][j];
                z = u[jj][i];
                u[jj][j] = y*c + z*s;
                u[jj][i] = z*c - y*s;
            }
        }
        rv1[l] = 0.0;
        rv1[k] = f;
        w[k] = x;
    }
}

}

Those variables are very hard to follow for me. Maybe they have significance to mathematicians, but I can't easily read this. — TankorSmash, Aug 15 '16 at 17:55
Have you tried an existing linear algebra library, say OpenBLAS? My intuition is that getting good parallel SVD performance will be tricky. There's an interesting discussion on parallelizing SVD via the GPU here: http://stackoverflow.com/questions/26797226/matlab-svd-on-gpu — hacoo, Aug 15 '16 at 20:59
For the future: comment variables if they are not self-descriptive. Like "int m; //width of the ...-matrix". Use more comments anyhow if you are not using descriptive methods, like "//invert matrix" before some block. You will thank YOURSELF when looking at your own code after a year. — Aziuth, Aug 17 '16 at 08:41
@Aziuth why not replacing directly m by matrixWidth and avoiding comment ? — Richard Dally, Aug 17 '16 at 08:46
@Richard That's why I wrote "IF they are not self-descriptive". Of course you're right, self-descriptive is way better than comments. Although, it might be that he used single letters because his mathematical formulas use them and he thought that the parallels to the formulas are easier to see that way. But yeah, I would've used descriptive variables, and, if someone ordered me to do otherwise, at least used comments. Dunno why he did neither. — Aziuth, Aug 18 '16 at 09:00

score 1 · Accepted Answer · answered Aug 17 '16 at 08:27

Parts of your code can certainly be parallelized. How much you gain, that is an other question.

The easy way would be to use a common math library.
The fun way would be to maybe use OpenMP to do it yourself.

But befor you even think about OpenMP, consider to rearange your indices. You tend to loop over the first index alot, like in for (k = i; k < m; k++) u[k][i] *= scale;. This has a very bad cache hit rate in c++ for u[k][i] is basicly u[k*second_index_size+i]. If you swap the indices you get for (k = i; k < m; k++) u[i][k] *= scale; which makes perfect use of the cache.
You should see quite a speedup by implementing this.

Now for the OpenMP part.
Find out where the hot regions in your code are. Maybe use Visual Studio to do so. And then you could use OpenMP to parallelize certain for loops, like
#pragma omp parallel for for (k = i; k < m; k++) u[i][k] *= scale;

What you will gain depends on where the hot regions are and how big your matrices are. Benchmarks will have to show.

Can the following C++ code be parallelized, with OpenMP, for better performance?

1 Answers1