How to do QR decomposition via modified Gram-Schmidt method in C and CUDA

Question

anyone know how to do the QR decomposition via modified Gram-Schmidt method in C and CUDA. Some example/source/paper or other else? Thanks so much.

Edit: I can't answer to my question because someone have closed it, so i decided to update my question.

/*
 * QR decomposition via modified Gram-Schmidt algorithm
 * 
 * @Package = QR-decomposition
 * @Program = QR_gpu
 * @Version = 13.0928
 */

// Libraries
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <ctype.h>
#include <time.h>
#include <math.h>
#include <cuda.h>

// Constant
#define PROG "QR_cpu"
#define VERSION "13.1003"
#define PACKAGE "QR-Decomposition"

// Threads per block
#define THREAD_P_BLOCK 512
// Blocks per grid
#define BLOCKS_P_GRID  512

// macro
/* wrap each API call with the gpuErrchk macro, which will process the return 
status of the API call it wraps: http://bit.ly/1dTD0ZE */
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }

// Prototypes
__global__ void xTA (float *, float *, int, int, int, int, int);
__global__ void scale (float *, int, int, float *);
__global__ void r1_update(float *, float *, int, int, int, int);
__host__ void gpuAssert(cudaError_t, char *, int);
__host__ void print_matrix(float *, int, int, int);
__host__ void print_help (int);
__host__ int option_parser(int, char **, int *, int *);

// Host code
int main (int argc, char **argv) {

    int m, n, lda, ldr, i;
    float *A_d, *R_d;
    //cudaEvent_t t_start, t_stop;
    // Get "m" and "n" from command line
    if (0 != option_parser(argc, argv, &m, &n)) {
        fprintf(stderr, "Can\'t continue, exiting now!\n"); 
        exit(EXIT_FAILURE);
    }
    size_t A_size = m * n * sizeof(float);
    size_t R_size = n * n * sizeof(float);

    lda = n; ldr = n;

    // Allocate input matrices A_h and R_h in host memory
    float *A_h = (float *) malloc(A_size);
    float *R_h = (float *) malloc(R_size);
    memset(R_h, 0, R_size);

    // Initialize input matrix
    for (i = 0; i < n; i++)
        A_h[i*lda + i] = i + 1;

    // Allocate matrices in device memory
    gpuErrchk (cudaMalloc(&A_d, A_size));
    gpuErrchk (cudaMalloc(&R_d, R_size));
    // Copy the A matrix from host memory to device memory
    gpuErrchk (cudaMemcpy(A_d, A_h, A_size, cudaMemcpyHostToDevice));
    // Set R matrix to 0 
    gpuErrchk (cudaMemset(R_d, 0, R_size));

    /**** Invoke kernel ****/
    dim3 dimBlock (THREAD_P_BLOCK, 1, 1);
    // dimGrid 'monodimensional' (just x value)
    dim3 dimGrid_M ((m + THREAD_P_BLOCK - 1) / THREAD_P_BLOCK, 1, 1);
    // dimGrid 'bidimensional' (x and y values)
    dim3 dimGrid_B (BLOCKS_P_GRID, (m + THREAD_P_BLOCK - 1) / THREAD_P_BLOCK,1);
    // Gram-Schmidt algorithm step by step
    for (i = 0; i < n; i++) {
        // Step #1 --> R(i,i:n-1) = A'(:,i) * A(:,i:n-1)
        xTA <<< dimBlock, dimGrid_B >>> (R_d, A_d, m, n, lda, ldr, i);
        // Step #3 (Is the scale of a column vector)
        scale <<< dimBlock, dimGrid_M >>> (A_d + i, m, lda, R_d + i*ldr + i);
        // Step #4 (Is the scale of a row)
        scale <<< dimBlock, dimGrid_M >>> (R_d + ldr*i, m, 1, R_d + i*ldr + i);
        // Step #5 --> A(:,i+1:n−1) = A(:,i+1:n−1) − A(:,i) ∗ R(i,i+1:n−1)
        r1_update <<< dimBlock, dimGrid_B >>> (A_d, R_d + i*lda, m, n, lda, i);
    }
    // Copy the results from device memory to host memory
    gpuErrchk (cudaMemcpy(A_h, A_d, A_size, cudaMemcpyDeviceToHost));
    // Free device memory
    cudaFree(A_d); cudaFree(R_d);
    // Free host memory
    free(A_h); free(R_h);

    return 0;
}

/**
 * ## Kernel 1
 * 
 * Rank 1 update of columns of A
 */
__global__ void r1_update (float *A, float *R, int m, int n, int lda, int k) {
    // get x,y cordinates
    unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int y = blockIdx.y * blockDim.y + threadIdx.x;

    if (x < m && y < n-k-1)
        A[x*lda + y + k + 1] -= A[x*lda + k] * R[y + k + 1];
}

/**
 * ## Kernel 2
 * 
 * matrix vector product
 * Performs R[i] =  x'A where x' is a row of A
 * 
 * How leading dimension is used for matrices: http://ibm.co/19PLtIX
 */
__global__ void xTA (float *R, float *A, int m, int n, int lda, int ldr, int k){
    // block column * block dim + column (computed by each thread)
    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
    int j;
    // upper triangular matrix 
    if (i < n - k) {
        for (j = 0; j < m; j++)
            R[k*ldr + k + i] += A[k*lda + j] * A[j*lda + k + i];
    }
}

/**
 * ## Kernel 3
 * 
 * mult. for constant s
 * d vector
 * ld leading dimension (distance from elements)
 */
__global__ void scale (float *d, int m, int ld, float *R_x) {
    // block colum * block dim + column (computed by each thread)
    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
    // s = sqrt(R(i,i))
    // Static initialization of shared variables is illegal in CUDA. 
    // The problem is that the semantics of how every thread should treat 
    // static initialization of shared memory is undefined in the 
    // programming model. Which thread should do the write? What happens if 
    // the value is not uniform between threads? How should the compiler 
    // emit code for such a case and how should the hardware run it?
    __shared__ float s; s = sqrt(*R_x);
    // and scale
    if (i < m) d[i*ld] /= s;
}

/*
 * GPU Error Handler (CUDA_SAFE_CALL deprecated from CUDA 5.0)
 */
__host__ void gpuAssert(cudaError_t code, char *file, int line) {

    if (code != cudaSuccess) {
        fprintf(stderr,"GPUassert: %s %s %d\n", 
            cudaGetErrorString(code), file, line);
        exit(code);
    }
}

/* 
 * Print matrix
 * 
 * Print a matrix passed as argument
 */
__host__ void print_matrix (float * matrix, int m, int n, int ld) {

    int i, j;

    for (i = 0; i < m; i++) {
        for (j = 0; j < n; j++)
            printf("%0.5f ", matrix[i*ld + j]);
        printf("\n");
    }
}

/*
 * The option parser
 *
 * The function parses the parameters passed from the command line and run 
 * their own procedures.
 *
 * Return value:
 *   0 on success
 *  -1 on failure
 *
 * Please, see http://www.gnu.org/software/libc/manual/html_node/Getopt.html
 * for further informations. (thanks to Frodo Looijaard)
 */
__host__ int option_parser (int argc, char **argv, int * m, int * n) {

    int opt;

    if (argc < 2) {
        fprintf(stderr, "The program needs arguments...\n\n");
        print_help(1);
    }

    opterr = 0;

    while ( -1 != (opt = getopt (argc, argv, "hr:c:"))) {
        switch (opt) {
            case 'h': 
                print_help(0);
            case 'r': 
                printf("optarg: %s\n", optarg); 
                if ((*m = atoi(optarg)) < 2) return -1; 
                break;
            case 'c': 
                printf("optarg: %s\n", optarg);
                if ((*n = atoi(optarg)) < 2 || *n > *m) return -1; 
                break;
            case '?':
                if (optopt == 'r' || optopt == 'c')
                    fprintf(stderr,"Option -%c requires an argument.\n",optopt);
                else if (isprint (optopt))
                    fprintf(stderr,"Unknown option `-%c'.\n", optopt);
                else
                    fprintf(stderr,"Unknown option chr `\\x%x'.\n", optopt);
                return -1;
            default:
                fprintf(stderr, "default switch-case statement reached\n");
                return -1;
        }
        //for (ii = optind; ii < argc; ii++)
        //  printf ("non-option argument %s\n", argv[ii]);
    }
    return 0;
}

/*
 * The helper
 *
 * Show the info to run the program in the correct way
 */
__host__ void print_help (int exit_code) {

    printf("\nPKG : %s\nPROGRAM : %s\nVERSION : %s\n\n",PACKAGE,PROG,VERSION);

    printf("%s [-h] [-r num of rows] [-c num of columns]\n\n", PROG);
    printf("  -h    print this help and exit\n");
    printf("  -r    provide the number of rows\n");
    printf("  -c    provide the number of colums\n\n");

    printf("  Example: ./qr_gpu -r 800 -c 600\n\n");

    exit_code == -1 ? exit(EXIT_FAILURE) : exit(EXIT_SUCCESS);
}

When i run the program with cuda-memcheck I obtain this result:

[mcrociara@tesla project_CUDA]$ cuda-memcheck ./qr_gpu -r 4 -c 4 ========= CUDA-MEMCHECK optarg: 4 optarg: 4 GPUassert: unspecified launch failure src/qr_gpu.cu 99 ========= Invalid global read of size 4 ========= at 0x000000c8 in xTA ========= by thread (0,0,0) in block (0,0,0)

========= Address 0x3b5273104 is out of bounds

========= Invalid global read of size 4 ========= at 0x000000c8 in xTA ========= by thread (1,0,0) in block (0,0,0)

========= Address 0x3b5273108 is out of bounds

========= Invalid global read of size 4 ========= at 0x000000c8 in xTA ========= by thread (2,0,0) in block (0,0,0)

========= Address 0x3b527310c is out of bounds

========= ERROR SUMMARY: 3 errors

Someone may help to understand why? I implemented the serial version on this algorithm that seem to work properly:

/*
 * QR decomposition via modified Gram-Schmidt algorithm
 */

// Libraries
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <ctype.h>
#include <time.h>
#include <math.h>

// Constant
#define PROG "QR_cpu"
#define VERSION "28.0913"
#define PACKAGE "QR-Decomposition"

// Prototypes
void r1_update(double *, double *, int);
void xTA (double *, double *, int);
void scale (double *, int, double);
void gram (double *, double *);
void print_matrix(double *);
int option_parser(int, char **);
void print_help (int);

// Number of rows
int M = -1; 
// Number of columns
int N = -1;
// Leading Dimension of A and R
int LDA = -1;
int LDR = -1;
// The clock
clock_t start, stop;

/*
 *
 */
int main (int argc, char **argv) {

    int i;

    // Get M, N from command line
    if (0 != option_parser(argc, argv)) {
        fprintf(stderr, "Bad option man!!!\n");
        exit(EXIT_FAILURE);
    }
    // Check the size of M and N 
    if (M > 5000 || N > 5000) {
        fprintf(stderr, "Too big man!!!\n");
        exit(EXIT_FAILURE);
    }

    // Set the leading dimension of A and R
    LDA = N; LDR = N;

    // Allocate memory for A and R 
    double *A = calloc(M * N, sizeof(*A));
    double *R = calloc(N * N, sizeof(*R));

    // Set the diagonal of A as A(i,i) = i + 1 with i from 0 to N−1
    for (i = 0; i < N; i++) 
        A[i*LDA + i] = i + 1;

    start = clock();
    gram(A, R);
    stop = clock();

    printf("\nTime: %0.4f\n\n",(stop - start) / (double)(CLOCKS_PER_SEC));
    // print_matrix(A);
    // print_matrix(R);

    free(A); free(R);

    return 0;
}

/**
 * Rank 1 update of columns of A
 */
void r1_update (double *A, double *R, int k) {

    int i, j;

    for(i = 0; i < M; i++)
        for(j = k + 1; j < N; j++)
            A[i*LDA + j] -= A[i*LDA + k] * R[j];
}

/**
 * Matrix vector product
 * Performs R[i] =  x'A where x' is a row of A
 * A : m x k, leading dimebsion, lda
 * 
 * How leading dimension is used for matrices: http://ibm.co/19PLtIX
 */
void xTA (double *R, double *A, int k) {

    int i, j;
    // upper triangular matrix 
    for (i = 0; i < N-k; i++)
        for (j = 0; j < M; j++)
            R[k*LDR + k + i] += A[k*LDA + j] * A[j*LDA + k + i];
}

/**
 * Mult. for constant s
 * d vector
 * ld leading dimension (distance from elements)
 */
void scale (double *d, int ld, double s) {

    int i;

    for (i = 0; i < M; i++) d[i*ld] *= s;
}

/**
 * Performs Modified Gram Schmidt
 * ortogonalization of columns of A
 * A m x n
 * R n x n
 */ 
void gram (double *A, double *R) {

    int i;
    double s;
    // Modified Gram Schmidt algorithm step by step
    for (i = 0; i < N; i++) {
        // Step #1 --> R(i,i:n-1) = A'(:,i) * A(:,i:n-1)
        xTA(R, A, i);
        // Step #2 (Normalizing) --> s = sqrt(R(i,i))
        s = 1 / sqrt(R[i*LDR + i]);
        // Step #3 (Is the scale of a column vector)
        scale(A + i, LDA, s);
        // Step #4 (Is the scale of a row)
        scale(R + LDR*i, 1, s);
        // Step #5 --> A(:,i+1:n−1) = A(:,i+1:n−1) − A(:,i) ∗ R(i,i+1:n−1)
        r1_update(A, R + i*LDA, i);
    }
}

/* 
 * Print Matrix
 * 
 * Print a matrix passed as argument
 */
void print_matrix (double * matrix) {

    int i, j;

    for (i = 0; i < M; i++) {
        for (j = 0; j < N; j++)
            printf("%0.2f ", matrix[i*LDA + j]);
        printf("\n");
    }
}

/*
 * The option parser
 *
 * The function parses the parameters passed from the command line and run 
 * their own procedures.
 *
 * Return value:
 *   0 on success
 *  -1 on failure
 *
 * Please, see http://www.gnu.org/software/libc/manual/html_node/Getopt.html
 * for further informations. (thanks to Frodo Looijaard)
 */
int option_parser (int argc, char **argv) {

    int opt;

    if (argc < 2) {
        fprintf(stderr, "This program needs arguments...\n\n");
        print_help(1);
    }

    opterr = 0;

    while ( -1 != (opt = getopt (argc, argv, "hr:c:"))) {
        switch (opt) {
            case 'h': 
                print_help(0);
            case 'r': 
                printf("optarg: %s\n", optarg); 
                if ((M = atoi(optarg)) < 2) return -1; 
                break;
            case 'c': 
                printf("optarg: %s\n", optarg);
                if ((N = atoi(optarg)) < 2 || N > M) return -1; 
                break;
            case '?':
                if (optopt == 'r' || optopt == 'c')
                    fprintf(stderr,"Option -%c requires an argument.\n",optopt);
                else if (isprint (optopt))
                    fprintf(stderr,"Unknown option `-%c'.\n", optopt);
                else
                    fprintf(stderr,"Unknown option chr `\\x%x'.\n", optopt);
                return -1;
            default:
                fprintf(stderr, "default switch-case statement reached\n");
                return -1;
        }
        //for (ii = optind; ii < argc; ii++)
        //  printf ("Non-option argument %s\n", argv[ii]);
    }
    return 0;
}

/*
 * The helper
 *
 * Shows the info to run the program in the correct way
 */
void print_help (int exit_val) {

    printf("\nPKG : %s\nPROGRAM : %s\nVERSION : %s\n\n",PACKAGE,PROG,VERSION);

    printf("%s [-h] [-r num of rows] [-c num of columns]\n\n", PROG);
    printf("  -h    print this help and exit\n");
    printf("  -r    provide the number of rows\n");
    printf("  -c    provide the number of colums\n\n");

    printf("  Example: ./qr_cpu -r 800 -c 600\n\n");

    exit_val == -1 ? exit(EXIT_FAILURE) : exit(EXIT_SUCCESS);
}

Thanks in advance!

I think that, as it stands, this question is off-topic for StackOverflow. "How does QR via GS work?" is a maths question. "How do I code this in CUDA?" is asking too much for a SO question. It would be best if you provided *your* code (if you understand how GS works and know how to program basic CUDA, you *can* do a working implementation) and then asked about improvements. — us2012, Sep 13 '13 at 13:32
Perhaps the code at [LU, QR and Cholesky factorizations using GPU](https://devtalk.nvidia.com/default/topic/405197/?comment=2852975) could be of interest for you. I don't know if it uses Gram-Schmidt altogether. — Vitality, Sep 13 '13 at 13:35
@JackOLantern that looks like an answer to me. Voting to close, as per SO: "Questions asking us to recommend or find a tool, library or favorite off-site resource are off-topic for Stack Overflow as they tend to attract opinionated answers and spam. Instead, describe the problem and what has been done so far to solve it. " — Robert Crovella, Sep 13 '13 at 22:07
@RobertCrovella Yes, this post is off-topic for SO. Nevertheless, I have checked that the quoted code does not use the Gram-Schmidt method, but Householder reflections instead. I would anyway recommend the poster to have a look at the code as well as the related paper, which have been authored by V. Volkov (along with J.W. Demmel). — Vitality, Sep 14 '13 at 08:21
I personally would use the (modified) Gram-Schmidt method only for small matrices and use householder transformations for large matrices. So let's say you have small matrices. Then parallelism comes not from the matrix size but from the number of matrices. As gram-Schmidt does not involve any conditional branches (its just a fixed pattern of vector projections and subtractions) you can simply use one thread per matrix. But be sure to use a memory layout that allows all threads in a warp to load and store simultaneously. — CliffordVienna, Sep 14 '13 at 12:16
@RobertCrovella Since the post has not been close yet, I have expended my comment to an answer :-) — Vitality, Sep 14 '13 at 15:05
CUDA 7 (now in release candidate version) provides QR decomposition routines. It would be good to reopen this question to add this comment as an answer with an example for the benefit of the community. — Vitality, Feb 05 '15 at 08:09
@JackOLantern How? I solved the problem, i'd like to show my solution/example so other students of parallel computing can save time. — realnot, Mar 06 '15 at 12:23
@MauroCrociara You need a certain amount of reopen votes to reopen a question. I have added an answer to [QR decomposition to solve linear systems in CUDA](http://stackoverflow.com/questions/22399794/qr-decomposition-to-solve-linear-systems-in-cuda) which follows my comment above. — Vitality, Mar 07 '15 at 18:32

Vitality · Answer 1 · 2013-09-14T15:05:13.463

There are different ways to calculate the QR decomposition of a matrix. The main methods are:

Gram-Schmidt process;
Householder reflections;
Givens rotations;

Gram-Schmidt is a sequence of projections and vector subtractions, which may be implemented as a sequence of kernels performing reductions (for projections) and element-wise array operations (vector subtractions). You can have a look at the papers

a) QR Decomposition on GPUs

b) Parallel Implementation of Classical Gram-Schmidt Orthogonalization on CUDA Graphics Cards

c) CPU vs. GPU - Performance comparison for the Gram-Schmidt algorithm

QR decomposition via Householder reflections is dominated by matrix-vector operations and you can find some information in paper a), paper

d) Benchmarking GPUs to Tune Dense Linear Algebra

and an implementation by V. Volkov and J.W. Demmel is available at

LU, QR and Cholesky factorizations using GPU

Givens rotations do not appear to me to be very popular as a parallel approach to QR decomposition. Basically, each Givens rotation modifies two rows, so that some parallelization possible also by aid of the Sameh-Kuck pattern allowing up to n concurrent rotations. You can find some information at

Benchmarking the NVIDIA 8800GTX with the CUDA Development Platform

Actually, a clear performance comparison between the different approaches as implemented in CUDA architectures isn't available. Be aware that some of the posted material regards optimizations on "old" architectures. So, perhaps further improvements could be achieved by further optimizations for the "newer" GPU generations.

There are differences in numerical robustness. Householder reflections and Givens rotations are superior to MGS (modified Gram-Schmidt) in this regard. Givens rotation allow for the parallel processing of multiple pairs of rows according to various concurrency schemes, Sameh-Kuck being the simplest. For QR factorizations of small matrices that fit into shared memory, I have found Givens rotation using rsqrt() to be the fastest, on account of the simple code which allows high occupancy. — njuffa, Sep 14 '13 at 16:35

How to do QR decomposition via modified Gram-Schmidt method in C and CUDA

========= Address 0x3b5273104 is out of bounds

========= Address 0x3b5273108 is out of bounds

========= Address 0x3b527310c is out of bounds

1 Answers1