MPI Block matrix multiplication

Question

I am trying to generate two matrices A&B of size n, partition them into s*s sub-matrices and after scattering them through the processors, perform a multiplication between the block matrices. I have been able to successfully generate and scatter the sub-matrices through the processors; however, I am stuck in performing multiplication on the sub-matrices at each processor. My code is pretty much similar to the one in the following post (the code in the answer section) but I modified it for two matrices: MPI partition matrix into blocks

Could you please tell me how I can modify this to perform multiplication?

I stayed with the same tags to be easier to follow up.

    #include <stdio.h>
    #include <stdlib.h>
    #include <mpi.h>
    #include <time.h>

    #define COLSa 10
    #define ROWSa 10

    #define COLSb 10
    #define ROWSb 10
    #define s 2

   int main(int argc, char **argv) {

    MPI_Init(&argc, &argv);
    int p, rank;
    MPI_Comm_size(MPI_COMM_WORLD, &p);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);

    char i;
    char j;

    char a[ROWSa*COLSa];
    char b[ROWSb*COLSb];
    char c[ROWSa*COLSb];  // c=a*b

    const int NPROWS=s;  /* number of rows in _decomposition_ */
    const int NPCOLS=s;  /* number of cols in _decomposition_ */

    const int BLOCKROWSa = ROWSa/NPROWS;  /* number of rows in _block_ */
    const int BLOCKCOLSa = COLSa/NPCOLS; /* number of cols in _block_ */

    const int BLOCKROWSb = ROWSb/NPROWS;  /* number of rows in _block_ */
    const int BLOCKCOLSb= COLSb/NPCOLS; /* number of cols in _block_ */

    if (rank == 0) {

    for (int ii=0; ii<ROWSa*COLSa; ii++) {
         a[ii]=rand() %10 ;
    }

    for (int ii=0; ii<ROWSb*COLSb; ii++) {
         b[ii]=rand() %10 ;

       }
    }

    char BLa[BLOCKROWSa*BLOCKCOLSa];
    for (int ii=0; ii<BLOCKROWSa*BLOCKCOLSa; ii++)
    BLa[ii] = 0;

    char BLb[BLOCKROWSb*BLOCKCOLSb];
    for (int ii=0; ii<BLOCKROWSb*BLOCKCOLSb; ii++)
    BLb[ii] = 0;  

    char BLc[BLOCKROWSa*BLOCKCOLSb];
    for (int ii=0; ii<BLOCKROWSa*BLOCKCOLSb; ii++)
    BLc[ii] = 0; 

    MPI_Datatype blocktype;
    MPI_Datatype blocktype2;

    MPI_Type_vector(BLOCKROWSa, BLOCKCOLSa, COLSa, MPI_CHAR, &blocktype2);
    MPI_Type_vector(BLOCKROWSb, BLOCKCOLSb, COLSb, MPI_CHAR, &blocktype2);

    MPI_Type_create_resized( blocktype2, 0, sizeof(char), &blocktype);
    MPI_Type_commit(&blocktype);

    int dispsa[NPROWS*NPCOLS];
    int countsa[NPROWS*NPCOLS];
    int dispsb[NPROWS*NPCOLS];
    int countsb[NPROWS*NPCOLS];

    //*******************************Start Time Record****************//

    clock_t t;
    t=clock();

    for (int ii=0; ii<NPROWS; ii++) {
    for (int jj=0; jj<NPCOLS; jj++) {
    dispsa[ii*NPCOLS+jj] = ii*COLSa*BLOCKROWSa+jj*BLOCKCOLSa;
    countsa [ii*NPCOLS+jj] = 1;
        }
    }

    MPI_Scatterv(a, countsa, dispsa, blocktype, BLa, BLOCKROWSa*BLOCKCOLSa, MPI_CHAR, 0,   MPI_COMM_WORLD);


    for (int ii=0; ii<NPROWS; ii++) {
    for (int jj=0; jj<NPCOLS; jj++) {
    dispsb[ii*NPCOLS+jj] = ii*COLSb*BLOCKROWSb+jj*BLOCKCOLSb;
    countsb [ii*NPCOLS+jj] = 1;
         }
    }

    MPI_Scatterv(b, countsb, dispsb, blocktype, BLb, BLOCKROWSb*BLOCKCOLSb, MPI_CHAR, 0, MPI_COMM_WORLD);




     for (int proc=0; proc<p; proc++) {
        if (proc == rank) {

          printf("Rank = %d\n", rank);

                if (rank == 0) {
                  printf("Global matrix A : \n");

                   for (int ii=0; ii<ROWSa; ii++) {
                     for (int jj=0; jj<COLSa; jj++) {
                       printf("%3d ",(int)a[ii*COLSa+jj]);
                }
                        printf("\n");
            }
                 printf("\n");
            printf("Global matrix B : \n");

           for (int ii=0; ii<ROWSb; ii++) {
             for (int jj=0; jj<COLSb; jj++) {
              printf("%3d ",(int)b[ii*COLSb+jj]);
                }
         printf("\n");
            }
        printf("\n");
                  printf("Local Matrix A:\n");
              for (int ii=0; ii<BLOCKROWSa; ii++) {
                for (int jj=0; jj<BLOCKCOLSa; jj++) {
            printf("%3d ",(int)BLa[ii*BLOCKCOLSa+jj]);

                }

             printf("\n");
            }

           printf("\n");
              printf("Local Matrix B:\n");
                for (int ii=0; ii<BLOCKROWSb; ii++) {
                   for (int jj=0; jj<BLOCKCOLSb; jj++) {
                       printf("%3d ",(int)BLb[ii*BLOCKCOLSb+jj]);

                }

          printf("\n");
            }
                }


            printf("Local Matrix A:\n");
                    for (int ii=0; ii<BLOCKROWSa; ii++) {
                   for (int jj=0; jj<BLOCKCOLSa; jj++) {
                       printf("%3d ",(int)BLa[ii*BLOCKCOLSa+jj]);
                  }

             printf("\n");
            }

          printf("Local Matrix B:\n");
            for (int ii=0; ii<BLOCKROWSb; ii++) {
                for (int jj=0; jj<BLOCKCOLSb; jj++) {
                   printf("%3d ",(int)BLb[ii*BLOCKCOLSb+jj]);
                }

        printf("\n");
            }

  //**********************Multiplication***********************//

       for (int i = 0; i < BLOCKROWSa; i++) {
          for (j = 0; j < BLOCKCOLSb; j++) {

        for (k = 0; k < BLOCKCOLSb; k++) {  //I am considering square matrices with the same sizes
               BLc[i + j*BLOCKROWSa] += BLa[i + k*BLOCKROWSa]*BLb[k + BLOCKCOLb*j];
                  printf("%3d ",(int)BLc[i+j*BLOCKROWSa]);
                     }
    printf("\n");

                 }

      printf("\n");

             }

       }

      MPI_Barrier(MPI_COMM_WORLD);
    }

   MPI_Finalize();

   //**********************End Time Record************************//

    t=clock()-t;
     printf("It took %f seconds (%d clicks).\n",t,((float)t)/CLOCKS_PER_SEC);


       return 0;
     }

Except a missing `int k;` and a `BLc[i + j*BLOCKROWSa] += BLa[i + k*BLOCKROWSa]*BLb[k + BLOCKCOLb*j];` to turn into `BLc[i + j*BLOCKROWSa] += BLa[i + k*BLOCKROWSa]*BLb[k + BLOCKCOLSb*j];` (one `S` more), there is nothing particularly weird with your code, as long as you wish to perform multiplications between block matrices. Why do you think you are stuck ? Why are you unsatisfied with your code ? It worked with `mpicc main.c -o main -std=c99` and `mpirun -np 4 main`. — francis, Mar 16 '14 at 13:04
Hi Francis. Thanks for your comment and corrections. However, by this code I am not able to get one single matrix as a result of multiplication at each processors, for some reason I get 5 of them at each processor!! — THTH, Mar 16 '14 at 14:31
OK I managed to fix that part and now I get one single product outcome. But the multiplications are not correct! Something is wrong with the math! — THTH, Mar 16 '14 at 15:08
A minor addition, unrelated to your issue. You're using rand(), but you haven't initiated the seed with srand. So each run you're actually using a same matrix. You can add a ''srand(time(NULL)) '' to fix that one. — Hrishikesh, Aug 26 '14 at 12:28

score 2 · Accepted Answer · answered Mar 16 '14 at 15:04

2

To get the blocks back into the matrix on proc 0, you may use the "opposite" of MPI_Scatterv() which is called MPI_Gatherv() http://www.mpich.org/static/docs/latest/www3/MPI_Gatherv.html :

MPI_Gatherv(BLc, BLOCKROWSb*BLOCKCOLSb,MPI_CHAR, c, countsb, dispsb,blocktype, 0, MPI_COMM_WORLD);

if (rank == 0) {
    printf("Global matrix C : \n");

    for (int ii=0; ii<ROWSa; ii++) {
        for (int jj=0; jj<COLSa; jj++) {
            printf("%3d ",(int)c[ii*COLSa+jj]);
        }
        printf("\n");
    }
}

Keep in mind that you are performing blockwise multiplications, that is different from a matrix multiplication.

Bye,

Francis

answered Mar 16 '14 at 15:04

francis

9,525
2
25
41

Francis, Thanks a lot for this useful point. I just realized what the problem is. My matrix multiplication is actually right, but the elements are subtracted from 256 and displayed. I mean, for example instead of 135, I get (135-256)=-121 in the output. I tried to change the define the matrices a float instead of char, and also changed the MPI commands arguments correspondingly, but it still did not work. – THTH Mar 16 '14 at 18:37
@THTH : it will be the case as long as you use `char` ! – francis Mar 16 '14 at 18:40
I used float, and changed the MPI_CHAR to MPI_FLOAT in my MPI commands, but I did not get any output! – THTH Mar 16 '14 at 18:42
You elements are not subtracted from 256. You are simply using `signed char`s and those can usually hold values that range from -128 to 127. Values larger than 127 are interpreted as negative numbers in two's complement representation. Indeed, 135 is 121's two's complement. – Hristo Iliev Mar 16 '14 at 20:17
Thanks Hristo. So does it mean that if I use MPI_UNSIGNED_CHAR this will be fixed? – THTH Mar 16 '14 at 21:20
You won't get negative number by using `unsigned char`...But the result will not be correct due to overflow. Any result above 255 will be wrong. You can change `char` to `float` and `MPI_CHAR`to `MPI_FLOAT` (or at least to `int` and `MPI_INT`). – francis Mar 17 '14 at 18:46

MPI Block matrix multiplication

1 Answers1