Strassen-Winograd Algorithm

Question

I got a task to write a Strassen-Winograd algorithm in C++. I have written it twice, but the first version of my code doesn't work. The result is correct in the lower left corner of result matrix. And my second version is running slower than naive algorithm, even with N = 64+.

What am I doing wrong?

Important note: I'm not allowed to use dynamic matrix in recursion and structures. In addition, it is better to do the multiplication without copying, using the coordinates of the corner element of the submatrices.

Incorrect:

#include "pch.h"
#include<iostream>
#include<cstdio>
#include<conio.h>
#include<cstdlib>
#include<cmath>
#include<ctime>
#pragma comment(linker, "/STACK:5813243000")
using namespace std;
const int sizs = 256;

void vivod(int matrix[][256], int n);
void Matrix_Add(int a[][256], int b[][256], int c[][256], int n, int x1, int y1, int x2, int y2);
void Matrix_Sub(int a[][256], int b[][256], int c[][256], int n, int x1, int y1, int x2, int y2);
void Matrix_Multiply(int a[][256], int b[][256], int c[][256], int x1, int y1, int x2, int y2, int n);
void strassen(int a[][256], int b[][256], int c[][256], int m, int n, int x1, int y1, int x2, int y2);
void Naive_Multiply(int a[][256], int b[][256], int c[][256], int n)
{
    for (int i = 0; i < n; i++) {
        for (int j = 0; j < n; j++) {
            c[i][j] = 0;
            for (int t = 0; t < n; t++) {
                c[i][j] = c[i][j] + a[i][t] * b[t][j];
            }
        }
    }
}
void Multiply(int a[][256], int b[][256], int c[][256], int x1, int y1, int x2, int y2)
{
    for (int i = 0; i < 2; i++) {
        for (int j = 0; j < 2; j++) {
            c[i][j] = 0;
            for (int t = 0; t < 2; t++) {
                c[i][j] = c[i][j] + a[i][t] * b[t][j];
            }
        }
    }
}
int main()
{
    setlocale(LC_ALL, "Russian");
    int n;
    cout << "Введите число n:";
    cin >> n;
    const int m = 256;
    int A[m][m];
    int B[m][m];
    int C[m][m];
    int k[m][m];
    for (int i = 0; i < n; i++)
        for (int j = 0; j < n; j++)
            A[i][j] = 0;
    for (int i = 0; i < n; i++)
        for (int j = 0; j < n; j++)
            B[i][j] = 0;
    for (int i = 0; i < n; i++)
        for (int j = 0; j < n; j++)
            A[i][j] = rand() % 10;
    for (int i = 0; i < n; i++)
        for (int j = 0; j < n; j++)
            B[i][j] = rand() % 10;
    cout << "First Matrix:" << endl;
    vivod(A, n);
    cout << "Second Matrix:" << endl;
    vivod(B, n);
    int begin = clock();
    //for (int i =0; i < 100; i++)
    Naive_Multiply(A, B, k, n);
    int end = clock();
    cout << "Naive Multiply time: " << end - begin  << endl;
    vivod(k, n);
    int begin2 = clock();
    //for (int i = 0; i < 100; i++)
    strassen(A, B, C, n, n, 0, 0, 0, 0);
    int end2 = clock();
    cout << "time: " << end2 - begin2 << endl;
    vivod(C, n);
    system("pause");
    return 0;
}

void strassen(int a[][256], int b[][256], int c[][256], int m, int n, int x1, int y1, int x2, int y2) {
    m = n / 2;
    if (m != 1)
    {
        int s1[sizs][sizs];
        int s2[sizs][sizs];
        int s3[sizs][sizs];
        int s4[sizs][sizs];
        int s5[sizs][sizs];
        int s6[sizs][sizs];
        int s7[sizs][sizs];
        int s8[sizs][sizs];
        int m1[sizs][sizs];
        int m2[sizs][sizs];
        int m3[sizs][sizs];
        int m4[sizs][sizs];
        int m5[sizs][sizs];
        int m6[sizs][sizs];
        int m7[sizs][sizs];
        int t1[sizs][sizs];
        int t2[sizs][sizs];
        int c11[sizs][sizs];
        int c12[sizs][sizs];
        int c21[sizs][sizs];
        int c22[sizs][sizs];
        Matrix_Add(a, a, s1, m, n - m, n - 2 * m, n - m, n - m);
        Matrix_Sub(s1, a, s2, m, 0, 0, n - 2 * m, n - 2 * m);
        Matrix_Sub(a, a, s3, m, n - 2 * m, n - 2 * m, n - m, n - 2 * m);
        Matrix_Sub(a, s2, s4, m, n - 2 * m, n - m, 0, 0);
        Matrix_Sub(b, b, s5, m, n - 2 * m, n - m, n - 2 * m, n - 2 * m);
        Matrix_Sub(b, s5, s6, m, n - m, n - m, 0, 0);
        Matrix_Sub(b, b, s7, m, n - m, n - m, n - 2 * m, n - m);
        Matrix_Sub(s6, b, s8, m, 0, 0, n - m, n - 2 * m);
        strassen(s2, s6, m1, m, m, 0, 0, 0, 0);
        strassen(a, b, m2, m, m, n - 2 * m, n - 2 * m, n - 2 * m, n - 2 * m);
        strassen(a, b, m3, m, m, n - 2 * m, n - m, n - m, n - 2 * m);
        strassen(s3, s7, m4, m, m, 0, 0, 0, 0);
        strassen(s1, s5, m5, m, m, 0, 0, 0, 0);
        strassen(s4, b, m6, m, m, 0, 0, n - m, n - m);
        strassen(a, s8, m7, m, m, n - m, n - m, 0, 0);

        Matrix_Add(m1, m2, t1, m, 0, 0, 0, 0);
        Matrix_Add(t1, m4, t2, m, 0, 0, 0, 0);
        Matrix_Add(m2, m3, c11, m, 0, 0, 0, 0);
        Matrix_Sub(t2, m7, c21, m, 0, 0, 0, 0);
        Matrix_Add(t1, m5, c12, m, 0, 0, 0, 0);
        Matrix_Add(c12, m6, c12, m, 0, 0, 0, 0);
        Matrix_Add(t2, m5, c22, m, 0, 0, 0, 0);
        for (int i = 0; i < n / 2; i++)
        {
            for (int j = 0; j < n / 2; j++)
            {
                c[i + n - 2 * m][j + n - 2 * m] = c11[i][j];
                c[i + n - 2 * m][j + n - m] = c12[i][j];
                c[i + n - m][j + n - 2 * m] = c21[i][j];
                c[i + n - m][j + n - m] = c22[i][j];
            }
        }
    }
    else
    {
        Matrix_Multiply(a, b, c, x1, y1, x2, y2, n);
    }
}
void vivod(int matrix[][256], int n)
{
    for (int i = 0; i < n; i++)
    {
        for (int j = 0; j < n; j++)
        {
            cout << matrix[i][j] << " ";
        }
        cout << endl;
    }
    cout << endl;
}
void Matrix_Add(int a[][256], int b[][256], int c[][256], int n, int x1, int y1, int x2, int y2)
{
    for (int i = 0; i < n; i++) {
        for (int j = 0; j < n; j++) {
            c[i][j] = a[i + x1][j + y1] + b[i + x2][j + y2];
        }
    }
}

void Matrix_Sub(int a[][256], int b[][256], int c[][256], int n, int x1, int y1, int x2, int y2)
{
    for (int i = 0; i < n; i++) {
        for (int j = 0; j < n; j++) {
            c[i][j] = a[i + x1][j + y1] - b[i + x2][j + y2];
        }
    }
}
void Matrix_Multiply(int a[][256], int b[][256], int c[][256], int x1, int y1, int x2, int y2, int n)
{
    for (int i = 0; i < n; i++) {
        for (int j = 0; j < n; j++) {
            c[i][j] = 0;
            for (int t = 0; t < n; t++) {
                c[i][j] = c[i][j] + a[x1 + i][y1 + t] * b[x2 + t][y2 + j];
            }
        }
    }
}

Slow, deadline default 1:

#include "pch.h"
#include <stdio.h>
#include <iostream>
#include<cstdlib>
#include<ctime>
#pragma comment(linker, "/STACK:44333338")
int deadline;
using namespace std;
const int n = 16;
void StrVin(int(&A)[n][n], int(&B)[n][n], int(&C)[n][n], int size);
void add(int(&A)[n][n], int(&B)[n][n], int(&C)[n][n], int size);
void sub(int(&A)[n][n], int(&B)[n][n], int(&C)[n][n], int size);
void Multiply(int a[n][n], int b[n][n], int c[n][n]);
void vivod(int a[n][n]);
void naivemultiplication(int a[n][n], int b[n][n], int c[n][n], int size);
int main()
{
    setlocale(LC_ALL, "Russian");
    cout << "Введите предел: ";
    cin >> deadline;
    int a[n][n];
    int b[n][n];
    int c[n][n];

    for (int i = 0; i < n; i++)
    {
        for (int j = 0; j < n; j++)
        {
            a[i][j] = rand() % 10;
            b[i][j] = rand() % 10;
            c[i][j] = 0;
        }
    }
    cout << "First Matrix" << endl;
    //vivod(a);
    cout << "Second Matrix" << endl;
    //vivod(b);
    cout << "Strassen-Winograd ";
    long double begin2 = clock();
    StrVin(a, b, c, n);
    long double end2 = clock();
    cout << "Time: " << (end2 - begin2) / CLOCKS_PER_SEC << endl;
    //vivod(c);
    cout << "Standart method ";
    long double begin1 = clock();
    naivemultiplication(a, b, c, n);
    long double end1 = clock();
    cout << "Time: " << (end1 - begin1) / CLOCKS_PER_SEC << endl;
    //vivod(c);
    system("pause");
    return 0;
}
void vivod(int a[n][n])
{
    for (int i = 0; i < n; i++)
    {
        for (int j = 0; j < n; j++)
            cout << a[i][j] << " ";
        cout << endl;
    }
    cout << endl;
}

void add(int(&A)[n][n], int(&B)[n][n], int(&C)[n][n], int size)
{
    for (int i = 0; i < size; i++)
    {
        for (int j = 0; j < size; j++)
        {
            C[i][j] = A[i][j] + B[i][j];
        }
    }
}

void sub(int(&A)[n][n], int(&B)[n][n], int(&C)[n][n], int size)
{

    for (int i = 0; i < size; i++)
    {
        for (int j = 0; j < size; j++)
        {
            C[i][j] = A[i][j] - B[i][j];
        }
    }
}
void Multiply(int a[n][n], int b[n][n], int c[n][n])
{
    for (int i = 0; i < deadline*2; i++) {
        for (int j = 0; j < deadline*2; j++) {
            c[i][j] = 0;
            for (int t = 0; t < deadline*2; t++) {
                c[i][j] = c[i][j] + a[i][t] * b[t][j];
            }
        }
    }
}

void StrVin(int(&A)[n][n], int(&B)[n][n], int(&C)[n][n], int size)
{
    int half = size / 2;
    if (half != deadline)
    {
        int A11[n][n], A12[n][n], A21[n][n], A22[n][n], B11[n][n], B12[n][n], B21[n][n], B22[n][n], C11[n][n], C12[n][n], C21[n][n], C22[n][n];
        int S1[n][n], S2[n][n], S3[n][n], S4[n][n], S5[n][n], S6[n][n], S7[n][n], S8[n][n];
        int P1[n][n], P2[n][n], P3[n][n], P4[n][n], P5[n][n], P6[n][n], P7[n][n];
        int T1[n][n], T2[n][n];
        int K[n][n];
        for (int i = 0; i < half; i++) {
            for (int j = 0; j < half; j++) {
                A11[i][j] = A[i][j];
                A12[i][j] = A[i][j + half];
                A21[i][j] = A[i + half][j];
                A22[i][j] = A[i + half][j + half];
                B11[i][j] = B[i][j];
                B12[i][j] = B[i][j + half];
                B21[i][j] = B[i + half][j];
                B22[i][j] = B[i + half][j + half];
            }
        }
        for (int i = 0; i < half; i++)
        {
            for (int j = 0; j < half; j++)
            {
                C11[i][j] = C[i][j];
            }
        }
        for (int i = 0; i < half; i++)
        {
            for (int j = half; j < size; j++)
            {
                C12[i][j - half] = C[i][j];
            }
        }
        for (int i = half; i < size; i++)
        {
            for (int j = 0; j < half; j++)
            {
                C21[i - half][j] = C[i][j];
            }
        }
        for (int i = half; i < size; i++)
        {
            for (int j = half; j < size; j++)
            {
                C22[i - half][j - half] = C[i][j];
            }
        }
        add(A21, A22, S1, half);
        sub(S1, A11, S2, half);
        sub(A11, A21, S3, half);
        sub(A12, S2, S4, half);
        sub(B12, B11, S5, half);
        sub(B22, S5, S6, half);
        sub(B22, B12, S7, half);
        sub(S6, B21, S8, half);
        StrVin(S2, S6, P1, half);
        StrVin(A11, B11, P2, half);
        StrVin(A12, B21, P3, half);
        StrVin(S3, S7, P4, half);
        StrVin(S1, S5, P5, half);
        StrVin(S4, B22, P6, half);
        StrVin(A22, S8, P7, half);
        add(P1, P2, T1, half);
        add(T1, P4, T2, half);
        add(P2, P3, C11, half);
        add(T1, P5, K, half);
        add(K, P6, C12, half);
        sub(T2, P7, C21, half);
        add(T2, P5, C22, half);

        for (int i = 0; i < half; i++)
        {
            for (int j = 0; j < half; j++)
            {
                C[i][j] = C11[i][j];
                C[i][j + half] = C12[i][j];
                C[i + half][j] = C21[i][j];
                C[i + half][j + half] = C22[i][j];
            }
        }
    }
    else
    {
        Multiply(A, B, C);
    }
}

void naivemultiplication(int a[n][n], int b[n][n], int c[n][n], int size)
{
    for (int k = 0; k < size; k++)
    {
        for (int i = 0; i < size; i++)
        {
            c[k][i] = 0;
            for (int j = 0; j < size; j++)
            {
                c[k][i] += a[k][j] * b[j][i];
            }
        }
    }
}

ouch you renamed for nothing some functions and you changed the order of them into the code, this doesn't help us to compare the two versions of the code, think about to help us to help you ;-) — bruno, Dec 16 '18 at 14:17
I'm so sorry about that. Codes are not similar, they are based on two different principles, I did so to think differently. Compare them is meaningless. In addition, i am not pro, so i write primitive code without different strucutres etc. So it's easy to read it. — NoActualName, Dec 16 '18 at 14:25
The second code segment, which you say works correctly, would be better posted on the stack review site, except you can't really benchmark like this. You need to execute it many times in sequence, and under the same conditions, and measure the average time. Maybe it is slower simply because it runs first, and you have all the numbers in local cache when you do the naive multiplication. — Kenny Ostrom, Dec 16 '18 at 16:26
I have finnaly wrote the algorithm, but it's to slow like my second. Code and discription on review site: https://codereview.stackexchange.com/questions/209772/strassen-vinograde-alghorithm As you see, i'm running it after naive algorithm. The main problam, that it's requiers large amounts of data (recursiver creation of 256x256 arrays), and as a result we see that naive is faster. — NoActualName, Dec 16 '18 at 16:59
It looks like you have way too many temporary matrices, and they are all MAX size. Is that what you meant by no dynamic matrix? That is going to be a problem. — Kenny Ostrom, Dec 16 '18 at 18:00
Yep, it is a problem. I am not allowed to use dynamic arrays. But i also can't create static arrays of size n/2xn/2 every time. How can i solve this problam. Probably i can save the resalt by using "corner method", but i don't imagine how to implement it. — NoActualName, Dec 16 '18 at 18:17

score 0 · Answer 1 · edited Jan 17 '22 at 14:05

While it is asymptotically faster than a classical matrix-matrix product, even a very well written Strassen Winograd algorithm won't be faster than a naïve implementation for small sizes (and on modern architectures 64×64 is very small). This is because there is considerable overhead of n×n matrix additions during the recursion.

That is why all optimized implementations have a cut-off size under which they will switch to an optimized classical matrix-matrix product. The cut-off size depends strongly on the architecture and the quality of the underlying algorithm, but it can be for n way above 512.

If you want to make a serious implementation, I recommend reading some detailed description on the algorithm (you can start at wikipedia, then continue with their references). If you are doing this as a toy-project, at least stop the recursion under a certain threshold (which you need to tune), and try bigger sizes until you find measurable performance gains.

Regarding memory allocation, since you are allowed to allocate memory in the main function, you can compute an upper bound of memory required for temporary matrices as T*N^2*(1/4+1/16+...) (where T is the number of temporaries per recursion depth and N is the size of the matrix) and re-use that memory at each recursion step. To simplify things, start with just one recursion step which immediately switches to a classical algorithm, until you manage to get any speed improvements — and again, read existing literature about implementation details of the algorithm.

Alright, and the main question, can the algorithm slow down due to large memory allocation for results of s1, s2, etc.? — NoActualName, Dec 17 '18 at 11:54
I added a paragraph about memory. I think in your implementation all allocations are on the stack, which is "almost" free (but for dynamic sizes it is not standard c/c++) — chtz, Dec 17 '18 at 13:23

Strassen-Winograd Algorithm

1 Answers1

Linked