-3
#include <iostream>
#include <chrono>
using namespace std;
int main()
{
    const unsigned int m = 200;
    const unsigned int n = 200;
    srand(static_cast<unsigned int>(static_cast<std::chrono::duration<double>
    >(std::chrono::high_resolution_clock::now().time_since_epoch()).count()));
    double** matrixa;
    double** matrixb;
    double** matrixc;
    matrixa = new double* [m];
    matrixb = new double* [m];
    matrixc = new double* [m];
    unsigned int max = static_cast<unsigned int>(1u << 31);
    for (unsigned int i = 0; i < m; i++)
        matrixa[i] = new double[n];
    for (unsigned int i = 0; i < m; i++)
        matrixb[i] = new double[n];
    for (unsigned int i = 0; i < m; i++)
        matrixc[i] = new double[n];
    for (unsigned int i = 0; i < m; i++)
        for (unsigned int j = 0; j < n; j++)
            matrixa[i]
            [j] = static_cast<double>(static_cast<double>(rand()) / max * 10);
    for (unsigned int i = 0; i < m; i++)
        for (unsigned int j = 0; j < n; j++)
            matrixb[i]
            [j] = static_cast<double>(static_cast<double>(rand()) / max * 10);
    auto start = std::chrono::high_resolution_clock::now();
    for (unsigned int i = 0; i < m; i++)
        for (unsigned int j = 0; j < n; j++)
            for (unsigned int k = 0; k < m; k++)
                for (unsigned int l = 0; l < m; l++)
                    matrixc[i][j] += matrixa[k][l] * matrixb[l][k];
    auto stop = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> time_diff = stop - start;
    cout << "Czas wykonania programu " << time_diff.count() << " sekund." <<
        endl;
    for (unsigned int i = 0; i < m; i++)
        delete[] matrixa[i];
    for (unsigned int i = 0; i < m; i++)
        delete[] matrixb[i];
    for (unsigned int i = 0; i < m; i++)
        delete[] matrixc[i];
    delete[] matrixa;
    delete[] matrixb;
    delete[] matrixc;
    return 0;
}

I have this code and I would like to optimize it, unfortunately I have absolutely no idea how to go about it. Maybe someone has an idea and would like to help me? I got to the point where the program for 400 arrays executes 105 seconds but it is still too much, I would like to optimize this code to run faster. I found OpenMP library and thread class but I don't know how to use it in my program.

  • I think you need an optimization solution, not a problem... – ΦXocę 웃 Пepeúpa ツ Jun 09 '21 at 18:14
  • 1
    Arrays of arrays can scatter the memory used throughout storage making cache less effective. Sometimes orders of magnitude less effective. Make one big 1D array and pretend that it's 2D. [Here's an example.](https://stackoverflow.com/a/2076668/4581301) – user4581301 Jun 09 '21 at 18:14
  • 1
    Enlight us and please explain what this code suppose to do. Also if you add `-O3` flag it will execute immediately (since your code do not have observable results). – Marek R Jun 09 '21 at 18:15
  • 1
    You're going to have out-of-range problems if m!=n. And instead of `rand` (which, on some systems, only returns 15 bits) use the C++ `` header. – 1201ProgramAlarm Jun 09 '21 at 18:18
  • You appear to be doing O(m²n²) many calculations, which in this case is ~1.6 billion. Without knowing if there's a way to bring down the asymptotic complexity it seems hard to do much to shave that down. – Nathan Pierson Jun 09 '21 at 18:20
  • 1
    Side note: `high_resolution_clock` shouldn't be used for timing. It might alias `system_clock` and be vulnerable to changes to the system time. Clock resynchronization and daylight saving time changes are classics for screwing up measurements. Use `steady_clock` instead. [Here's the advice of one of the people most responsible for `chono`](https://stackoverflow.com/a/37440647/4581301). – user4581301 Jun 09 '21 at 18:21
  • 1
    The inner k & l loops don't depend on the i or j values (except for the initial value in matrixc, which you don't initialize) so you could run the k/l loops once, then apply that result to all the elements of matrixc. – 1201ProgramAlarm Jun 09 '21 at 18:25
  • The code would be somewhat simpler if those matrices were defined directly: `double matrix[m][n];` etc. – Pete Becker Jun 09 '21 at 19:48
  • Basically the four-times nested loop has a complexity of n*m^3. O(n^3) isn't efficient. – U. Windl Jun 10 '21 at 12:19

1 Answers1

0

Firstly, your matrix multiply algorithm is over complex than a normal one(Or it's just wrong), you may reference the wiki for a typical algorithm:

Input: matrices A and B

Let C be a new matrix of the appropriate size

For i from 1 to n:

For j from 1 to p:

Let sum = 0

For k from 1 to m:

Set sum ← sum + Aik × Bkj

Set Cij ← sum

Return C

There is a critical bug in your code, you haven't initialized the result matrix.

So the fixed code may like this:

#include <chrono>
#include <iostream>
using namespace std;
int main() {
  const unsigned int m = 200;
  const unsigned int n = 201;
  const unsigned int p = 202;

  srand(static_cast<unsigned int>(
      static_cast<std::chrono::duration<double> >(
          std::chrono::high_resolution_clock::now().time_since_epoch())
          .count()));
  double** matrixa;
  double** matrixb;
  double** matrixc;
  matrixa = new double*[m];
  matrixb = new double*[n];
  matrixc = new double*[m];
  unsigned int max = static_cast<unsigned int>(1u << 31);
  for (unsigned int i = 0; i < m; i++) matrixa[i] = new double[n];
  for (unsigned int i = 0; i < n; i++) matrixb[i] = new double[p];
  for (unsigned int i = 0; i < m; i++) {
    matrixc[i] = new double[p];
    std::fill(matrixc[i], matrixc[i] + p, 0.0);
  }
  for (unsigned int i = 0; i < m; i++)
    for (unsigned int j = 0; j < n; j++)
      matrixa[i][j] =
          static_cast<double>(static_cast<double>(rand()) / max * 10);
  for (unsigned int i = 0; i < n; i++)
    for (unsigned int j = 0; j < p; j++)
      matrixb[i][j] =
          static_cast<double>(static_cast<double>(rand()) / max * 10);
  auto start = std::chrono::high_resolution_clock::now();
  for (unsigned int i = 0; i < m; i++)
    for (unsigned int j = 0; j < p; j++)
      for (unsigned int k = 0; k < n; k++)
        matrixc[i][j] += matrixa[i][k] * matrixb[k][j];
  auto stop = std::chrono::high_resolution_clock::now();
  std::chrono::duration<double> time_diff = stop - start;
  cout << "Czas wykonania programu " << time_diff.count() << " sekund." << endl;

  for (unsigned int i = 0; i < m; i++) delete[] matrixa[i];
  for (unsigned int i = 0; i < n; i++) delete[] matrixb[i];
  for (unsigned int i = 0; i < m; i++) delete[] matrixc[i];
  delete[] matrixa;
  delete[] matrixb;
  delete[] matrixc;
  return 0;
}

Now it's much faster than the one in question.

It still can be faster with slightly modification:

#include <chrono>
#include <iostream>
using namespace std;
int main() {
  const unsigned int m = 200;
  const unsigned int n = 201;
  const unsigned int p = 202;

  srand(static_cast<unsigned int>(
      static_cast<std::chrono::duration<double> >(
          std::chrono::high_resolution_clock::now().time_since_epoch())
          .count()));
  double** matrixa;
  double** matrixb;
  double** matrixc;
  matrixa = new double*[m];
  matrixb = new double*[n];
  matrixc = new double*[m];
  unsigned int max = static_cast<unsigned int>(1u << 31);
  for (unsigned int i = 0; i < m; i++) matrixa[i] = new double[n];
  for (unsigned int i = 0; i < n; i++) matrixb[i] = new double[p];
  for (unsigned int i = 0; i < m; i++) {
    matrixc[i] = new double[p];
    std::fill(matrixc[i], matrixc[i] + p, 0.0);
  }
  for (unsigned int i = 0; i < m; i++)
    for (unsigned int j = 0; j < n; j++)
      matrixa[i][j] =
          static_cast<double>(static_cast<double>(rand()) / max * 10);
  for (unsigned int i = 0; i < n; i++)
    for (unsigned int j = 0; j < p; j++)
      matrixb[i][j] =
          static_cast<double>(static_cast<double>(rand()) / max * 10);
  auto start = std::chrono::high_resolution_clock::now();
  for (unsigned int i = 0; i < m; i++)
    for (unsigned int k = 0; k < n; k++)
      for (unsigned int j = 0; j < p; j++)
        matrixc[i][j] += matrixa[i][k] * matrixb[k][j];
  auto stop = std::chrono::high_resolution_clock::now();
  std::chrono::duration<double> time_diff = stop - start;
  cout << "Czas wykonania programu " << time_diff.count() << " sekund." << endl;

  for (unsigned int i = 0; i < m; i++) delete[] matrixa[i];
  for (unsigned int i = 0; i < n; i++) delete[] matrixb[i];
  for (unsigned int i = 0; i < m; i++) delete[] matrixc[i];
  delete[] matrixa;
  delete[] matrixb;
  delete[] matrixc;
  return 0;
}

This code is more cache-friendly, the explanation can be found here.

The code can still be improved by a parallel algorithm, to speed up the previous code with OpenMP, with only one line change:

Add we need to add the build option -fopenmp to compile it.

#include <chrono>
#include <iostream>
using namespace std;
int main() {
  const unsigned int m = 200;
  const unsigned int n = 201;
  const unsigned int p = 202;

  srand(static_cast<unsigned int>(
      static_cast<std::chrono::duration<double> >(
          std::chrono::high_resolution_clock::now().time_since_epoch())
          .count()));
  double** matrixa;
  double** matrixb;
  double** matrixc;
  matrixa = new double*[m];
  matrixb = new double*[n];
  matrixc = new double*[m];
  unsigned int max = static_cast<unsigned int>(1u << 31);
  for (unsigned int i = 0; i < m; i++) matrixa[i] = new double[n];
  for (unsigned int i = 0; i < n; i++) matrixb[i] = new double[p];
  for (unsigned int i = 0; i < m; i++) {
    matrixc[i] = new double[p];
    std::fill(matrixc[i], matrixc[i] + p, 0.0);
  }
  for (unsigned int i = 0; i < m; i++)
    for (unsigned int j = 0; j < n; j++)
      matrixa[i][j] =
          static_cast<double>(static_cast<double>(rand()) / max * 10);
  for (unsigned int i = 0; i < n; i++)
    for (unsigned int j = 0; j < p; j++)
      matrixb[i][j] =
          static_cast<double>(static_cast<double>(rand()) / max * 10);
  auto start = std::chrono::high_resolution_clock::now();
#pragma omp parallel for
  for (unsigned int i = 0; i < m; i++)
    for (unsigned int k = 0; k < n; k++)
      for (unsigned int j = 0; j < p; j++)
        matrixc[i][j] += matrixa[i][k] * matrixb[k][j];
  auto stop = std::chrono::high_resolution_clock::now();
  std::chrono::duration<double> time_diff = stop - start;
  cout << "Czas wykonania programu " << time_diff.count() << " sekund." << endl;

  for (unsigned int i = 0; i < m; i++) delete[] matrixa[i];
  for (unsigned int i = 0; i < n; i++) delete[] matrixb[i];
  for (unsigned int i = 0; i < m; i++) delete[] matrixc[i];
  delete[] matrixa;
  delete[] matrixb;
  delete[] matrixc;
  return 0;
}

It would be better to use std::vector rather than dynamically allocated arrays, the work is left for you.

prehistoricpenguin
  • 6,130
  • 3
  • 25
  • 42
  • Thank you very much, I have one more question because I have to rewrite the same code using the thread function and then the POSIX Threads function and through the fork() function someone can help me with this? – Michał Sałbut Jun 11 '21 at 12:48