Firstly, your matrix multiply algorithm is over complex than a normal one(Or it's just wrong), you may reference the wiki for a typical algorithm:
Input: matrices A and B
Let C be a new matrix of the appropriate size
For i from 1 to n:
For j from 1 to p:
Let sum = 0
For k from 1 to m:
Set sum ← sum + Aik × Bkj
Set Cij ← sum
Return C
There is a critical bug in your code, you haven't initialized the result matrix.
So the fixed code may like this:
#include <chrono>
#include <iostream>
using namespace std;
int main() {
const unsigned int m = 200;
const unsigned int n = 201;
const unsigned int p = 202;
srand(static_cast<unsigned int>(
static_cast<std::chrono::duration<double> >(
std::chrono::high_resolution_clock::now().time_since_epoch())
.count()));
double** matrixa;
double** matrixb;
double** matrixc;
matrixa = new double*[m];
matrixb = new double*[n];
matrixc = new double*[m];
unsigned int max = static_cast<unsigned int>(1u << 31);
for (unsigned int i = 0; i < m; i++) matrixa[i] = new double[n];
for (unsigned int i = 0; i < n; i++) matrixb[i] = new double[p];
for (unsigned int i = 0; i < m; i++) {
matrixc[i] = new double[p];
std::fill(matrixc[i], matrixc[i] + p, 0.0);
}
for (unsigned int i = 0; i < m; i++)
for (unsigned int j = 0; j < n; j++)
matrixa[i][j] =
static_cast<double>(static_cast<double>(rand()) / max * 10);
for (unsigned int i = 0; i < n; i++)
for (unsigned int j = 0; j < p; j++)
matrixb[i][j] =
static_cast<double>(static_cast<double>(rand()) / max * 10);
auto start = std::chrono::high_resolution_clock::now();
for (unsigned int i = 0; i < m; i++)
for (unsigned int j = 0; j < p; j++)
for (unsigned int k = 0; k < n; k++)
matrixc[i][j] += matrixa[i][k] * matrixb[k][j];
auto stop = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> time_diff = stop - start;
cout << "Czas wykonania programu " << time_diff.count() << " sekund." << endl;
for (unsigned int i = 0; i < m; i++) delete[] matrixa[i];
for (unsigned int i = 0; i < n; i++) delete[] matrixb[i];
for (unsigned int i = 0; i < m; i++) delete[] matrixc[i];
delete[] matrixa;
delete[] matrixb;
delete[] matrixc;
return 0;
}
Now it's much faster than the one in question.
It still can be faster with slightly modification:
#include <chrono>
#include <iostream>
using namespace std;
int main() {
const unsigned int m = 200;
const unsigned int n = 201;
const unsigned int p = 202;
srand(static_cast<unsigned int>(
static_cast<std::chrono::duration<double> >(
std::chrono::high_resolution_clock::now().time_since_epoch())
.count()));
double** matrixa;
double** matrixb;
double** matrixc;
matrixa = new double*[m];
matrixb = new double*[n];
matrixc = new double*[m];
unsigned int max = static_cast<unsigned int>(1u << 31);
for (unsigned int i = 0; i < m; i++) matrixa[i] = new double[n];
for (unsigned int i = 0; i < n; i++) matrixb[i] = new double[p];
for (unsigned int i = 0; i < m; i++) {
matrixc[i] = new double[p];
std::fill(matrixc[i], matrixc[i] + p, 0.0);
}
for (unsigned int i = 0; i < m; i++)
for (unsigned int j = 0; j < n; j++)
matrixa[i][j] =
static_cast<double>(static_cast<double>(rand()) / max * 10);
for (unsigned int i = 0; i < n; i++)
for (unsigned int j = 0; j < p; j++)
matrixb[i][j] =
static_cast<double>(static_cast<double>(rand()) / max * 10);
auto start = std::chrono::high_resolution_clock::now();
for (unsigned int i = 0; i < m; i++)
for (unsigned int k = 0; k < n; k++)
for (unsigned int j = 0; j < p; j++)
matrixc[i][j] += matrixa[i][k] * matrixb[k][j];
auto stop = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> time_diff = stop - start;
cout << "Czas wykonania programu " << time_diff.count() << " sekund." << endl;
for (unsigned int i = 0; i < m; i++) delete[] matrixa[i];
for (unsigned int i = 0; i < n; i++) delete[] matrixb[i];
for (unsigned int i = 0; i < m; i++) delete[] matrixc[i];
delete[] matrixa;
delete[] matrixb;
delete[] matrixc;
return 0;
}
This code is more cache-friendly, the explanation can be found here.
The code can still be improved by a parallel algorithm, to speed up the previous code with OpenMP, with only one line change:
Add we need to add the build option -fopenmp
to compile it.
#include <chrono>
#include <iostream>
using namespace std;
int main() {
const unsigned int m = 200;
const unsigned int n = 201;
const unsigned int p = 202;
srand(static_cast<unsigned int>(
static_cast<std::chrono::duration<double> >(
std::chrono::high_resolution_clock::now().time_since_epoch())
.count()));
double** matrixa;
double** matrixb;
double** matrixc;
matrixa = new double*[m];
matrixb = new double*[n];
matrixc = new double*[m];
unsigned int max = static_cast<unsigned int>(1u << 31);
for (unsigned int i = 0; i < m; i++) matrixa[i] = new double[n];
for (unsigned int i = 0; i < n; i++) matrixb[i] = new double[p];
for (unsigned int i = 0; i < m; i++) {
matrixc[i] = new double[p];
std::fill(matrixc[i], matrixc[i] + p, 0.0);
}
for (unsigned int i = 0; i < m; i++)
for (unsigned int j = 0; j < n; j++)
matrixa[i][j] =
static_cast<double>(static_cast<double>(rand()) / max * 10);
for (unsigned int i = 0; i < n; i++)
for (unsigned int j = 0; j < p; j++)
matrixb[i][j] =
static_cast<double>(static_cast<double>(rand()) / max * 10);
auto start = std::chrono::high_resolution_clock::now();
#pragma omp parallel for
for (unsigned int i = 0; i < m; i++)
for (unsigned int k = 0; k < n; k++)
for (unsigned int j = 0; j < p; j++)
matrixc[i][j] += matrixa[i][k] * matrixb[k][j];
auto stop = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> time_diff = stop - start;
cout << "Czas wykonania programu " << time_diff.count() << " sekund." << endl;
for (unsigned int i = 0; i < m; i++) delete[] matrixa[i];
for (unsigned int i = 0; i < n; i++) delete[] matrixb[i];
for (unsigned int i = 0; i < m; i++) delete[] matrixc[i];
delete[] matrixa;
delete[] matrixb;
delete[] matrixc;
return 0;
}
It would be better to use std::vector
rather than dynamically allocated arrays, the work is left for you.