!!! HOMEWORK - ASSIGNMENT !!!
Please do not post code as I would like to complete myself but rather if possible point me in the right direction with general information or by pointing out mistakes in thought or other possible useful and relevant resources.
I have a method that creates my square npages * npages
matrix hat of double
for use in my pagerank algorithm.
I have made it with pthreads, SIMD and with both pthreads and SIMD. I have used xcode instruments time profiler and found that the pthreads only version is the fastest, next is the SIMD only version and slowest is the version with both SIMD and pthreads.
As it is homework it can be run on multiple different machines however we were given the header #include so it is to be assumed we can use upto AVX at least. We are given how many threads the program will use as the argument to the program and store it in a global variable g_nthreads
.
In my tests I have been testing it on my machine which is an IvyBridge with 4 hardware cores and 8 logical cores and I've been testing it with 4 threads as an arguments and with 8 threads as an argument.
RUNNING TIMES:
SIMD ONLY:
*331ms - for consturct_matrix_hat function *
PTHREADS ONLY (8 threads):
70ms - each thread concurrently
SIMD & PTHREADS (8 threads):
110ms - each thread concurrently
What am I doing that is slowing it down more when using both forms of optimisation?
I will post each implementation:
All versions share these macros:
#define BIG_CHUNK (g_n2/g_nthreads)
#define SMALL_CHUNK (g_npages/g_nthreads)
#define MOD BIG_CHUNK - (BIG_CHUNK % 4)
#define IDX(a, b) ((a * g_npages) + b)
Pthreads:
// struct used for passing arguments
typedef struct {
double* restrict m;
double* restrict m_hat;
int t_id;
char padding[44];
} t_arg_matrix_hat;
// Construct matrix_hat with pthreads
static void* pthread_construct_matrix_hat(void* arg) {
t_arg_matrix_hat* t_arg = (t_arg_matrix_hat*) arg;
// set coordinate limits thread is able to act upon
size_t start = t_arg->t_id * BIG_CHUNK;
size_t end = t_arg->t_id + 1 != g_nthreads ? (t_arg->t_id + 1) * BIG_CHUNK : g_n2;
// Initialise coordinates with given uniform value
for (size_t i = start; i < end; i++) {
t_arg->m_hat[i] = ((g_dampener * t_arg->m[i]) + HAT);
}
return NULL;
}
// Construct matrix_hat
double* construct_matrix_hat(double* matrix) {
double* matrix_hat = malloc(sizeof(double) * g_n2);
// create structs to send and retrieve matrix and value from threads
t_arg_matrix_hat t_args[g_nthreads];
for (size_t i = 0; i < g_nthreads; i++) {
t_args[i] = (t_arg_matrix_hat) {
.m = matrix,
.m_hat = matrix_hat,
.t_id = i
};
}
// create threads and send structs with matrix and value to divide the matrix and
// initialise the coordinates with the given value
pthread_t threads[g_nthreads];
for (size_t i = 0; i < g_nthreads; i++) {
pthread_create(threads + i, NULL, pthread_construct_matrix_hat, t_args + i);
}
// join threads after all coordinates have been intialised
for (size_t i = 0; i < g_nthreads; i++) {
pthread_join(threads[i], NULL);
}
return matrix_hat;
}
SIMD:
// Construct matrix_hat
double* construct_matrix_hat(double* matrix) {
double* matrix_hat = malloc(sizeof(double) * g_n2);
double dampeners[4] = {g_dampener, g_dampener, g_dampener, g_dampener};
__m256d b = _mm256_loadu_pd(dampeners);
// Use simd to subtract values from each other
for (size_t i = 0; i < g_mod; i += 4) {
__m256d a = _mm256_loadu_pd(matrix + i);
__m256d res = _mm256_mul_pd(a, b);
_mm256_storeu_pd(&matrix_hat[i], res);
}
// Subtract values from each other that weren't included in simd
for (size_t i = g_mod; i < g_n2; i++) {
matrix_hat[i] = g_dampener * matrix[i];
}
double hats[4] = {HAT, HAT, HAT, HAT};
b = _mm256_loadu_pd(hats);
// Use simd to raise each value to the power 2
for (size_t i = 0; i < g_mod; i += 4) {
__m256d a = _mm256_loadu_pd(matrix_hat + i);
__m256d res = _mm256_add_pd(a, b);
_mm256_storeu_pd(&matrix_hat[i], res);
}
// Raise each value to the power 2 that wasn't included in simd
for (size_t i = g_mod; i < g_n2; i++) {
matrix_hat[i] += HAT;
}
return matrix_hat;
}
Pthreads & SIMD:
// struct used for passing arguments
typedef struct {
double* restrict m;
double* restrict m_hat;
int t_id;
char padding[44];
} t_arg_matrix_hat;
// Construct matrix_hat with pthreads
static void* pthread_construct_matrix_hat(void* arg) {
t_arg_matrix_hat* t_arg = (t_arg_matrix_hat*) arg;
// set coordinate limits thread is able to act upon
size_t start = t_arg->t_id * BIG_CHUNK;
size_t end = t_arg->t_id + 1 != g_nthreads ? (t_arg->t_id + 1) * BIG_CHUNK : g_n2;
size_t leftovers = start + MOD;
__m256d b1 = _mm256_loadu_pd(dampeners);
//
for (size_t i = start; i < leftovers; i += 4) {
__m256d a1 = _mm256_loadu_pd(t_arg->m + i);
__m256d r1 = _mm256_mul_pd(a1, b1);
_mm256_storeu_pd(&t_arg->m_hat[i], r1);
}
//
for (size_t i = leftovers; i < end; i++) {
t_arg->m_hat[i] = dampeners[0] * t_arg->m[i];
}
__m256d b2 = _mm256_loadu_pd(hats);
//
for (size_t i = start; i < leftovers; i += 4) {
__m256d a2 = _mm256_loadu_pd(t_arg->m_hat + i);
__m256d r2 = _mm256_add_pd(a2, b2);
_mm256_storeu_pd(&t_arg->m_hat[i], r2);
}
//
for (size_t i = leftovers; i < end; i++) {
t_arg->m_hat[i] += hats[0];
}
return NULL;
}
// Construct matrix_hat
double* construct_matrix_hat(double* matrix) {
double* matrix_hat = malloc(sizeof(double) * g_n2);
// create structs to send and retrieve matrix and value from threads
t_arg_matrix_hat t_args[g_nthreads];
for (size_t i = 0; i < g_nthreads; i++) {
t_args[i] = (t_arg_matrix_hat) {
.m = matrix,
.m_hat = matrix_hat,
.t_id = i
};
}
// create threads and send structs with matrix and value to divide the matrix and
// initialise the coordinates with the given value
pthread_t threads[g_nthreads];
for (size_t i = 0; i < g_nthreads; i++) {
pthread_create(threads + i, NULL, pthread_construct_matrix_hat, t_args + i);
}
// join threads after all coordinates have been intialised
for (size_t i = 0; i < g_nthreads; i++) {
pthread_join(threads[i], NULL);
}
return matrix_hat;
}