0

I am trying to optimize the best I can a DAXPY procedure. However, there is a mistake in the following code that I cannot spot:

#include <stdio.h>
#include <time.h>

void daxpy(int n, double a, double *x, double *y) {
    int i;
    double y0, y1, y2, y3, x0, x1, x2, x3;
    // loop unrolling
    for (i = 0; i < n; i += 4) {
        // multiple accumulating registers
        x0 = x[i];
        x1 = x[i + 1];
        x2 = x[i + 2];
        x3 = x[i + 3];

        y0 = y[i];
        y1 = y[i + 1];
        y2 = y[i + 2];
        y3 = y[i + 3];

        y0 += a * x0;
        y1 += a * x1;
        y2 += a * x2;
        y3 += a * x3;

        y[i] = y0;
        y[i + 1] = y1;
        y[i + 2] = y2;
        y[i + 3] = y3;
    }
}
int main() {
    int n = 10000000;
    double a = 2.0;
    double x[n];
    double y[n];
    int i;
    for (i = 0; i < n; ++i) {
        x[i] = (double)i;
        y[i] = (double)i;
    }
    clock_t start = clock();
    daxpy(n, a, x, y);
    clock_t end = clock();
    double time_elapsed = (double)(end - start) / CLOCKS_PER_SEC;
    printf("Time elapsed: %f seconds\n", time_elapsed);
    return 0;
}

When executing the binary I am getting the following error message:

Segmentation fault: 11

Using LLDB I am getting:

Process 89219 stopped
* thread #1, queue = 'com.apple.main-thread', stop reason = EXC_BAD_ACCESS (code=1, address=0x7ff7bb2b2cc0)
    frame #0: 0x0000000100003e4f daxpy.x`main at daxpy.c:38:14
   35       double y[n];
   36       int i;
   37       for (i = 0; i < n; ++i) {
-> 38           x[i] = (double)i;
   39           y[i] = (double)i;
   40       }
   41       clock_t start = clock();
Target 0: (daxpy.x) stopped.

But I can't spot what is the mistake on lines 38 and 39. Any help?

user3116936
  • 492
  • 3
  • 21

1 Answers1

3

Assuming sizeof(double) == 8 you try to allocate 160MB on the stack. The stack is not usually as big.

You need to allocate it from the heap

    size_t n = 10000000;
    double a = 2.0;
    double *x = malloc(sizeof(*x) * n);
    double *y = malloc(sizeof(*y) * n);

for (i = 0; i < n; where n is the array size.

What happens if you

        y1 = y[i + 1];
        y2 = y[i + 2];
        y3 = y[i + 3];

You may access the array outside the bounds.

0___________
  • 60,014
  • 4
  • 34
  • 74
  • 3
    This is not an issue if `n` is divisible by 4 (which it is in OP's code). Also, the segfault occurred before entering that loop. – interjay Jan 17 '23 at 11:46