error double free or corruption when using intrinsics

Question

The code below is for checking the correctness of add_prod. I wrote the code inside add_prod. It makes the calculation right but shows an error double free or corruption (!prev). Aborted (core dumped). Using valgrind, it prints

==2462== Invalid write of size 8
==2462==    at 0x1097A5: _mm_storeu_si128 (emmintrin.h:727)

Can anyone help me why this error?

static void add_prod(const short* src, short* dst, short x, int n) {
    __m128i _src, _dst,
            _scalar = _mm_set_epi16(x,x,x,x,x,x,x,x);
    
    for(int i = 0; i < n; i += 8) {
        _src = _mm_loadu_si128((const __m128i*) (src+i));
        _dst = _mm_loadu_si128((const __m128i*) (dst+i));
        
        _src = _mm_mullo_epi16(_src, _scalar);
        _dst = _mm_add_epi16(_src, _dst);
        
        _mm_storeu_si128((__m128i*) (dst+i), _dst);
    }
        
}

#define N1 1001
#define M1 1
#define N2 16
#define M2 100000

void matmul(const short** a, const short** b, short** c, int n) {
    int i, j, k;
    for (i=0; i<n; ++i)
        for (j=0; j<n; ++j) c[i][j] = 0;
    for (i=0; i<n; ++i)
        for (k=0; k<n; ++k)
            add_prod(b[k], c[i], a[i][k], n);
}

static long mat_sum(const short** m, int n) {
    int i, j;
    long sum = 0;
    for (i=0; i<n; ++i)
        for (j=0; j<n; ++j) sum += m[i][j];        
    return sum;
}

static short** alloc_mat(int n) {
    int i;
    short** m = malloc(n*sizeof(short*));
    assert(m != NULL);
    for (i=0; i<n; ++i) {
        m[i] = malloc(n*sizeof(short));
        assert(m[i] != NULL);
    }
    return m;
}

static void init_mat(short** m, int n, int max) {
    int i, j;
    for (i=0; i<n; ++i) 
        for (j=0; j<n; ++j) m[i][j] = 1 + (i+j) % max; 
}

static void free_mat(short** m, int n) {
    int i;
    for (i=0; i<n; ++i) free(m[i]);
    free(m);
}

static int do_test(const short** a, const short** b, short** c, 
                    int n, int m, int test_no) {

    double start, tseq, tsse;
    int i;
    long rseq, rsse;

    printf("\nTest #%d\n", test_no);

    // sequential
    start = get_real_time();
    for (i=0; i<m; ++i) matmul_seq(a, b, c, n);
    tseq  = get_real_time()-start;
    rseq = mat_sum((const short**)c, n);

    // SSE
    start = get_real_time();
    for (i=0; i<m; ++i) matmul(a, b, c, n);
    tsse  = get_real_time()-start;
    rsse = mat_sum((const short**)c, n);

    printf("- result: %ld [expected: %ld]\n", rsse, rseq);
    printf("- sequential version: %.2f msec\n", tseq*1000);
    printf("- SSE version: %.2f msec\n", tsse*1000);
    printf("- speedup: %.2fx\n", tseq/(tsse==0.0 ? 1E-9 : tsse));

    return rsse == rseq;
}

int main() {

    int points = 0;

    short** a1 = alloc_mat(N1);
    short** b1 = alloc_mat(N1);
    short** c1 = alloc_mat(N1);
    short** a2 = alloc_mat(N2);
    short** b2 = alloc_mat(N2);
    short** c2 = alloc_mat(N2);

    init_mat(a1, N1, 5);
    init_mat(b1, N1, 3);
    init_mat(a2, N2, 7);
    init_mat(b2, N2, 5);

    points += do_test((const short**)a1, (const short**)b1, c1, N1, M1, 1);
    points += do_test((const short**)a2, (const short**)b2, c2, N2, M2, 2);

    free_mat(a1, N1);
    free_mat(b1, N1);
    free_mat(c1, N1);
    free_mat(a2, N2);
    free_mat(b2, N2);
    free_mat(c2, N2);

    printf("\nPoints: %d out of 2\n", points);

    return 0;
}

This can be reduced further to [mcve]. The issue is not related to intrinsics, but to the fact you are accessing some allocated buffer out of bounds. — Eugene Sh., Nov 15 '21 at 19:18
My guess: There is no check about n 128 bytes boudaries in your `add_prod`. Just add `if (n-i <8) break;` as first line in its `for` and your _out of bounds access_ should be gone. I guess you should only accept multiple of 8 for n in this one. Remember, you're writing 128 bytes by 128 bytes, so if you have to write less in the remaining bytes of the buffer, you'll overwrite it. — Zilog80, Nov 15 '21 at 20:37
i exit from loop when the remaining part of array is smaller than 8, then make a for loop without using intrinsics. — Naruto Uzumaki, Nov 15 '21 at 20:55
I also guess i'm a little too tired. Read 128 bits, not 128 bytes... Maybe the weight of the years too... If only i could fix _my_ own memory struct alignments... — Zilog80, Nov 15 '21 at 21:25
@PeterCordes provided excellent answers about UB that might be interesting for you regarding the `_mm_*` SIMDs : [the strlen case](https://stackoverflow.com/a/57676035/3641635) and [about SIMD](https://stackoverflow.com/a/52117639/3641635). — Zilog80, Nov 15 '21 at 22:58

score 1 · Answer 1 · answered Nov 16 '21 at 01:51

You're not crossing into an unmapped page, that would segfault directly rather than corrupt malloc bookkeeping info. But as noted in comments, your loop bound allows the loop to start an iteration that touches 16 bytes if i < n, but that's only enough to prove that one more short is safe, not 8 more. That's why valgrind is correctly reporting that you access outside an object.

Use i-7 < n, or count down the number of elements remaining like n >= 8 / n -= 8.

Then scalar cleanup, or a final vector that ends at the end of the array, and partially overlaps if length isn't a multiple of the vector width. (The latter strategy only works if you load early from the dst, otherwise you'd be redoing some elements.)

error double free or corruption when using intrinsics

1 Answers1