The code below is for checking the correctness of add_prod
. I wrote the code inside add_prod. It makes the calculation right but shows an error double free or corruption (!prev). Aborted (core dumped)
.
Using valgrind
, it prints
==2462== Invalid write of size 8
==2462== at 0x1097A5: _mm_storeu_si128 (emmintrin.h:727)
Can anyone help me why this error?
static void add_prod(const short* src, short* dst, short x, int n) {
__m128i _src, _dst,
_scalar = _mm_set_epi16(x,x,x,x,x,x,x,x);
for(int i = 0; i < n; i += 8) {
_src = _mm_loadu_si128((const __m128i*) (src+i));
_dst = _mm_loadu_si128((const __m128i*) (dst+i));
_src = _mm_mullo_epi16(_src, _scalar);
_dst = _mm_add_epi16(_src, _dst);
_mm_storeu_si128((__m128i*) (dst+i), _dst);
}
}
#define N1 1001
#define M1 1
#define N2 16
#define M2 100000
void matmul(const short** a, const short** b, short** c, int n) {
int i, j, k;
for (i=0; i<n; ++i)
for (j=0; j<n; ++j) c[i][j] = 0;
for (i=0; i<n; ++i)
for (k=0; k<n; ++k)
add_prod(b[k], c[i], a[i][k], n);
}
static long mat_sum(const short** m, int n) {
int i, j;
long sum = 0;
for (i=0; i<n; ++i)
for (j=0; j<n; ++j) sum += m[i][j];
return sum;
}
static short** alloc_mat(int n) {
int i;
short** m = malloc(n*sizeof(short*));
assert(m != NULL);
for (i=0; i<n; ++i) {
m[i] = malloc(n*sizeof(short));
assert(m[i] != NULL);
}
return m;
}
static void init_mat(short** m, int n, int max) {
int i, j;
for (i=0; i<n; ++i)
for (j=0; j<n; ++j) m[i][j] = 1 + (i+j) % max;
}
static void free_mat(short** m, int n) {
int i;
for (i=0; i<n; ++i) free(m[i]);
free(m);
}
static int do_test(const short** a, const short** b, short** c,
int n, int m, int test_no) {
double start, tseq, tsse;
int i;
long rseq, rsse;
printf("\nTest #%d\n", test_no);
// sequential
start = get_real_time();
for (i=0; i<m; ++i) matmul_seq(a, b, c, n);
tseq = get_real_time()-start;
rseq = mat_sum((const short**)c, n);
// SSE
start = get_real_time();
for (i=0; i<m; ++i) matmul(a, b, c, n);
tsse = get_real_time()-start;
rsse = mat_sum((const short**)c, n);
printf("- result: %ld [expected: %ld]\n", rsse, rseq);
printf("- sequential version: %.2f msec\n", tseq*1000);
printf("- SSE version: %.2f msec\n", tsse*1000);
printf("- speedup: %.2fx\n", tseq/(tsse==0.0 ? 1E-9 : tsse));
return rsse == rseq;
}
int main() {
int points = 0;
short** a1 = alloc_mat(N1);
short** b1 = alloc_mat(N1);
short** c1 = alloc_mat(N1);
short** a2 = alloc_mat(N2);
short** b2 = alloc_mat(N2);
short** c2 = alloc_mat(N2);
init_mat(a1, N1, 5);
init_mat(b1, N1, 3);
init_mat(a2, N2, 7);
init_mat(b2, N2, 5);
points += do_test((const short**)a1, (const short**)b1, c1, N1, M1, 1);
points += do_test((const short**)a2, (const short**)b2, c2, N2, M2, 2);
free_mat(a1, N1);
free_mat(b1, N1);
free_mat(c1, N1);
free_mat(a2, N2);
free_mat(b2, N2);
free_mat(c2, N2);
printf("\nPoints: %d out of 2\n", points);
return 0;
}