I wrote a program that tests _mm_prefetch
(with gcc -O3): there are two random arrays,
- one of which traverses 1/600 of the length of the first random array (hardware prefetch, cache hit)
- another traverses the entire the second random array by 600 steps (hardware prefetch, cache miss)
I used the _mm_prefetch prefetch in the second test, but the performance still didn't improve.Why is this?what should I do? please tell me
The Code in C:
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <xmmintrin.h>
#define LENGTH 10000000
#define Max 10000000
//Generate a random array
int * random_arr(int lo,int hi,int num){
int *arr = malloc(sizeof *arr *num);
srand((unsigned)time(NULL));
for (int i=0; i<num; i++) {
arr[i]=rand()%(hi-lo)+lo;
}
return arr;
}
int main(int argc, const char * argv[]) {
int * random_array1 = random_arr(0, Max, LENGTH); // the first random array
int * random_array2 = random_arr(0, Max, LENGTH); // the second random array
clock_t start1 = clock();
//-------------------------------------------------------------- first test
for (int i = 0; i<LENGTH/600; ++i) {
random_array1[i] = i;
}
//--------------------------------------------------------------
clock_t end1 = clock();
double duration1 = ((double)(end1 - start1)) / CLOCKS_PER_SEC;
printf("The first test running time: %lf s\n", duration1);
//=========================================
clock_t start2 = clock();
//-------------------------------------------------------------- second test
for (int i = 0; i < LENGTH; i+=600) {
_mm_prefetch(&random_array2[i+600], _MM_HINT_T0);
_mm_prefetch(&random_array2[i+1200], _MM_HINT_T0);
random_array2[i] = i;
}
//--------------------------------------------------------------
clock_t end2 = clock();
double duration2 = ((double)(end2 - start2)) / CLOCKS_PER_SEC;
printf("The second test running time: %lf s\n", duration2);
return 0;
}
Code execution result:
The first test running time: 0.000076 s
The second test running time: 0.001859 s