I try to traverse a block of data, perform a scale operation on each value, and obtain performance info t1, such as A.cpp in the code example.
In addition, before I perform the same operation, I allocate a 50M heap memory, then simply traverse it (In order to cover the size of L1 and L2, to achieve the purpose of clearing L1 and L2), and obtain performance info t2, such as B.cpp in the code example.
Finally, I compared the two performance info and found that t2 is significantly less than t1, that is, the performance of B.cpp is better.(The same happens on both x86 and arm)
// A.cpp
auto a = (float*)new float[8 * 1 * 8192];
auto b = (float*)new float[8 * 1 * 8192];
auto c = (float*)new float[8 * 1];
auto d = (float*)new float[8 * 1];
void ScaleAndAddBias(float * dstZ, const float* srcZ, const float* biasZ, const float* alphaZ, size_t planeNumber) {
for (int p = 0; p < planeNumber; ++p) {
float* dstX = dstZ + 8 * p;
const float* srcX = srcZ + 8 * p;
for (int i = 0; i < 8; i++) {
dstX[i] = srcX[i] * alphaZ[i] + biasZ[i];
}
}
}
auto begin = time();
ScaleAndAddBias(a, b, c, d, 8192);
auto end = time();
auto t1 = end - begin;
// B.cpp
const size_t bigger_than_cachesize = 50 * 1024 * 1024;
long *p = new long[bigger_than_cachesize];
for (int j = 0; j < bigger_than_cachesize; j++) {
p[j] += 1;
}
auto a = (float*)new float[8 * 1 * 8192];
auto b = (float*)new float[8 * 1 * 8192];
auto c = (float*)new float[8 * 1];
auto d = (float*)new float[8 * 1];
void ScaleAndAddBias(float * dstZ, const float* srcZ, const float* biasZ, const float* alphaZ, size_t planeNumber) {
for (int p = 0; p < planeNumber; ++p) {
float* dstX = dstZ + 8 * p;
const float* srcX = srcZ + 8 * p;
for (int i = 0; i < 8; i++) {
dstX[i] = srcX[i] * alphaZ[i] + biasZ[i];
}
}
}
auto begin = time();
ScaleAndAddBias(a, b, c, d, 8192);
auto end = time();
auto t2 = end - begin;
question: As we all know, we can make use of spatial locality and time locality, but in B.cpp, the memory traversed in advance is not the same segment of memory, and the two segments of memory accessed are not contiguous. Why does it improve the access performance of this segment of memory?