11

I'm on Whiskey Lake i7-8565U and analyzing perf counters and time for copying 512 KiB of data (twice more than L2 cache size) and faced some misunderstanding regarding L2 HW prefetcher's work.

In the Intel Manual Vol.4 MSR there is MSR 0x1A4 the bit 0 of is for controlloing L2 HW prefetcher (1 to disable).


Consider the following benchmark:

memcopy.h:

void *avx_memcpy_forward_lsls(void *restrict, const void *restrict, size_t);

memcopy.S:

avx_memcpy_forward_lsls:
    shr rdx, 0x3
    xor rcx, rcx
avx_memcpy_forward_loop_lsls:
    vmovdqa ymm0, [rsi + 8*rcx]
    vmovdqa [rdi + rcx*8], ymm0
    vmovdqa ymm1, [rsi + 8*rcx + 0x20]
    vmovdqa [rdi + rcx*8 + 0x20], ymm1
    add rcx, 0x08
    cmp rdx, rcx
    ja avx_memcpy_forward_loop_lsls
    ret

main.c:

#include <string.h>
#include <stdlib.h>
#include <inttypes.h>
#include <x86intrin.h>
#include <fcntl.h>
#include <unistd.h>
#include <stdio.h>
#include "memcopy.h"

#define ITERATIONS 1000
#define BUF_SIZE 512 * 1024

_Alignas(64) char src[BUF_SIZE];
_Alignas(64) char dest[BUF_SIZE];

static void __run_benchmark(unsigned runs, unsigned run_iterations,
                    void *(*fn)(void *, const void*, size_t), void *dest, const void* src, size_t sz);

#define run_benchmark(runs, run_iterations, fn, dest, src, sz) \
    do{\
        printf("Benchmarking " #fn "\n");\
        __run_benchmark(runs, run_iterations, fn, dest, src, sz);\
    }while(0)

int main(void){
    int fd = open("/dev/urandom", O_RDONLY);
    read(fd, src, sizeof src);
    run_benchmark(20, ITERATIONS, avx_memcpy_forward_lsls, dest, src, BUF_SIZE);
}

static inline void benchmark_copy_function(unsigned iterations, void *(*fn)(void *, const void *, size_t),
                                               void *restrict dest, const void *restrict src, size_t sz){
    while(iterations --> 0){
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
    }
}

static void __run_benchmark(unsigned runs, unsigned run_iterations,
                    void *(*fn)(void *, const void*, size_t), void *dest, const void* src, size_t sz){
    unsigned current_run = 1;
    while(current_run <= runs){
        benchmark_copy_function(run_iterations, fn, dest, src, sz);
        printf("Run %d finished\n", current_run);
        current_run++;
    }
}

Consider 2 runs of the compiled main.c

I.

MSR:

$ sudo rdmsr -p 0 0x1A4
0

Run:

$ taskset -c 0 sudo ../profile.sh ./bin 

 Performance counter stats for './bin':

    10 486 164 071      L1-dcache-loads                                               (12,13%)
    10 461 354 384      L1-dcache-load-misses     #   99,76% of all L1-dcache hits    (12,05%)
    10 481 930 413      L1-dcache-stores                                              (12,05%)
    10 461 136 686      l1d.replacement                                               (12,12%)
    31 466 394 422      l1d_pend_miss.fb_full                                         (12,11%)
   211 853 643 294      l1d_pend_miss.pending                                         (12,09%)
     1 759 204 317      LLC-loads                                                     (12,16%)
            31 007      LLC-load-misses           #    0,00% of all LL-cache hits     (12,16%)
     3 154 901 630      LLC-stores                                                    (6,19%)
    15 867 315 545      l2_rqsts.all_pf                                               (9,22%)
                 0      sw_prefetch_access.t1_t2                                      (12,22%)
         1 393 306      l2_lines_out.useless_hwpf                                     (12,16%)
     3 549 170 919      l2_rqsts.pf_hit                                               (12,09%)
    12 356 247 643      l2_rqsts.pf_miss                                              (12,06%)
                 0      load_hit_pre.sw_pf                                            (12,09%)
     3 159 712 695      l2_rqsts.rfo_hit                                              (12,06%)
     1 207 642 335      l2_rqsts.rfo_miss                                             (12,02%)
     4 366 526 618      l2_rqsts.all_rfo                                              (12,06%)
     5 240 013 774      offcore_requests.all_data_rd                                     (12,06%)
    19 936 657 118      offcore_requests.all_requests                                     (12,09%)
     1 761 660 763      offcore_response.demand_data_rd.any_response                                     (12,12%)
       287 044 397      bus-cycles                                                    (12,15%)
    36 816 767 779      resource_stalls.any                                           (12,15%)
    36 553 997 653      resource_stalls.sb                                            (12,15%)
    38 035 066 210      uops_retired.stall_cycles                                     (12,12%)
    24 766 225 119      uops_executed.stall_cycles                                     (12,09%)
    40 478 455 041      uops_issued.stall_cycles                                      (12,05%)
    24 497 256 548      cycle_activity.stalls_l1d_miss                                     (12,02%)
    12 611 038 018      cycle_activity.stalls_l2_miss                                     (12,09%)
        10 228 869      cycle_activity.stalls_l3_miss                                     (12,12%)
    24 707 614 483      cycle_activity.stalls_mem_any                                     (12,22%)
    24 776 110 104      cycle_activity.stalls_total                                     (12,22%)
    48 914 478 241      cycles                                                        (12,19%)

      12,155774555 seconds time elapsed

      11,984577000 seconds user
       0,015984000 seconds sys

II.

MSR:

$ sudo rdmsr -p 0 0x1A4
1

Run:

$ taskset -c 0 sudo ../profile.sh ./bin

 Performance counter stats for './bin':

    10 508 027 832      L1-dcache-loads                                               (12,05%)
    10 463 643 206      L1-dcache-load-misses     #   99,58% of all L1-dcache hits    (12,09%)
    10 481 296 605      L1-dcache-stores                                              (12,12%)
    10 444 854 468      l1d.replacement                                               (12,15%)
    29 287 445 744      l1d_pend_miss.fb_full                                         (12,17%)
   205 569 630 707      l1d_pend_miss.pending                                         (12,17%)
     5 103 444 329      LLC-loads                                                     (12,17%)
            33 406      LLC-load-misses           #    0,00% of all LL-cache hits     (12,17%)
     9 567 917 742      LLC-stores                                                    (6,08%)
     1 157 237 980      l2_rqsts.all_pf                                               (9,12%)
                 0      sw_prefetch_access.t1_t2                                      (12,17%)
           301 471      l2_lines_out.useless_hwpf                                     (12,17%)
       218 528 985      l2_rqsts.pf_hit                                               (12,17%)
       938 735 722      l2_rqsts.pf_miss                                              (12,17%)
                 0      load_hit_pre.sw_pf                                            (12,17%)
         4 096 281      l2_rqsts.rfo_hit                                              (12,17%)
     4 972 640 931      l2_rqsts.rfo_miss                                             (12,17%)
     4 976 006 805      l2_rqsts.all_rfo                                              (12,17%)
     5 175 544 191      offcore_requests.all_data_rd                                     (12,17%)
    15 772 124 082      offcore_requests.all_requests                                     (12,17%)
     5 120 635 892      offcore_response.demand_data_rd.any_response                                     (12,17%)
       292 980 395      bus-cycles                                                    (12,17%)
    37 592 020 151      resource_stalls.any                                           (12,14%)
    37 317 091 982      resource_stalls.sb                                            (12,11%)
    38 121 826 730      uops_retired.stall_cycles                                     (12,08%)
    25 430 699 605      uops_executed.stall_cycles                                     (12,04%)
    41 416 190 037      uops_issued.stall_cycles                                      (12,04%)
    25 326 579 070      cycle_activity.stalls_l1d_miss                                     (12,04%)
    25 019 148 253      cycle_activity.stalls_l2_miss                                     (12,03%)
         7 384 770      cycle_activity.stalls_l3_miss                                     (12,03%)
    25 442 709 033      cycle_activity.stalls_mem_any                                     (12,03%)
    25 406 897 956      cycle_activity.stalls_total                                     (12,03%)
    49 877 044 086      cycles                                                        (12,03%)

      12,231406658 seconds time elapsed

      12,226386000 seconds user
       0,004000000 seconds sys

I noticed the counter:

12 611 038 018 cycle_activity.stalls_l2_miss v/s
25 019 148 253 cycle_activity.stalls_l2_miss

suggesting that the MSR disabling L2 HW prefetcher is being applied. Also other l2/LLC related stuff differs significantly. The difference is reproducible across different runs. The problem is there is almost no difference in total time and cycles:

48 914 478 241 cycles v/s
49 877 044 086 cycles

12,155774555 seconds time elapsed v/s
12,231406658 seconds time elapsed

QUESTION:
Is L2 misses hidden by other performance limiters?
If so, can you suggest what counters to look at to understand it?

user3666197
  • 1
  • 6
  • 50
  • 92
St.Antario
  • 26,175
  • 41
  • 130
  • 318
  • 4
    As a rule of thumb: Any non-abysmally implemented memory copy is memory bound. Even when it only hits L1 cache. The overheads of any memory access are simply so much higher than what it takes a CPU to add two and two together. In your case, you are even using AVX instructions to reduce the amount of instructions per copied byte. Wherever your data is found (L1, L2, LLC, memory), the throughput of the associated memory component will be your bottleneck. – cmaster - reinstate monica Feb 02 '20 at 14:47

2 Answers2

6

Yes, the L2 streamer is really helpful a lot of the time.

memcpy doesn't have any computational latency to hide, so I guess it can afford to let OoO exec resources (ROB size) handle the extra load latency you get from more L2 misses, at least in this case where you get all L3 hits from using a medium-size working set (1MiB) that fits in L3, no prefetching needed to make L3 hits happen.

And the only instructions are load/store (and loop overhead), so the OoO window includes demand loads for pretty far ahead.

IDK if the L2 spatial prefetcher and L1d prefetcher are helping any here.


Prediction to test this hypothesis: make your array bigger so you get L3 misses and you'll probably see a difference in overall time once OoO exec isn't enough to hide the load latency of going all the way to DRAM. HW prefetch triggering farther ahead can help some.

The other big benefits of HW prefetching come when it can keep up with your computation, so you get L2 hits. (In a loop that has computation with a medium-length but not loop-carried dependency chain.)

Demand loads and OoO exec can do a lot as far as using the available (single threaded) memory bandwidth, when there isn't other pressure on ROB capacity.


Also note that on Intel CPUs, every cache miss can cost a back-end replay (from the RS/scheduler) of dependent uops, one each for L1d and L2 misses when the data is expected to arrive. And after that, apparently the core optimistically spams uops while waiting for data to arrive from L3.

(See https://chat.stackoverflow.com/rooms/206639/discussion-on-question-by-beeonrope-are-load-ops-deallocated-from-the-rs-when-th and Are load ops deallocated from the RS when they dispatch, complete or some other time?)

Not the cache-miss load itself; in this case it would be the store instructions. More specifically, the store-data uop for port 4. That doesn't matter here; using 32-byte stores and bottlenecking on L3 bandwidth means we're not close to 1 port 4 uop per clock.

Peter Cordes
  • 328,167
  • 45
  • 605
  • 847
  • _and loop overhead_ I thought that since the loop consits of 2 loads, 2 stores, `add` and `cmp` it can be detected by LSD so its overhead would not be that significant. – St.Antario Feb 02 '20 at 14:48
  • 2
    @St.Antario: huh? That makes no sense; you're memory-bound so you don't have a front-end bottleneck so the LSD is irrelevant. (It avoids re-fetching them from the uop cache, saving some power). They still take space in the ROB until they can retire. They're not *that* significant, but not negligible either. – Peter Cordes Feb 02 '20 at 14:50
  • 2
    _make your array bigger so you get L3 misses and you'll probably see a difference_ I ran a number of tests with `16MiB` buffer and `10` iterations and indeed got `14,186868883 seconds` vs `43,731360909 seconds` and `46,76% of all LL-cache hits` vs `99,32% of all LL-cache hits`; `1 028 664 372 LLC-loads` vs `1 587 454 298 LLC-loads `. – St.Antario Feb 02 '20 at 17:49
  • Aren't 2 loads into the same register (e.g. as in my case `vmovdqa ymm0, [rsi + 8 * rcx]` in consequtive loop iteration) from the same dependency chain? If so, how can they be executed OoO? – St.Antario Feb 02 '20 at 20:11
  • 4
    @St.Antario: by register renaming! This is one of the most key pieces of OoO exec, especially on a register-poor ISA like x86. See [Why does mulss take only 3 cycles on Haswell, different from Agner's instruction tables? (Unrolling FP loops with multiple accumulators)](//stackoverflow.com/q/45113527). And BTW, normally you'd want to do 2 loads then 2 stores, not load/store load/store. Better chance of avoiding or mitigating 4k aliasing stalls because the later loads (that the HW has to detect as overlapping the previous stores or not) are farther away. – Peter Cordes Feb 03 '20 at 02:25
  • I found an entry in IOM/2.6.5: _Architectural registers are renamed to a larger set of microarchitectural registers. Renaming eliminates false dependencies known as read-after-read and write-after-read hazard_ which seems to be relevant here. So basically we have 2 Load-Store dependency chains per each iteration. Also we have dependency chain consisting of `add rcx, 0x08` on each iteration with the length of the loop size. – St.Antario Feb 03 '20 at 23:03
  • 2
    @St.Antario: yes, of course. Agner Fog's optimization guide also explains OoO exec with register renaming, so does wikipedia. BTW, register renaming also avoids WAW hazards, leaving only true dependencies (RAW). So loads can even complete out of order, without waiting for a previous load to finish *writing* the same architectural register. And yes, the only loop-carried dep chain is through RCX, so that chain can run ahead. That's why addresses can be ready early, while load/store uops are still bottlenecked on port 2/3 throughput. – Peter Cordes Feb 03 '20 at 23:41
  • 3
    I am surprised that prefetching didn't help for the memcpy in L3. I guess the 10/12 LFBs is "enough" in that case. Seems weird though: what is the limiting factor there? The core -> L2 time should be less than the L2 -> L3 time, so in my mental model having more buffers (more total occupancy) for the second leg should help. – BeeOnRope Feb 04 '20 at 00:16
  • 1
    @BeeOnRope: yeah, I wouldn't have guessed that OoO exec could absorb L3 latency. Perhaps adjacent-line prefetch into L2 is still helping? Or maybe not; `offcore_response.demand_data_rd.any_response` is almost equal to `offcore_requests.all_data_rd` so it seems all L3 data is getting pulled in by demand loads, not prefetch. – Peter Cordes Feb 04 '20 at 03:24
  • 1
    @PeterCordes - I was able to basically reproduce this with `./uarch-bench.sh --test-name=memory/bandwidth/load/*`, which is just a plain linear load test (with different sized loads). For small elements like 4 or 8 bytes, prefetching helped a lot, presumably because the code can't produce enough demand loads to fill all the LFBs before stalling out (curious what the limit is: i.e., whether loads that all miss to the same line clog up the RS or not). – BeeOnRope Feb 04 '20 at 03:39
  • 1
    However, for 32-byte AVX2 loads, the performance was almost the same with or without prefetching: 3.78 cycles per line vs 4.17. Next-line prefetching on or off didn't make a noticeable difference. Results [here](https://gist.github.com/travisdowns/8bbad46095f7ea74c44ae0e0291ede41). – BeeOnRope Feb 04 '20 at 03:39
  • For fun I added an answer with tinymembench, it kind of fills out a bit the quantitative side to this answer which is the "why" side, and covers a variety of code types. The conclusion is the same. – BeeOnRope Feb 04 '20 at 18:53
3

Yes, the L2 HW prefetcher is very helpful!

For example, find below results on my machine (i7-6700HQ) running tinymembench. The first column of results is with all prefetchers on, the second result column is with the L2 streamer off (but all other prefetchers still on).

This test uses 32 MiB source and destination buffers, which are much larger than the L3 on my machine, so it will be testing mostly misses to DRAM.

==========================================================================
== Memory bandwidth tests                                               ==
==                                                                      ==
== Note 1: 1MB = 1000000 bytes                                          ==
== Note 2: Results for 'copy' tests show how many bytes can be          ==
==         copied per second (adding together read and writen           ==
==         bytes would have provided twice higher numbers)              ==
== Note 3: 2-pass copy means that we are using a small temporary buffer ==
==         to first fetch data into it, and only then write it to the   ==
==         destination (source -> L1 cache, L1 cache -> destination)    ==
== Note 4: If sample standard deviation exceeds 0.1%, it is shown in    ==
==         brackets                                                     ==
==========================================================================

                                                       L2 streamer ON            OFF
 C copy backwards                                     :   7962.4 MB/s    4430.5 MB/s
 C copy backwards (32 byte blocks)                    :   7993.5 MB/s    4467.0 MB/s
 C copy backwards (64 byte blocks)                    :   7989.9 MB/s    4438.0 MB/s
 C copy                                               :   8503.1 MB/s    4466.6 MB/s
 C copy prefetched (32 bytes step)                    :   8729.2 MB/s    4958.4 MB/s
 C copy prefetched (64 bytes step)                    :   8730.7 MB/s    4958.4 MB/s
 C 2-pass copy                                        :   6171.2 MB/s    3368.7 MB/s
 C 2-pass copy prefetched (32 bytes step)             :   6193.1 MB/s    4104.2 MB/s
 C 2-pass copy prefetched (64 bytes step)             :   6198.8 MB/s    4101.6 MB/s
 C fill                                               :  13372.4 MB/s   10610.5 MB/s
 C fill (shuffle within 16 byte blocks)               :  13379.4 MB/s   10547.5 MB/s
 C fill (shuffle within 32 byte blocks)               :  13365.8 MB/s   10636.9 MB/s
 C fill (shuffle within 64 byte blocks)               :  13588.7 MB/s   10588.3 MB/s
 -
 standard memcpy                                      :  11550.7 MB/s    8216.3 MB/s
 standard memset                                      :  23188.7 MB/s   22686.8 MB/s
 -
 MOVSB copy                                           :   9458.4 MB/s    6523.7 MB/s
 MOVSD copy                                           :   9474.5 MB/s    6510.7 MB/s
 STOSB fill                                           :  23329.0 MB/s   22901.5 MB/s
 SSE2 copy                                            :   9073.1 MB/s    4970.3 MB/s
 SSE2 nontemporal copy                                :  12647.1 MB/s    7492.5 MB/s
 SSE2 copy prefetched (32 bytes step)                 :   9106.0 MB/s    5069.8 MB/s
 SSE2 copy prefetched (64 bytes step)                 :   9113.5 MB/s    5063.1 MB/s
 SSE2 nontemporal copy prefetched (32 bytes step)     :  11770.8 MB/s    7453.4 MB/s
 SSE2 nontemporal copy prefetched (64 bytes step)     :  11937.1 MB/s    7712.1 MB/s
 SSE2 2-pass copy                                     :   7092.8 MB/s    4355.2 MB/s
 SSE2 2-pass copy prefetched (32 bytes step)          :   7001.4 MB/s    4585.1 MB/s
 SSE2 2-pass copy prefetched (64 bytes step)          :   7055.1 MB/s    4557.9 MB/s
 SSE2 2-pass nontemporal copy                         :   5043.2 MB/s    3263.3 MB/s
 SSE2 fill                                            :  14087.3 MB/s   10947.1 MB/s
 SSE2 nontemporal fill                                :  33134.5 MB/s   32774.3 MB/s

In these tests having the L2 streamer is never slower and is often nearly twice as fast.

In general, you might notice the following patterns in the results:

  • Copies generally seem to be more affected than fills.
  • The standard memset and STOSB fill (these boil down to the same thing on this platform) are the least affected, with the prefetched result being only a few % faster than without.
  • Standard memcpy is probably the only copy here that uses 32-byte AVX instructions, and it is among the least affected of the copies - but prefetching on is still ~40% faster than without.

I also tried turning on and off the other three prefetchers, but they generally had almost no measurable effect for this benchmark.

BeeOnRope
  • 60,350
  • 16
  • 207
  • 386
  • (Fun fact: `vmovdqa` is AVX1 despite being "integer".) Do you think the OP's loop was giving lower bandwidth than glibc memcpy? And that's why 12 LFBs were enough to keep up with demand loads going to L3, without taking advantage of the extra MLP from the L2 <-> L3 superqueue which the L2 streamer can keep occupied? That's presumably the difference in your test. L3 should run at the same speed as the core; you both have quad-core Skylake-client equivalent microarchitectures so probably similar L3 latency? – Peter Cordes Feb 05 '20 at 01:13
  • @PeterCordes - sorry I probably should have been clear: this test was between 32 MiB buffers, so it is testing DRAM hits not L3 hits. I though tmb output the buffer size, but I see it doesn't -- oops! That was intentional: I wasn't trying to explain exactly the 512 KiB scenario of the OP, but just answer the headline question of whether the L2 streamer is useful with a scenario that shows it is. I guess I used a smaller buffer size I could more or less reproduce the results (I already saw a similar result in `uarch-bench` mentioned in the comments). – BeeOnRope Feb 05 '20 at 20:56
  • 1
    I added the buffer size to the answer. – BeeOnRope Feb 05 '20 at 20:57
  • If I had to guess the OP's loop would be similar to glibc memcpy. I think on my machine it uses `rep movsb` for large copies, which is slightly slower than a good unrolled AVX version, but the OP's is probably a bit worse than a good unrolled version. – BeeOnRope Feb 05 '20 at 20:58
  • @BeeOnRope On my machine `memcpy` uses AVX NT for buffers with more then 6MiB size. It was not the intention to measure the memcpy's implementation with hand-crafted code. – St.Antario Feb 06 '20 at 11:47
  • @St.Antario - oh yeah, my `memcpy` does that too. I didn't understand the second part of your comment. – BeeOnRope Feb 06 '20 at 21:18
  • @PeterCordes _Fun fact: vmovdqa is AVX1 despite being "integer"._ Yes, but is it a problem? As far as I found in the intel instruction reference for `vmovdqa` there is no any useful alternative from `AVX2`. I dont have support of AVX512 on my pc. – St.Antario Feb 08 '20 at 22:50
  • 1
    @St.Antario: No, it's not a problem. No idea why you think it might *be* a problem; it's not like there's any penalty for mixing AVX1 and AVX2 instructions. The point of my comment was that this loop only requires AVX1, yet this answer mentions using AVX2 instructions. Intel happened to widen the L1d load/store data paths to 32 bytes at the same time as introducing AVX2, so you might use availability of AVX2 as part of how you select a memcpy implementation if you're doing runtime dispatch... – Peter Cordes Feb 08 '20 at 22:55
  • 1
    How did you turn off the prefetcher and which one? Was it https://software.intel.com/en-us/articles/disclosure-of-hw-prefetcher-control-on-some-intel-processors? Forum https://software.intel.com/en-us/forums/intel-isa-extensions/topic/785240 says that some bits have different meaning. – osgx Feb 22 '20 at 07:57