7

I'm reading Intel Optimization Manual about Write Combining memory and wrote benchmarks to understand how it works. These are 2 functions that I'm running benchmarks on:

memcopy.h:

void avx_ntcopy_cache_line(void *dest, const void *src);

void avx_ntcopy_64_two_cache_lines(void *dest, const void *src);

memcopy.S:

avx_ntcopy_cache_line:
    vmovdqa ymm0, [rdi]
    vmovdqa ymm1, [rdi + 0x20]
    vmovntdq [rsi], ymm0
    vmovntdq [rsi + 0x20], ymm1
    ;intentionally no sfence after nt-store
    ret

avx_ntcopy_64_two_cache_lines:
    vmovdqa ymm0, [rdi]
    vmovdqa ymm1, [rdi + 0x40]
    vmovntdq [rsi], ymm0
    vmovntdq [rsi + 0x40], ymm1
    ;intentionally no sfence after nt-store
    ret

Here is how benchmark's main function looks like:

#include <stdlib.h>
#include <inttypes.h>
#include <x86intrin.h>
#include <fcntl.h>
#include <unistd.h>
#include <stdio.h>
#include "memcopy.h"

#define ITERATIONS 1000000

//As @HadiBrais noted, there might be an issue with 4K aliasing
_Alignas(64) char src[128];
_Alignas(64) char dest[128];

static void run_benchmark(unsigned runs, unsigned run_iterations,
                    void (*fn)(void *, const void*), void *dest, const void* src);

int main(void){
    int fd = open("/dev/urandom", O_RDONLY);
    read(fd, src, sizeof src);

    run_benchmark(20, ITERATIONS, avx_ntcopy_cache_line, dest, src);
    run_benchmark(20, ITERATIONS, avx_ntcopy_64_two_cache_lines, dest, src);
}

static int uint64_compare(const void *u1, const void *u2){
    uint64_t uint1 = *(uint64_t *) u1;
    uint64_t uint2 = *(uint64_t *) u2;
    if(uint1 < uint2){
        return -1;
    } else if (uint1 == uint2){
        return 0;
    } else {
        return 1;
    }
}

static inline uint64_t benchmark_2cache_lines_copy_function(unsigned iterations, void (*fn)(void *, const void *),
                                               void *restrict dest, const void *restrict src){
    uint64_t *results = malloc(iterations * sizeof(uint64_t));
    unsigned idx = iterations;
    while(idx --> 0){
        uint64_t start = __rdpmc((1<<30)+1);
        fn(dest, src);
        fn(dest, src);
        fn(dest, src);
        fn(dest, src);
        fn(dest, src);
        fn(dest, src);
        fn(dest, src);
        fn(dest, src);
        fn(dest, src);
        fn(dest, src);
        fn(dest, src);
        fn(dest, src);
        fn(dest, src);
        fn(dest, src);
        fn(dest, src);
        fn(dest, src);
        uint64_t finish = __rdpmc((1<<30)+1);
        results[idx] = (finish - start) >> 4;
    }
    qsort(results, iterations, sizeof *results, uint64_compare);
    //median
    return results[iterations >> 1];
}

static void run_benchmark(unsigned runs, unsigned run_iterations,
                    void (*fn)(void *, const void*), void *dest, const void* src){
    unsigned current_run = 1;
    while(current_run <= runs){
        uint64_t time = benchmark_2cache_lines_copy_function(run_iterations, fn, dest, src);
        printf("Run %d result: %lu\n", current_run, time);
        current_run++;
    }
}

Compiling with options

-Werror \
-Wextra
-Wall \
-pedantic \
-Wno-stack-protector \
-g3 \
-O3 \
-Wno-unused-result \
-Wno-unused-parameter

And running the benchmarks I got the following results:

I. avx_ntcopy_cache_line:

Run 1 result: 61
Run 2 result: 61
Run 3 result: 61
Run 4 result: 61
Run 5 result: 61
Run 6 result: 61
Run 7 result: 61
Run 8 result: 61
Run 9 result: 61
Run 10 result: 61
Run 11 result: 61
Run 12 result: 61
Run 13 result: 61
Run 14 result: 61
Run 15 result: 61
Run 16 result: 61
Run 17 result: 61
Run 18 result: 61
Run 19 result: 61
Run 20 result: 61

perf:

 Performance counter stats for './bin':

     3 503 775 289      L1-dcache-loads                                               (18,87%)
        91 965 805      L1-dcache-load-misses     #    2,62% of all L1-dcache hits    (18,94%)
     2 041 496 256      L1-dcache-stores                                              (19,01%)
         5 461 440      LLC-loads                                                     (19,08%)
         1 108 179      LLC-load-misses           #   20,29% of all LL-cache hits     (19,10%)
        18 028 817      LLC-stores                                                    (9,55%)
       116 865 915      l2_rqsts.all_pf                                               (14,32%)
                 0      sw_prefetch_access.t1_t2                                      (19,10%)
           666 096      l2_lines_out.useless_hwpf                                     (19,10%)
        47 701 696      l2_rqsts.pf_hit                                               (19,10%)
        62 556 656      l2_rqsts.pf_miss                                              (19,10%)
         4 568 231      load_hit_pre.sw_pf                                            (19,10%)
        17 113 190      l2_rqsts.rfo_hit                                              (19,10%)
        15 248 685      l2_rqsts.rfo_miss                                             (19,10%)
        54 460 370      LD_BLOCKS_PARTIAL.ADDRESS_ALIAS                                     (19,10%)
    18 469 040 693      uops_retired.stall_cycles                                     (19,10%)
    16 796 868 661      uops_executed.stall_cycles                                     (19,10%)
    18 315 632 129      uops_issued.stall_cycles                                      (19,05%)
    16 176 115 539      resource_stalls.sb                                            (18,98%)
    16 424 440 816      resource_stalls.any                                           (18,92%)
    22 692 338 882      cycles                                                        (18,85%)

       5,780512545 seconds time elapsed

       5,740239000 seconds user
       0,040001000 seconds sys

II. avx_ntcopy_64_two_cache_lines:

Run 1 result: 6
Run 2 result: 6
Run 3 result: 6
Run 4 result: 6
Run 5 result: 6
Run 6 result: 6
Run 7 result: 6
Run 8 result: 6
Run 9 result: 6
Run 10 result: 6
Run 11 result: 6
Run 12 result: 6
Run 13 result: 6
Run 14 result: 6
Run 15 result: 6
Run 16 result: 6
Run 17 result: 6
Run 18 result: 6
Run 19 result: 6
Run 20 result: 6

perf:

 Performance counter stats for './bin':

     3 095 792 486      L1-dcache-loads                                               (19,26%)
        82 194 718      L1-dcache-load-misses     #    2,66% of all L1-dcache hits    (18,99%)
     1 793 291 250      L1-dcache-stores                                              (19,00%)
         4 612 503      LLC-loads                                                     (19,01%)
           975 438      LLC-load-misses           #   21,15% of all LL-cache hits     (18,94%)
        15 707 916      LLC-stores                                                    (9,47%)
        97 928 734      l2_rqsts.all_pf                                               (14,20%)
                 0      sw_prefetch_access.t1_t2                                      (19,21%)
           532 203      l2_lines_out.useless_hwpf                                     (19,19%)
        35 394 752      l2_rqsts.pf_hit                                               (19,20%)
        56 303 030      l2_rqsts.pf_miss                                              (19,20%)
         6 197 253      load_hit_pre.sw_pf                                            (18,93%)
        13 458 517      l2_rqsts.rfo_hit                                              (18,94%)
        14 031 767      l2_rqsts.rfo_miss                                             (18,93%)
        36 406 273      LD_BLOCKS_PARTIAL.ADDRESS_ALIAS                                     (18,94%)
     2 213 339 719      uops_retired.stall_cycles                                     (18,93%)
     1 225 185 268      uops_executed.stall_cycles                                     (18,94%)
     1 943 649 682      uops_issued.stall_cycles                                      (18,94%)
       126 401 004      resource_stalls.sb                                            (19,20%)
       202 537 285      resource_stalls.any                                           (19,20%)
     5 676 443 982      cycles                                                        (19,18%)

       1,521271014 seconds time elapsed

       1,483660000 seconds user
       0,032253000 seconds sys

As can be seen, there is 10 times difference in measurement results.


My Interpretation:

As explained in Intel Optimization Manual/3.6.9:

writes to different parts of the same cache line can be grouped into a single, full-cache-line bus transaction instead of going across the bus (since they are not cached) as several partial writes

I assumed that in the case of avx_ntcopy_cache_line we've got the full 64-bytes write initiating the bus transaction to write them out which prohibits rdtsc to be executed out of order.

By contrast, in the case of avx_ntcopy_64_two_cache_lines we've got 32 bytes written into different cache lines going to WC-buffer and bus transaction was not triggered. This allowed rdtsc to be executed out of order.

This interpretation looks extremely suspicious and it does not go along with bus-cycles difference:

avx_ntcopy_cache_line: 131 454 700

avx_ntcopy_64_two_cache_lines: 31 957 050

QUESTION: What is the true cause of such difference in measurement?

Peter Cordes
  • 328,167
  • 45
  • 605
  • 847
St.Antario
  • 26,175
  • 41
  • 130
  • 318
  • 3
    Your interpretation makes no sense to me; `bus-cycles` is just unhalted reference cycles. Also `avx_ntcopy_cache_line` may be suffering from 4K aliasing (more than `avx_ntcopy_64_two_cache_lines`?). Measure `LD_BLOCKS_PARTIAL.ADDRESS_ALIAS` for both. Also what is the frequency of TSC? Is the core frequency fixed? – Hadi Brais Jan 18 '20 at 14:35
  • @HadiBrais can't it be an out-of-order issue? I think this way because adding `sfence` to flush nt stores results `avx_ntcopy_64_two_cache_lines` in 80 reference cycles, but `avx_ntcopy_cache_line` in 75. – St.Antario Jan 18 '20 at 14:48
  • First we'll have to determine to what extent they are suffering from 4K aliasing, then change the alignment so as to eliminate the impact of this issue. Then we'll see. – Hadi Brais Jan 18 '20 at 14:52
  • @HadiBrais I ran `perf stat` with `LD_BLOCKS_PARTIAL.ADDRESS_ALIAS` on the same benchmarks and got the following results: `avx_ntcopy_64_two_cache_lines` - `1 512 813 960 LD_BLOCKS_PARTIAL.ADDRESS_ALIAS (25,93%)`; `avx_ntcopy_cache_line` - `10 005 775 711 LD_BLOCKS_PARTIAL.ADDRESS_ALIAS (26,65%)`. Probably you were right. – St.Antario Jan 18 '20 at 15:10
  • 2
    So get rid of 4K aliasing; later loads should not align with previous stores. And for the love of all that you care about, stop showing TSC cycle counts (you did that also in the prev question)! What am I supposed to do with these numbers without giving also info about TSC and core frequencies? Covert them into core frequencies! – Hadi Brais Jan 18 '20 at 15:18
  • @HadiBrais My fault. I re-measured with `rdpmc((1<<30)+1)` to count actual clock cycles as specified [here](https://software.intel.com/en-us/forums/software-tuning-performance-optimization-platform-monitoring/topic/595214). – St.Antario Jan 18 '20 at 16:12
  • 2
    @HadiBrais It's surprising to me there's some 4K aliasing. Aren't the two tests just loading and storing from the same pair of (not 4K-aligned) addresses over and over again? – Margaret Bloom Jan 18 '20 at 18:06
  • 1
    @MargaretBloom: the original code did have `_Alignas(4096)` so the dst and src were both 4k aligned. The store at the end of one iteration would alias with the load in the next (because as you say, the pointer isn't incrementing). The code has now changed to just `_Alignas(32)` so we don't even know the alignment relative to a cache line, and `avx_ntcopy_cache_line` might or might not be copying adjacent halves of 2 separate cache lines. – Peter Cordes Jan 18 '20 at 23:22
  • @St.Antario: *initiating the bus transaction to write them out which prohibits rdtsc to be executed out of order.* where are you getting any connection between `rdtsc` and flushing LFBs? `rdtsc` can start executing as soon as it issues, whether or not there are still un-executed `movntdq` instructions in the scheduler. (Same for `rdpmc` I think.) – Peter Cordes Jan 18 '20 at 23:29
  • BTW, your asm functions read from their first arg (RDI) and write to their 2nd (RSI), so you got the prototype backwards. The D in RDI stands for destination; a handy way to remember the x86-64 System V arg-passing convention is that it matches memcpy for the first 2 args, and that `memcpy(dst, src, size)` could be implemented with `mov rcx,rdx` / `rep movsb` / `ret`. – Peter Cordes Jan 18 '20 at 23:31
  • @PeterCordes Even with setting 32-byte alignment the buffers turned out to be aligned on at least cache line size, e.g: `src` - `0x555555755100`; `dest` - `0x555555755080` and the measurement is reproduced if the two reads and writes are not within the same cache line. – St.Antario Jan 19 '20 at 04:16
  • Ok, so you happened to get the alignment you wanted. It would make a lot more sense to guarantee that in the source code with `_Alignas(64)`, to make sure it also happens for other people trying to reproduce your results on other systems with different compilers. – Peter Cordes Jan 19 '20 at 04:19
  • @PeterCordes _where are you getting any connection between rdtsc and flushing LFBs_ This was the possible cause I was thinking about. I just measured `resource_stalls.sb` and noticed almost `100x` difference between the 2 cases which might be relevant to the cases: `avx_ntcopy_64_two_cache_lines` - `103 421 551 resource_stalls.sb`; `avx_ntcopy_cache_line` - `16 603 956 158 resource_stalls.sb`. Initially the buffers were `4K` aligned and as @HadiBrais noted they might be suffering from 4K aliasing so I re-aligned the buffer to eliminate impact of it. – St.Antario Jan 19 '20 at 04:35
  • re: alignment. But at least you don't have to re-run your own experiment, assuming the data is from the non-4k version of the code. This would appear to indicate that a (fully) overlapping store to a not-yet-flushed WC buffer can just merge into it, while of course completing a line triggers an immediate flush, and doing that repeatedly is slow. – Peter Cordes Jan 19 '20 at 04:37
  • 4
    When you changed your code from alignas(4096) to alignas(32), you only changed 1 line in the perf outputs. That sounds totally implausible that everything else would be identical, including total time. (Why doesn't your perf output include core clock cycles? `bus-cycles` is just ref cycles and redundant with elapsed time.) – Peter Cordes Jan 19 '20 at 04:39
  • @PeterCordes Fixed, thanks. – St.Antario Jan 19 '20 at 04:45
  • 1
    I don't see cycles *or* time anywhere in the 2_lines perf output. Seeing *total* time for the whole benchmark is a useful quick reference to see which one's faster, without trusting your `__rdpmc` numbers. – Peter Cordes Jan 19 '20 at 04:48
  • 1
    Are you sure the numbers are correct? You seem to have updated the core cycles numbers *before* changing `_Alignas` to 64-byte alignment, then changed the alignment to 64 bytes, and then updated the `perf` numbers only, but not the `rdpmc` numbers. Moreover, the `perf` numbers are highly unreliably. For one thing, there is too much event multiplexing. Remove all the events and keep only 4 of the stall events. Another thing, can you get rid of that `qsort` stuff? How about reporting the average or the minimum? Try increasing `ITERATIONS` and see whether the event counts scale with it. – Hadi Brais Jan 19 '20 at 08:38
  • @HadiBrais I got rid of `rdpmc` and all the `qsort`-related stuff collecting perf counters only and increased `ITERATIONS` to `10 000 000`. The difference in `cycles` was `14 529 703 536` vs `182 067 319 516` and sb stalls `2 358 034` vs `172 131 587 352`. – St.Antario Jan 19 '20 at 18:07

1 Answers1

4

Hypothesis: a (fully) overlapping store to a not-yet-flushed WC buffer can just merge into it. Completing a line triggers an immediate flush, and all those stores going all the way off core is slow.

You report 100x more resource_stalls.sb for the full-line version than for the 2 partial line version. That's consistent with this explanation.

If 2_lines can commit the NT stores into existing WC buffers (LFBs), the store buffer can keep up with the rate of store instructions executing, usually bottlenecking on something else. (Probably just the front-end, given the call/ret overhead for each pair of loads/stores. Although of course call does include a store.) Your perf results show 1.8 billion stores (to L1) over 5.7 billion cycles, so well within the 1 store/cycle limit we might expect for stores hitting in the WC buffer.

But if WC buffers get flushed, which happens when a line is fully written, it has to go off core (which is slow), tying up that LFB for a while so it can't be used to commit later NT stores. When stores can't leave the store buffer, it fills up and the core stalls on being able to allocate resources for new store instructions to enter the back-end. (Specifically the issue/rename/allocate stage stalls.)

You could probably see this this effect more clearly with any of the L2, L3, SQ, offcore req/resp events that would pick up all this traffic outside of the L1. You include some L2 counters, but those probably don't pick up NT store that pass through L2.


Enhanced REP MOVSB for memcpy suggests that NT stores take longer for the LFB to "hand off" to outer levels of the memory hierarchy, keeping the LFB occupied long after the request starts its journey. (Perhaps to make sure a core can always reload what it just stored, or otherwise not losing track of an in-flight NT store to maintain coherency with MESI.) A later sfence also needs to know when earlier NT stores have become visible to other cores, so we can't have them invisible at any point before that.

Even if that's not the case, there's still going to be a throughput bottleneck somewhere for all those NT store requests. So the other possible mechanism is that they fill up some buffer and then the core can't hand off LFBs anymore, so it runs out of LFBs to commit NT stores into, and then the SB fills stalling allocation.

They might merge once they get to the memory controller without each one needing a burst transfer over the actual external memory bus, but the path from a core through the uncore to a memory controller is not short.


Even doing 2x rdpmc for every 32 stores doesn't slow the CPU down enough to prevent the store buffer from filling; what you're seeing depends on running this in a relatively tight loop, not a one-shot execution with an empty store buffer to start with. Also, your suggestion that rdpmc or rdtsc won't be reordered wrt. the WC buffers flushing makes zero sense. Execution of stores isn't ordered wrt. execution of rdtsc.

TL:DR: your rdpmc to time an individual group of stores isn't helpful, and if anything hides some of the perf difference by slowing down the fast case that doesn't bottleneck on the store buffer.

Peter Cordes
  • 328,167
  • 45
  • 605
  • 847
  • 1
    I must have started writing my answer like a minute before this appeared, and mobile doesn't tell you another answer appeared.... anyway they seem identical in idea. – BeeOnRope Jan 19 '20 at 05:10
  • 1
    @BeeOnRope: heh. I was wondering why you posted another answer instead of commenting that you were sure that's how repeated NT stores to the same open line worked. I wasn't 100% sure, I don't remember what the answer was last time this came up, if it ever has on SO or in my own testing. – Peter Cordes Jan 19 '20 at 05:17
  • I am not 100% sure, but even before this evidence I would have guessed they work like that, in both aspects (immediate close and repeated stores). About repeated stores especially, it's hard to imagine how else they would work? – BeeOnRope Jan 19 '20 at 05:22
  • 2
    The one thing I would be least sure if is if the same line, written fully with NT stores, then again, back to back, would really cause two flushes all the way to memory. It seems plausible that the second request might catch up to the first, at least some of the time, eg in an outer cache level. Maybe that does happen here, but of course performance would still suck unless if happens a lot. It would be interesting to see the offcore request events to see if there a 1:1 correspondence with writes in the benchmark. – BeeOnRope Jan 19 '20 at 05:28
  • @BeeOnRope: Merging might happen in the memory controller itself and avoid actual memory-bus cycles. At that point it knows it's DRAM (or Optane DC PM...), not device memory / MMIO registers where writing might have side-effects. – Peter Cordes Jan 19 '20 at 05:37
  • 1
    I just added a bit of my answer to yours, because the rest was basically a dupe. – BeeOnRope Jan 19 '20 at 19:07
  • I think the system knows that it's not device memory/MMIO long before that, or least knows it can treat it as such, based on the memory type: this is WB memory, so I don't think the system has to do anything special to preserve any specific behavior of NT writes (indeed, those are not well defined since flushes can happen for various reasons) when it is to WB regions. So I agree it can be merged at the memory controller, but I don't think there is necessarily an issue before then either. – BeeOnRope Jan 19 '20 at 19:08
  • @BeeOnRope: Yeah, the CPU core itself has to know the memory type, but could that info be dropped when sending the store off core? On 2nd thought maybe not; L3 has to treat a WT store differently from a WB eviction or an NT store. My other reasoning was that the memory controller might be the first place that tries to match different requests with each other. That takes a buffer and some HW matching which might not be very useful in most places. Spamming NT stores to the same WB line is not something that's generally worth looking for because normal code doesn't do it in first place. – Peter Cordes Jan 19 '20 at 20:35
  • It could be dropped, I guess - I was more thinking that it knows it at the L1, and it knows it at the MC, so maybe it knows it it in the middle too, but I didn't think too hard to get to it. I agree the MC seems the most likely. I think the caches definitely "coalesce" writes normally, but they don't have any special hardware to do that necessarily, it just happens as a natural consequence of normal WB caches: the value gets updated in the cache but if there is no immediate writeback later writes get effectively coalesced. NT writes are obviously different since the semantic is basically WT. – BeeOnRope Jan 19 '20 at 20:43
  • 2
    So only when it gets to the MC queue would it seem like there is a good chance of coalescing the request (and AFAIK the MC definitely does this kind of coalescing). – BeeOnRope Jan 19 '20 at 20:43
  • 1
    I have to say I'm not satisfied with the analysis here. First, the claim that a WC gets evicted when all of the 64 bytes are modified must be proven. The Intel SDM V3 (Section 11.3.1) clearly says that the WCB eviction protocol is implementation-dependent and there are no guarantees. Second, even if that's true, where exactly is the bottleneck? There are only 32 stores between the `rdpmc` instructions. On Skylake the SB size is 56. It takes tens of cycles to executes two `rdpmc`s, why isn't that enough to drain most of the SB until the next 32 stores are allocated? – Hadi Brais Jan 20 '20 at 04:42
  • 1
    On Skylake, the L1-L2 bandwidth is 64 bytes/cycle, so it can evict a whole WCB in one cycle. Third, after changing the alignment of the buffers, the difference is still 10x, even after supposedly removing 4K aliasing. How does that make sense? Fourth, the TSC cycles vs. core cycles suggest that the core frequency is 3x higher than the TSC frequency, which is rather unusual. And finally, we don't know the standard deviation of any of the numbers; it could be high with huge run-to-run variations. We don't have the numbers to support any claims. A thorough and proper analysis is required. – Hadi Brais Jan 20 '20 at 04:42
  • @HadiBrais: Yes, the data leaves a lot to be desired. And yes WCB eviction on other uarches isn't guaranteed on paper to work this way, but I think we can conclude from this data that it *does* on the OP's CPU. Remember that the LFB itself has to track the in-flight store for longer than just handing it off to L2. [Enhanced REP MOVSB for memcpy](//stackoverflow.com/q/43343231) suggests that NT stores take longer to "hand off" to outer levels of the memory hierarchy than regular stores, perhaps to make sure the core doing the store can always reload what it stored. – Peter Cordes Jan 20 '20 at 04:49
  • @HadiBrais: Updated my answer to address some of your points in comments. Thanks for pointing out assumptions / leaps in logic that my answer left unexplained. I haven't tried to look at every data point of the OP's run, but given the different edits I don't think there's any reason to believe that any TSC count is from the same run as a core cycles count. The blocks of perf results did get edited all in one edit so at least we can assume it's all from the same run. – Peter Cordes Jan 20 '20 at 05:02
  • I guess you're referring to the statement "NT stores don't help and they can even hurt since they may increase the latency since the handoff time for the line buffer may be longer than a scenario where prefetch brings the RFO line into LLC" in Bee's answer? Although I don't see how Bee reached that conclusion. (Are there latency measurements in that Q/A that I've missed?) Note that I'm not saying your answer is wrong, rather it's on weak grounds. Your edits made it a little better, but a lot more needs to be done. – Hadi Brais Jan 20 '20 at 05:06
  • and I very much appreciate that your answer starts with "Hypothesis" – Hadi Brais Jan 20 '20 at 05:09
  • @BeeOnRope: ping, have a look at Hadi's recent comments. Do we have any evidence of NT stores definitely occupying an LFB for a long time, other than this Q&A? – Peter Cordes Jan 20 '20 at 05:22
  • 1
    It doesn't really matter the LFB occupancy time, it just matters the effective throughput for continual WC evictions all the way to DRAM. That is going to have a limit somewhere, and because this a test with a large number of iterations (and the median is taken), that number is going to be reported. About eviction, it seems fairly obvious to me that line the is probably evicted almost immediately after it is fully written: why would you wait? Of course the manual doesn't commit to a specific path: it rarely would on something that is basically just a performance difference. – BeeOnRope Jan 20 '20 at 13:52
  • @BeeOnRope: Oh good point. If the real bottleneck is farther out, there's no room to hand off an LFB at all so you do still run out of LFBs and get back pressure on the store buffer. – Peter Cordes Jan 20 '20 at 20:08
  • Exactly - you don't even have to understand the exact mechanism, just know that without "infinite buffering" the long term rate will just be the slowest bottleneck. Now this is different e.g., for the first few stores: before steady-state is reached - but the OP doesn't show these. – BeeOnRope Jan 20 '20 at 21:20
  • @BeeOnRope These stores are to the *same* cache line in the case of `avx_ntcopy_cache_line`. How exactly do you run out of LFBs? I don't think more than one LFB will be allocated for the same line at the same time (at least I have not seen anything like this in research papers or patents). The stores are either coalesced in the same WCB or a free WCB will be allocated, and there will always be a free WCB because there is nothing else happening (other than the loads to the *same* line). And saying "there's bottleneck somewhere" has really no meaning and explains nothing. Where and why? – Hadi Brais Jan 21 '20 at 05:27
  • @Hadi Do you agree that once a line is fully written by NT stores, it is flushed out all the way to memory? If so, I think the conclusion follows obviously from there. – BeeOnRope Jan 21 '20 at 05:52
  • @HadiBrais: we know that at some point after a WCB is completed, it initiates some process for communicating off core, invalidating all other copies of that line, and storing this copy. Presumably once it decides to do that, it can't undo that and re-open itself to merge in more overlapping stores. So it's no longer an LFB that can accept new stores to that line, and stores leaving the store buffer will need to start another LFB. (And the evidence here supports the conclusion that this process happens as soon as the line is completed; that makes obvious sense for the normal NT use-case.) – Peter Cordes Jan 21 '20 at 05:53