I am asking if mov
instructions that need to compute that address i.e (in at&t syntax
mov i(r, r, i), reg
or mov reg, i(r, reg, i)
have to be executed on port 1 because they are effectively an LEA w/ 3 operands + MOV or if they are free to be executed on port 0156.
If they do indeed execute the LEA portion on port 1, will port 1 be unblocked once the address computation is complete or will the entire memory load need to complete first.
On ICL it seems p7 can do indexed address mode?
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#define BENCH_ATTR __attribute__((noinline, noclone, aligned(4096)))
#define TERMS 3
void BENCH_ATTR
test_store_port() {
const uint32_t N = (1 << 29);
uint64_t dst, loop_cnt;
uint64_t src[16] __attribute__((aligned(64)));
asm volatile(
"movl %[N], %k[loop_cnt]\n\t"
".p2align 5\n\t"
"1:\n\t"
"movl %k[loop_cnt], %k[dst]\n\t"
"andl $15, %k[dst]\n\t"
#if TERMS == 3
"movl %k[dst], (%[src], %[dst], 4)\n\t"
#else
"movl %k[dst], (%[src])\n\t"
#endif
"decl %k[loop_cnt]\n\t"
"jnz 1b\n\t"
: [ dst ] "+r"(dst), [ loop_cnt ] "+r"(loop_cnt)
: [ N ] "i"(N), [ src ] "r"(src), "m"(*((const uint32_t(*)[16])src))
: "cc");
}
int
main(int argc, char ** argv) {
test_store_port();
}
Results with #define TERMS 3
:
perf stat -e uops_dispatched.port_2_3 -e uops_dispatched.port_7_8 -e uops_issued.any -e cpu-cycles ./bsf_dep
Performance counter stats for './bsf_dep':
297,191 uops_dispatched.port_2_3
537,039,830 uops_dispatched.port_7_8
2,149,098,661 uops_issued.any
761,661,276 cpu-cycles
0.210463841 seconds time elapsed
0.210366000 seconds user
0.000000000 seconds sys
Results with #define TERMS 1
:
perf stat -e uops_dispatched.port_2_3 -e uops_dispatched.port_7_8 -e uops_issued.any -e cpu-cycles ./bsf_dep
Performance counter stats for './bsf_dep':
291,370 uops_dispatched.port_2_3
537,040,822 uops_dispatched.port_7_8
2,148,947,408 uops_issued.any
761,476,510 cpu-cycles
0.202235307 seconds time elapsed
0.202209000 seconds user
0.000000000 seconds sys