Why does Delphi insert nop's in the middle of nowhere?

Question

The following code:

while Assigned(p) do begin
  np:= p^.next;
  h:= leaf_hash(p^.data);   <<-- inline routine
  h:= h mod nhashprime;
  p^.next:= nhashtab[h]; 
  nhashtab[h]:= p;
  p:= np;
end; { while }

Produces the following assembly:

hlife.pas.605: h:= leaf_hash(p^.data);
00000000005D4602 498B4018         mov rax,[r8+$18]
00000000005D4606 48C1E830         shr rax,$30
00000000005D460A 498B5018         mov rdx,[r8+$18]
00000000005D460E 48C1EA20         shr rdx,$20
00000000005D4612 81E2FFFF0000     and edx,$0000ffff
00000000005D4618 4D8B5818         mov r11,[r8+$18]
00000000005D461C 49C1EB10         shr r11,$10
00000000005D4620 4181E3FFFF0000   and r11d,$0000ffff
00000000005D4627 418B7018         mov esi,[r8+$18]
00000000005D462B 81E6FFFF0000     and esi,$0000ffff
00000000005D4631 488D34F6         lea rsi,[rsi+rsi*8]
00000000005D4635 4403DE           add r11d,esi
00000000005D4638 4F8D1CDB         lea r11,[r11+r11*8]
00000000005D463C 4103D3           add edx,r11d
00000000005D463F 488D14D2         lea rdx,[rdx+rdx*8]
00000000005D4643 03C2             add eax,edx
hlife.pas.606: h:= h mod nhashprime;
00000000005D4645 8BC0             mov eax,eax   <<--- Why is there a NOP here?
00000000005D4647 4C63DB           movsxd r11,rbx
00000000005D464A 4899             cwd
00000000005D464C 49F7FB           idiv r11
00000000005D464F 488BC2           mov rax,rdx
hlife.pas.607: p^.next:= nhashtab[h];
00000000005D4652 488B5538         mov rdx,[rbp+$38]

Delphi inserts a NOP before the nhashtab[h]:= p; line. If the leaf_hash function had been a normal function it would have made sense.
_{(no not really because the RET would still return to [5D4645] executing the nop)}
But now this is not a jump target.

So I'm (just) curious, Why does it do this?

[EDIT] : SSCCE
OK I've got a SSCCE up {it's not very short but it will have to do.

Note the compiler settings (Debug + Win64)

enter image description here

unit Unit16;

interface

uses
  Winapi.Windows, Winapi.Messages, System.SysUtils, System.Variants, System.Classes, Vcl.Graphics,
  Vcl.Controls, Vcl.Forms, Vcl.Dialogs, Vcl.StdCtrls;

type
  pnode = ^node;
  tflavour = (tnode, tleaf, tleaf64);

  node = record
    case flavour: tflavour of
      tnode: (next: pnode; (* hash link *)
          nw, ne, sw, se: pnode; (* constant; nw not= 0 means nonleaf *)
          res: pnode); (* cache *)
      tleaf: (next1: pnode; (* hash link *)
          isnode: pnode; (* must always be zero for leaves *)
          nw1, ne1, sw1, se1: word; (* constant *)
          res1, res2: word; (* constant *)
        );
      tleaf64: (next2: pnode; (* hash link *)
          isnode1: pnode; (* must always be zero for leaves *)
          data: Uint64; (* constant *)
          res1a, res2a: word; (* constant *)
        )
  end;

  ppnode = array of pnode;

  THashBox = class(TPersistent)
  strict private
    leafhashpop: integer;
    leafhashlimit: integer;
    leafhashprime: integer;
    leafhashtab: ppnode;
    nodehashpop: integer;
    nodehashlimit: integer;
    nodehashprime: integer;
    nodehashtab: ppnode;
  private
    TotalTime, Occurrences: Uint64;
    StartTime, EndTime: Uint64;
    procedure resize_leaves();
  public
    constructor Create;
  end;

  TForm16 = class(TForm)
    Button1: TButton;
    procedure Button1Click(Sender: TObject);
  private
    HashBox: THashBox;
  public
  end;

var
  Form16: TForm16;

implementation

{$R *.dfm}

const
  maxmem = 2000*1000*1000;   {2GB}

var
  alloced: cardinal;

function rdtsc: int64; assembler;
asm
  { xor eax,eax;
  push rbx
  cpuid
  pop rbx }
  rdtsc
end;

function node_hash(n: pnode): cardinal; { inline; } assembler; overload;
// var
// a: pnativeint;
  // begin
  // Result:= nativeint(n^.se) + 3 * (nativeint(n^.sw) + 3 * (nativeint(n^.ne) + 3 * nativeint(n^.nw) + 3));
asm
  mov eax,[rcx+node.nw]
  lea eax,[eax+eax*2+3]
  add eax,[rcx+node.ne]
  lea eax,[eax+eax*2]
  add eax,[rcx+node.sw]
  lea eax,[eax+eax*2]
  add eax,[rcx+node.se]
end;

function leaf_hash(a, b, c, d: cardinal): cardinal; inline; overload;
begin
  Result:= (d + 9 * (c + 9 * (b + 9 * a)))
end;

function leaf_hash(data: Uint64): cardinal; inline; overload;
begin
  // Result:= d + 9 * (c + 9 * (b + 9 * a));
  Result:= ((data shr 48) + 9 * (((data shr 32) and $FFFF) + 9 * (((data shr 16) and $FFFF) + 9 * (data and $FFFF))));
  Inc(Result);
end;

procedure TForm16.Button1Click(Sender: TObject);
begin
  HashBox:= THashBox.Create;
  Hashbox.resize_leaves;
end;

function nextprime(old: integer): integer;
begin
  Result:= 1009;
end;

constructor THashBox.Create;
begin
  leafhashprime:= 7;
  SetLength(leafhashtab, leafhashprime);
end;

procedure THashBox.resize_leaves();
var
  i, i1, i2: integer;
  nhashprime: Cardinal;
  p: pnode;
  nhashtab: ppnode;
  np: pnode;
  h: Integer;
  n, n2: integer;
  diff1, diff2: integer;
begin
  nhashprime:= nextprime(4 * leafhashprime);
  if (nhashprime * sizeof(pnode) > maxmem - alloced) then begin
    leafhashlimit:= 2000 * 1000 * 1000;
    exit;
  end;
  (*
    *   Don't let the hash table buckets take more than 4% of the
    *   memory.  If we're starting to strain memory, let the buckets
    *   fill up a bit more.
  *)
  if (nhashprime > maxmem div 100) then begin
    nhashprime:= nextprime(maxmem div 100);
    if (nhashprime = leafhashprime) then begin
      leafhashlimit:= 2000 * 1000 * 1000;
      exit;
    end;
  end;
  SetLength(nhashtab, nhashprime); //make a new table, do not resize the existing one.
  alloced:= alloced + sizeof(pnode) * (nhashprime - leafhashprime);

  diff1:= maxint;
  for i1:= 0 to 100 do begin
    n:= 0;
    StartTime:= rdtsc;
    for i:= 0 to leafhashprime - 1 do begin
      p:= leafhashtab[i];
      if Assigned(p) then begin
        h:= node_hash(p);
        h:= h mod nhashprime;
        inc(n, h);
      end;
    end;
    EndTime:= rdtsc;
    if ((EndTime - StartTime) < diff1) then diff1:= (EndTime - StartTime);

  end;

  diff2:= maxint;
  for i1:= 0 to 100 do begin
    n2:= 0;
    StartTime:= rdtsc;
    for i:= 0 to leafhashprime - 1 do begin
      p:= leafhashtab[i];
      if Assigned(p) then begin
        inc(n2);
      end;
    end;
    EndTime:= rdtsc;
    if (endtime - starttime) < diff2 then diff2:= endtime - starttime;
  end;

  TotalTime:= diff1 - diff2;
  if n <> n2 then Occurrences:= nhashprime;

  for i:= 0 to leafhashprime - 1 do begin
    // for (p=hashtab[i]; p;) {
    p:= leafhashtab[i];
    while Assigned(p) do begin    <<--- put a breakpoint here
      np:= p^.next;
      h:= leaf_hash(p^.data);
      h:= h mod nhashprime;
      p^.next:= nhashtab[h];
      nhashtab[h]:= p;
      p:= np;
    end; { while }
  end; { for i }
  // free(hashtab);
  leafhashtab:= nhashtab;
  leafhashprime:= nhashprime;
  leafhashlimit:= leafhashprime;
end;

end.

You will see this disassembly:

Unit16.pas.196: h:= h mod nhashprime;
000000000059CE4B 4863C0           movsxd rax,rax
000000000059CE4E 448B5528         mov r10d,[rbp+$28]
000000000059CE52 458BD2           mov r10d,r10d     <<--- weird NOP here 
000000000059CE55 4899             cwd
000000000059CE57 49F7FA           idiv r10
000000000059CE5A 488BC2           mov rax,rdx
Unit16.pas.197: p^.next:= nhashtab[h];
000000000059CE5D 488B5538         mov rdx,[rbp+$38]

I will guess for debugging possibly? I honestly have no clue though — Jerry Dodge, Sep 27 '13 at 01:50
My best guess would be some form of optimization to align the code bytes on a certain boundary. — Glenn1234, Sep 27 '13 at 03:46
This question sorely needs an SSCCE. Without one we cannot probe the compiler. I'd like to do that, and with different versions of the compiler. But I cannot. — David Heffernan, Sep 27 '13 at 08:24
@DavidHeffernan OK SSCCE is up, it's not very short, but it does the job. — Johan, Sep 27 '13 at 10:38
Of course the length of the code inside the loop needs to be a multiple of 16bytes. So the no-ops are ok as such it's just the placement of them, but perhaps it is awkward to place them at the end for some reason, or perhaps it works out ok, because the nop can break a dependency chain and thus cost no cycles to execute. Keep in mind that with XE2 the x64 compiler has a whole new code generation engine, so the same-old same-old code generation that we know from D3-DXE is out the door. — Johan, Sep 28 '13 at 03:07

score 5 · Accepted Answer · answered Oct 06 '13 at 16:09

The answer is that

MOV EAX,EAX

Is not a no-op

On x64 manipulating the lower 32 bits of a 64 bit register will zero the upper 32 bits.
So the above instruction should really be read as:

MOVZX RAX,EAX

According to AMD

Results of 32-bit operations are implicitly zero extended to 64-bit values.

Arnaud Bouchez · Answer 2 · 2013-09-27T07:42:00.103

4

IMHO this is not a nop for alignement, but it sounds to me just like un-optimized generated code, and wrong signing of your own variables.

h:= h mod nhashprime;

May be split into:

mov eax,eax       new h = eax, old h = eax  // which does not mean anything
movsxd r11,rbx    convert with sign nhashprime stored in rbx into temp registry r11
cwd               signed rax into rdx:rax
idiv r11          signed divide rdx:rax by r11 -> rax=quotient, rdx=remainder
mov rax,rdx       store remainder rdx into rax

Did you try enabling the code generation optimization? I suppose it will fix the content of mov eax,eax.

But your original code is also sub-optimized. You should use unsigned arithmetic in your case.

And, you should better use a power of two nhashprime, the compute a simple and binary operation instead of a slow division:

var h, nhashprimeMask: cardinal; // use UNSIGNED arithmetic here!

// here nhashprime is a POWER OF TWO (128,256,512,1024,2048...)
nhashprimeMask := nhashprime-1; // compute the mask

while Assigned(p) do begin
  np:= p^.next;
  h:= leaf_hash(p^.data) and nhashprimeMask;
  p^.next:= nhashtab[h]; 
  nhashtab[h]:= p;
  p:= np;
end; { while }

This code will be much faster, and should much better compile.

edited Sep 27 '13 at 07:42

answered Sep 27 '13 at 06:08

Arnaud Bouchez

42,305
3
71
159

2

Anything other than a prime size for hashtab will lead to more hash collisions. Right now I'm exactly at the theoretical optimum. If I replace the primenumber with a power of two, the hash collisions go way way up (from 18% collisions at halffull to 81% at halffull) – Johan Sep 27 '13 at 08:45
Yes, optimization is on – Johan Sep 27 '13 at 09:22
When correcting the ints to cardinals the problem does go away. When turning optimization **off** the problem also disappears. – Johan Sep 27 '13 at 10:40
@Johan You have to compute the power of two value properly. See for instance how it is done in the RTL code (e.g. generic collections) or in mORMot. In practice, it is not as awful as you stated, [or your hash algorithm is to be re-considered](http://stackoverflow.com/q/1145217/458259) (e.g. we use crc32 with preallocated tables with great success). AFAIK modern CPUs are clever enough to *ignore the NOP* when they convert the x64 asm into internal micro-ops before execution. Only issue may be cache miss, not a big problem in your case. – Arnaud Bouchez Sep 27 '13 at 13:21
I'll check out mORMot. – Johan Sep 27 '13 at 14:06

score 0 · Answer 3 · answered Sep 27 '13 at 05:09

0

It is an optimization for aligning code, especially in loops, to avoid cache line stalls and such.

answered Sep 27 '13 at 05:09

Remy Lebeau

555,201
31
458
770

1

I would expect those alignment NOP's to be at the end or beginning of loops, not somewhere in the middle, unless there's something I'm not getting here. What's the purpose of putting them in the middle where there's no branch target? – Johan Sep 27 '13 at 05:14
1

Are you going to explain how this results in optimization? Because it sure looks like it won't. – David Heffernan Sep 27 '13 at 05:46
1

I agree with David. Alignment is to be made before the loop label, to a aligned to 4/8 bytes boundary. Here there is no alignment, just wrong optimization. – Arnaud Bouchez Sep 27 '13 at 05:56

Why does Delphi insert nop's in the middle of nowhere?

3 Answers3