Assembler quicksort slower than java array.sort?

Question

I have coded an assembler QuickSort and found that is it slower than the array.sort in java. I am wondering if anyone would know of why I cannot match the speed of some of the higher level languages (I am assuming C would probably be quicker also)

I have tested with 1,000,000 * 1024 bytes. I get approx 2 seconds to complete the sort while Array.Sort in Java will do it in about .8 second

I don't think there is any problem in actually moving the data as I have independently tested that and it moves quite quickly.

    %macro $copyBytes 3
        mov RSI,%1                                  ; Grab the Src Address
        mov RDI,%2                                  ; Grab the Dst Address
        mov rcx,%3                                  ; Grab the length
    %%l:movsb                                       ; Move the data
        loop %%l    
    %endmacro

    %imacro $getAddress 2
        mov %1,r_Table                              ; Start of Table to RSI/RDI
        mov rax,%2                                  ; Lower/Upper to RAX
        mul r_recordLength                          ; Multiply by Record Length
        add %1,rax                                  ; add offset to RSI/RDI
    %endmacro

    %define r_recordLength  r8
    %define r_startPos      r10             
    %define r_noOfBytes     r11
    %define r_Ubound        r15

    %define @left           r10
    %define @right          r11
    %define @pivotValue     r12
    %define @prevLeft       r13

    [section .bss]
        d_startPos          resq 1
        d_noOfBytes         resq 1      
        pivotValue          resb 128                ; This gives tha max sortkey size
        LeftAddress         resq 1
        RightAddress        resq 1
    [section .text]

;   Call the recursive SORT
    push 0                                          ; Push the 1st Left
    push r_Ubound                                   ; Push the 1st Right
    Call _QSort
    pop rax
    pop rax


;-----------------------------------------------------------------------
;   Recursive Sort
;-----------------------------------------------------------------------
_QSort:

    mov @left,qword[rsp+16]                         ; get the Left Index
    mov @right,qword[rsp+8]                         ; get the Right Index

;   if  left >= right                   ; Exit when Left is not < Right
    cmp @left,@right
        jnl .Exit

;   pivotValue = RandomNo[ [ left + right / 2 ] ] CALCULATE pivotValue
    mov rax,@left                                   ; Grab the left
    add rax,@right                                  ; add the right
    shr rax,1                                       ; and divide by 2
    $getAddress RSI,rax                          ; get the address of this slot
    $copyBytes RSI,pivotValue,qword[d_noOfBytes]     ; and copy it to pivotValue


;   prevLeftIdx = [moveThem]
    push @left                                      ; preserve the Left index
    push @right                                     ; preserve the Right index
    call _moveThem                                  ; do the moves
    pop @right                                      ; restore the Right index
    pop @left                                       ; restore the Left index                                                        
    mov @prevLeft,rax                               ; and save the returned value

;   -------------------------------------
;   Recursive Call - (left, prevLeft - 1)
;   -------------------------------------
    push @left                                      ; preserve the Left index
    push @right                                     ; preserve the Right index
    push @left                                      ; push the 1st parameter (Left)
    mov rax,@prevLeft                               ; get the previous left
    dec rax                                         ; minus 1
    push rax                                        ; push the 2nd parameter
    Call _QSort                                     ; Call yourself     
    pop rax                                         ; Junk this
    pop rax                                         ; Junk this
    pop @right                                      ; restore the Right index
    pop @left                                       ; restore the Left index

;   ----------------------------------
;   Recursive Call - (prevLeft, right)  
;   ----------------------------------
    push @left                                      ; preserve the Left index
    push @right                                     ; preserve the Right index
    push @prevLeft                                  ; push the 1st parameter (previous left
    push @right                                     ; push the 2nd parameter (Right0
    Call _QSort                                     ; Call yourself
    pop rax                                         ; Junk this
    pop rax                                         ; Junk this
    pop @right                                      ; restore the Right index
    pop @left                                       ; restore the Left index

    .Exit:
_QSort_EXIT:
    ret

_moveThem:

;   repeat.while (left <= right)
    cmp @left,@right                                ; Compare Left to Right Index
        jg .Exit                                    ; exit if Left is > Right

;   repeat.while (Key[left] < pivotValue)
;           left    = left + 1
;   end.repeat
    .start1:$getAddress RSI,@left                    ; Get the Start of the record 
            add RSI,qword[d_startPos]               ; add the requested starting position
            mov qword[LeftAddress],RSI              ; and save it
            mov RDI,pivotValue                      ; Get the start of the pivotValue
            mov rcx,qword[d_noOfBytes]              ; Grab the length (LOOP Ctr)
            cld                                     ; Clear the direction flag - left to right
    repe    cmpsb                                   ; Compare the String
            jnl .cont1                              ; If result not < than exit While Loop                                          
            inc @left                               ; get the next table slot
            jmp .start1                             ; and repeat
    .cont1:

    cmp @left,@right                                ; Compare Left to Right Index
        jg .Exit                                    ; exit if Left is > Right

;   repeat.while (Key[right]> pivotValue)
;       right   = right - 1
;   end.repeat
    .start2:$getAddress RSI,@right                   ; Get the Start of the record 
            add RSI,qword[d_startPos]               ; add the requested starting position
            mov qword[RightAddress],RSI             ; and save it
            mov RDI,pivotValue                      ; Get the start of the pivotValue 
            mov rcx,qword[d_noOfBytes]              ; Grab the length (LOOP Ctr)
            cld                                     ; Clear the direction flag - left to right
    repe    cmpsb                                   ; Compare the string    
            jng .cont2                              ; If result not > than exit While Loop  
            dec @right                              ; get the previous table slot
            jmp .start2                             ; and repeat
    .cont2:

    cmp @left,@right                                ; Compare Left to Right Index
        jg .Exit                                    ; exit if Left is > Right
        jl .DoTheMove                               ; Do the move if less than

    ;   No need to do the move if left = right
        inc @left                                   ; increment the left 
        jmp .Exit                                   ; and head for the exit


;   if left <= right - SWAP left and right data
.DoTheMove:
    mov RSI,qword[LeftAddress]                      ; grab the Left address
    mov RDI,qword[RightAddress]                     ; grab the Right address        
    mov rcx,r_recordLength                          ; and grab the record length
    .start3:cmp rcx,8                               ; Have we still got 8 bytes to move
                jb .start4                          ; If not go and pick up the last few bytes
            mov rax,qword[RSI]                      ; grab the left 8 bytes                                     
            mov rbx,qword[RDI]                      ; grab the right 8 bytes
            xchg rax,rbx                            ; swap them over
            mov qword[RSI],rax                      ; put the right 8 bytes to the left
            mov qword[RDI],rbx                      ; put the left 8 bytes to the right 
            add RSI,8                               ; setup RSI for the next 8 bytes
            add RDI,8                               ; setup RDI for the next 8 bytes                                                    
            sub rcx,8                               ; take 8 away from the counter
            jmp .start3                             ; and loop around

    .start4:cmp rcx,0                               ; Have we still got anything to move
                jna .cont4                          ; If not we are done            
            mov ah,byte[RSI]                        ; grab the left byte                                            
            mov al,byte[RDI]                        ; grab the right byte
            xchg ah,al                              ; swap them over
            mov byte[RSI],ah                        ; put the right byte to the left
            mov byte[RDI],al                        ; put the left byte to the right
            add RSI,1                               ; setup RSI for the next byte
            add RDI,1                               ; setup RDI for the next bytes
            sub rcx,1                               ; take 1 away from the counter
            jmp .start4                             ; and loop around
    .cont4:

    inc @left                                       ; Next Left
    dec @right                                      ; Previous Right
    jmp _moveThem                                   ; Back to the Start

.Exit:  
    mov rax,@left                                   ; Return the Left value

_moveThem_EXIT:
    ret

import java.util.Arrays;

class QuickSort {

    public static void Sort(String[] strArray, int lowerIndex, int higherIndex) 
    {        
        int left = lowerIndex;
        int right= higherIndex;
        int x;
        String pivot, temp;

        pivot = strArray[lowerIndex+(higherIndex-lowerIndex)/2];

        while (left <= right) {

          x = strArray[left].compareTo(pivot);
          while (x < 0) {
            left++;  
            x = strArray[left].compareTo(pivot);
          }

          x = strArray[right].compareTo(pivot);
          while (x > 0) {
            right--;
            x = strArray[right].compareTo(pivot);
          }

          if (left <= right) {
            temp = strArray[left];
            strArray[left] = strArray[right];
            strArray[left] = temp;
            left++;
            right--;
          }
        }

        if (lowerIndex < right)
            Sort(strArray,lowerIndex,right);

        if (left < higherIndex)
            Sort(strArray,left,higherIndex);
    }
}

public class V2_11 {

  public static void main(String[] args) {

    String strArray[] = new String[1000001];       
    String letters[] = {"a","b","c","d","e","f","g","h","i","j",
                        "k","l","m","n","o","p","q","r","s","t",
                        "u","v","w","x","y","z"};
    String word;
    String spaces[] = new String[998];
    int random;

  //---------------
  //Build the Array
  //---------------
    System.out.println("Filling Array");  
    System.out.println(java.time.LocalTime.now());  
    for (int i=0;i < 1000000;i++){ 
      word="";
      for (int j=0;j< 26;j++){ 
        random = (int)(Math.random() * 9 + 0);
        word=word+letters[random];
      }
      strArray[i]=word+spaces;
    }   
    System.out.println(java.time.LocalTime.now());

  //--------------
  //Sort the Array
  //--------------
    System.out.println("Sorting");  
    System.out.println(java.time.LocalTime.now());    
  //---Arrays.sort(strArray);
    QuickSort.Sort(strArray,0, 999999);   
    System.out.println(java.time.LocalTime.now());

  //-------------------
  //Display some Reults
  //-------------------
    for (int i=100;i< 110;i++)
    { 
      System.out.println(strArray[i]);
    }
  }
}

#include <stdio.h>
#include <string.h>
#include <time.h>
#include <sys/time.h>
#include <stdlib.h>

    struct timeval start, stop;
    double secs = 0;

    char strArray[1000000][27];
    char pivotValue[27]; 
    char temp[27]; 
    int prevLeftIdx;

int moveThem(int left, int right)
{   int x;

    while (left <= right) {

       x = strcmp(strArray[left],pivotValue);
       while (x < 0) {
         left++;  
            x = strcmp(strArray[left],pivotValue);
       }

       x = strcmp(strArray[right],pivotValue);
       while (x > 0) {
         right--;
         x = strcmp(strArray[right],pivotValue);
       }

       if (left <= right) {
         strcpy(temp,strArray[left]);
         strcpy(strArray[left],strArray[right]);
         strcpy(strArray[right],temp);
         left++;
         right--;
       }
    }

    return left;

}

int QuickSort(int left, int right) 
{

    if (left>=right)
        {return 0;}

    strcpy(pivotValue, strArray[(left+right)/2]);
    prevLeftIdx=moveThem(left,right);

    QuickSort(left,prevLeftIdx-1);
    QuickSort(prevLeftIdx,right);      

}


int main()
{

    char letters[26] = "abcdefghijklmnopqrstuvwxyz";
    char c_Null[1] = "\0";
    char word[27];
    int i,j,random;

  //---------------
  //Build the Array
  //---------------
    puts("Filling Array"); 

    gettimeofday(&start, NULL);
    srand(time(NULL));
    for (i=0;i < 1000000;i++){ 
      for (j=0;j< 26;j++){ 
        random = rand() % 24;
        word[j] = letters[random];
      }
      word[j] = *c_Null;
      strcpy(strArray[i],word);
    }   
    gettimeofday(&stop, NULL);
    secs = (double)(stop.tv_usec - start.tv_usec) / 1000000 + (double)(stop.tv_sec - start.tv_sec);
    printf("time taken %f\n",secs);

  //--------------
  //Sort the Array
  //--------------
    puts("Sorting"); 

    gettimeofday(&start, NULL);   
    QuickSort(0, 999999);   
    gettimeofday(&stop, NULL);
    secs = (double)(stop.tv_usec - start.tv_usec) / 1000000 + (double)(stop.tv_sec - start.tv_sec);
    printf("time taken %f\n",secs);


  //-------------------
  //Display some Reults
  //-------------------
   for (i=0;i<10 ;i++)
   { 
     puts(strArray[i]); 
   }

}

RESULTS FOR C program
Filling Array
time taken 0.245878
Sorting
time taken 0.473481
aaaagajqpbutmaxxhgjkauwmwg
aaaakestelpvsrgiemafwtujev
aaaamnvrdueegvfilkvsabdomd
aaaapspaurwurvecikmjibkwfk
aaaauclobckwflwushhnhiafws
aaabaiueabmqlumijuhlrvmgtf
aaabcqehhoeegkhpeondnbsqjb
aaabmwxdrcpfxlhbmggxvjcaoc
aaabplwowigaxmpjipqxkntilc
aaacjsuqkmbwnglltqsspvvtlr

I expect the answer to this will be something like the answer to [C++ code for testing the Collatz conjecture faster than hand-written assembly - why?](https://stackoverflow.com/q/40354978/7509065) - because the hand-written assembler is really bad. — Joseph Sible-Reinstate Monica, Jun 02 '20 at 04:22
[edit] the question to include the code. Use comments to reply, not answers unless you actually have an answer to your own question. In which case yes, post an answer to your own question. — Peter Cordes, Jun 02 '20 at 04:29
Yup, Joseph was correct. You've invented your own inefficient custom calling convention, passing args on the stack instead of registers. And you're using double recursion instead of a loop. And you're comparing strings with slow `repe cmpsb` instead of fast SIMD (SSE2 `pcmpeqb`). Swapping doesn't need `xchg`, just store opposite registers. You are at least swapping in 8-byte chunks before byte-at-a-time. Also inefficient loop structure with a bunch of unnecessary jumps; [Why are loops always compiled into "do...while" style (tail jump)?](https://stackoverflow.com/q/47783926). — Peter Cordes, Jun 02 '20 at 05:09
IDK if there are any algorithmic problems like pivot choice; inefficient implementation of a good algorithm would be a sufficient explanation for a factor of 2 slowdown. — Peter Cordes, Jun 02 '20 at 05:10
Another issue is that Java is sorting references, so it moves only 8 (or 4) bytes to relocate an element, not 1024. — Gene, Jun 02 '20 at 05:12
What are `@left` and `@right`? I think they must be macros for register names, otherwise `cmp @left,@right` wouldn't assemble. This doesn't look fully complete; if I wanted to profile this myself I wouldn't be able to copy/paste and run it, so it's not quite a [mcve]. — Peter Cordes, Jun 02 '20 at 05:12
To answer a few comments. - 1) I have not invented any calling conventions. Parameters are passed into this routine using registers, the recursion is however handle via the stack. 2) SIMD (SSE2 pcmpeqb). - does that not only check for equality ? 3) "xchg" - Yes you are right, thank you. 4) Not sure if the jumps are causing any real slowness. — Roger Tunnicliffe, Jun 02 '20 at 05:34
5) "Sorting References" - Yes, I have a version of the code that only swaps the references but felt this version was clearer to use as an example. Surprisingly swapping the addresses only does not improve the speed, cmpsb is surprisingly fast. — Roger Tunnicliffe, Jun 02 '20 at 05:35
You are in the main `_Qsort` function itself: `mov @right,qword[rsp+8]` loads stack args, which you push before hand. Since it's recursive instead of looping, you pay that cost repeatedly. You're right that the helper functions are just pushing/popping around every `call` instead of keeping values in call-preserved registers that you save/restore once at the start/end of the function, which is bad in a different way. (And unless those regs are RDI,RSI,... or RCX,RDX,.., that's not Windows x64 or x86-64 System V). I'd suggest writing this in C and looking at optimized compiler output. — Peter Cordes, Jun 02 '20 at 05:35
6) "@left" is r10 - "@right" is R11 7) Yes the code is complete (other than the data it acts upon. You WOULD be able to copy it and assemble it. — Roger Tunnicliffe, Jun 02 '20 at 05:42
Are your 1024-byte elements fully randomized, so `repe cmpsb` usually finds a difference in the first couple bytes? Even so, `repe cmpsb` has significant startup overhead on most CPUs; what CPU do you have? And how did you measure that it's cheap? Have a look at how glibc Implements `memcmp`: `pcmpeqb` to find the first difference, then bit-scan and a scalar compare on the byte-position that differed. https://github.com/lattera/glibc/blob/master/sysdeps/x86_64/memcmp.S#L73 (AT&T syntax). That's good if it's not rare for the first difference to be many bytes into the array. — Peter Cordes, Jun 02 '20 at 05:42
7) It's missing `%define @left r10` and `r11`, so no, it's not *fully* complete. That's the point I was making with that comment. Also `$getAddress` and `$copyBytes` don't seem to be defined. copy/paste your answer into a file and try running `nasm -f elf64 foo.asm` — Peter Cordes, Jun 02 '20 at 05:43
"I'd suggest writing this in C and looking at optimized compiler output." From my experience that would simply produce a call to clib - not something I really want to delve into. — Roger Tunnicliffe, Jun 02 '20 at 05:45
I meant actually writing this algorithm in C, not calling `qsort()`, obviously. C compilers can turn tail recursion into looping, and are often able to optimize general recursion into tail recursion, and at least make this only singly recursive. And they know how to optimize loops over arrays (e.g. to swap, although gcc/clang won't auto-vectorize search loops. But you can call `memcmp`.) See [How to remove "noise" from GCC/clang assembly output?](https://stackoverflow.com/q/38552116) — Peter Cordes, Jun 02 '20 at 05:48
Condering this is sorting 1,000,000 1024k records (with all the work thats involved in doing that - [I have used a 26byte sortkey]) I am curious as to the no of push/pops involved and as to wether that would amount to an over 100% degradation in performance. I haven't check how deep this algorythm goes but maybe I should check that. — Roger Tunnicliffe, Jun 02 '20 at 05:59
The other consideration I have is clarity. At the moment the code is very easy to understand (It replaced a Shell sort) and I would hope any changes would only make it clearer. — Roger Tunnicliffe, Jun 02 '20 at 06:00
Gene is probably right: you probably can't make Java copy whole objects while sorting, only sort references. If so, Java is doing that much less work than asm. I'm surprised that's only ~2x slower. The number of push/pop probably isn't a huge factor; each call to `_moveThem` probably takes long enough that the push/pops around it are minor; it does plenty of looping. Although with no small-size basecase fallback, it does get called about N/2 times with a size of 2 elements, right? Many QuickSort implementations fall back to an InsertionSort for fewer than 32 or 16 elements. — Peter Cordes, Jun 02 '20 at 06:09
"Gene is probably right: you probably can't make Java copy whole objects while sorting, only sort references." - That's the curious thing. The version I have that simply swaps array element address's is not particularly faster than this one that swaps all the data. Maybe only 30% better. I was surprised — Roger Tunnicliffe, Jun 02 '20 at 06:14
How do you initialize the data you're sorting? If it's all zeroed so you don't need any swapping, then you're spending most of your time just doing compares. Or somewhere in between if it's only somewhat random. Also, ping me on replies with @PeterCordes; you get notified because we're commenting under your post; I'm only seeing your replies because I left this tab open and refreshed manually. — Peter Cordes, Jun 02 '20 at 06:32
@PeterCordes. memcmp is again what looks like a call to clib -> callq 0x5555555545d0 . As far as initialising the data I randomly select letters from a to z and concatenate each letter to a 26byte field. Should be fairly random. — Roger Tunnicliffe, Jun 02 '20 at 06:45
Doesn't your data structure have a length field? Do you set that to `1`, or is that random as well? If it's `1` then lots of elements will compare equal to each other. And yes, `memcmp` is in libc. I suggested using it because gcc won't auto-vectorize a loop unless the trip-count can be calculated before the first iteration. But that's not essential, you can just write a scalar compare loop and let GCC turn it into something other than `repe cmpsb`. — Peter Cordes, Jun 02 '20 at 06:50
Oh wow, your last update with `$copyBytes` is the worst inefficiency yet. [`loop` is slow (on Intel CPUs)](https://stackoverflow.com/questions/35742570/why-is-the-loop-instruction-slow-couldnt-intel-have-implemented-it-efficiently), and `movsb` is also slow. If you're copying more than like 8 bytes with that, use `rep movsb` to copy RCX bytes with efficient microcode. (It has startup overhead but unlike `repe cmpsb` the fast strings / ERMSB microcode *can* use wider loads/stores. [Enhanced REP MOVSB for memcpy](https://stackoverflow.com/q/43343231)) — Peter Cordes, Jun 02 '20 at 06:55
`$getAddress` is also somewhat inefficient; use `imul %1, r_recordLength` or something instead of `mul`, since you don't need the RDX high half of the full result. Or better, increment pointers *by* the record length, instead of multiplying all the time. ([Strength Reduction optimization](https://en.wikipedia.org/wiki/Strength_reduction)) — Peter Cordes, Jun 02 '20 at 06:58
@PeterCordes - $copyBytes IS used to copy very small amounts and only used to grab the pivot point. Although it might not be the most efficient way to do it (and this is all part of my progress in writing more efficient code) I don't think it would account for a 100% increase in sort time. — Roger Tunnicliffe, Jun 02 '20 at 07:01
No, that alone wouldn't. But inefficiencies often add up when out-of-order execution can't hide them. And for what it's doing, it's super bad. You know you have a 128 byte static buffer, so you could just copy the first 32 or 16 bytes unconditionally then loop until you've copied enough bytes. Or just keep a pointer to the pivot element instead of copying its key. — Peter Cordes, Jun 02 '20 at 07:08
Or just keep a pointer to the pivot element instead of copying its key. – @Peter Cordes - the trouble with that is that the pivot element is moved during the sort so you woud have to detect on each move if it was actually the pivot element being moved and save the new location. That's a lot of compares Peter, especially if you had an array that is originally in descending order where every single compare reslults in a move — Roger Tunnicliffe, Jun 02 '20 at 22:58
Oh, I'd forgotten that detail about QuickSort. Yes, you would want to copy the pivot's key. Unless you're just sorting references, in which case swapping other references won't invalidate this one. — Peter Cordes, Jun 03 '20 at 00:08
Unless you're just sorting references, in which case swapping other references won't invalidate this one - @PeterCordes - I made this change on the version I have that does not swap data just addresses and there is no real discernable speed improvement. This is the thing that I am struggling with, YES rep is better than loop, YES registers are better than using the stack BUT these make the smallest of differences while we are looking to half the time of the sort. — Roger Tunnicliffe, Jun 03 '20 at 01:57
Often one big bottleneck (like memory / cache misses) can hide other costs, thanks to out-of-order execution. So it's possible that all the minor improvements aren't adding up to a measurable change if there's still a big bottleneck somewhere. When you profile this with `perf stat`, what average instructions / clock do you get? What what branch mispredict rate? What are the hotspots with `perf record` / `perf report` for the default "cycles" event? — Peter Cordes, Jun 03 '20 at 02:23
Hmm, I wonder if actually swapping data instead of references improves spatial locality significantly. So deeper recursive steps over fewer elements are looking at elements close by in cache, and making hardware prefetching more effective. If you post your optimized version, I could have a look at it to see if there are still any obvious inefficiencies. Including code to generate input data for it, and for the Java version you're comparing against. I can't help but wonder if your Java version is different in some significant way, or if Java's sort algorithm is just better than QSort here. — Peter Cordes, Jun 03 '20 at 02:28
@PeterCordes - gotta head out in a minute Peter (thx for your help) but will get back to you a bit later. Cheers — Roger Tunnicliffe, Jun 03 '20 at 02:31
Another thought: have you checked if Java's sort is parallelized at all? Java will often have CPU time > wall-clock time just from running it's garbage collector and JIT, but repeated parallel sorting should be enough work without JIT or GC that you could tell if it was using multiple CPU cores for large sorts. — Peter Cordes, Jun 03 '20 at 04:54
@PeterCordes - Parallelized.? - This is something I considered and is a tempting argument however I am not sure it is the case. I have included the Java program with the sort coded (as well as Array.Sort) and if you run it you will find the hand coded sort runs in almost identical time to Array.Sort. Is Java smart enough to figure out my coded sort can be parallized. — Roger Tunnicliffe, Jun 03 '20 at 22:48
@PeterCordes - The frustration here is that my own implementation in assembler is "algorthymically" identical to the Java version and although there are a few - less than efficient - pieces of code, I can't see where it can do anything other than what it is doing !! — Roger Tunnicliffe, Jun 03 '20 at 22:48
@PeterCordes - I'm gonna spend some time this morning and code this in C. I've got a better chance at getting it thru a debugger and watching what it does. — Roger Tunnicliffe, Jun 03 '20 at 22:56
I hadn't realized you had implemented QSort manually in Java. Do you have times for that compared to Array.Sort? Your question still doesn't have much detail on what CPU you tested on, or a time for your Java QSort. (Also, looks like you didn't do a warm-up run to prime the JIT, which would make Java look even better.) — Peter Cordes, Jun 03 '20 at 23:12
@PeterCordes - CPU is an I3 --> Array.Sort = .877888 seconds. QuickSort = .917975 seconds. Barely a difference. (You should be able cut and paste that Java and make the same observations) — Roger Tunnicliffe, Jun 03 '20 at 23:31
i3 Nehalem from 2008? i3 Skylake? All that tells me is that it's Intel, not AMD, and probably Sandybridge-family. But not L3 cache size, or what microarchitecture. Without details like that, times are less interesting to me, and don't make me want to test it on my Skylake system. e.g. I have an i7-6700k Skylake, max turbo 4.2GHz, with dual channel DDR4-2666. And apparently I have OpenJDK 1.8.0_242 installed on my desktop. — Peter Cordes, Jun 03 '20 at 23:36
@PeterCordes - is the processor that significant when BOTH programs are running on the same processor ? System:Kernel: 4.15.0-96-generic x86_64 bits: 64 compiler: gcc v: 7.5.0 - CPU: Topology: Dual Core model: Intel Core i3-7100 bits: 64 type: MT MCP arch: Kaby Lake rev: 9 L2 cache: 3072 KiB flags: lm nx pae sse sse2 sse3 sse4_1 sse4_2 ssse3 vmx bogomips: 31296 Speed: 3900 MHz min/max: 800/3900 MHz Core speeds (MHz): 1: 3900 2: 3900 3: 3900 4: 3901 — Roger Tunnicliffe, Jun 03 '20 at 23:40
Yes, it can be. Some microarchitectures have performance potholes that others don't, e.g. Intel's recent [JCC erratum that disables the uop cache for a line where a jcc (or macro-fused JCC) touches a 32-byte boundary](https://stackoverflow.com/a/61016915) only applies to Skylake-derived uarches, like your Kaby Lake. If you'd had an earlier Intel, I wouldn't have had to consider that problem in your asm. (Until that erratum, Intel SnB-family had been remarkably robust to tune for, few gotchas compared to P6 where the front-end was a serious bottleneck :/ ) — Peter Cordes, Jun 03 '20 at 23:49
@PeterCordes - So, I have added the code for the C program and it's results. A bit over .4 of a second, about twice as fast as Java and making my assembler code running in 2 seconds look like like a heap of shyte !!!!!!. As noted previously the only real difference is a call to . I'll chase that one and see what it is doing but looks like something to do with "strcpy-sse2-unaligned" (i think you mentioned that previously[SSE2]) — Roger Tunnicliffe, Jun 04 '20 at 03:43
You'd probably go even faster with `memcpy(dst, src, 27)`, and `memcmp(a,b, 26)`. And yes, glibc uses dynamic linker hooks to dispatch `strcpy` to a hand-written asm implementation with SSE2 or AVX2, depending on the CPU that it's running on. https://code.woboq.org/userspace/glibc/sysdeps/x86_64/multiarch/strcpy-avx2.S.html or https://code.woboq.org/userspace/glibc/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S.html. If you single-step into a call (with GDB), you'll get to the real implementation that it picked. Surprised your CPU only used SSE2 not AVX2; did you statically link? — Peter Cordes, Jun 04 '20 at 03:45
@PeterCordes - Thanks Peter. - and NO not statically linked just used "gcc -g -o test test.c" — Roger Tunnicliffe, Jun 04 '20 at 03:55
@PeterCordes - Yep, memcpy/memcmp gets it down to about .34 seconds. !! — Roger Tunnicliffe, Jun 04 '20 at 04:01
@PeterCordes - I have an asnwer that's at least good enough for me - thx for your efforts. Cheers — Roger Tunnicliffe, Jun 05 '20 at 05:27

score 1 · Answer 1 · answered Jun 05 '20 at 05:26

1

Found a solution to this that at least satisfies me.

As a starting point the quicksort algorithm performs as follows on a 1,000,000 element array of qwords. (Times shown in seconds for 5 runs) .091067, .087505, .087407, .088104, .086983 - So..pretty fast

On a 1,000,000 * 1024byte string array sorting on the 1st 8 bytes times were:- 2.265867, 2.241919, 2.281978, 2.264712, 2.291069 - so..not so hot

On a 1,000,000 * 1024byte string array sorting on the 1st 8 bytes (After code change) times were:- 0.719363, 0.688310, 0.717557, 0.710861, 0.746405 - comparible if not better than Java

The code fix consisted of replacing

     mov rsi,Src Address
     mov rdi,Dst Address
     mov rcx,SortKey Length
repe cmpsb

with

    mov rax,[Src Address]
    bswap rax
    mov rbx,[Dst Address]
    bswap
    cmp rax,rbx

I guess you are making the compares 8bytes at once and it make sense that it would be quicker. This is obviously NOT the best way to do it but without getting side tracked into something I didn't want to get into, it will do well enough.

As a side note, the code needs completing to deal with variable length keys but that is just a case of sitting down and doing the work. I hope this has helped anyone who has stumbled across this thread. (Special thx to @PeterCordes for his efforts)

answered Jun 05 '20 at 05:26

Roger Tunnicliffe

21
3

1

Yeah, I'd believe that [microcode startup overhead for `rep cmpsb`](https://stackoverflow.com/questions/33902068/what-setup-does-rep-do) destroys performance that much. Unlike the other minor things I mentioned (more pushes than necessary), this one really can bottleneck the pipeline. Are you really copying whole 1024 byte array elements around, though? Or is this like your C test case with `char strArray[1000000][27];` where the elements are small enough that copying them probably pays for itself in removing a level of indirection, and in cache locality? – Peter Cordes Jun 05 '20 at 05:34
@PeterCordes - the version that comes in at about .7 seconds just moves the address's, so I did use the full 1024byte record (obviously not relevant when you just move the 64bit address). I did have a look at the code in strcpy-sse2-unaligned but it goes against tthe design principles for this project. ie portability to other processors. The version that just sorts the qwords is pretty jolly quick eh !!! – Roger Tunnicliffe Jun 05 '20 at 06:26
SSE2 is baseline for x86-64. If you're writing in x86-64 asm, you can *always* use SSE2 without checking CPUID to see if it's supported. Only later extensions like SSE3 and later, popcnt, BMI, and so on are not baseline. If you need portability to ISAs other than x86-64, assembly language is probably the wrong choice. :P That said, writing this part in C might be a good idea; it sounds like you're getting very good results from C compilers, comparable to your hand-written asm. Although you keep throwing numbers around without saying exactly what they're for (sorting refs or not), so IDK. – Peter Cordes Jun 05 '20 at 06:29
@PeterCordes - the work that I am doing is for the language I have written. It is not really about getting the best results in regard to performance it's really just about keeping myself amused and learning as I go. These are troubled times and writing code is a very cheap hobby. – Roger Tunnicliffe Jun 05 '20 at 06:45

Assembler quicksort slower than java array.sort?

1 Answers1