Difference between cmp and doing sub then cmp with 0

Question

Below there are two identical assembly code, written to do a quicksort. The only difference is, in the first one a subtraction is done and then the result compared with a zero, and in the second it's just a comparison. For some reason, the first piece of code actually does the sorting right, but the second one doesn't. I've done some research on this: cmp doesn't change the operands, whereas 'sub' does.

First (correct):

## 
##
##  quick_sort(int *a, int l, int r){
##      int i, j, pivot, temp;
##      if(l>=r) return;
##
##      pivot = a[r];
##      i = l;
##      j = r;
##
##          while(i<j){
##          while(i < j && a[i]<pivot) i++;
##          while(i < j && a[j]>=pivot) j--;
##          temp = a[i];
##          a[i] = a[j];
##          a[j] = temp;
##      }
##      a[r] = a[i];
##      a[i] = pivot;
##
##      quick_sort(a, l, i-1);
##      quick_sort(a, i+1, r);
##  }
##

.intel_syntax noprefix

.text
.global quick_sort

########################################
# void quick_sort(int *a, int l, int r)
#
# rdi - int *a
# rsi - int l
# rdx - int r
########################################

quick_sort:
enter 0, 0

cmp rsi, rdx    
jge end_quick_sort  

mov r12, [rdi+rdx*4]    

mov r13, rsi    # i=l
mov r14, rdx    # j=r

partition:
cmp r13, r14    
jge end_partition   

left_loop:      
cmp r13, r14    
jge end_left_loop
mov rax, [rdi+r13*4]                    
sub rax, r12                   #this 
cdqe                           #piece 
cmp rax, 0                     #here
jge end_left_loop   
inc r13     
jmp left_loop
end_left_loop:

right_loop:     
cmp r13, r14    
jge end_right_loop
mov rax, [rdi+r14*4]
sub rax, r12
cdqe
cmp rax, 0      
jl  end_right_loop  
dec r14     
jmp right_loop
end_right_loop:

mov rax, [rdi + r13*4]  
mov r15, [rdi + r14*4]  
mov [rdi + r13*4], r15d
mov [rdi + r14*4], eax

jmp partition       
end_partition:

mov rax, [rdi + r13*4]  # a[r] = a[i]
mov [rdi + rdx*4], eax
mov [rdi + r13*4], r12d # a[i] = pivot



quick_sort_left:    # quick_sort(array,l,i-1)
push    rdx
push    r13
mov rdx, r13
dec rdx
call quick_sort
pop r13
pop rdx

quick_sort_right:   # quick_sort(array,i+1,r)
mov rsi, r13
inc rsi
call quick_sort

end_quick_sort:
xor rax, rax
leave
ret

Second (incorrect):

.intel_syntax noprefix

.text

.global quick_sort

quick_sort:

enter 0,0

cmp rsi, rdx
jge end_qs

mov r12, [rdi + 4*rdx]
mov r13, rsi
mov r14, rdx

main_while:

cmp r13, r14
jge end_main_while

left_while:

cmp r13, r14
jge end_left_while
mov rax, [rdi + 4*r13]
cmp rax, r12                       # changed to just one line (cmp)


jge end_left_while

inc r13
jmp left_while
end_left_while:

right_while:

cmp r13, r14
jge end_right_while

mov rax, [rdi + 4*r14]
cmp rax, r12


jl end_right_while

dec r14
jmp right_while

end_right_while:

mov rax, [rdi + 4*r13]
mov r15, [rdi + 4*r14]
mov [rdi + 4*r13], r15d
mov [rdi + 4*r14], eax

jmp main_while
end_main_while:

mov rax, [rdi + 4*r13]
mov [rdi + 4*rdx], eax
mov [rdi + 4*r13], r12d

qs_left:
push rdx
push r13
mov rdx, r13
dec rdx
call quick_sort
pop r13
pop rdx

qs_right:
mov rsi, r13
inc rsi
call quick_sort

end_qs:
xor rax, rax
leave
ret

You might want to go play "spot the differences" games for a while. Hint: the important thing is the `cdqe` which is a silly workaround for accessing 8 bytes instead of 4. As such, I would call the first version broken too. You should just operate on 4 bytes if that's what you want to do. — Jester, May 28 '18 at 21:37
both are wrong, and the relevant thing confusing your particular spot is the data width, the original array does consist of 32 bit ints, but the code operates with them as with 64 bit values. The `cdqe` will truncate the result of `sub` down to 32 bit (and extend it immediately back to 64b to make the next `cmp` to work). It's quite convoluted way how to extend 32 bit values to 64 bit registers, and generally this is not the "correct" (efficient and wanted) approach, although the first code maybe even works for every input (don't want to spend more time thinking about it) — Ped7g, May 28 '18 at 21:39
So, what is the correct way when dealing with 32-bit integers using 64-bit registers ? To be frank, I'm honestly confused about how 64bit registers treat `ints`, also why and when are word conversions (such as `cdqe`) even used. — monolith937, May 28 '18 at 22:02
`mov r12d, [rdi+rdx*4]`, or `mov eax, [rsi]`, or `cmp eax, r12d` or whatever. If you want 32-bit operand-size, use 32-bit registers in 64-bit mode (except in addressing mode: still use the 64-bit version of a register in addressing modes). [The advantages of using 32bit registers/instructions in x86-64](https://stackoverflow.com/q/38303333) — Peter Cordes, May 28 '18 at 22:12
That certainly works, but most importantly it seems intuitive to me. What concerns me, so I don't make similar mistakes in the future, is why is it different ? To me `cmp rax, r12` and `cmp eax, r12d` look and should do the same, unless I'm overlooking something. — monolith937, May 29 '18 at 07:42

Difference between cmp and doing sub then cmp with 0

0 Answers0