3

Here is the C code and I compiled with gcc

char *a="a";
char *d="d";
printf("%d\n", strcmp("a", "d"));
printf("%d\n", strcmp(a, "d"));
printf("%d\n", strcmp(a, d));

When I compiled with -O the output is

-1
-3
-1

When I compiled without -O then output is

-1
-3
-3

Why the output is different and what is the code of strcmp?

solomon_wzs
  • 1,711
  • 5
  • 16
  • 29

4 Answers4

5

Why the output is different

Because all that matters is the sign (positive, negative or zero) of the return value. strcmp() is not required to return +1 or -1, nor does it have to return consistent values. I suspect that in the first and third case, the compiler optimizes away the call to strcmp() and puts -1 into the place of the return value. In the second case, I think the function is actually called.

what is the code of strcmp?

Deducing from the fact that it seemingly returns the difference between the character codes of the first differing character, I'd say this is glibc's strcmp():

int
 strcmp (p1, p2)
      const char *p1;
      const char *p2;
 {
   register const unsigned char *s1 = (const unsigned char *) p1;
   register const unsigned char *s2 = (const unsigned char *) p2;
   unsigned char c1, c2;

   do
     {
       c1 = (unsigned char) *s1++;
       c2 = (unsigned char) *s2++;
       if (c1 == '\0')
     return c1 - c2;
     }
   while (c1 == c2);

   return c1 - c2;
 }

Edit: @AndreyT doesn't believe me, so here's the assembly GCC 4.2 generated for me (OS X 10.7.5 64-bit Intel, default optimization level - no flags):

    .section    __TEXT,__text,regular,pure_instructions
    .globl  _main
    .align  4, 0x90
_main:
Leh_func_begin1:
    pushq   %rbp
Ltmp0:
    movq    %rsp, %rbp
Ltmp1:
    subq    $32, %rsp
Ltmp2:
    leaq    L_.str(%rip), %rax
    movq    %rax, -16(%rbp)
    leaq    L_.str1(%rip), %rax
    movq    %rax, -24(%rbp)
    movl    $-1, %ecx             ; <- THIS!
    xorb    %dl, %dl
    leaq    L_.str2(%rip), %rsi
    movq    %rsi, %rdi
    movl    %ecx, %esi
    movq    %rax, -32(%rbp)
    movb    %dl, %al
    callq   _printf               ; <- no call to `strcmp()` so far!
    movq    -16(%rbp), %rax
    movq    %rax, %rdi
    movq    -32(%rbp), %rsi
    callq   _strcmp               ; <- strcmp()
    movl    %eax, %ecx
    xorb    %dl, %dl
    leaq    L_.str2(%rip), %rdi
    movl    %ecx, %esi
    movb    %dl, %al
    callq   _printf               ; <- printf()
    movq    -16(%rbp), %rax
    movq    -24(%rbp), %rcx
    movq    %rax, %rdi
    movq    %rcx, %rsi
    callq   _strcmp               ; <- strcmp()
    movl    %eax, %ecx
    xorb    %dl, %dl
    leaq    L_.str2(%rip), %rdi
    movl    %ecx, %esi
    movb    %dl, %al
    callq   _printf               ; <- printf()
    movl    $0, -8(%rbp)
    movl    -8(%rbp), %eax
    movl    %eax, -4(%rbp)
    movl    -4(%rbp), %eax
    addq    $32, %rsp
    popq    %rbp
    ret
Leh_func_end1:

    .section    __TEXT,__cstring,cstring_literals
L_.str:
    .asciz   "a"

L_.str1:
    .asciz   "d"

L_.str2:
    .asciz   "%d\n"

    .section    __TEXT,__eh_frame,coalesced,no_toc+strip_static_syms+live_support
EH_frame0:
Lsection_eh_frame:
Leh_frame_common:
Lset0 = Leh_frame_common_end-Leh_frame_common_begin
    .long   Lset0
Leh_frame_common_begin:
    .long   0
    .byte   1
    .asciz   "zR"
    .byte   1
    .byte   120
    .byte   16
    .byte   1
    .byte   16
    .byte   12
    .byte   7
    .byte   8
    .byte   144
    .byte   1
    .align  3
Leh_frame_common_end:
    .globl  _main.eh
_main.eh:
Lset1 = Leh_frame_end1-Leh_frame_begin1
    .long   Lset1
Leh_frame_begin1:
Lset2 = Leh_frame_begin1-Leh_frame_common
    .long   Lset2
Ltmp3:
    .quad   Leh_func_begin1-Ltmp3
Lset3 = Leh_func_end1-Leh_func_begin1
    .quad   Lset3
    .byte   0
    .byte   4
Lset4 = Ltmp0-Leh_func_begin1
    .long   Lset4
    .byte   14
    .byte   16
    .byte   134
    .byte   2
    .byte   4
Lset5 = Ltmp1-Ltmp0
    .long   Lset5
    .byte   13
    .byte   6
    .align  3
Leh_frame_end1:


.subsections_via_symbols

And the original source code:

#include <stdio.h>
#include <string.h>

int main()
{
    const char *a = "a";
    const char *d = "d";
    printf("%d\n", strcmp("a", "d"));
    printf("%d\n", strcmp(a, "d"));
    printf("%d\n", strcmp(a, d));

    return 0;
}

And the output it generated (screenshot for having a better proof):

enter image description here

  • Why then `strcmp(a, d)` is `-1` again? – AnT stands with Russia May 09 '13 at 04:10
  • @AndreyT Different compiler versions, different optimization levels, etc., etc... Don't expect me to reason the unreasonable. –  May 09 '13 at 04:11
  • 1
    Well, `strcmp` is a very simple function. Whatever it does, should be easily reasonable. (And yes, I'm talking about implementation details). I actually suspect that the OP's experiment was somehow broken. – AnT stands with Russia May 09 '13 at 04:14
  • 1
    @AndreyT For me with GCC 4.2 on Mac OS X, this printed `-1 -3 -3` and as I'm looking at the generated assembly, I've been proved right (what a surprise, really). –  May 09 '13 at 04:16
  • 1
    @AndreyT "Why then strcmp(a, d) is -1 again?" -- Because the compiler is allowed to optimize. See my answer. – Jim Balter May 09 '13 at 04:22
  • 1
    @JimBalter Exactly, look at my annotations in the generated assembler - the first call with two string literals has been optimized away. –  May 09 '13 at 04:24
  • @AndryeT @H2CO3 On Linux I find that when I compile with `gcc` and `clang` at zero optimization I get {-1,-1,-1}. Optimization includes a {-3}. Interesting to me that optimization seems to exclusively add {-3} rather than another number--even on various machines and compilers...! – d0rmLife May 09 '13 at 04:29
  • 1
    @d0rmLife Optimization doesn't add -3. Optimization removes the -3 (which is the actual return value of `strcmp()` in this particular case) and replaces it with -1 (a quite arbitrary negative number). –  May 09 '13 at 04:30
  • @H2CO3 ? The opposite occurred on my end--there was no {-3} for optimization to remove (apparently)... unless I am misunderstanding you. Agreed, though, "add" was a poor word choice. It is the *uniqueness* of {-1,-3} that is interesting to me. – d0rmLife May 09 '13 at 04:33
  • @d0rmLife There's nothing interesting about it and it has already been explained: `'a' - 'd' == -3` – Jim Balter May 09 '13 at 04:43
  • @JimBalter It is a two element set, and I can comprehend the ASCII subtraction! It is the -1 that I think is more interesting. It would appear to me that -3 is the natural return... – d0rmLife May 09 '13 at 04:46
  • @d0rmLife "On Linux I find that when I compile with gcc and clang at zero optimization I get {-1,-1,-1}" -- you're almost certainly misreporting your result ... you will not get -1 for the 2nd or 3rd comparison without optimization. – Jim Balter May 09 '13 at 04:48
  • @JimBalter Or the implementation of `strcmp()` is normalized (i. e. it always returns -1, 0, +1). Also, `-O0` does **not** actually mean "no optimizations". GCC (and Clang too) does indeed do some optimizations even with `-O0`. –  May 09 '13 at 04:48
  • @JimBalter No, those are the values I got. – d0rmLife May 09 '13 at 04:49
  • @d0rmLife "It is the -1 that I think is more interesting" -- It has been explained repeatedly ... look at the generated code. it isn't interesting at all ... the compiler authors picked -1 as the value to return when one string is known at compile time to be less than another because it's the most obvious and natural value. – Jim Balter May 09 '13 at 04:49
  • "Or the implementation of strcmp() is normalized" -- No glibc implementation of strcmp that I'm aware of contains the suboptimal code to do that. "some optimizations" -- yes, of course ... they notice that they are comparing constant strings. But to go beyond that requires more than -O0 – Jim Balter May 09 '13 at 04:51
  • @JimBalter Clearly you understand better than I, and I appreciate your time, but I guess I should have been more clear: why do the values **alternate**? It seems bizarre... with optimization I got {-1,-3,-1}. I'm happy to drop it, but it just doesn't make sense to me why it would hop between ASCII subtraction and default return values. – d0rmLife May 09 '13 at 04:51
  • @JimBalter OK, thanks for the info (that was only an assumption). –  May 09 '13 at 04:52
  • 1
    @d0rmLife I doubt that it's a matter of alternation ... the expressions are different; change their order and the results will change with them. In order to know why, on some implementations, `strcmp(a, "d")` returns -3 while the other two return -1, you would have to examine the optimizer. – Jim Balter May 09 '13 at 04:55
  • @JimBalter Great point, thanks--just tested it and you seem to be right. – d0rmLife May 09 '13 at 04:58
  • 1
    @d0rmLife: The reason for `-1` return is because this is how actual `strcmp` is implemented in some GCC libraries (despite the claims of "suboptimality" from our bitter "expert" here). In GCC 4.1.2 on Linux (for one example), the implementation of `strcmp` does not use subtraction to calculate the result. Instead it performs an explicit comparison of characters by using `cmp` instruction. And if characters are different, it loads `eax` register with `1` and `ecx` register with `-1`, and then performs `cmovb ecx, eax` instruction, which will produce the proper result in `eax` as `-1` or `1`. – AnT stands with Russia May 09 '13 at 05:08
  • 1
    @d0rmLife: This `cmovb`-based approach is was apparently considered more optimal than the straightforward subtraction. (I wonder whether it really is. But the fact is the fact - it is used in GCC standard library.) More likely it is done that way not for optimization, but rather to produce a "normalized" `-1, 0, +1` result, even though it is not required by the language. As you see, "someone" claimed above that `-1` cannot be obtained without optimization. In reality, it can. – AnT stands with Russia May 09 '13 at 05:10
  • @AndreyT **Terrific** exposition on such a small question. Thanks for taking the time to do the research. What you just said ought to be an answer on this question. I know an answer has been accepted, but that is valuable information to anyone who comes across it. `cmovb` doesn't strike me as the optimal implementation--unless, of course, it is seeking to normalize results (as you suggested). If that is the case, it should handle all such cases rather than only some, as the infamous {-1,-3} set showed us. Perhaps, though, it is selectively optimal. *Good stuff!* – d0rmLife May 09 '13 at 05:39
  • 1
    "bitter" -- That's funny coming from that source. What I said was "No glibc implementation of strcmp that I'm aware of" ... well, I wasn't aware of this, obviously. The new information and correction is welcome, even if it does come from someone very late to the party who made numerous false assertions and was generally a PITA. – Jim Balter May 09 '13 at 06:35
  • `"someone" claimed above that -1 cannot be obtained without optimization.` -- I should have said that the value generated at compile-time, rather than the value from strcmp, cannot be obtained without optimization. Of course if those values are the same then they can be obtained with or without optimization. I do think my sin was the lesser compared to claiming that the only possible way a compiler could produce different results is via randomization. – Jim Balter May 09 '13 at 06:44
  • 1
    BTW, that claim was accompanied by the assertion that the OP probably had an error in their experiment. I said something similar about @d0rmLife. I apologize for that, and I especially apologize for in any way resembling a certain person. – Jim Balter May 09 '13 at 06:50
  • there's no need to check the assembly output. You can just check the [`strcmp` implementation in Apple or BSD](https://stackoverflow.com/a/25015214/995714) directly and easily see that both return the difference between the characters – phuclv Jul 20 '21 at 17:08
4

The C standard allows the implementation to return any negative value. It also allows the implementation to do optimizations of library function calls as long as the result obeys the standard ... thus, implementations can optimize functions like strcmp by generating inline machine instructions instead of calling a function. Extra optimizations are possible when arguments are constants. So the reason the results are different is because the optimizer happens to generate different code for some of the cases. A conforming program is not allowed to care which negative value is returned.

Edit:

On my system at the moment, the output is

-1
-3
-3

Here is the code the compiler generated that produced those results (obtained with gcc -S):

    movl    $-1, 4(%esp)
    movl    $LC2, (%esp)
    call    _printf
    movl    $LC1, 4(%esp)
    movl    28(%esp), %eax
    movl    %eax, (%esp)
    call    _strcmp
    movl    %eax, 4(%esp)
    movl    $LC2, (%esp)
    call    _printf
    movl    24(%esp), %eax
    movl    %eax, 4(%esp)
    movl    28(%esp), %eax
    movl    %eax, (%esp)
    call    _strcmp
    movl    %eax, 4(%esp)

As you can see, there are only two strcmp calls. The -1 result for the first comparison is produced at compile-time, because the compiler knows that "a" is less than "d". If I use -O, it produces this code:

    movl    $-1, 4(%esp)
    movl    $LC0, (%esp)
    call    _printf
    movl    $-1, 4(%esp)
    movl    $LC0, (%esp)
    call    _printf
    movl    $-1, 4(%esp)
    movl    $LC0, (%esp)
    call    _printf
Jim Balter
  • 16,163
  • 3
  • 43
  • 66
2

I'm getting

 -1
 -3
 -1

output for optimized (-O4) build with GCC 4.1.2 on Linux. Here's the code that the compiler generates for main

main:
.LFB25:
        subq    $8, %rsp
.LCFI0:
        movl    $-1, %esi
        xorl    %eax, %eax
        movl    $.LC0, %edi
        call    printf
        movzbl  .LC1(%rip), %edx
        movzbl  .LC2(%rip), %eax
        movl    %edx, %esi
        subl    %eax, %esi
        jne     .L2
        movzbl  .LC1+1(%rip), %esi
        movzbl  .LC2+1(%rip), %eax
        subl    %eax, %esi
.L2:
        movl    $.LC0, %edi
        xorl    %eax, %eax
        call    printf
        movl    $-1, %esi
        movl    $.LC0, %edi
        xorl    %eax, %eax
        call    printf
        xorl    %eax, %eax
        addq    $8, %rsp
        ret

which means that the first and the last comparisons were actually optimized out, while the middle comparison was actually implemented intrinsically through subtraction (which is why it produced -3). I don't see any logic in this selective behavior, so it is probably just a quirk of the optimizer.

BTW, without optimization the same GCC 4.1.2 produces

 -1
 -1
 -1

output because it calls strcmp. strcmp in this standard library is implemented as

<strcmp>           mov    (%rdi),%al
<strcmp+2>         cmp    (%rsi),%al
<strcmp+4>         jne    <strcmp+19>
<strcmp+6>         inc    %rdi
<strcmp+9>         inc    %rsi
<strcmp+12>        test   %al,%al
<strcmp+14>        jne    <strcmp>
<strcmp+16>        xor    %eax,%eax
<strcmp+18>        retq
<strcmp+19>        mov    $0x1,%eax
<strcmp+24>        mov    $0xffffffff,%ecx
<strcmp+29>        cmovb  %ecx,%eax
<strcmp+32>        retq

which means that it is intentionally implemented to return -1, 0 or +1, even if it might be seen as suboptimal.

AnT stands with Russia
  • 312,472
  • 42
  • 525
  • 765
  • What this highlights is that there's potentially *three* different `strcmp()` implementations in play: the comparison implemented within the compiler and evaluated entirely at compile-time; the intrinsic function provided by the compiler but evaluated at run-time; and the standard library function evaluated at run-time. Exactly what output you see will depend on what output each of these generates for the test input, *and* which one is used for each comparison (which may depend on optimisation settings). – caf May 09 '13 at 13:08
1

strcmp returns < 0 if strings are not equal.
It indicates that second string has higher value for the first character that does not match in the strings. The exact precise value is Unspecified.
The only thing defined is whether the output is:

  • Zero or
  • Positive or
  • Negative
Alok Save
  • 202,538
  • 53
  • 430
  • 533
  • 2
    That misses the point of the question. The question is about why *identical* input strings produce different results. While the results are formally "correct", the question is: why the behavior of `strcmp` is so seemingly non-deterministic? – AnT stands with Russia May 09 '13 at 04:09
  • @AndreyT: Because the behavior is **Unspecified**. The compiler is allowed to return any value and it is not required to tell you the detail. – Alok Save May 09 '13 at 04:10
  • 2
    That is fine, but this is a real-life question. The only reason a real-life compiler would return different results for identical strings is if it is intentionally randomizing the result. This might make sense in some "stress testing" compiler, but not in "normal" compiler. So the question stands. Your purely theoretical answer is correct, but it make no real-life sense. Or, if you think it does, then show it. – AnT stands with Russia May 09 '13 at 04:12
  • @AndreyT: The answer stands.A user of the language uses constructs provided by the language and is bound by the contract of specifications.If the specification doesn't say what precise value should be returned,the user should'nt rely or speculate on it to be anything specific.Period.The real-life perspective doesn't make sense because you are not supposed to do this even in real life.If the question is why gcc or a specific compiler does so, it stands beyond purview of language and just shouldn't be tagged C at all, tag it to compiler & I will be happy to remove the answer if that be the case. – Alok Save May 09 '13 at 04:18
  • 1
    @AndreyT "The only reason a real-life compiler would return different results for identical strings is if it is intentionally randomizing the result." -- You are incorrect. See my answer. – Jim Balter May 09 '13 at 04:18
  • 1
    "this is a real-life question" -- the real-life answer is: generate the assembly code and look at it. It can be different for different versions of gcc and different platforms. – Jim Balter May 09 '13 at 04:25
  • 3
    @JimBalter: Precisely, and that exercise is useless because you cannot rely the value to be same even on the same compiler and on the same platform, technically(*and hence practically*) that value can be just anything, just it has to be `-ve`. – Alok Save May 09 '13 at 04:27
  • Don't you find it interesting that the return value is exclusively an element of {-1,-3}? Perhaps it *can* be anything, but our results seem surprisingly limited, even across various compilers, OSes and optimizations! I think that may be what @AndreyT is curious about, and I must say I am a bit curious, too. Although I hope to avoid the argument...! – d0rmLife May 09 '13 at 04:39
  • @d0rmLife: I am going to repeat the comment I made on now deleted answer. "Knowing Andrey T's contributions here, he is certainly not ignorant rather the exact opposite. The question of *why a compiler shows the behavior?* might be interesting but if this is tagged as C, the most correct answer is indeed: *because the compiler is allowed to and the behavior is Unspecified*.If one is interested in getting answer for behavior specific to a compiler implementation, then the Q is beyond the language specification & the Q should be asked beyond the purview of the language." – Alok Save May 09 '13 at 04:41
  • Perhaps this is better continued in chat. I removed my answer to get rid of all the excessive comment-arguing. – Yuushi May 09 '13 at 04:52
  • @AlokSave You miss the point. Andrey asserted that "The only reason a real-life compiler would return different results for identical strings is if it is intentionally randomizing the result." Such an assertion is both grossly ignorant and arrogantly commits a fallacy of argumentum ad ignorantiam. – Jim Balter May 09 '13 at 05:02
  • "Don't you find it interesting that the return value is exclusively an element of {-1,-3}?" -- Not at all. -1 is the value the compiler generates when it knows at compile time that one string is less than the other -- it could have been any value but -1 is the obvious choice -- and -3 is 'a'- 'd', which is what a natural and/or optimal implementation of strcmp will return. – Jim Balter May 09 '13 at 05:05
  • 1
    @Jim Balter: That's actually false in general case. `strcmp` implementation in GCC 4.1.2 on x86 is actually purposely implemented to return `-1, 0, +1` by using `cmovb` instruction to generate the result (instead or returning the result of subtraction). Why it was done that way is a different question. – AnT stands with Russia May 09 '13 at 05:24
  • ^ I didn't say anything false. Among AnT's numerous ignorant and point-missing comments on this crazy page is his failure to grasp what "is exclusively an element of {-1,-3}" and "and/or" mean. 'a'-'d' is natural, regardless of how "GCC 4.1.2 on x86" or any other implementation works. – Jim Balter Dec 08 '20 at 22:53