3

I am trying to implement simple MD5 for short strings (shorter than 64 bytes). I am using algorithm from Wikipedia.. Everything compiles, but my result for string:

"hello world" 

is:

BB3BB65ED0EE1EE0BB22CB93C3CD5A8F

while it should be:

5EB63BBBE01EEED093CB22BB8F5ACDC3

The full code is here:

program Prog;

uses Classes, SysUtils;

function leftrotate(x, c: Cardinal): Cardinal;
begin
  leftrotate := (x shl c) or (x shr (32-c));
end;

const s: array[0..63] of Cardinal = (
    7, 12, 17, 22,  7, 12, 17, 22,  7, 12, 17, 22,  7, 12, 17, 22,
    5,  9, 14, 20,  5,  9, 14, 20,  5,  9, 14, 20,  5,  9, 14, 20,
    4, 11, 16, 23,  4, 11, 16, 23,  4, 11, 16, 23,  4, 11, 16, 23,
    6, 10, 15, 21,  6, 10, 15, 21,  6, 10, 15, 21,  6, 10, 15, 21 );
K: array[0..63] of Cardinal = (
    $d76aa478, $e8c7b756, $242070db, $c1bdceee,
    $f57c0faf, $4787c62a, $a8304613, $fd469501,
    $698098d8, $8b44f7af, $ffff5bb1, $895cd7be,
    $6b901122, $fd987193, $a679438e, $49b40821,
    $f61e2562, $c040b340, $265e5a51, $e9b6c7aa,
    $d62f105d, $02441453, $d8a1e681, $e7d3fbc8,
    $21e1cde6, $c33707d6, $f4d50d87, $455a14ed,
    $a9e3e905, $fcefa3f8, $676f02d9, $8d2a4c8a,
    $fffa3942, $8771f681, $6d9d6122, $fde5380c,
    $a4beea44, $4bdecfa9, $f6bb4b60, $bebfbc70,
    $289b7ec6, $eaa127fa, $d4ef3085, $04881d05,
    $d9d4d039, $e6db99e5, $1fa27cf8, $c4ac5665,
    $f4292244, $432aff97, $ab9423a7, $fc93a039,
    $655b59c3, $8f0ccc92, $ffeff47d, $85845dd1,
    $6fa87e4f, $fe2ce6e0, $a3014314, $4e0811a1,
    $f7537e82, $bd3af235, $2ad7d2bb, $eb86d391 );

var a0,b0,c0,d0, a,b,c,d, f,g,dTemp: Cardinal;
   Len: Integer;
   Msg: array[0..63] of Char;
   M: array[0..15] of Cardinal absolute Msg; //break chunk into sixteen 32-bit words M[j]
   Str: String;
   i: Integer;
   ff: TFileStream;
   wait: Char;
begin
  a0 := $67452301;
  b0 := $efcdab89;
  c0 := $98badcfe;
  d0 := $10325476;

  Str := 'hello world';
  Len := Length(Str);

  FillChar(Msg, 64, 0);

  for i:=1 to Len do Msg[i-1] := Str[i];

//append "1" bit to message
  Msg[Len] := chr(128);

//append original length in bits mod (2 pow 64) to message
  Msg[63-7] := chr(8*Len);  //Update thanks to @MBo

//Process each 512-bit chunk of message- 1 only have 1 chunk

//TEST dump
//  ff := TFileStream.create('test.txt', fmCreate);
//  ff.write(msg, 64);
//  ff.free;

//Initialize hash value for this chunk:
    A := a0;
    B := b0;
    C := c0;
    D := d0;

//Main loop:
    for i := 0 to 63 do begin

        if (i>=0) and (i<=15) then begin
            F := (B and C) or ((not B) and D);
            g := i;
        end
        else if (i>=16) and (i<=31) then begin
            F := (D and B) or ((not D) and C);
            g := (5*i + 1) mod 16;
        end
        else if (i>=32) and (i<=47) then begin
            F := B xor C xor D;
            g := (3*i + 5) mod 16;
        end
        else if (i>=48) and (i<=63) then begin
            F := C xor (B or (not D));
            g := (7*i) mod 16;
        end;

        dTemp := D;
        D := C;
        C := B;
        B := B + leftrotate((A + F + K[i] + M[g]), s[i]);
        A := dTemp;
    end;

//Add this chunk's hash to result so far:
  a0 := a0 + A;
  b0 := b0 + B;
  c0 := c0 + C;
  d0 := d0 + D;

  //This should give 5EB63BBBE01EEED093CB22BB8F5ACDC3
  Writeln( IntToHex(a0,8) + IntToHex(b0,8) + IntToHex(c0,8)  +IntToHex(d0,8) );

  Readln(wait);
end.

You can try the code online here: http://ideone.com/qdYQ6q

And here's dump of my prepared chunk just before main loop (test.txt):

enter image description here

Jan Doggen
  • 8,799
  • 13
  • 70
  • 144
Tom
  • 2,962
  • 3
  • 39
  • 69
  • Well, surely you should be putting this into a function for re-use. And are you really treating the input as text? You must treat the input as binary. – David Heffernan Dec 08 '14 at 15:32
  • @DavidHeffernan I put it as a program so anyone can test it easily and it can be run on ideaone.com but it will be a function. And yes- it has to uses text (strings) as I will use it to hash emails, logins, passwords- such short strings only. – Tom Dec 08 '14 at 15:44
  • You've got that wrong. Hashing algos operate on binary. First of all choose a text encoding and then encode the text as binary using that encoding. For instance you might choose UTF-8. – David Heffernan Dec 08 '14 at 15:52
  • @DavidHeffernan the binary encoding is there: M: array[0..15] of Cardinal absolute Msg; and the input strings are ANSI. – Tom Dec 08 '14 at 15:57
  • 2
    Now you've got code that can only work with ANSI text. Which is rather limiting. You can't calculate hashes for files, or streams, and so on. You are also reinventing the wheel. There are a lot of good hash implementations out there. – David Heffernan Dec 08 '14 at 16:03
  • @DavidHeffernan: I tried Indy implementation. It makes 100 000 hashes in 0.30 sec. My implementation does that in 0.20 sec. The speed gain is more important for me, in this particular case, than these limitations. So I have reinvented a pretty nice wheel ;) – Tom Dec 08 '14 at 16:16
  • That doesn't sound like a useful performance gain at all. I doubt that hashing is ever a bottleneck. Can you really acquire 100,000 things to hash quickly enough for that performance gain to matter. And I'm sure there are faster hashers around than your code if it really did matter. And it's also quite plausible that your benchmark is erroneous. There's loads of scope for improvement of your code. Your `rol` implementation is poor for a start. – David Heffernan Dec 08 '14 at 16:20
  • @DavidHeffernan I am also hashing memory-generated strings in order to find a collision. Perhaps there are faster implementations but I didn't fine any. – Tom Dec 08 '14 at 16:25
  • See also [MD5 Hashing in Delphi 2009](http://stackoverflow.com/a/392745/576719). – LU RD Dec 08 '14 at 21:15

3 Answers3

5

The last step is wrong:

  a0 := a0 + A;
  b0 := b0 + B;
  c0 := c0 + C;
  d0 := d0 + D;

it should change endianess:

  a0 := Swap32(a0 + A);
  b0 := Swap32(b0 + B);
  c0 := Swap32(c0 + C);
  d0 := Swap32(d0 + D);

function Swap32(ALong: Cardinal): Cardinal; Assembler; 
asm 
  BSWAP eax 
end;

and then it's good.

Tom
  • 2,962
  • 3
  • 39
  • 69
  • 1
    One small note... There is a line in the pseudo-code on the wiki `//Note: All variables are unsigned 32 bit and wrap modulo 2^32 when calculating` so shouldn't those Longint's not be changed to DWord's ? – Rik Dec 08 '14 at 15:43
  • @Rik We have binary operations here mostly and it works fine with Longints. But thank you for you note- I will change my code to use Cardinals just to be sure it's good. – Tom Dec 08 '14 at 15:50
  • A Cardinal is a DWord on 32bit but is it guaranteed it will stay a DWord (unsigned 32bit integer) on other platforms/bitness? I think a DWord will always stay 32 bit. (I'm also not sure when there actually will be 32bit wrapping but to be on the safe side it should always be a 32bit unsigned integer, regardless of system, which a Cardinal does not guarantee, i think) – Rik Dec 08 '14 at 16:13
  • @Rik DWord is not a Delphi datatype. It is a Winapi one. Uint32 under Delphi is either Cardinal or Longword. And I read somewhere these are aliases and never change on other platforms. – Tom Dec 08 '14 at 16:23
  • 2
    It's pretty safe to assume that `Cardinal` will be 32 bit everywhere forever. However, if you are nervous about that, then use `LongWord`. – David Heffernan Dec 08 '14 at 16:44
  • Aaah. Okay. I [just read somewhere](http://www.delphibasics.co.uk/RTL.asp?Name=Cardinal) that Cardinal was not guaranteed to be always 32bit in the future but now i see [on the Delphi site](http://docwiki.embarcadero.com/Libraries/XE4/en/System.Cardinal) it should be. But LongWord is a good alternative too. – Rik Dec 08 '14 at 20:04
  • 1
    Well, instead of DWord or Longword or Cardinal, use the explicit UInt32. I expect that to be mapped to the appropiate type, no matter which platform. I am not so sure that Cardinal will always be 32 bit. In the olden days, it was 16 bit, after all. – Rudy Velthuis Dec 09 '14 at 08:42
  • @RudyVelthuis I was ruling out porting back to 16 bit Delphi – David Heffernan Dec 09 '14 at 15:09
  • Sure, but it is not sure that Cardinal will remain 32 bit. There has been talk about making it 64 bit on some platforms, but this was rejected due to user protests. Not sure if it will always be rejected. Sure is that UInt32 will remain 32 bit. – Rudy Velthuis Dec 09 '14 at 17:52
3

You might want to consider using a third-party implementation instead of creating your own. For example, Indy's TIdHashMessageDigest5 class produces the correct value, eg:

uses
  ..., IdHashMessageDigest;

var
  S: string;
begin
  with TIdHashMessageDigest5.Create do
  try
    S := HashStringAsHex('hello world'); // returns '5EB63BBBE01EEED093CB22BB8F5ACDC3'
  finally
    Free;
  end;
end;
Remy Lebeau
  • 555,201
  • 31
  • 458
  • 770
  • 2
    And likewise... FPC has a unit [MD5](http://wiki.freepascal.org/hash) you can use. (there was a tag freepascal attached to this question) – Rik Dec 08 '14 at 20:08
  • And there is also unit `MessageDigest_5` in Delphi. – LU RD Dec 08 '14 at 21:16
2

What about these steps:

append "0" bit until message length in bits ≡ 448 (mod 512)

(56 bytes, 64+56 and so on) and

append original length in bits mod (2 pow 64) to message

but you appended Len in bytes

P.S. I've checked you last variant with Delphi. I've changed char types to AnsiChar, and result is consistent with expected one. Note that byte swapping is not needed for correct binary result. It may help only for constructing hex string from Int32 values

Int32 are already little-endian on Intel hardware, so BB3BB65E (hex string representation) corresponds to byte sequence 5E B6 3B BB and so on.

MBo
  • 77,366
  • 5
  • 53
  • 86