0

I currently have a really basic MD5 hash algorithm working. (All I can call atm is MD5(const char*)) However, it's going to be limited to small files. i.e 32bit system will only give me up to 4GB or less of a file. Plus why in the world, would I want to load anything that's even close to 1GB into memory. ;) So that brings me to this question...

How can I hash large files? I noticed OpenSSL (Which I will use sometime in the future) use MD5 hash update function, when loading parts of a file into memory. So what exactly happens when 'updating' a MD5 hash? Is there an pseudo code on the internet to do this or examples anywhere?

P.S I'm sorta new to the crypto world. So forgive me if I ask any follow up questions to anyone's answer. Like to trying things out the hard way before I take the easy way. Best way to learn! ;)

My MD5 Header

    #ifndef MD5_H
    #define MD5_H

    #include <stdlib.h>
    #include <stdio.h>
    #include <string.h>
    #include <math.h>
    #include <ctype.h>

    #define MAX_MD5_HASH_LENGTH 32


    typedef union uwb {
        unsigned w;
        unsigned char b[4];
    } WBunion;

    typedef unsigned Digest[4];

    unsigned f0( unsigned abcd[] );

    unsigned f1( unsigned abcd[] );

    unsigned f2( unsigned abcd[] );

    unsigned f3( unsigned abcd[] );

    typedef unsigned (*DgstFctn)(unsigned a[]);

    unsigned *calcKs( unsigned *k);
    unsigned rol( unsigned v, short amt );
    unsigned *md5( const char *msg, int mlen);

    char* convertRawMd5HashToString(unsigned* rawMd5);
    int isValidMd5(const char* md5String);

    #endif

Source File

#include "md5.h"

unsigned *calcKs( unsigned *k)
{
    double s, pwr;
    int i;

    pwr = pow( 2, MAX_MD5_HASH_LENGTH);
    for (i=0; i<64; i++) {
        s = fabs(sin(1+i));
        k[i] = (unsigned)( s * pwr );
    }
    return k;
}

// ROtate v Left by amt bits
unsigned rol( unsigned v, short amt )
{
    unsigned  msk1 = (1<<amt) -1;
    return ((v>>(MAX_MD5_HASH_LENGTH-amt)) & msk1) | ((v<<amt) & ~msk1);
}

unsigned *md5( const char *message, int messageLength) 
{
    static const Digest h0 = { 0x67452301, 0xEFCDAB89, 0x98BADCFE, 0x10325476 };
    static const DgstFctn ff[] = { &f0, &f1, &f2, &f3 };
    static const short M[] = { 1, 5, 3, 7 };
    static const short O[] = { 0, 1, 5, 0 };
    static const short rot0[] = { 7,12,17,22};
    static const short rot1[] = { 5, 9,14,20};
    static const short rot2[] = { 4,11,16,23};
    static const short rot3[] = { 6,10,15,21};
    static const short *rots[] = {rot0, rot1, rot2, rot3 };
    static unsigned kspace[64];
    static unsigned *k;

    static Digest h;
    Digest abcd;
    DgstFctn fctn;
    short m, o, g;
    unsigned f;
    short *rotn;
    union {
        unsigned w[16];
        char     b[64];
    }mm;
    int os = 0;
    int grp, grps, q, p;
    unsigned char *msg2;

    if (k==NULL) k= calcKs(kspace);

    for (q=0; q<4; q++) h[q] = h0[q];   // initialize
    {
        grps  = 1 + (messageLength+8)/64;
        msg2 = malloc( 64*grps);
        memcpy( msg2, message, messageLength);
        msg2[messageLength] = (unsigned char)0x80;  
        q = messageLength + 1;
        while (q < 64*grps){ msg2[q] = 0; q++ ; }
        {
            WBunion u;
            u.w = 8*messageLength;
            q -= 8;
            memcpy(msg2+q, &u.w, 4 );
        }
    }

    for (grp=0; grp<grps; grp++)
    {
        memcpy( mm.b, msg2+os, 64);
        for(q=0;q<4;q++) abcd[q] = h[q];
        for (p = 0; p<4; p++) {
            fctn = ff[p];
            rotn = rots[p];
            m = M[p]; o= O[p];
            for (q=0; q<16; q++) {
                g = (m*q + o) % 16;
                f = abcd[1] + rol( abcd[0]+ fctn(abcd) + k[q+16*p] + mm.w[g], rotn[q%4]);

                abcd[0] = abcd[3];
                abcd[3] = abcd[2];
                abcd[2] = abcd[1];
                abcd[1] = f;
            }
        }
        for (p=0; p<4; p++)
            h[p] += abcd[p];
        os += 64;
    }

    if( msg2 )
        free( msg2 );

    return h;
}

char* convertRawMd5HashToString(unsigned* rawMd5)
{
    static char* outputBuffer[MAX_MD5_HASH_LENGTH];
    memset(outputBuffer, 0, MAX_MD5_HASH_LENGTH);

    int j, k;
    WBunion u;
    for (j=0;j<4; j++){
        u.w = rawMd5[j];
        for (k=0;k<4;k++) sprintf(outputBuffer, "%s%02x", outputBuffer, u.b[k]);
    }

    return outputBuffer;
}

unsigned f0( unsigned abcd[] ){
    return ( abcd[1] & abcd[2]) | (~abcd[1] & abcd[3]); }

unsigned f1( unsigned abcd[] ){
    return ( abcd[3] & abcd[1]) | (~abcd[3] & abcd[2]);}

unsigned f2( unsigned abcd[] ){
    return  abcd[1] ^ abcd[2] ^ abcd[3];}

unsigned f3( unsigned abcd[] ){
    return abcd[2] ^ (abcd[1] |~ abcd[3]);}

int isValidMd5(const char* md5String)
{
    if(strlen(md5String) != MAX_MD5_HASH_LENGTH)
        return 0;

    for (int i = 0; i < MAX_MD5_HASH_LENGTH; ++i) {
        char c = tolower(md5String[i]);
        if((c >= 'a' && c <= 'f') || isdigit(c)) {
            continue;
        } else {
            return 0;
        }
    }

    return 1;
}

I couldn't find the original author, but if anyone knows who written most of this snippet, let me know. Thanks. :)

ajm113
  • 936
  • 1
  • 8
  • 18
  • Are you using a particular language/library? They generally have their own methods and documentation on how this is accomplished... – Jon Clements Aug 18 '16 at 23:13
  • I'm using C, I taken a MD5 generator written for C somewhere on the net, and modified it to be a tiny bit faster. I'll see if I can find the source, if you like and update my question with that info. – ajm113 Aug 18 '16 at 23:15
  • I just went ahead and posted the code. – ajm113 Aug 18 '16 at 23:19
  • 1
    Have you tried breaking the file into chunks/blocks(like chunks of 128 Bytes) and feed them to the MD5 consecutively or parallelly. Maybe you should look into this Topic for more help,[link](http://stackoverflow.com/questions/7015544/calculating-a-hash-code-for-a-large-file-in-parallel) – user4925 Aug 18 '16 at 23:50
  • 2
    @ajm113 Why are you declaring `static char* outputBuffer[MAX_MD5_HASH_LENGTH];` ?? You do not need an *array of pointers to char*, you need an *array of char*. – David C. Rankin Aug 18 '16 at 23:54
  • 1
    Why `memcpy( msg2, message, messageLength);`, that doubles the memory footprint. It is really not good code. – zaph Aug 18 '16 at 23:56
  • 2
    @ajm113 Be cautious using this code. It comes from [**Rosettacode-MD5**](https://rosettacode.org/wiki/MD5#C), it comes with the warning "*Needs review - differences observed for the last 8 characters when compared with openssl implementation*". – David C. Rankin Aug 19 '16 at 00:21
  • @DavidC.Rankin Just noticed that. I must of forgot to remove the * after I converted it. Thanks for the heads up. :) – ajm113 Aug 19 '16 at 01:12
  • @zaph Yeah I noticed some odd ball stuff about the code, and haven't spent enough time looking at it. – ajm113 Aug 19 '16 at 01:13
  • 1
    Minor: Avoid UB, use `unsigned msk1 = (1u< – chux - Reinstate Monica Aug 19 '16 at 01:37

1 Answers1

1

The padding happens in the first for loop, you need to postpone that until the end of the data. Then you can run as much data through the second for loop until you get to the end and then add the padding. That will also allow the code to be changes such that a copy if the data does not need to be made. Split it up into init, update and finalize functions. That should not be to hard to code-up.

Of course a better idea is to use a version that already has the function split into init, update, finalize. Interestingly Apple Common Crypto is open source and written in "C", take a look at it. MD code The code is in CommonDigestPriv.h.

zaph
  • 111,848
  • 21
  • 189
  • 228
  • Very cool, thank you! I think it would be better to simply rewrite the current implementation I have anyway and see some legit better implementations before doing anything further. – ajm113 Aug 19 '16 at 01:16