2

I'm trying to create 8 hand coded fast hash algorithms with as much parallelism (i.e. CPU pipelining) as is reasonable for fixed length keys of size 1..8 bytes. I am creating 8 hash functions in order that the key size is a compile time constant. I am looking at common hash algorithms, and trying to unroll them. At the same time I thought I would just ask around to see if anyone had a link or knew a way to do it.

Our set of characters being hashed is normally just 0..9 A..Z SPC with the occasional binary short or long. I am aware the 1 char hash is likely useless, so I will make it so our code won't compile with that small a hash key. Perfection is not as important as speed. This means, the occasional bucket with 2 times the items is ok if we eliminate enough CPU cost generating the hash. I have extensively profiled our code for years now, so I know how much money it will save our company if I take the next step to make the hashing algorithm better/faster.

I can already do better than the hash function our apps currently use, UlHashOld below (which will likely be changed as well), but I think with some help from SO I can make it a little better. For instance, my hash function for 2 chars cuts the max bucket count in half compared to UlHashOld, but it still has a number of unfilled buckets. If would like to get the two char hash to better spread out its bits when doing alpha numeric keys. I've tested this with the following code:

template <int cb> class CFixLenKeyBase {  
protected:
    char m_ach[cb];
public:
    CFixLenKeyBase(LPCSTR sz) { memcpy(m_ach, sz, cb); }
    UL UlHashOld() {
        UL n = 0;
        for (LPCSTR sz = m_ach, szEnd = sz + cb; sz < szEnd; sz++) 
            n = n ^ (n >> 25) ^ (n << 7) ^ *sz;
        return n;
    }
};

template <int cb> class CFixLenKey : public CFixLenKeyBase<cb> {  
public:
    CFixLenKey(LPCSTR sz) : CFixLenKeyBase(sz) { }
};

template <> class CFixLenKey<1> : public CFixLenKeyBase<1> { 
public:
    CFixLenKey(LPCSTR sz) : CFixLenKeyBase(sz) { }
    UL UlHash(void) const { return *m_ach; }
};

template <> class CFixLenKey<2> : public CFixLenKeyBase<2> { 
public:
    CFixLenKey(LPCSTR sz) : CFixLenKeyBase(sz) { }
    UL UlHash(void) const { 
        auto a = *(WORD*)m_ach; 
        return b = (a >> 6) ^ a;
    }
};

template <> class CFixLenKey<3> : public CFixLenKeyBase<3> { 
public:
    CFixLenKey(LPCSTR sz) : CFixLenKeyBase(sz) { }
    UL UlHash(void) const { 
        UL a = (*(UL*)m_ach)&0xFFFFFF;
        UL b = (a >> 14) ^ (a >> 6);
        UL c = (a >> 11) ^ (a << 5);
        UL d = (a >> 19) ^ (a << 9);
        return d ^ c ^ b;
    }
};

char a[36*36*36*36][4] = {0};

int main(int argc, char* argv[])
{
    const int cBuckets = 256;
    int aHisto[4][cBuckets] = { 0 };
    int aHistoOld[4][cBuckets] = { 0 };
    int aMax[4] = { 0 };
    int cElem = 36*36*36*36;
    memcpy(a[0], "0000", 4);
    for (int iElem=1 ; iElem<cElem ; ++iElem) {
        auto& b = a[iElem];
        memcpy(b,a[iElem-1], 4);
        int i = 3;
        while (1) {
            b[i]++;
            if (b[i] == '9' + 1) {
                b[i] = 'A';
            } else if (b[i] == 'Z' + 1) {
                b[i] = '0';
                --i;
            } else {
                break; 
            }
        }
    }
    int fMask = cBuckets - 1;
    int ah[4] = {0}, ahOld[4] = {0}, aSame[4] = {0},aSameOld[4] = {0};
    for (int iElem=0 ; iElem<cElem ; ++iElem) {
        int h, hOld, n, cb;
        cb = 0;
        CFixLenKey<1> A(a[iElem]);
        if ((h = A.UlHash()) == ah[cb] && memcmp(a+iElem, a+iElem-1,cb))
            aSame[cb]++;
        if ((hOld = A.UlHashOld()) == ahOld[cb] && memcmp(a+iElem, a+iElem-1,cb))
            aSameOld[cb]++;
        ahOld[cb] = hOld; ah[cb]=h;
        aHisto[cb][h&fMask]++;
        aHistoOld[cb][hOld&fMask]++;

        cb = 1;
        CFixLenKey<2> B(a[iElem]);
        if ((h = B.UlHash()) == ah[cb] && memcmp(a+iElem, a+iElem-1,cb))
            aSame[cb]++;
        if ((hOld = B.UlHashOld()) == ahOld[cb] && memcmp(a+iElem, a+iElem-1,cb))
            aSameOld[cb]++;
        ahOld[cb] = hOld; ah[cb]=h;
        aHisto[cb][h&fMask]++;
        aHistoOld[cb][hOld&fMask]++;

        cb = 2;
        CFixLenKey<3> C(a[iElem]);
        if ((h = C.UlHash()) == ah[cb] && memcmp(a+iElem, a+iElem-1,cb))
            aSame[cb]++;
        if ((hOld = C.UlHashOld()) == ahOld[cb] && memcmp(a+iElem, a+iElem-1,cb))
            aSameOld[cb]++;
        ahOld[cb] = hOld; ah[cb]=h;
        aHisto[cb][h&fMask]++;
        aHistoOld[cb][hOld&fMask]++;

    }
    for (int cb=0 ; cb<4 ; ++cb) {
        int nMin=aHisto[cb][0], nMax = nMin;
        for (int i=0 ; i<fMask ; ++i) {
            if (aHisto[cb][i] < nMin)
                nMin = aHisto[cb][i];
            if (aHisto[cb][i] > nMax)
                nMax = aHisto[cb][i];
        }
        int nMinOld=aHistoOld[cb][0], nMaxOld = nMinOld;
        for (int i=0 ; i<fMask ; ++i) {
            if (aHistoOld[cb][i] < nMinOld)
                nMinOld = aHistoOld[cb][i];
            if (aHistoOld[cb][i] > nMaxOld)
                nMaxOld = aHistoOld[cb][i];
        }

        printf("%d %03d %03d %03d %03d Same: %04d %04d\n", cb, nMin, nMax, nMinOld, nMaxOld, aSame[cb], aSameOld[cb]);
        //for (int i=0 ; i<fMask ; ++i) {
        //  printf("%03d ", aHisto[cb][i]);
        //  if ((i&31) == 31) 
        //      printf("\n");
        //}
        printf("\n");
    }

    return 0;
}
johnnycrash
  • 5,184
  • 5
  • 34
  • 58
  • Have you tried taking the 8 bytes input as a 64-bit unsigned integer (with zero padding if necessary) and then modulo by a prime number? – André Sassi Oct 04 '14 at 21:50

1 Answers1

0

I wrote about hashing here and here.

The first answer takes up generic issues usually encountered when experimenting with hashing, it may give you a head start.

The second deals with (among other things) fine-tuning the hash algorithm to the input at hand. Even if you do not know exactly what it will look like you could implement statistics-gathering in your live/production function (# of rehashes when adding a hash entry, for example) and have a parallell thread which tests for alternate combinations of index size and rehash constants, doing a dummy hash of all input values and comparing the resulting number of rehashes. This way it is possible to continuously fine-tune your algorithm even after it has been put into production.

My hashing has a twist in that the bucket area is not separate from the primary index area. Not the "this is the way you should implement hashing" but a method which I find superior: the logic and inside loop becomes much simpler i e faster which is what hashing is all about.

Community
  • 1
  • 1
Olof Forshell
  • 3,169
  • 22
  • 28