1

I found this Hash function written in Java and with some help from stackoverflow converted it to C. The problem is it gives a different hash value each time it runs on the same word.

Here's the original function:

long sfold(String s, int M) 
{
  int intLength = s.length() / 4;
  long sum = 0;
  for (int j = 0; j < intLength; j++) 
  {
     char c[] = s.substring(j * 4, (j * 4) + 4).toCharArray();
     long mult = 1;
     for (int k = 0; k < c.length; k++) 
     {
        sum += c[k] * mult;
        mult *= 256;
     }
  }

  char c[] = s.substring(intLength * 4).toCharArray();
  long mult = 1;
  for (int k = 0; k < c.length; k++) 
  {
     sum += c[k] * mult;
     mult *= 256;
  }

  return(Math.abs(sum) % M);
 }

And here's how we rewrote it:

include <stdlib.h>
include <stdio.h>
include <math.h>
include <string.h>

long sfold(char * s, int M);

int main(void)
{
    char *  s = "test";
    int M;
    long x;
    M = 525;
    x = sfold(s,M);

    printf("%ld\n",x);
}   
long sfold(char * s, int M)
{
    int intLength = strlen(s) / 4;
    long sum = 0;

    for (int j = 0; j < intLength; j++) 
    {
       char c[4];
       memcpy(c, s + 4 * j, 4);

       //char c[] = s.substring(j * 4, (j * 4) + 4).toCharArray();
       long mult = 1;

       for (int k = 0; k < strlen(c); k++) 
       {
           sum += c[k] * mult;
           mult *= 256;
       }
    }

    char c[intLength];
    memcpy(c,s,intLength);
    //char c[] = s.substring(intLength * 4).toCharArray();
    long mult = 1;

    for (int k = 0; k < strlen(c); k++) 
    {
       sum += c[k] * mult;
       mult *= 256;
    }

    return(abs(sum) % M);
}

Shouldn't this give the same value each time we run the program? Anyone see what's wrong?

DCR
  • 14,737
  • 12
  • 52
  • 115

2 Answers2

3

All that string copying is really silly. What's the point of copying if all you need is the character value?

Here's how it might look in C:

long sfold(char* s, unsigned long M) {
   unsigned long mult = 1, sum = 0;
   while (*s) {
      sum += (uint8_t)(*s++) * mult;
      mult *= 256;
      if (!mult) mult = 1;
   }
   return sum % M;
}

But it's a terrible hash algorithm. You'd be better off with a simple modular hash (which is also not great, but it's not as bad):

/* This could be any small prime */
static const unsigned long mult = 31;
long sfold(char* s, unsigned long M) {
   /* Avoid having the hash of the empty string be 0 */
   unsigned long sum = 0xBEA00D1FUL;
   while (*s)
      sum += (uint8_t)(*s++) * mult;
   return sum % M;
}
rici
  • 234,347
  • 28
  • 237
  • 341
1

I think I took care of most of the bugs for you. I made it C99 compliant, mainly out of habit. The major problem was using strlen(c): c is a character array, not a string (which is a character array terminated with the null '\0' character). You'll need to rewrite your function so that if calloc()/malloc() fails, the function terminates with an error. Or you can go back to variable length arrays like you were using before if your compiler supports it. There are likely better hash functions in other posts on StackOverflow, but this at least helps you getting yours working in a deterministic manner without invoking undefined behavior.

Code Listing


/******************************************************************************/
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <string.h>

#define BUF_SIZE  (4)

/******************************************************************************/
long sfold(const char* s, int M);

/******************************************************************************/
int main(void) {
   const char*  s = "test string";
   int M;
   long x;
   M = 525;
   x = sfold(s,M);

   printf("String:%s - Hash:%ld\n", s, x);
}

/******************************************************************************/
long sfold(const char* s, int M) {
   int intLength = strlen(s) / 4;
   char* c = calloc(intLength, sizeof(char));   /* Warning, test if c==NULL, this
                                                 * call can fail.
                                                 */
   long sum = 0;
   int j, k;

   for (j=0; j<intLength; j++) {
      char c[BUF_SIZE];
      memcpy(c, s + BUF_SIZE * j, BUF_SIZE);

      //char c[] = s.substring(j * 4, (j * 4) + 4).toCharArray();
      long mult = 1;

      for (k=0; k<BUF_SIZE; k++) {
         sum += c[k] * mult;
         mult *= 256;
      }
   }

   memcpy(c, s, intLength);
   //char c[] = s.substring(intLength * 4).toCharArray();
   long mult = 1;

   for (k=0; k<BUF_SIZE; k++) {
      sum += c[k] * mult;
      mult *= 256;
   }

   free(c);
   return(abs(sum) % M);
}

Sample Output


for i in $(seq 1 5); do echo $i; ./a.out; done
1
String:test string - Hash:384
2
String:test string - Hash:384
3
String:test string - Hash:384
4
String:test string - Hash:384
5
String:test string - Hash:384
Community
  • 1
  • 1
Cloud
  • 18,753
  • 15
  • 79
  • 153
  • Thank you. When I run valgrind I get: valgrind ./test2 ==345== Command: ./test2 ==345== ==345== Invalid read of size 1 ==345== at 0x80486F2: sfold (test2.c:50) ==345== by 0x80485B7: main (test2.c:18) ==345== Address 0x423c02a is 0 bytes after a block of size 2 alloc'd ==345== at 0x402C109: calloc (in /usr/lib/valgrind/vgpreload_memcheck-x86-linux.so) ==345== by 0x804862E: sfold (test2.c:26) ==345== by 0x80485B7: main (test2.c:18) ==345== ==345== ==345== ERROR SUMMARY: 2 errors from 1 contexts (suppressed: 0 from 0) – DCR Feb 25 '15 at 17:48
  • @DCR also, please take into consideration the points rici made in his/her answer: they are very valid points. – Cloud Feb 25 '15 at 17:49
  • Also note that this hash function (like your attempt) does not do the same as the original hash function, which first processes all 4-byte chunks and then treats the remaining bytes in an extra loop. This code has the substring wrong. It passes over the 4-byte chunks twice, once with a 4.-byte stride and then in a single byte stride. The present code will give the same hash value for all words with fewer than 4 letters, for example, namely 0. – M Oehm Feb 25 '15 at 18:05