4

This is a follow-up question to this question.

Since I solved part of it and I have the feeling that the remaining problem is unrelated to the first one, I decided to separate the two parts into two questions.

I have implemented an associated array using POSIX hcreate/hsearch as suggested here.

For the sake of completeness, here is all the code except for the main() function from the previous question:

#include <inttypes.h> /* intptr_t             */
#include <search.h>   /* hcreate(), hsearch() */
#include <stdio.h>    /* perror()             */
#include <stdlib.h>   /* exit()               */
#include <string.h>   /* strcpy()             */

void exit_with_error(const char* error_message){
  perror(error_message);
  exit(EXIT_FAILURE);
}
int fetch(const char* key, intptr_t* value){
  ENTRY e,*p;
  e.key=(char*)key;
  p=hsearch(e, FIND);
  if(!p) return 0;
  *value=(intptr_t)p->data;
  return 1;
}

void store(const char *key, intptr_t value){
  ENTRY e,*p;
  e.key=(char*)key;
  p = hsearch(e, ENTER);
  if(!p) exit_with_error("hash full");
  p->data = (void *)value;
}

and here is a slightly extended main() function:

int main(){
  char a[4]="foo";
  char b[4]="bar";
  char c[4]="baz";
  char t[4]="";
  char y='\0';
  const char l[6]={'a','b','f','o','r','z'};
  intptr_t x=NULL;
  size_t i=0,j=0;

  if(!hcreate(50)) exit_with_error("no hash");

  strcpy(t,b);
  store(t,0);
  if(fetch(t,&x)) printf("stored %s-->%d\n",t,(int)x);
  else printf("%s not stored\n",t);

  for(i=0;i<3;++i){
    y=t[i];
    for(j=0;j<6;++j){
      if(l[j]==y) continue;
      t[i]=l[j];
      if(fetch(t,&x)) store(t,-1);
      else store(t,1);
      if(fetch(t,&x)) printf("stored %s-->%d\n",t,(int)x);
      else printf("%s not stored\n",t);
    }   
    t[i]=y;
  }

  strcpy(t,a); if(fetch(t,&x)) printf("read %s-->%d\n",t,(int)x); else printf("%s not found\n",t);
  strcpy(t,b); if(fetch(t,&x)) printf("read %s-->%d\n",t,(int)x); else printf("%s not found\n",t);
  strcpy(t,c); if(fetch(t,&x)) printf("read %s-->%d\n",t,(int)x); else printf("%s not found\n",t);

  exit(EXIT_SUCCESS);
}

As you can see, I take the string bar and associate it with 0. Then, I take all words that differ in exactly one letter from bar and associate them to 1, if they were not added before, or -1 otherwise. Finally, I lookup foo, bar & baz resulting in the expected values:

stored bar-->0
stored aar-->1
stored far-->1
stored oar-->1
stored rar-->1
stored zar-->1
stored bbr-->-1
stored bfr-->1
stored bor-->1
stored brr-->1
stored bzr-->1
stored baa-->1
stored bab-->1
stored baf-->1
stored bao-->1
stored baz-->1
foo not found
read bar-->0
read baz-->1

Now I slightly modify the function replacing foo, bar & baz by CCCTTCTTATCG & CCCTTCATTGCG:

int main(){
  char a[13]="CCCTTCTTATCG"
            /*|||||| |  ||*/
  char b[13]="CCCTTCATTGCG";
  char t[13]="";
  char y='\0';
  const char l[4]={'A','C','G','T'};
  intptr_t x=NULL;
  size_t i=0,j=0;

  if(!hcreate(150)) exit_with_error("no hash");

  strcpy(t,a);
  store(t,0); 
  if(fetch(t,&x)) printf("stored %s-->%d\n",t,(int)x);
  else printf("%s not stored\n",t);

  for(i=0;i<12;++i){
    y=t[i];
    for(j=0;j<4;++j){
      if(l[j]==y) continue;
      t[i]=l[j];
      if(fetch(t,&x)) store(t,-1);
      else store(t,1);
      if(fetch(t,&x)) printf("stored %s-->%d\n",t,(int)x);
      else printf("%s not stored\n",t);
    }   
    t[i]=y;
  }

  strcpy(t,a); if(fetch(t,&x)) printf("read %s-->%d\n",t,(int)x); else printf("%s not found\n",t);
  strcpy(t,b); if(fetch(t,&x)) printf("read %s-->%d\n",t,(int)x); else printf("%s not found\n",t);

  exit(EXIT_SUCCESS);
}

For clarity here is the corresponding diff:

29,32c29,32
<   char a[4]="foo";
<   char b[4]="bar";
<   char c[4]="baz";
<   char t[4]="";
---
>   char a[13]="CCCTTCTTATCG";
>             /*|||||| |  ||*/
>   char b[13]="CCCTTCATTGCG";
>   char t[13]="";
34c34
<   const char l[6]={'a','b','f','o','r','z'};
---
>   const char l[4]={'A','C','G','T'};
38c38
<   if(!hcreate(50)) exit_with_error("no hash");
---
>   if(!hcreate(150)) exit_with_error("no hash");
40c40
<   strcpy(t,b);
---
>   strcpy(t,a);
45c45
<   for(i=0;i<3;++i){
---
>   for(i=0;i<12;++i){
47c47
<     for(j=0;j<6;++j){
---
>     for(j=0;j<4;++j){
60d59
<   strcpy(t,c); if(fetch(t,&x)) printf("read %s-->%d\n",t,(int)x); else printf("%s not found\n",t);

As you can see, CCCTTCATTGCG has 3 edits from CCCTTCTTATCG (like foo from bar) and thus should not be found in the hash table.

However, this is the corresponding output:

stored CCCTTCTTATCG-->0
stored ACCTTCTTATCG-->1
stored GCCTTCTTATCG-->1
stored TCCTTCTTATCG-->1
stored CACTTCTTATCG-->1
stored CGCTTCTTATCG-->1
stored CTCTTCTTATCG-->1
stored CCATTCTTATCG-->1
stored CCGTTCTTATCG-->1
stored CCTTTCTTATCG-->1
stored CCCATCTTATCG-->1
stored CCCCTCTTATCG-->1
stored CCCGTCTTATCG-->1
stored CCCTACTTATCG-->1
stored CCCTCCTTATCG-->1
stored CCCTGCTTATCG-->1
stored CCCTTATTATCG-->1
stored CCCTTGTTATCG-->1
stored CCCTTTTTATCG-->1
stored CCCTTCATATCG-->1
stored CCCTTCCTATCG-->1
stored CCCTTCGTATCG-->1
stored CCCTTCTAATCG-->1
stored CCCTTCTCATCG-->1
stored CCCTTCTGATCG-->1
stored CCCTTCTTCTCG-->-1
stored CCCTTCTTGTCG-->-1
stored CCCTTCTTTTCG-->-1
stored CCCTTCTTAACG-->-1
stored CCCTTCTTACCG-->-1
stored CCCTTCTTAGCG-->-1
stored CCCTTCTTATAG-->-1
stored CCCTTCTTATGG-->-1
stored CCCTTCTTATTG-->-1
stored CCCTTCTTATCA-->-1
stored CCCTTCTTATCC-->-1
stored CCCTTCTTATCT-->-1
read CCCTTCTTATCG-->-1
read CCCTTCATTGCG-->1

As you can see, CCCTTCTTATCG is associated with -1 even though it was never stored.

This is also true for all strings that got -1 assigned on storage since they were already in the hash when they were about to be inserted.

This is also true for bbr in the smaller example above.

How come hsearch returns 1 for these keys even though they have never been inserted?

mschilli
  • 1,884
  • 1
  • 26
  • 56
  • 2
    Home come the question is about `hsearch()` and that function is not posted here? – chux - Reinstate Monica Jan 12 '16 at 20:18
  • 2
    The hcreate(), hdestroy(), and hsearch() functions first appeared in AT&T System V UNIX. At least that's what the man page says. – user3386109 Jan 12 '16 at 20:39
  • 2
    My guess: `hsearch` keeps the provided key string in the hash table without making a copy, so you can't just keep reusing the same buffer to build keys and insert them. Modifying the buffer after you've used it to do a store causes the hash table to become internally inconsistent. –  Jan 12 '16 at 21:18
  • 1
    With POSIX, it is [`int main()`](http://stackoverflow.com/questions/204476/), not `void main()`. – Jonathan Leffler Jan 12 '16 at 21:22
  • the posted code contains several 'magic' numbers. for instance: 3, 4, 6, 50. 'magic' numbers make the code more difficult to understand, debug, maintain. Suggest using #define's or an enum to give those numbers meaningful names then use those meaningful names throughout the code. – user3629249 Jan 12 '16 at 21:28
  • For ease of understanding of the code by us humans, please follow the axiom: only one statement per line and (at most) one variable declaration per statement. – user3629249 Jan 12 '16 at 21:29
  • this statement is not correct: `for(i<0;i<3;++i)` The first parameter is the initializer, not a comparison. Suggest: `for( i=0; i<3; ++i )` – user3629249 Jan 12 '16 at 21:32
  • regarding these two lines: `if(fetch(t,&x)) store(t,-1); else store(t,1);` the function: `store()` is expecting the second parameter to be a pointer to an integer, not an integer. So the parameters are not correct. – user3629249 Jan 12 '16 at 21:50
  • @chux: `hsearch()` is called in both `store()` & `fetch()` defined above. – mschilli Jan 13 '16 at 07:17
  • @user3629249: Thx for spotting the `for(i<0;....` typo. I fixed it. – mschilli Jan 13 '16 at 07:20
  • @JonathanLeffler: I just used `void` in the MWE. Usually I have `int` (as well as `argc` and `argv`). But since this is tagged [tag:posix], I changed it. – mschilli Jan 13 '16 at 07:24
  • @user3629249: regarding the parameter types: `intptr_t` is not a pointer, it's an `int` big enough to store any pointer, afaik. So while `fetch()` asks for a pointer `intptr_t*` to return the value, `store()` doesn't. Note that for `fetch()` I also always pass `&x`, which is exactly the type asked for. – mschilli Jan 13 '16 at 07:29

1 Answers1

6

The man page indicates that the key should be a string allocated with malloc. Here are a couple relevant quotes from the man page

The hdestroy() function calls free(3) for each comparison key in the search table but not the data item associated with the key.

and

The comparison key (passed to hsearch() as item.key) must be allocated using malloc(3) if action is ENTER and hdestroy() is called.

So calling hsearch with action ENTER stores a pointer in the hash, and that pointer is expected to point to a string that can be later freed. In your code, you have one single string buffer, and every time you add an entry to the hash table, you pass the same pointer (which points to your string buffer). That's going to make a mess of the hash table.

Fixing the problem is simple. In the store function, make a copy of the key with strdup before adding the entry to the table. In other words, replace

e.key=(char*)key;

with

e.key=strdup(key);
user3386109
  • 34,287
  • 7
  • 49
  • 68
  • He never calls `hdestroy()`, so I don't see how this is relevant. – Barmar Jan 12 '16 at 21:07
  • 1
    @Barmar Because the hash table doesn't make copies of the string. It just stores the pointer. If all the pointers point to the same buffer, then all of the strings in the hash table are whatever is currently in the buffer. To put it another way, you aren't allowed to make changes to the contents of the buffer after using the buffer as a key. – user3386109 Jan 12 '16 at 21:09
  • 2
    Interestingly, that text doesn't appear in the POSIX spec http://pubs.opengroup.org/onlinepubs/9699919799/ however, the example code is careful not to use the same pointers for each of the keys. Rather than using `malloc`, it has a long `char` array and reads each key into a different location in the array. – Barmar Jan 12 '16 at 21:18
  • 1
    It seems like the documentation should say somewhere that the key strings should not be modified after you store them. – Barmar Jan 12 '16 at 21:20
  • 1
    @Barmar Yes, the documentation should be very clear about that. It's implied (if ability to `free` is assumed to indicate ownership), but it should be explicitly stated. – user3386109 Jan 12 '16 at 21:25
  • The POSIX spec doesn't even mention that it has to be allocated using `malloc` if `hdestroy` is called. – Barmar Jan 12 '16 at 21:27
  • @Barmar In that case, POSIX may have mandated that the implementation make a copy of the key. Which creates compatibility issues with the older implementations. Also, if POSIX *does* require that the `key` be internally copied, then the documentation should explicitly state that the caller retains ownership of the `key`. – user3386109 Jan 12 '16 at 21:32
  • In fact, it says _The `hcreate()` and `hsearch()` functions may use `malloc()` to allocate space._ `hcreate` obviously needs to do it to create the hash table itself, but the only reason `hsearch` would need to is so it can copy the keys. – Barmar Jan 12 '16 at 21:37
  • @Barmar `hsearch` will need to allocate memory to handle collisions, e.g. managing a linked-list of entries for a given hash value . And/or it may need to allocate memory to expand (i.e. rehash) the table. – user3386109 Jan 12 '16 at 21:45
  • @user3386109 afik, by default `hsearch` uses open addressing & doesn't support expanding the table. – mschilli Jan 13 '16 at 06:49
  • 1
    This was really not clear from the documentation for me. I makes sense though. And most importantly it works. – mschilli Jan 13 '16 at 07:30
  • @Barmar: Regarding `hdestroy()` the [source](http://rosettacode.org/wiki/Associative_arrays/Creation/C#To_fetch_or_store) for the implementation explicitly warns not to use `hdestroy()` declaring it broken. Can you comment on that? – mschilli Jan 13 '16 at 07:32
  • @mschilli It's just obeying the rule stated in the documentation quoted above: you can only call `hdestroy()` if the keys were allocated dynamically. Since the keys are string literals, they can't be freed. – Barmar Jan 13 '16 at 16:06
  • 1
    It is worth pointing out that the stipulation for a key to be allocated using `malloc()` is *not* POSIX, and I could only find this in the BSD manpages. And in fact, the man pages on my Linux machine explicitly state that `hdestroy()` does *not* attempt to free the buffer pointed to by `key`. – josh798 Jun 09 '22 at 12:50