2

I worked on this hashtable on my windows machine and everything was gravy. I just transferred it to my linux machine and now my cout in the print function is getting completely garbled. In test1 you can see it outputting all of the i values, and in test2 you can see it only outputting i when i = 6. Is it something I'm doing in the processFile function?

int main ( ) {

    Hash hashTable;
    cout << setprecision ( 10 );
    cout << "Test 1 - print empty table" << endl;
    hashTable.print ();
    cout << "-------------------------------------------------------------"
    << endl;
    cout << "Test 2 - processing input file" << endl;
    hashTable.processFile ("test1.txt");
    hashTable.print ();
}

void Hash::print ()
{
    for(int i=0;i<HASH_TABLE_SIZE;i++)
    {
        cout << i << ":\t";
        for(list<string>::iterator it=hashTable[i].begin(); it != hashTable[i].end();++it)
            cout << *it << ", ";
        cout << endl;
    }
}

void Hash::processFile (string filename)
{
    ifstream fp(filename.c_str());
    if (!fp.is_open()) {
        cout << "can't open file " << endl;
    }
    string word;
    while (getline(fp, word)) {

        int key = hf(word);
        if(!hashTable[key].empty())
            collisions++;

        hashTable[key].push_back(word);
        int x = hashTable[key].size();
        if(x > longestList)
            longestList = x;

        double totalLength = 0;
        for(int i=0;i<HASH_TABLE_SIZE;i++)
            totalLength+=hashTable[i].size();
        double newAvgLength = totalLength/HASH_TABLE_SIZE;
        avgLength = (newAvgLength + avgLength) / 2.0;

    }
    fp.close();
}

output:

Test 1 - print empty table
0:  
1:  
2:  
3:  
4:  
5:  
6:  
7:  
8:  
9:  

Test 2 - processing input file
, ttttt ooooo
, rrrrr ppppp
,   iiiii
, nnnnn heath
,   jjjjj
, harps hello
6:  uuuuu, 
, qqqqq mmmmm
,   lllll
, sssss kkkkk

Expected output:

Test 1 - print empty table
0:
1:
2:
3:
4:
5:
6:
7:
8:
9:


Test 2 - processing input file
0:
1:
2:      kkkkk, lllll, ppppp, sssss,
3:      hello, heath, harps,
4:
5:      happy, jjjjj, nnnnn, ooooo,
6:      iiiii, uuuuu,
7:      ttttt,
8:      mmmmm, rrrrr,
9:      qqqqq,

Hashing Function

Hash::hf (string ins) {
return MurmurHash2(ins.c_str(),ins.size(),11); 
}
unsigned int Hash::MurmurHash2 (const void *key, int len, unsigned int seed )
{
// 'm' and 'r' are mixing constants generated offline.
// They're not really 'magic', they just happen to work well.
const unsigned int m = 0x5bd1e995;
const int r = 24;

// Initialize the hash to a 'random' value

unsigned int h = seed ^ len;

// Mix 4 bytes at a time into the hash

const unsigned char * data = (const unsigned char *)key;

while(len >= 4)
{
    unsigned int k = *(unsigned int *)data;

    k *= m; 
    k ^= k >> r; 
    k *= m; 

    h *= m; 
    h ^= k;

    data += 4;
    len -= 4;
}

// Handle the last few bytes of the input array

switch(len)
{
case 3: h ^= data[2] << 16;
case 2: h ^= data[1] << 8;
case 1: h ^= data[0];
        h *= m;
};

// Do a few final mixes of the hash to ensure the last few
// bytes are well-incorporated.

h ^= h >> 13;
h *= m;
h ^= h >> 15;

return h % HASH_TABLE_SIZE;
}

Hash Definition

#ifndef __HASH_H
#define __HASH_H

#include <string>
#include <list>
#include <iostream>
#include <sstream>
#include <vector>

using namespace std;

class Hash {

public:
void remove (string word);
void print ();
void processFile (string filename);
bool search (string word);
void output (string filename);
void printStats ();

private:
list<string> hashTable [HASH_TABLE_SIZE];
int collisions;
int longestList;
double avgLength;

private:
int hf (string word);
unsigned int MurmurHash2 (const void *key, int len, unsigned int seed);
public:
Hash(){collisions = 0; longestList = 0; avgLength = 0;}

};

#endif
CodeManiak
  • 1,903
  • 4
  • 19
  • 32
  • 1
    I see nothing in the `print` function which should cause such a problem. But can you provide the *complete* code instead (i.e. a [SSCCE](http://sscce.org/))? Including the `Hash` definition. There may be something in the code you don't show. – Some programmer dude Nov 06 '13 at 06:43
  • 1
    Oh, and run in a debugger and step through line by line, while checking values of variables. If you can't do that, add extra debug output, for example in the `processFile` function, to print all variables. – Some programmer dude Nov 06 '13 at 07:44
  • 2
    The definition of `hf()` would likely shed a ton of light on this issue, particularly how it relates to `HASH_TABLE_SIZE` (it better return a value from `0.. HASH_TABLE_SIZE-1`). And of course the member definition of `hashTable` is equally important. Post *both*. – WhozCraig Nov 06 '13 at 08:42
  • @WhozCraig I added the requested definitions. Hopefully this helps make it more clear! – CodeManiak Nov 06 '13 at 17:29

1 Answers1

2

Your input file is corrupted. Specifically, it uses Windows-style line endings instead of UNIX-style line endings.

To confirm this run, on your Linux computer, file /tmp/xx (where /tmp/xx is replaced by the name of your file). The output will say "ASCII text, with CRLF line terminators".

Robᵩ
  • 163,533
  • 20
  • 239
  • 308
  • Fixed this issue by recreating my input files. Thanks! – CodeManiak Nov 06 '13 at 17:34
  • 1
    With a variety of tools. If you have `vim`, edit the file, issue `:set fileformat=unix`, `:w`. Maybe you have `dos2unix`. Or you could `tr -d '\r' < /tmp/xx > /tmp/yy`. – Robᵩ Nov 06 '13 at 17:36
  • Alternatively, you could modify your program along the lines of [this question](http://stackoverflow.com/questions/216823/whats-the-best-way-to-trim-stdstring). – Robᵩ Nov 06 '13 at 17:38