Edit: So i noticed my article.txt file has characters which were not ASCII .So i changed the MAX_CHAR_COUNT to 256 and now I am getting segmentation fault error
I am writing a C++ ( new to it )solution to find the occurrences of words in file words.txt in a huge file which cannot fit in memory with a suffix tree.
I am reading 4 lines at a time from stream (4 is just an example, i will benchmark the number of lines later after profiling ).
When i run the programme with small file , it works fine .But when I run the programme with actual file , i am facing error :
.....
building suffix tree
text is : Template Template talk Help Help talk Category
free end -432 , 71
free node
free end -1 , 71
*** Error in `./needleInHaystack': free(): invalid pointer: 0x0000000000606140 ***
======= Backtrace: =========
/lib64/libc.so.6(+0x81679)[0x7ff83d338679]
./needleInHaystack[0x4017c9]
./needleInHaystack[0x401775]
./needleInHaystack[0x401775]
./needleInHaystack[0x401775]
./needleInHaystack[0x401775]
./needleInHaystack[0x401775]
./needleInHaystack[0x401775]
./needleInHaystack[0x401d88]
/lib64/libc.so.6(__libc_start_main+0xf5)[0x7ff83d2d9505]
./needleInHaystack[0x4010d9]
======= Memory map: ========
00400000-00406000 r-xp 00000000 00:51 16651445 /usr/local/rms/needleInHaystack
00605000-00606000 r--p 00005000 00:51 16651445 /usr/local/rms/needleInHaystack
00606000-00607000 rw-p 00006000 00:51 16651445 /usr/local/rms/needleInHaystack
01f6d000-01faf000 rw-p 00000000 00:00 0 [heap]
7ff838000000-7ff838021000 rw-p 00000000 00:00 0
7ff838021000-7ff83c000000 ---p 00000000 00:00 0
7ff83d2b7000-7ff83d47a000 r-xp 00000000 08:01 3547654 /usr/lib64/libc-2.17.so
7ff83d47a000-7ff83d67a000 ---p 001c3000 08:01 3547654 /usr/lib64/libc-2.17.so
7ff83d67a000-7ff83d67e000 r--p 001c3000 08:01 3547654 /usr/lib64/libc-2.17.so
7ff83d67e000-7ff83d680000 rw-p 001c7000 08:01 3547654 /usr/lib64/libc-2.17.so
7ff83d680000-7ff83d685000 rw-p 00000000 00:00 0
7ff83d685000-7ff83d69a000 r-xp 00000000 08:01 3547686 /usr/lib64/libgcc_s-4.8.5-20150702.so.1
7ff83d69a000-7ff83d899000 ---p 00015000 08:01 3547686 /usr/lib64/libgcc_s-4.8.5-20150702.so.1
7ff83d899000-7ff83d89a000 r--p 00014000 08:01 3547686 /usr/lib64/libgcc_s-4.8.5-20150702.so.1
7ff83d89a000-7ff83d89b000 rw-p 00015000 08:01 3547686 /usr/lib64/libgcc_s-4.8.5-20150702.so.1
7ff83d89b000-7ff83d99c000 r-xp 00000000 08:01 3547702 /usr/lib64/libm-2.17.so
7ff83d99c000-7ff83db9b000 ---p 00101000 08:01 3547702 /usr/lib64/libm-2.17.so
7ff83db9b000-7ff83db9c000 r--p 00100000 08:01 3547702 /usr/lib64/libm-2.17.so
7ff83db9c000-7ff83db9d000 rw-p 00101000 08:01 3547702 /usr/lib64/libm-2.17.so
7ff83db9d000-7ff83dc86000 r-xp 00000000 08:01 3547770 /usr/lib64/libstdc++.so.6.0.19
7ff83dc86000-7ff83de85000 ---p 000e9000 08:01 3547770 /usr/lib64/libstdc++.so.6.0.19
7ff83de85000-7ff83de8d000 r--p 000e8000 08:01 3547770 /usr/lib64/libstdc++.so.6.0.19
7ff83de8d000-7ff83de8f000 rw-p 000f0000 08:01 3547770 /usr/lib64/libstdc++.so.6.0.19
7ff83de8f000-7ff83dea4000 rw-p 00000000 00:00 0
7ff83dea4000-7ff83dec6000 r-xp 00000000 08:01 3547624 /usr/lib64/ld-2.17.so
7ff83e0b7000-7ff83e0bc000 rw-p 00000000 00:00 0
7ff83e0c2000-7ff83e0c5000 rw-p 00000000 00:00 0
7ff83e0c5000-7ff83e0c6000 r--p 00021000 08:01 3547624 /usr/lib64/ld-2.17.so
7ff83e0c6000-7ff83e0c7000 rw-p 00022000 08:01 3547624 /usr/lib64/ld-2.17.so
7ff83e0c7000-7ff83e0c8000 rw-p 00000000 00:00 0
7ffdfed2f000-7ffdfed50000 rw-p 00000000 00:00 0 [stack]
7ffdfedc9000-7ffdfedcb000 r--p 00000000 00:00 0 [vvar]
7ffdfedcb000-7ffdfedcd000 r-xp 00000000 00:00 0 [vdso]
ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]
Aborted
Following is my code and for now I am only trying the occurances of one word by hardcoding which is string : "make" :
#define MAX_CHAR 256
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include<cstring>
#include <unordered_map>
#include <fstream>
using namespace std;
struct SuffixTreeNode {
struct SuffixTreeNode *children[MAX_CHAR];
struct SuffixTreeNode *suffixLink;
int start;
int *end;
int suffixIndex;
};
typedef struct SuffixTreeNode Node;
string text;
Node *root = NULL; //Pointer to root node
Node *lastNewNode = NULL;
Node *activeNode = NULL;
int activeEdge = -1;
int activeLength = 0;
int remainingSuffixCount = 0;
int leafEnd = -1;
int *rootEnd = NULL;
int *splitEnd = NULL;
int size = -1; //Length of input string
Node *newNode(int start, int *end)
{
Node *node =(Node*) malloc(sizeof(Node));
int i;
for (i = 0; i < MAX_CHAR; i++)
node->children[i] = NULL;
node->suffixLink = root;
node->start = start;
node->end = end;
node->suffixIndex = -1;
return node;
}
int edgeLength(Node *n) {
if(n == root)
return 0;
return *(n->end) - (n->start) + 1;
}
int walkDown(Node *currNode)
{
if (activeLength >= edgeLength(currNode))
{
activeEdge += edgeLength(currNode);
activeLength -= edgeLength(currNode);
activeNode = currNode;
return 1;
}
return 0;
}
void extendSuffixTree(int pos)
{
leafEnd = pos;
remainingSuffixCount++;
lastNewNode = NULL;
while(remainingSuffixCount > 0) {
if (activeLength == 0)
activeEdge = pos;
if (activeNode->children[text[activeEdge]] == NULL)
{
activeNode->children[text[activeEdge]] =
newNode(pos, &leafEnd);
if (lastNewNode != NULL)
{
lastNewNode->suffixLink = activeNode;
lastNewNode = NULL;
}
}
else
{
Node *next = activeNode->children[text[activeEdge]];
if (walkDown(next)){
continue;
}
if (text[next->start + activeLength] == text[pos])
{
if(lastNewNode != NULL && activeNode != root)
{
lastNewNode->suffixLink = activeNode;
lastNewNode = NULL;
}
activeLength++;
break;
}
splitEnd = (int*) malloc(sizeof(int));
*splitEnd = next->start + activeLength - 1;
Node *split = newNode(next->start, splitEnd);
activeNode->children[text[activeEdge]] = split;
split->children[text[pos]] = newNode(pos, &leafEnd);
next->start += activeLength;
split->children[text[next->start]] = next;
if (lastNewNode != NULL)
{
lastNewNode->suffixLink = split;
}
lastNewNode = split;
}
remainingSuffixCount--;
if (activeNode == root && activeLength > 0)
{
activeLength--;
activeEdge = pos - remainingSuffixCount + 1;
}
else if (activeNode != root)
{
activeNode = activeNode->suffixLink;
}
}
}
void print(int i, int j)
{
int k;
for (k=i; k<=j; k++)
printf("%c", text[k]);
}
void setSuffixIndexByDFS(Node *n, int labelHeight)
{
if (n == NULL) return;
int leaf = 1;
int i;
for (i = 0; i < MAX_CHAR; i++)
{
if (n->children[i] != NULL)
{
leaf = 0;
setSuffixIndexByDFS(n->children[i], labelHeight +
edgeLength(n->children[i]));
}
}
if (leaf == 1)
{
n->suffixIndex = size - labelHeight;
}
}
void freeSuffixTreeByPostOrder(Node *n)
{
if (n == NULL)
return;
int i;
for (i = 0; i < MAX_CHAR; i++)
{
if (n->children[i] != NULL)
{
freeSuffixTreeByPostOrder(n->children[i]);
}
}
printf("free end %d , %d \n", n->suffixIndex, *(n->end));
if (n->suffixIndex == -1)
free(n->end);
printf("free node \n");
free(n);
}
void buildSuffixTree()
{
printf("building suffix tree \n");
size = text.length();
int i;
rootEnd = (int*) malloc(sizeof(int));
*rootEnd = - 1;
root = newNode(-1, rootEnd);
activeNode = root;
for (i=0; i<size; i++)
extendSuffixTree(i);
int labelHeight = 0;
setSuffixIndexByDFS(root, labelHeight);
}
int traverseEdge(const char *str, int idx, int start, int end)
{
int k = 0;
for(k=start; k<=end && str[idx] != '\0'; k++, idx++)
{
if(text[k] != str[idx])
return -1; // mo match
}
if(str[idx] == '\0')
return 1; // match
return 0; // more characters yet to match
}
int doTraversalToCountLeaf(Node *n)
{
if(n == NULL)
return 0;
if(n->suffixIndex > -1)
{
return 1;
}
int count = 0;
int i = 0;
for (i = 0; i < MAX_CHAR; i++)
{
if(n->children[i] != NULL)
{
count += doTraversalToCountLeaf(n->children[i]);
}
}
return count;
}
int countLeaf(Node *n)
{
if(n == NULL)
return 0;
return doTraversalToCountLeaf(n);
}
// returns count in the current buffer
int doTraversal(Node *n,const char* str, int idx)
{
if(n == NULL)
{
return -1; // no match
}
int res = -1;
if(n->start != -1)
{
res = traverseEdge(str, idx, n->start, *(n->end));
if(res == -1) //no match
return -1;
if(res == 1) //match
{
if(n->suffixIndex > -1)
return 1;
else
return countLeaf(n);
}
}
idx = idx + edgeLength(n);
if(n->children[str[idx]] != NULL)
return doTraversal(n->children[str[idx]], str, idx);
else
return -1; // no match
}
int checkForSubString(const char* str)
{
int res = doTraversal(root, str, 0);
if(res != 1)
return 0;
return res;
}
int main(int argc, char *argv[])
{
unordered_map<string, int> map;
// read file
ifstream words;
string allLines[1000];
words.open("words.txt");
int i =0;
for (string line; getline(words, line);)
{
allLines[i] = line;
map[line] = 0;
}
allLines[0] = "make";
map[allLines[0]] = 0;
ifstream article;
string artlines;
article.open ("article.txt");
i=0;
for (string line; getline(article, line);)
{
if(line.length() == 0 ){
continue;
}
if(i == 4){
i=0;
text = artlines;
buildSuffixTree();
printf("text is : %s\n" , text.c_str() );
map[allLines[0]]+= checkForSubString(allLines[0].c_str());
artlines = line;
freeSuffixTreeByPostOrder(root);
}else{
printf("counter I is %d\n",i);
artlines += line;
++i;
}
}
printf("out of loop \n");
if(i != 4 && i != 0){
text = artlines;
//buildSuffixTree();
//map[allLines[0]]+= checkForSubString(allLines[0].c_str());
freeSuffixTreeByPostOrder(root);
}
printf("make count : %d",map[allLines[0]]);
return 0;
}