Problem Description:
References: Fun With Strings
Based on the problem description, a naive approach to find sum of length of LCP for all possible substrings (for a given string) is as follows :
#include <cstring>
#include <iostream>
using std::cout;
using std::cin;
using std::endl;
using std::string;
int lcp(string str1, string str2)
{
string result;
int n1 = str1.length(), n2 = str2.length();
// Compare str1 and str2
for (int i=0, j=0; i<=n1-1 && j<=n2-1; i++,j++)
{
if (str1[i] != str2[j])
break;
result.push_back(str1[i]);
}
return (result.length());
}
int main()
{
string s;
cin>>s;
int sum = 0;
for(int i = 0; i < s.length(); i++)
for(int j = i; j < s.length(); j++)
for(int k = 0; k < s.length(); k++)
for(int l = k; l < s.length(); l++)
sum += lcp(s.substr(i,j - i + 1),s.substr(k,l - k + 1));
cout<<sum<<endl;
return 0;
}
Based on further reading and research on LCP's, I found this document which specifies a way to efficiently find a LCP using an Advanced Data Structure called Tries. I implemented a Trie and a Compressed Trie (Suffix Tree) as follows:
#include <iostream>
#include <cstring>
using std::cout;
using std::cin;
using std::endl;
using std::string;
const int ALPHA_SIZE = 26;
struct TrieNode
{
struct TrieNode *children[ALPHA_SIZE];
string label;
bool isEndOfWord;
};
typedef struct TrieNode Trie;
Trie *getNode(void)
{
Trie *parent = new Trie;
parent->isEndOfWord = false;
parent->label = "";
for(int i = 0; i <ALPHA_SIZE; i++)
parent->children[i] = NULL;
return parent;
}
void insert(Trie *root, string key)
{
Trie *temp = root;
for(int i = 0; i < key.length(); i++)
{
int index = key[i] - 'a';
if(!temp->children[index])
{
temp->children[index] = getNode();
temp->children[index]->label = key[i];
}
temp = temp->children[index];
temp->isEndOfWord = false;
}
temp->isEndOfWord = true;
}
int countChildren(Trie *node, int *index)
{
int count = 0;
for(int i = 0; i < ALPHA_SIZE; i++)
{
if(node->children[i] != NULL)
{
count++;
*index = i;
}
}
return count;
}
void display(Trie *root)
{
Trie *temp = root;
for(int i = 0; i < ALPHA_SIZE; i++)
{
if(temp->children[i] != NULL)
{
cout<<temp->label<<"->"<<temp->children[i]->label<<endl;
if(!temp->isEndOfWord)
display(temp->children[i]);
}
}
}
void compress(Trie *root)
{
Trie *temp = root;
int index = 0;
for(int i = 0; i < ALPHA_SIZE; i++)
{
if(temp->children[i])
{
Trie *child = temp->children[i];
if(!child->isEndOfWord)
{
if(countChildren(child,&index) >= 2)
{
compress(child);
}
else if(countChildren(child,&index) == 1)
{
while(countChildren(child,&index) < 2 and countChildren(child,&index) > 0)
{
Trie *sub_child = child->children[index];
child->label = child->label + sub_child->label;
child->isEndOfWord = sub_child->isEndOfWord;
memcpy(child->children,sub_child->children,sizeof(sub_child->children));
delete(sub_child);
}
compress(child);
}
}
}
}
}
bool search(Trie *root, string key)
{
Trie *temp = root;
for(int i = 0; i < key.length(); i++)
{
int index = key[i] - 'a';
if(!temp->children[index])
return false;
temp = temp->children[index];
}
return (temp != NULL && temp->isEndOfWord);
}
int main()
{
string input;
cin>>input;
Trie *root = getNode();
for(int i = 0; i < input.length(); i++)
for(int j = i; j < input.length(); j++)
{
cout<<"Substring : "<<input.substr(i,j - i + 1)<<endl;
insert(root, input.substr(i,j - i + 1));
}
cout<<"DISPLAY"<<endl;
display(root);
compress(root);
cout<<"AFTER COMPRESSION"<<endl;
display(root);
return 0;
}
My question, is how do I proceed to find the length of the LCP. I can get the LCP by getting the label field at the branching node, but how do I count the length of LCP's for all possible substrings ?
One way I thought of was to some how use the branching node, its label field which holds the LCP, and the branching node's children to find sum of all LCP's length (Lowest Common Ancestor ?). But I am still confused. How do I proceed further ?
Note: It is also possible that my approach to this problem is wrong, so please suggest other methods too for this problem (considering time and space complexity).
Link to similar unanswered questions:
- sum of LCP of all pairs of substrings of a given string
- Longest common prefix length of all substrings and a string
References for Code and Theory:
Update1:
Based on @Adarsh Anurag's answer, I have come up with following implementation with the help of trie data structure,
#include <iostream>
#include <cstring>
#include <stack>
using std::cout;
using std::cin;
using std::endl;
using std::string;
using std::stack;
const int ALPHA_SIZE = 26;
int sum = 0;
stack <int> lcp;
struct TrieNode
{
struct TrieNode *children[ALPHA_SIZE];
string label;
int count;
};
typedef struct TrieNode Trie;
Trie *getNode(void)
{
Trie *parent = new Trie;
parent->count = 0;
parent->label = "";
for(int i = 0; i <ALPHA_SIZE; i++)
parent->children[i] = NULL;
return parent;
}
void insert(Trie *root, string key)
{
Trie *temp = root;
for(int i = 0; i < key.length(); i++)
{
int index = key[i] - 'a';
if(!temp->children[index])
{
temp->children[index] = getNode();
temp->children[index]->label = key[i];
}
temp = temp->children[index];
}
temp->count++;
}
int countChildren(Trie *node, int *index)
{
int count = 0;
for(int i = 0; i < ALPHA_SIZE; i++)
{
if(node->children[i] != NULL)
{
count++;
*index = i;
}
}
return count;
}
void display(Trie *root)
{
Trie *temp = root;
int index = 0;
for(int i = 0; i < ALPHA_SIZE; i++)
{
if(temp->children[i] != NULL)
{
cout<<temp->label<<"->"<<temp->children[i]->label<<endl;
cout<<"CountOfChildren:"<<countChildren(temp,&index)<<endl;
cout<<"Counter:"<<temp->children[i]->count<<endl;
display(temp->children[i]);
}
}
}
void lcp_sum(Trie *root,int counter,string lcp_label)
{
Trie *temp = root;
int index = 0;
for(int i = 0; i < ALPHA_SIZE; i++)
{
if(temp->children[i])
{
Trie *child = temp->children[i];
if(lcp.empty())
{
lcp_label = child->label;
counter = 0;
lcp.push(child->count*lcp_label.length());
sum += lcp.top();
counter += 1;
}
else
{
lcp_label = lcp_label + child->label;
stack <int> temp = lcp;
while(!temp.empty())
{
sum = sum + 2 * temp.top() * child->count;
temp.pop();
}
lcp.push(child->count*lcp_label.length());
sum += lcp.top();
counter += 1;
}
if(countChildren(child,&index) > 1)
{
lcp_sum(child,0,lcp_label);
}
else if (countChildren(child,&index) == 1)
lcp_sum(child,counter,lcp_label);
else
{
while(counter-- && !lcp.empty())
lcp.pop();
}
}
}
}
int main()
{
string input;
cin>>input;
Trie *root = getNode();
for(int i = 0; i < input.length(); i++)
for(int j = i; j < input.length(); j++)
{
cout<<"Substring : "<<input.substr(i,j - i + 1)<<endl;
insert(root, input.substr(i,j - i + 1));
display(root);
}
cout<<"DISPLAY"<<endl;
display(root);
cout<<"COUNT"<<endl;
lcp_sum(root,0,"");
cout<<sum<<endl;
return 0;
}
From the Trie structure, I have removed the variable isEndOfWord
and instead replaced it with a counter
. This variable keeps track of duplicate substrings which should help in counting LCP's for strings with duplicate characters. However, the above implementation works only for strings with distinct characters. I have tried implementing the method suggested by @Adarsh for duplicate characters but does not satisfy any test case.
Update2:
Based on further updated answer from @Adarsh and "trial and error" with different testcases, I seem to have progressed a bit for duplicate characters, however it still does not work as expected. Here is the implementation with comments,
// LCP : Longest Common Prefix
// DFS : Depth First Search
#include <iostream>
#include <cstring>
#include <stack>
#include <queue>
using std::cout;
using std::cin;
using std::endl;
using std::string;
using std::stack;
using std::queue;
const int ALPHA_SIZE = 26;
int sum = 0; // Global variable for LCP sum
stack <int> lcp; //Keeps track of current LCP
// Trie Data Structure Implementation (See References Section)
struct TrieNode
{
struct TrieNode *children[ALPHA_SIZE]; // Search space can be further reduced by keeping track of required indicies
string label;
int count; // Keeps track of repeat substrings
};
typedef struct TrieNode Trie;
Trie *getNode(void)
{
Trie *parent = new Trie;
parent->count = 0;
parent->label = ""; // Root Label at level 0 is an empty string
for(int i = 0; i <ALPHA_SIZE; i++)
parent->children[i] = NULL;
return parent;
}
void insert(Trie *root, string key)
{
Trie *temp = root;
for(int i = 0; i < key.length(); i++)
{
int index = key[i] - 'a'; // Lowercase alphabets only
if(!temp->children[index])
{
temp->children[index] = getNode();
temp->children[index]->label = key[i]; // Label represents the character being inserted into the node
}
temp = temp->children[index];
}
temp->count++;
}
// Returns the count of child nodes for a given node
int countChildren(Trie *node, int *index)
{
int count = 0;
for(int i = 0; i < ALPHA_SIZE; i++)
{
if(node->children[i] != NULL)
{
count++;
*index = i; //Not required for this problem, used in compressed trie implementation
}
}
return count;
}
// Displays the Trie in DFS manner
void display(Trie *root)
{
Trie *temp = root;
int index = 0;
for(int i = 0; i < ALPHA_SIZE; i++)
{
if(temp->children[i] != NULL)
{
cout<<temp->label<<"->"<<temp->children[i]->label<<endl; // Display in this format : Root->Child
cout<<"CountOfChildren:"<<countChildren(temp,&index)<<endl; // Count of Child nodes for Root
cout<<"Counter:"<<temp->children[i]->count<<endl; // Count of repeat substrings for a given node
display(temp->children[i]);
}
}
}
/* COMPRESSED TRIE IMPLEMENTATION
void compress(Trie *root)
{
Trie *temp = root;
int index = 0;
for(int i = 0; i < ALPHA_SIZE; i++)
{
if(temp->children[i])
{
Trie *child = temp->children[i];
//if(!child->isEndOfWord)
{
if(countChildren(child,&index) >= 2)
{
compress(child);
}
else if(countChildren(child,&index) == 1)
{
while(countChildren(child,&index) < 2 and countChildren(child,&index) > 0)
{
Trie *sub_child = child->children[index];
child->label = child->label + sub_child->label;
//child->isEndOfWord = sub_child->isEndOfWord;
memcpy(child->children,sub_child->children,sizeof(sub_child->children));
delete(sub_child);
}
compress(child);
}
}
}
}
}
*/
// Calculate LCP Sum recursively
void lcp_sum(Trie *root,int *counter,string lcp_label,queue <int> *s_count)
{
Trie *temp = root;
int index = 0;
// Traverse through this root's children array, to find child nodes
for(int i = 0; i < ALPHA_SIZE; i++)
{
// If child nodes found, then ...
if(temp->children[i] != NULL)
{
Trie *child = temp->children[i];
// Check if LCP stack is empty
if(lcp.empty())
{
lcp_label = child->label; // Set LCP label as Child's label
*counter = 0; // To make sure counter is not -1 during recursion
/*
* To include LCP of repeat substrings, multiply the count variable with current LCP Label's length
* Push this to a stack called lcp
*/
lcp.push(child->count*lcp_label.length());
// Add LCP for (a,a)
sum += lcp.top() * child->count; // Formula to calculate sum for repeat substrings : (child->count) ^ 2 * LCP Label's Length
*counter += 1; // Increment counter, this is used further to pop elements from the stack lcp, when a branching node is encountered
}
else
{
lcp_label = lcp_label + child->label; // If not empty, then add Child's label to LCP label
stack <int> temp = lcp; // Temporary Stack
/*
To calculate LCP for different combinations of substrings,
2 -> accounts for (a,b) and (b,a)
temp->top() -> For previous substrings and their combinations with the current substring
child->count() -> For any repeat substrings for current node/substring
*/
while(!temp.empty())
{
sum = sum + 2 * temp.top() * child->count;
temp.pop();
}
// Similar to above explanation for if block
lcp.push(child->count*lcp_label.length());
sum += lcp.top() * child->count;
*counter += 1;
}
// If a branching node is encountered
if(countChildren(child,&index) > 1)
{
int lc = 0; // dummy variable
queue <int> ss_count; // queue to keep track of substrings (counter) from the child node of the branching node
lcp_sum(child,&lc,lcp_label,&ss_count); // Recursively calculate LCP for child node
// This part is experimental, does not work for all testcases
// Used to calculate the LCP count for substrings between child nodes of the branching node
if(countChildren(child,&index) == 2)
{
int counter_queue = ss_count.front();
ss_count.pop();
while(counter_queue--)
{
sum = sum + 2 * ss_count.front() * lcp_label.length();
ss_count.pop();
}
}
else
{
// Unclear, what happens if children is > 3
// Should one take combination of each child node with one another ?
while(!ss_count.empty())
{
sum = sum + 2 * ss_count.front() * lcp_label.length();
ss_count.pop();
}
}
lcp_label = temp->label; // Set LCP label back to Root's Label
// Empty the stack till counter is 0, so as to restore it's state when it first entered the child node from the branching node
while(*counter)
{
lcp.pop();
*counter -=1;
}
continue; // Continue to next child of the branching node
}
else if (countChildren(child,&index) == 1)
{
// If count of children is 1, then recursively calculate LCP for further child node
lcp_sum(child,counter,lcp_label,s_count);
}
else
{
// If count of child nodes is 0, then push the counter to the queue for that node
s_count->push(*counter);
// Empty the stack till counter is 0, so as to restore it's state when it first entered the child node from the branching node
while(*counter)
{
lcp.pop();
*counter -=1;
}
lcp_label = temp->label; // Set LCP label back to Root's Label
}
}
}
}
/* SEARCHING A TRIE
bool search(Trie *root, string key)
{
Trie *temp = root;
for(int i = 0; i < key.length(); i++)
{
int index = key[i] - 'a';
if(!temp->children[index])
return false;
temp = temp->children[index];
}
return (temp != NULL );//&& temp->isEndOfWord);
}
*/
int main()
{
int t;
cin>>t; // Number of testcases
while(t--)
{
string input;
int len;
cin>>len>>input; // Get input length and input string
Trie *root = getNode();
for(int i = 0; i < len; i++)
for(int j = i; j < len; j++)
insert(root, input.substr(i,j - i + 1)); // Insert all possible substrings into Trie for the given input
/*
cout<<"DISPLAY"<<endl;
display(root);
*/
//LCP COUNT
int counter = 0; //dummy variable
queue <int> q; //dummy variable
lcp_sum(root,&counter,"",&q);
cout<<sum<<endl;
sum = 0;
/*
compress(root);
cout<<"AFTER COMPRESSION"<<endl;
display(root);
*/
}
return 0;
}
Also, here are some sample test cases (expected outputs),
1. Input : 2 2 ab 3 zzz
Output : 6 46
2. Input : 3 1 a 5 afhce 8 ahsfeaa
Output : 1 105 592
3. Input : 2 15 aabbcceeddeeffa 3 bab
Output : 7100 26
The above implementation fails for testcase 2 and 3 (partial output). Please suggest a way to solve this. Any other approach to this problem is also fine.