I have a big text file (2GB) that contains couple of books. I want to create a (**char)
that contains each word of the whole text file. But firstly i pass all the text file data in a HUGE string
, THEN making the **char
variable
the problem is that it takes TOO long(hours) for the getline()
loop to end.I ran it for 30 mins and the program read 500.000 lines. The whole file is 43.000.000 lines
int main (){
ifstream book;
string sbook,str;
book.open("gutenberg.txt"); // the huge file
cout<<"Reading the file ....."<<endl;
while(!book.eof()){
getline(book,sbook);//passing the line as a string to sbook
if(str.empty()){
str= sbook;
}
else
str= str + " " + sbook;//apend sbook to another string until the file closes
}//I never managed to get out of this loop
cout<<"Done reading the file."<<endl;
cout<<"Removal....."<<endl;
removal(str);//removes all puncuations and makes each upperccase letter to a lowercase
cout<<"done removal"<<endl;
cout<<"Removing doublewhitespaces...."<<endl;
int whitespaces=removedoublewhitespace(str);//removes excess whitespaces leaving only one whitespace within each word
//and returns the number of all the whitespaces
cout<<"doublewhitespaces removed."<<endl;
cout<<"initiating leksis....."<<endl;
char **leksis=new char*[whitespaces+1];//whitespase+1 is how many words are left in the file
for(int i=0;i<whitespaces+1;i++){
leksis[i]= new char[30];
}
cout<<"done initiating leksis."<<endl;
int y=0,j=0;
cout<<"constructing leksis,finding plithos...."<<endl;
for(int i=0;i<str.length();i++){
if(isspace(str[i])){;
y++;
j=0;
leksis[y][j]=' ';
j++;
}
else{
leksis[y][j]=str[i];
j++;
}
}
cout<<"Done constructing leksis,finding plithos...."<<endl;
removal()
function
void removal(string &s) {
for (int i = 0, len = s.size(); i < len; i++)
{
char c=s[i];
if(isupper(s[i])){
s[i]=tolower(s[i]);
}
int flag=ispunct(s[i]);
if (flag){
s.erase(i--, 1);
len = s.size();
}
}
}
removedoublewhitespace()
function :
int removedoublewhitespace(string &str){
int wcnt=0;
for(int i=str.size()-1; i >= 0; i-- )
{
if(str[i]==' '&&str[i]==str[i-1]) //added equal sign
{
str.erase( str.begin() + i );
}
}
for(int i=0;i<str.size();i++){
if(isspace(str[i])){
wcnt++;
}
}
return wcnt;
}