Most common words in txt File

Question

I've made a program that outputs you the most common word in txt file. And does anybody know how to optimize it that it would work for bigger files and faster.

The output image: https://i.stack.imgur.com/fVBh0.png

Here is the code.

#include <iostream>
#include <string>
#include <fstream>
#include <cstdlib>
#include <vector>
#include <algorithm>
#include <math.h>

using namespace std;

int main()
{
    ifstream in("file.txt");

    if(!in){
        cerr << "Could not open file.txt.";
        return EXIT_FAILURE;
    }

    string str, str2, strn, tab[10000], tab2[10000];
    int i, k, j, n, l, tabl;
    char c = 179;
    vector<int> tabs;
    vector<string> stringi;

    while(getline(in, str2)){
        str += str2;
        str += ' ';
    }
    k = 0;
    for(i = 0; i < str.length(); i++){
        if(str[i] != ' ' && str[i] != '.' && str[i] != '\t' && str[i] != ','
           && str[i] != ';' && str[i] != ':' && str[i] != '}' && str[i] != '{'){
            tab[k] += tolower(str[i]);
        }else{
            k++;
        }
        if(str[i] == '.' || str[i] == '\t' || str[i] == ',' || str[i] == ';'
        || str[i] == ':' || str[i] == '}' || str[i] == '{') {
            k--;
        }
    }
    tabl = k;

    k = 0;
    for(i = 0; i < tabl; i++){
        for(j = 0; j < tabl; j++){
            if(tab[i] == tab[j]){
                k++;
            }
        }
        tabs.push_back(k);
        k = 0;
    }
    for(i = 0; i < tabl; i++){
        for(j = 0; j < tabl-1; j++){
            if(tab[j] < tab[j+1]){
                n = tabs.at(j);
                tabs.at(j) = tabs.at(j+1);
                tabs.at(j+1) = n;
                strn = tab[j];
                tab[j] = tab[j+1];
                tab[j+1] = strn;
            }
        }
    }
    for(i = 0; i < tabl; i++){
        for(j = 0; j < tabl-1; j++){
            if(tabs.at(j) < tabs.at(j+1)){
                n = tabs.at(j);
                tabs.at(j) = tabs.at(j+1);
                tabs.at(j+1) = n;
                strn = tab[j];
                tab[j] = tab[j+1];
                tab[j+1] = strn;
            }
        }
    }
    tab2[0] = tab[0];
    for(i = 0; i < tabl; i++){
        if(tab[i] != tab[i+1]){
            tab2[i] = tab[i+1];
        }
    }
    k = 1;
    l++;
    for(i = 0; i < tabl; i++){
        if(!tab2[i].empty()){
            l++;
        }
    }
    cout << "------------------------------------" << endl;
    cout << "|--->TABLE OF MOST COMMON WORDS<---|" << endl;
    cout << "------------------------------------" << endl;
    for(i = 0; i < tabl; i++){
        if(!tab2[i].empty() && k <= 20 ){
            cout << c << k++ << "." << '\t' << c << tab2[i] << '\t' << c << "*" <<
            tabs.at(i+1)
            << '\t'  << c << roundf(((float)tabs.at(i+1)*100/l)*100)/100 << "%" <<
            endl;
        }
    }
    cout << "------------------------------------" << endl ;
    cout << "|----->Dif. strings: " << '\t' << l << "<-------|" << endl ;
    cout << "------------------------------------" << endl;

    return 0;
}

This question appears to be off-topic because it is about code optimization, and better asked at [Code Review Stack Exchange](http://codereview.stackexchange.com/). — Jongware, Oct 12 '14 at 23:07
Please do not insert plain text output as an image. It is clearer if you copy it into your post. — Jongware, Oct 12 '14 at 23:08
Please make sure your code at least compiles before posting: `tab.at(i+1)` — user657267, Oct 12 '14 at 23:55

dgnuff · Accepted Answer · 2014-10-13T00:43:38.550

Since this doesn't attempt to read the entire file in one piece, the only upper limit on file size is how long you're willing to wait for your output.

#include <iostream>
#include <fstream>
#include <sstream>
#include <string>
#include <map>
#include <algorithm>

char convert(char arg)
{
    if (arg == '.' || arg == '\t' || arg == ',' || arg == ';'
            || arg == ':' || arg == '}' || arg == '{')
    {
        return ' ';     // convert delimiters to spaces
    }
    return(tolower(arg));  // convert everything else to lower case
}

int main()
{
    std::ifstream in("c:\\etc\\foo.txt");

    // This can be replaced with std::unordered_map if you are willing to sacrifice
    // lexical sorting of the output for speed
    std::map<std::string, int> counts;

    while (!in.eof())
    {
        std::string str;
        // Get the next space delimited word from the file
        in >> str;

        if (str == "")
        {
            // Reject empty strings, which can occur as we read the file
            continue;
        }

        // Convert to lower case, and convert our delimiter set to spaces
        std::transform(str.begin(), str.end(), str.begin(), convert);
        std::stringstream in1(str);

        while (!in1.eof())
        {
            std::string word;
            in1 >> word;
            if (word == "")
            {
                // reject empty words, which are also possible at this point
                continue;
            }
            // Use the map to count occurrences of the word
            auto it = counts.find(word);
            if (it == counts.end())
            {
                counts[word] = 1;
            }
            else
            {
                it->second++;
            }
        }
    }

    // Output the results here

    return 0;
}

Most common words in txt File

1 Answers1