1

Writing a program which scans hard drive for files based on provided extensions. The current one I am testing is "-c+r D://" This looks for c,c++ files recursively starting at the root of the D: drive. However this is very slow and I'm not sure what I could be doing to speed this up.

#include <map>
#include <iostream>
#include <string>
using namespace std;

#include <filesystem>
using namespace std::experimental::filesystem;


//class wide booleans
bool cFiles, cPlusFiles, javaFiles, cSharpFiles, webProgrammingFiles;
bool bool_summary, bool_recursive, bool_reverse, bool_sort, bool_verbose;
bool show_help;
string regex = "";
//regex


// extension       file size
map<string, vector<long long>> extensionMap;//map which stores all files
vector<long long> fileSizes;

//searches condition string and ticks off booleans which will be used in file search
void check_conditions(string conditions) {
    for (unsigned int i = 1; i < conditions.size(); ++i)
    {
        switch (conditions.at(i))
        {
        case 'c':
            cFiles = true;
            break;
        case '+':
            cPlusFiles = true;
            break;
        case 'j':
            javaFiles = true;
            break;
        case '#':
            cSharpFiles = true;
            break;
        case 'w':
            webProgrammingFiles = true;
            break;
        case 's':
            bool_summary = true;
            break;
        case 'x':
            //while (conditions.at(i) != '"')
            break;
        case 'r':
            bool_recursive = true;
            break;
        case 'R':
            bool_reverse = true;
            break;
        case 'S':
            bool_sort = true;
            break;
        case 'v':
            bool_verbose = true;
            break;
        case 'h':
            show_help = true;
            break;
        default:
            break;
        }
    }
}

// scan a single folder
void scan(path const& f) {
    directory_iterator d(f);    // first entry of folder 'f'
    directory_iterator e;       // virtual match to the 'end' of any folder
    string extension;
    while (d != e) {
        cout << d->path() <<
            (is_directory(d->status()) ? " [dir]" : "") <<
            " ext=" << d->path().extension() <<
            endl;
        extension = d->path().extension().string();

        if (extensionMap.find(extension) == extensionMap.end())
        {
            //not found
        }
        else
        {
            extensionMap[extension].push_back(file_size(d->path()));
        }

        ++d;
    }

}

// scan a current folder and all sub folders
void rscan(path const& f) {
    cout << "\n";
    string extension = "";
    //int testsize;
    for (recursive_directory_iterator d(f), e; d != e; ++d) {
        /*cout << d->path() <<
            (is_directory(d->status()) ? " [dir]" : "") <<
            " ext=" << d->path().extension() <<
            endl;*/
        extension = d->path().extension().string();

        if (extension == ".cpp")
        {
            cout << "CPP!!!!" << endl;
        }


        if (extensionMap.find(extension) == extensionMap.end())
        {
            //not found
        }
        else
        if (!is_directory(d->path()))//makes sure its not a folder
        {
            //testsize = file_size(d->path());
            //cout << d->path() << endl;
            //cout << "size is " << file_size(d->path()) << endl;
            extensionMap[extension].push_back(file_size(d->path()));
        }
    }
}


int main(int argc, char* argv[])
{
    cout.imbue(locale(""));
    //if (argc )
    string conditions = "-c+r";
    string directory = "";
    path finalDirectory;
    cout << canonical(finalDirectory);

    cout << endl;
    //string directory = argv[2];

    cout << "fileusage (c) 2016-7, Hur Romanchik" << endl;

    cout << "argc is " << argc << endl;

    for (int i = 0; i < argc; ++i)
    {
        cout << argv[i] << endl;
    }

    if (argc > 1) {

        if (conditions.at(0) == '-' && argc > 2)
        {
            check_conditions(conditions);
            directory = argv[2];
        }
        else
        {
            directory = argv[1];
        }


        cout << "Directory is " << directory << endl;


        if (show_help == true) //show help and exit if user asks for help
        {
            cout << "Usage: fileusage [-hrRsSvc+#jw(x regularexpression)] [folder]" << endl;
            return 0; //exit
        }

        if (cFiles == true)
        {
            cout << "C files: " << directory << endl;
            extensionMap.insert(pair<string, vector<long long>>(".c", fileSizes));
            extensionMap.insert(pair<string, vector<long long>>(".h", fileSizes));
        }
        if (cPlusFiles == true)
        {
            cout << "C++ files: " << directory << endl;
            extensionMap.insert(pair<string, vector<long long>>(".cc", fileSizes));
            extensionMap.insert(pair<string, vector<long long>>(".cp", fileSizes));
            extensionMap.insert(pair<string, vector<long long>>(".cpp", fileSizes));
            extensionMap.insert(pair<string, vector<long long>>(".cxx", fileSizes));
            extensionMap.insert(pair<string, vector<long long>>(".c++", fileSizes));
            extensionMap.insert(pair<string, vector<long long>>(".hpp", fileSizes));
            extensionMap.insert(pair<string, vector<long long>>(".hxx", fileSizes));
        }
        if (cSharpFiles == true)
        {
            cout << "C# files: " << directory << endl;
            extensionMap.insert(pair<string, vector<long long>>(".cs", fileSizes));
            extensionMap.insert(pair<string, vector<long long>>(".vb", fileSizes));
            extensionMap.insert(pair<string, vector<long long>>(".jsl", fileSizes));
        }
        if (javaFiles == true)
        {
            cout << "Java files: " << directory << endl;
            extensionMap.insert(pair<string, vector<long long>>(".class", fileSizes));
            extensionMap.insert(pair<string, vector<long long>>(".jar", fileSizes));
            extensionMap.insert(pair<string, vector<long long>>(".java", fileSizes));
        }
        if (webProgrammingFiles == true)
        {
            cout << "Web Programming Files";
            extensionMap.insert(pair<string, vector<long long>>(".htm", fileSizes));
            extensionMap.insert(pair<string, vector<long long>>(".html", fileSizes));
            extensionMap.insert(pair<string, vector<long long>>(".html5", fileSizes));
            extensionMap.insert(pair<string, vector<long long>>(".js", fileSizes));
            extensionMap.insert(pair<string, vector<long long>>(".jse", fileSizes));
            extensionMap.insert(pair<string, vector<long long>>(".jsc", fileSizes));
        }
        cout << "Creatiiong cpmp" << endl;
    }
    else
    {
        finalDirectory = current_path();//sets current path as search
    }
    if (directory.empty() == false)
        finalDirectory = directory;

    if (bool_recursive == true)
        rscan(finalDirectory);
    else
        scan(finalDirectory);

    cout << "scan complete" << endl;

    for (map<string, vector<long long>>::const_iterator it = extensionMap.begin(); it != extensionMap.end(); ++it)
    {
        //cout << it->first << endl;
        //cout << extensionMap[it->first].size();
        cout << "# of " << it->first << " files is " << extensionMap[it->first].size();
        for (vector<long long>::size_type i = 0; i < extensionMap[it->first].size(); ++i)
        {
            cout << it->first << " file is " << extensionMap[it->first].at(i) << " bytes" << endl;

        }
    }


    system("pause");
}
  • 1
    one normally addresses performance issues by figuring out where the bottlenecks are. – jdigital Apr 14 '17 at 05:13
  • And bottle neck probably is disk IO - so, look at filesystem drivers + options – sehe Apr 14 '17 at 09:30
  • @Hur how does it compare to the performance of a more simple take like https://stackoverflow.com/questions/7377733/directory-recursion-and-symlinks/7377992#7377992 – sehe Apr 14 '17 at 10:10
  • I probably shouldve provided more info. This is for a college project. The prof has given us his final exe and when using his, it's much faster. I'm relatively new to maps and was wondering if perhaps I am accessing them or adding to them inefficiently. – Hur Romanchik Apr 14 '17 at 20:03
  • When your program is running, is it using 100% (of a single core of the CPU), or is it using very little CPU? If the former, then your usage of maps and strings is probably slowing things down, or more likely, it is not using lots of CPU and is I/O bound. Does the is_directory() call actually make a system call? If so, this will slow things greatly - doing an I/O for each file. Using the windows API FindFirstFile/FindNextFile would be much more efficient. (Unlike Linux the Windows dir scan functions return the equivalent info to the stat() function - use it or loose performance). – joeking May 16 '17 at 21:00

0 Answers0