1

I have a large data set that I want to process (120 million records). My program is currently using Google dense hash but still it takes 29 hours to finish and uses 8.5 GiB of RAM from my 64 GiB server.

Please do you have any suggestions? I'm new to C++. If I want to replace the vector with something that is faster, what would that be?

#include <string>
#include <algorithm>
#include <tr1/unordered_map>
#include <iterator>
#include <sstream>
#include <cstring>
#include <iomanip>
#include <fstream>
#include <vector>
#include <iterator>
#include <time.h>
#include <iostream>
#include <iostream>
#include <sparsehash/dense_hash_map>
#include <stdio.h>
#include <string.h>
using google::dense_hash_map;  
using std::tr1::hash; 
using namespace std;
using std::string;

bool ProcessInput(const string& inChar, vector<string> *invector);
void Processmax( dense_hash_map < string, int>* ins, vector<int> *inc, vector<string>      *outs, vector<int> *outc);

int main()
{
time_t start, stop;
time(&start);
ofstream finall;
vector<int> usrsc,artc,tmusrc,tmart2c,atrsc,tmartc;
vector<string> tmart,tmusr,tmart2;
vector< vector<string> > usrlist,artlist;
string x1,x2;
ifstream ifTraceFile;
bool f,f2;
dense_hash_map < string, int > a;
dense_hash_map < string, int > u;
a.set_empty_key(string());
u.set_empty_key(string());

int kl=0;
ifTraceFile.open ("data2.tr", std::ifstream::in);
while (ifTraceFile.good ())
{
    ifTraceFile>>x1>> x2;


    if (kl==0)
    {
        a.insert(make_pair(x1,0));
        u.insert(make_pair(x2,0));
        usrlist.push_back((vector<string>()));
        usrlist[0].push_back(x1);
        artlist.push_back((vector<string>()));
        artlist[0].push_back(x2);
        usrsc.push_back(1);
        artc.push_back(1);
        atrsc.push_back(1);

    }
    else
    {

        dense_hash_map < string, int>::iterator itn;
        itn=a.find(x1);
        if (itn == a.end())
        {
            a.insert(make_pair(x1,(artlist.size())));
            artlist.push_back((vector<string>()));
            artlist[(artlist.size()-1)].push_back(x2);
            artc.push_back(1);
            atrsc.push_back(1);
        }
        else
        {
            f=ProcessInput(x2, &artlist[itn->second]);
            if(f)
            {
                artlist[itn->second].push_back(x2);
                atrsc[itn->second]+=1;
                artc[itn->second]+=1;
            }
            else
                atrsc[itn->second]+=1;

        }


         dense_hash_map < string, int>::iterator its;
        its=u.find(x2);
        if (its == u.end())
        {
            u.insert(make_pair(x2,(usrlist.size())));
            usrlist.push_back((vector<string>()));
            usrlist[(usrlist.size()-1)].push_back(x1);
            usrsc.push_back(1);

        }
        else
        {
            f2=ProcessInput(x1, &usrlist[its->second]);

            if(f2)
            {
                usrlist[its->second].push_back(x1);
                usrsc[its->second]+=1;

            }

        }

    }

    kl++;
}
ifTraceFile.close();
Processmax(&a, &artc, &tmart, &tmartc);
Processmax(&a, &atrsc, &tmart2 ,&tmart2c);
Processmax(&u, &usrsc ,&tmusr, &tmusrc);
int width=15;
cout <<"article has Max. review by users Top 1: "<<tmart.at(0)<<'\t'<<tmartc.at(0)<<endl;
cout <<"article has Max. review by users Top 2: "<<tmart.at(1)<<'\t'<<tmartc.at(1)<<endl;
cout <<"article has Max. review by users Top 3: "<<tmart.at(2)<<'\t'<<tmartc.at(2)<<endl;
cout <<endl;
cout <<"article has Max. review Top 1: "<<tmart2.at(0)<<'\t'<<tmart2c.at(0)<<endl;
cout <<"article has Max. review Top 2: "<<tmart2.at(1)<<'\t'<<tmart2c.at(1)<<endl;
cout <<"article has Max. review Top 3: "<<tmart2.at(2)<<'\t'<<tmart2c.at(2)<<endl;
cout <<endl;
cout <<"user who edited most articles Top 1: "<<tmusr.at(0)<<'\t'<<tmusrc.at(0)<<endl;
cout <<"user who edited most articles Top 2: "<<tmusr.at(1)<<'\t'<<tmusrc.at(1)<<endl;
cout <<"user who edited most articles Top 3: "<<tmusr.at(2)<<'\t'<<tmusrc.at(2)<<endl;

finall.open ("results");
finall << "Q1 results:"<<endl;;
finall <<"article has Max. review Top 1: "<<setw(width)<<tmart2.at(0)<<setw(width)<<tmart2c.at(0)<<endl;
finall <<"article has Max. review Top 2: "<<setw(width)<<tmart2.at(1)<<setw(width)<<tmart2c.at(1)<<endl;
finall <<"article has Max. review Top 3: "<<setw(width)<<tmart2.at(2)<<setw(width)<<tmart2c.at(2)<<endl;
finall<<endl;

finall<<"article has Max. review by users Top 1: "<<setw(width)<<tmart.at(0)<<setw(width)<<tmartc.at(0)<<endl;
finall <<"article has Max. review by users Top 2: "<<setw(width)<<tmart.at(1)<<setw(width)<<tmartc.at(1)<<endl;
finall <<"article has Max. review by users Top 3: "<<setw(width)<<tmart.at(2)<<setw(width)<<tmartc.at(2)<<endl;
finall<<endl;
finall <<"user edited most articles Top 1: "<<setw(width)<<tmusr.at(0)<<setw(width-5)<<tmusrc.at(0)<<endl;
finall <<"user edited most articles Top 2: "<<setw(width)<<tmusr.at(1)<<setw(width-5)<<tmusrc.at(1)<<endl;
finall <<"user edited most articles Top 3: "<<setw(width)<<tmusr.at(2)<<setw(width-5)<<tmusrc.at(2)<<endl;
finall.close ();
time(&stop);
cout<<"Finished in about "<< difftime(stop, start)<< " seconds"<<endl;

return 0;
}

void Processmax(  dense_hash_map< string,int >* ins, vector<int> *inc, vector<string> *outs, vector<int> *outc)
{
int index=0;
int l=0;
 dense_hash_map < string, int>:: iterator iti;
string value;
while(l!=4)
{
    vector<int>::iterator it=max_element(inc->begin(), inc->end());
    index = distance(inc->begin(), it);

    for (iti = ins->begin(); iti != ins->end(); ++iti)
    {
        if (iti->second == index)
        {
            value = iti->first;
            break;
        }
    }
    outs->push_back(value);
    outc->push_back(inc->at(index));
    inc->at(index)=0;
    l++;
  }
}

bool ProcessInput(const string& inChar, vector<string> *invector)
{
 bool index=true;
 vector<string>::iterator it=find(invector->begin(), invector->end(), inChar);
 if (it!=invector->end())
    index=false;

 return index;
}
Jonathan Leffler
  • 730,956
  • 141
  • 904
  • 1,278
  • 9
    The sure way to improve speed in an application is to find out which part of it is the bottleneck. I suggest you profile this code and find out what part of it takes the most time. – Borgleader Sep 24 '12 at 02:39
  • Second that. And with a 29 hour sample set, you'll have *more* than enough data to chew through (and expect it to take longer than 29 hours). – WhozCraig Sep 24 '12 at 02:54
  • How big is each record? (What is the average number of bytes per record in the input file?) 8.5 GiB / 120 M records suggest about 70 bytes per record, ignoring overhead; with overhead, quite a bit less per record for the data. – Jonathan Leffler Sep 24 '12 at 05:32
  • You `#include` a number of headers twice: ``, ``; and it seems odd to include both `` and ``. If this is C++, you should probably avoid ``. Why not use `` instead of ``? Etc. – Jonathan Leffler Sep 24 '12 at 14:05
  • the main point of the program is : 1st is to find the user who made most edit on articles (number is needed),2nd article that have been edited by most users (number is needed). so i only read two attributes so far. if i can find something else to this code – user1683302 Sep 24 '12 at 17:03
  • continue to previous post: dense_hash_map < string, int > a; vector< vector > usrlist,artlist; i'm using hash map int as index for vector so if i can find some thing that is faster than vector and it is integrated into map – user1683302 Sep 24 '12 at 17:05
  • Not sure who down voted this question and why? – Viet Sep 26 '12 at 03:57

3 Answers3

3

Judging by the data you are printing, you are trying to list just the top three or so users in each of a number of categories. Rather than storing all the data, you only need to store the current top three users in each category. When the new record arrives, you determine whether it replaces any of the top three items in any category, and if so arrange for the new data to replace the appropriate old data. If the new record is 'uninteresting', you ignore it. Make the number of interesting users a parameter of the calculations; solve for the general case of the top N, and then set N to 3.

This limits your storage to a few KiB maximum. You also have much smaller data structures to be manipulating, so they'll be radically faster. Your processing time should drop to about the time taken to read that size of file, which isn't 29 hours.

Jonathan Leffler
  • 730,956
  • 141
  • 904
  • 1,278
  • thanks for the advice, i know i have included extra headers i'll fix it.i can not tell which user made the biggest contribution until i read all the data. my sample is just 1000 record but my real data is 120 million record. i got the result after 29 hours. i'm just collecting two attributes but in the future i have to collect five of them. i think the bottle neck is in vector search also in the integration. i want to replace vector with something that is faster to search. like map of unorderd_set and something integrated like map of list. thanks – user1683302 Sep 24 '12 at 16:17
  • If you're accumulating the data as you go, then it is harder. Have you considered using a DBMS? They should make mincemeat out of a mere 120 million rows, given a sensible schema. Basically, I think you need to change your algorithm radically if you're going to get your answer quicker. Think outside the box; do something very different. – Jonathan Leffler Sep 24 '12 at 16:39
  • i also need to keep record of who edited what article and vice versa so i have to check the names and articles if it exist for that user and that what takes long time ? – user1683302 Sep 24 '12 at 17:21
  • the 120 million row is the database size but most of it are repeated rows i do not need to keep. if it is repeated i ignore them so what i have to keep is maybe 10 millions only or less because it only takes 16Gig and i'm using 2 tables rather than 1 – user1683302 Sep 24 '12 at 18:29
  • i checked the map size for the 10 million record input. the first table size was 20000 record and the second was 45000 record. so it seems like a lot of records are repeated ( i ignore any repeated record). if you can help me find replacement for 2d vector that allows fast look up. – user1683302 Sep 24 '12 at 23:03
2

You may want to follow a few simple steps:

  1. Take subset of the data (say 1/100 or 1/1000)
  2. Run your program through sample data under a profiler
  3. Find the bottle neck and optimize it

Some links to read:

http://www.sgi.com/tech/stl/complexity.html

Quick and dirty way to profile your code

vector vs. list in STL

Community
  • 1
  • 1
Viet
  • 17,944
  • 33
  • 103
  • 135
  • 3
    Don't make the sample data too small, or it wont reflect cache misses. – bitmask Sep 24 '12 at 03:24
  • thanks for the links, i test my program on 50000 record. and i found that google dense if fast and does not effect by size like the vector. – user1683302 Sep 24 '12 at 16:19
  • i'm new to programming but i what i did is that i run the program on my IDE (neatbeans) and pause it and checked the stack call for 10 time and "find_vector algo" was always in stack call. i hope i understood stack call correctly. i tested it on 10 million record. – user1683302 Sep 24 '12 at 16:42
  • what i'm doing is using hash map value as index for 2d vector. what i'm doing is for example : " username1 (map): (vector) article1, article2,article2 " vector accept indexing so. i use username as key and its value (int) as pointer to vector row : Vector [hash_map value]. can i do something that is faster like map with 2d map rather than vector.thanks – user1683302 Sep 24 '12 at 17:51
  • i used timestamp algo to calculate the max waiting time. and here is the results: max vector search time 0.0018 and my largest vector has 5000 elements while the hash map max look up took only 0.00017 for 20000 elements. so i seems that vector search is taking long time. can i replace it with map or something that uses hash i'm just using inserting and look up operations. – user1683302 Sep 24 '12 at 18:42
1

thank you for your help. I now could get the results in 10 min. only!!!!!!!!!

  unordered_map < string, unordered_set <string> > a;
  unordered_map < string, unordered_set <string> > u;
  unordered_map < string, int > artc,usrc,artac;
    .....
    ....
   if (true)
    {  
        a[x1].insert(x2);
       u[x2].insert(x1);
        artc[x1]=a[x1].size();
        usrc[x2]=u[x2].size();
        artac[x1]++;
    }

unordered_map is 100% faster than google dense hash and it takes 30% less RAM than Google dense.

  • 1
    And you can still get more speed out of this if your key's have a rather large size by changing the hash functor. See [this question](http://stackoverflow.com/q/8372579/500104) for a reason. – Xeo Sep 26 '12 at 01:21
  • +1 glad you found the answer. So std::unordered_set is better suited to your need than Google dense hash. Note taken =) – Viet Sep 26 '12 at 03:59