Introduction
I have a vector entities
containing 44 million names. I want to split it into 4 parts and process each part in parallel. Class Freebase
contains the function loadData()
which is used to split the vector and call function multiThread
in order to do the processing.
loadEntities()
reads a text file containing the names. I didn't put the implementation in the class because it's not importantloadData()
splits the vectorentities
that was initialized in the constructor into 4 parts and adds every part thevector<thread> threads
as follows:
threads.push_back(thread(&Freebase::multiThread, this, i, i + right, ref(data)));
- multiThread is the function where I process the files
i
andi+right
are the indices used in the for loop of multithread to loop through entitiesreturnValues
is a subfunction ofmultiThread
and is used to call an external function.
Problem
cout <<"Entity " << entities[i] << endl;
is showing the following results:
- Entity m.0rzf6wv (ok)
- Entity m.0rzf70 (ok)
- Entity m.068s4h9 m.0n_k8bz (WRONG)
- Entity Entity m.068s5_1 (WRONG)
The last 2 outputs are wrong. The output should be:
Entity name
notentity entity name
norentity name name
This is causing a segmentation fault when the input is being sent to function returnValues
. How can I solve it?
Source Code
#ifndef FREEBASE_H
#define FREEBASE_H
class Freebase
{
public:
Freebase(const std::string &, const std::string &, const std::string &, const std::string &);
void loadData();
private:
std::string _serverURL;
std::string _entities;
std::string _xmlFile;
void multiThread(int,int, std::vector<std::pair<std::string, std::string>> &);
//private data members
std::vector<std::string> entities;
};
#endif
#include "Freebase.h"
#include "queries/SparqlQuery.h"
Freebase::Freebase(const string & url, const string & e, const string & xmlFile, const string & tfidfDatabase):_serverURL(url), _entities(e), _xmlFile(xmlFile), _tfidfDatabase(tfidfDatabase)
{
entities = loadEntities();
}
void Freebase::multiThread(int start, int end, vector<pair<string,string>> & data)
{
string basekb = "PREFIX basekb:<http://rdf.basekb.com/ns/> ";
for(int i = start; i < end; i++)
{
cout <<"Entity " << entities[i] << endl;
vector<pair<string, string>> description = returnValues(basekb + "select ?description where {"+ entities[i] +" basekb:common.topic.description ?description. FILTER (lang(?description) = 'en') }");
string desc = "";
for(auto &d: description)
{
desc += d.first + " ";
}
data.push_back(make_pair(entities[i], desc));
}
}
void Freebase::loadData()
{
vector<pair<string, string>> data;
vector<thread> threads;
int Size = entities.size();
//split database into 4 parts
int p = 4;
int right = round((double)Size / (double)p);
int left = Size % p;
float totalduration = 0;
vector<pair<int, int>> coordinates;
int counter = 0;
for(int i = 0; i < Size; i += right)
{
if(i < Size - right)
{
threads.push_back(thread(&Freebase::multiThread, this, i, i + right, ref(data)));
}
else
{
threads.push_back(thread(&Freebase::multiThread, this, i, Size, ref(data)));
}
}//end outer for
for(auto &t : threads)
{
t.join();
}
}
vector<pair<string, string>> Freebase::returnValues(const string & query)
{
vector<pair<string, string>> data;
SparqlQuery sparql(query, _serverURL);
string result = sparql.retrieveInformations();
istringstream str(result);
string line;
//skip first line
getline(str,line);
while(getline(str, line))
{
vector<string> values;
line.erase(remove( line.begin(), line.end(), '\"' ), line.end());
boost::split(values, line, boost::is_any_of("\t"));
if(values.size() == 2)
{
pair<string,string> fact = make_pair(values[0], values[1]);
data.push_back(fact);
}
else
{
data.push_back(make_pair(line, ""));
}
}
return data;
}//end function