While I was using async module and bookshelf.js orm, duplicate records (person roles) are being added to the database even though I check before adding them, I couldn't figure out why.
In a nutshell, here's what I'm trying to achieve.
- Visit some urls (in parallel, it's a web scraper, it visits 10 urls each time thanks to http.globalAgent.maxSockets), fetch
person id
,person name
androles
. - If a person exists in the database (I check it using the person id, it's the same id that I fetch from the a url), add him/her. If the record exists, don't do anything.
- Get the names of the roles of this person and query the database. If the role doesn't exist in the database, add it first, get its id, then add it to the pivot table (many-to-many relationship since a person can have more than one roles) If the role exists, just add it to the pivot table with related person_id.
I'm using async
, request
, cheerio
and bookshelf.js
.
app.get('/async', function (req, res) {
var urlArr = [];
for (var i = 10; i < 100; i++) {
urlArr.push("http://www.example.com/person/" + i + "/personname.html");
}
async.each(urlArr, function (url, callback) {
request({
url: url,
headers: {
'User-Agent': req.headers['user-agent'],
'Content-Type': 'application/json; charset=utf-8'
}
}, function (err, resp, body) {
if (err) {
console.log(err);
} else {
if (cheerio.load(body)) {
var $ = cheerio.load(body);
var links = $('#container');
var name = links.find('span[itemprop="name"]').html(); // name
if (name == null) {
console.log("null returned, do nothing");
} else {
name = entities.decodeHTML(name);
var r = url.substring(33, 35);
person.where('id', r).fetch({require: true}).then(function (p) {
// person exists, don't do anything
}).catch(function () {
// person doesn't exist, proceed
var numberOfRoles = links.find('span[itemprop="roletitle"]').length; // number of roles
new person({id: r, name: name}).save(null, {method: 'insert'}).then(function (returnval) {
var rolesArr = [];
for (var b = 0; b < numberOfRoles; b++) {
rolesArr.push(b);
}
async.each(rolesArr, function (roles, callback) {
var personTypes = $('span[itemprop="roletitle"]').eq(roles).text();
var personTypes = entities.decodeHTML(personTypes);
personRole.where('role', personTypes).fetch().then(function (data1) {
// role exists, add the role_id and person_id to the pivot table
new personPersonRole({
person_id: r,
personrole_id: data1.id
}).save().then(function (data2) {
console.log("role has been added");
});
}).catch(function () {
// role doesn't exist, add it first
new personRole({
role: personTypes
}).save().then(function (data3) {
console.log("new added role_id is : " + data3.id);
// add person_id and role_id to the pivot table
new personPersonRole({
person_id: r,
personrole_id: data3.id
}).save();
});
});
callback();
});
});
});
}
}
else {
console.log("can't open");
}
}
});
callback();
},
function (err) {
if (err) {
console.log("err");
} else {
console.log("success");
}
});
});