- I am trying to load 50000 items from the database with text in them, tag them and update the tags
- I am using pg-promise and pg-query-stream for this purpoes
- I was able to get the streaming part working properly but updating has become problematic with so many update statements
Here is my existing code
const QueryStream = require('pg-query-stream')
const JSONStream = require('JSONStream')
function prepareText(title, content, summary) {
let description
if (content && content.length) {
description = content
} else if (summary && summary.length) {
description = summary
} else {
description = ''
}
return title.toLowerCase() + ' ' + description.toLowerCase()
}
async function tagAll({ db, logger, tagger }) {
// you can also use pgp.as.format(query, values, options)
// to format queries properly, via pg-promise;
const qs = new QueryStream(
'SELECT feed_item_id,title,summary,content FROM feed_items ORDER BY pubdate DESC, feed_item_id DESC'
)
try {
const result = await db.stream(qs, (s) => {
// initiate streaming into the console:
s.pipe(JSONStream.stringify())
s.on('data', async (item) => {
try {
s.pause()
// eslint-disable-next-line camelcase
const { feed_item_id, title, summary, content } = item
// Process text to be tagged
const text = prepareText(title, summary, content)
const tags = tagger.tag(text)
// Update tags per post
await db.query(
'UPDATE feed_items SET tags=$1 WHERE feed_item_id=$2',
// eslint-disable-next-line camelcase
[tags, feed_item_id]
)
} catch (error) {
logger.error(error)
} finally {
s.resume()
}
})
})
logger.info(
'Total rows processed:',
result.processed,
'Duration in milliseconds:',
result.duration
)
} catch (error) {
logger.error(error)
}
}
module.exports = tagAll
- The db object is the one from pg-promise whereas the tagger simply extracts an array of tags from text contained in the variable tags
- Too many update statements are executing from what I can see in the diagnostics, is there a way to batch them?