0

I am creating a web scraper that scrapes all of the movies coming out for the next year from this site (https://www.imdb.com/movies-coming-soon/) and it loops through an array of links that contain all the movies for each month for the next year, its working but the only problem is that its not returning them in order due to node.js asynchronous behavior, how do i get it to loop through the array and return the data in order?

Ive tried to make a callback function but I don't know where it would go at

const request = require('request')
const cheerio = require('cheerio')

const movieArray = [ '/movies-coming-soon/2019-09/',
'/movies-coming-soon/2019-10/',
'/movies-coming-soon/2019-11/',
'/movies-coming-soon/2019-12/',
'/movies-coming-soon/2020-01/',
'/movies-coming-soon/2020-02/',
'/movies-coming-soon/2020-03/',
'/movies-coming-soon/2020-04/',
'/movies-coming-soon/2020-05/',
'/movies-coming-soon/2020-06/',
'/movies-coming-soon/2020-07/',
'/movies-coming-soon/2020-08/' ]
for (let i = 0; i < movieArray.length; i++) {
    request.get('https://www.imdb.com' + movieArray[i] , (err, res, body) => {
        if (!err && res.statusCode == 200) {
            console.log(res.request.href)
            const $ = cheerio.load(body)
            //console.log(next)
            $('h4').each((i, v) => {
                const date = $(v).text()
                console.log(date)
            })               
        }
    })
}

I'm expecting it to return the data in order instead of it being returned in a order based off how fast the data is returned due to nodes asynchronous behavior

Kendall Kelly
  • 121
  • 1
  • 10

2 Answers2

1

It's a classic async issue in for loop as per explained https://lavrton.com/javascript-loops-how-to-handle-async-await-6252dd3c795/. Below would be the solution:

// const request = require('request')
const request = require('request-promise');
const cheerio = require('cheerio');

const movieArray = [
  '/movies-coming-soon/2019-09/',
  '/movies-coming-soon/2019-10/',
  '/movies-coming-soon/2019-11/',
  '/movies-coming-soon/2019-12/',
  '/movies-coming-soon/2020-01/',
  '/movies-coming-soon/2020-02/',
  '/movies-coming-soon/2020-03/',
  '/movies-coming-soon/2020-04/',
  '/movies-coming-soon/2020-05/',
  '/movies-coming-soon/2020-06/',
  '/movies-coming-soon/2020-07/',
  '/movies-coming-soon/2020-08/',
];

async function processMovieArray(array) {
  for (const item of array) {
    await getMovie(item);
  }
  console.log('Done');
}

async function getMovie(item) {
  const options = {
    method: `GET`,
    uri: 'https://www.imdb.com' + item,
  };
  const response = await request(options);
  const $ = cheerio.load(response.body);
  $('h4').each((i, v) => {
    const date = $(v).text();
    console.log(date);
  });
}

processMovieArray(movieArray);
Chris Chen
  • 1,228
  • 9
  • 14
  • Im getting an error on saying ``` const $ = await cheerio.load(body) ^^^^^ SyntaxError: await is only valid in async function``` – Kendall Kelly Sep 12 '19 at 02:38
  • My bad as cheerio.load() is synchronous... And request is not asynchronous as well. Code is updated or you can follow @jfriend00 's solution – Chris Chen Sep 12 '19 at 06:22
  • i thought the regular request library is async, how would u go about doing it in that library, not the request-promise one, cause im not really that familiar with promises – Kendall Kelly Sep 13 '19 at 02:25
  • here is a related discussion with similar use case about promise and where you may find their code useful https://stackoverflow.com/questions/47341603/async-await-with-request-promise-returns-undefined – Chris Chen Sep 13 '19 at 03:08
0

The low tech way that deviates the least from your current code is to just use the index of your for loop to populate an array. Since let in the for loop will make a separate variable for i for each iteration of the for loop, we can use that index inside the async callback to reference the desired spot in a results array. Then, you also use a cntr to know when you've finished with all the results:

const request = require('request');
const cheerio = require('cheerio');

if (!Array.prototype.flat) {
    Array.prototype.flat = function() {
        return this.reduce((acc, val) => acc.concat(val), []);
    }
}


const movieArray = [ '/movies-coming-soon/2019-09/',
'/movies-coming-soon/2019-10/',
'/movies-coming-soon/2019-11/',
'/movies-coming-soon/2019-12/',
'/movies-coming-soon/2020-01/',
'/movies-coming-soon/2020-02/',
'/movies-coming-soon/2020-03/',
'/movies-coming-soon/2020-04/',
'/movies-coming-soon/2020-05/',
'/movies-coming-soon/2020-06/',
'/movies-coming-soon/2020-07/',
'/movies-coming-soon/2020-08/' ];

let results = [];
let cntr = 0;
for (let i = 0; i < movieArray.length; i++) {
    request.get('https://www.imdb.com' + movieArray[i] , (err, res, body) => {
        ++cntr;
        if (!err && res.statusCode == 200) {
            console.log(res.request.href)
            const $ = cheerio.load(body)
            //console.log(next)
            let textArray = [];
            $('h4').each((i, v) => {
                console.log(date)
                textArray.push($(v).text());
            });
            results[i] = textArray;
        }
        if (cntr === moveArray.length) {
            // all results are done now
            let allResults = results.flat();
        }
    })
}

A bit more elegant way is to switch over to promises and let the promise infrastructure keep everything in order for you:

const rp = require('request-promise');
const cheerio = require('cheerio');

if (!Array.prototype.flat) {
    Array.prototype.flat = function() {
        return this.reduce((acc, val) => acc.concat(val), []);
    }
}

const movieArray = [ '/movies-coming-soon/2019-09/',
'/movies-coming-soon/2019-10/',
'/movies-coming-soon/2019-11/',
'/movies-coming-soon/2019-12/',
'/movies-coming-soon/2020-01/',
'/movies-coming-soon/2020-02/',
'/movies-coming-soon/2020-03/',
'/movies-coming-soon/2020-04/',
'/movies-coming-soon/2020-05/',
'/movies-coming-soon/2020-06/',
'/movies-coming-soon/2020-07/',
'/movies-coming-soon/2020-08/' ];

// 
if (!Array.prototype.flat) {
    Array.prototype.flat = function() {
        return this.reduce((acc, val) => acc.concat(val), []);
    }
}

Promise.all(movieArray.map(path => {
    return rp('https://www.imdb.com' + path).then(body => {
        const $ = cheerio.load(body);
        let textArray = [];
        $('h4').each((i, v) => {
            // console.log($(v).text());
            textArray.push($(v).text());
        });
        return textArray;

    }).catch(err => {
        // ignore errors on urls that didn't work
        // so we can get the rest of the results without aborting
        console.log("err");
        return undefined;
    });
})).then(results => {
    // flatten the two level array and remove empty items
    let allResults = results.flat().filter(item => !!item);
    console.log(allResults);
}).catch(err => {
    console.log(err);
});

FYI, I tested the 2nd version in nodejs version 10.16.0 and it works.

jfriend00
  • 683,504
  • 96
  • 985
  • 979