1

I'm attempting to scrape a webpage in Node.js.

var request = require('request').defaults({maxRedirects:3});


let url = "https://webapp4.asu.edu/catalog/classlist?k=81684&t=2177&e=all&hon=F&promod=F"
// , qs:propertiesObject
request({url:url}, function(err, response, body) {
  if(err) { console.log(err); return; }
  console.log("Get response: " + response.statusCode);
});

I'm getting max number of redirects for some reason. I can do a get request in postman and I can visit the page just fine. What would I do to cause redirects to happen?

Ryan Shocker
  • 693
  • 1
  • 7
  • 22
  • Why are you passing the same query params in propertiesObject that are found in your url? Most of the time you will have a url like "https://webapp4.asu.edu/catalog/classlist" and then you will pass a query object that is something like { k:'81684', t:'2177', e:"all", hon:"F", promod:"F" }. – user2263572 Apr 01 '17 at 22:59
  • Sorry, that was a typo. I didn't have that in the request. – Ryan Shocker Apr 01 '17 at 23:01
  • And have you tried removing ".defaults({maxRedirects:3});" and does that change the outcome? I believe the request default is 10, and depending on the page you are trying to view 3 may be too low. – user2263572 Apr 01 '17 at 23:03
  • Yes, I have tried that as well – Ryan Shocker Apr 01 '17 at 23:05
  • Would it have anything to do with the referrer? I notice on the request they have a policy Referrer Policy:no-referrer-when-downgrade – Ryan Shocker Apr 01 '17 at 23:06
  • Try using cookieJar from `cookielib` https://stackoverflow.com/a/6930182/11686526 – Minaro Feb 03 '22 at 18:16

1 Answers1

2

Best Pratice: You should always check for a robots.txt file before scraping a webpage. I was unable to locate one for this specific site, but if you do come across a site that does not allow scraping you should follow all rules.

That being said, it seems as if your scraper is getting stuck in an infinite redirect loop due to the lack of headers on the outgoing request.

Something like the below will get you a response, but you will need to determine what parsing needs to be done to extract information from it.

var request = require('request');

var options = {
  url: 'https://webapp4.asu.edu/catalog/classlist?k=81684&t=2177&e=all&hon=F&promod=F',
  headers: {
    "method":"GET",
    "path":"/catalog/classlist?k=math&t=2177&e=all&hon=F&promod=F",
    "scheme":"https",
    "accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "accept-encoding":"gzip, deflate, sdch, br",
    "accept-language":"en-US,en;q=0.8",
    "cache-control":"no-cache",
    "cookie":"JSESSIONID=javaprod19~413DF4150236B1466C8ECB85EB796C06.catalog19; onlineCampusSelection=C; __cfduid=d5e9cb96f2485f7500fec2116ee8f23381491087061; __utma=59190898.1874896314.1491088625.1491088625.1491088625.1; __utmb=59190898.2.10.1491088625; __utmc=59190898; __utmz=59190898.1491088625.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utma=137925942.2000995260.1491087063.1491087063.1491088718.2; __utmb=137925942.2.10.1491088718; __utmc=137925942; __utmz=137925942.1491088718.2.2.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided); ADRUM=s=1491089349546&r=https%3A%2F%2Fwebapp4.asu.edu%2Fcatalog%2Fclasslist%3F-1275642430",
    "pragma":"no-cache",
    "referer":"https://webapp4.asu.edu/catalog/",
    "upgrade-insecure-requests":"1",
    "user-agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36"
  }
};

function callback(error, response, body) {

    console.log(response.body)

}
request(options, callback);
user2263572
  • 5,435
  • 5
  • 35
  • 57