Crawl through JavaScript redirect

Question

I am writing a spider programme in Java and I ran into some troubles handling URL redirection. There are two kind of URL redirection I have ran into so far, the first one is those with HTTP response code 3xx which I can take care follow this answer.

But the second kind is that the server return HTTP response code 200 with a page that contain only some JavaScript code like this:

<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<script>
function detectmob() { 
    var u=(document.URL);
    if( navigator.userAgent.match(/Android/i) || some other browser...){
        window.location.href="web/mobile/index.php";
    } else {
        window.location.href="web/desktop/index.php";
    }
}

detectmob();
</script>
</head>
<body></body></html>

If the original URL is http://example.com, then it will automatically redirect to http://example.com/web/desktop/index.php if I am using a desktop web browser with JavaScript enabled.

However, my spider checks HttpURLConnection#getResponseCode() to see if it has reached the final URL by getting HTTP response code 200 and use URLConnection#getHeaderField() to get the Location field if HTTP response code 3xx is received. The following are the code snippet of my spider:

public String getFinalUrl(String originalUrl) {
        try {
            URLConnection con = new URL(originalUrl).openConnection();
            HttpURLConnection hCon = (HttpURLConnection) con;
            hCon.setInstanceFollowRedirects(false);
            if(hCon.getResponseCode() == HttpURLConnection.HTTP_MOVED_PERM 
                    || hCon.getResponseCode() == HttpURLConnection.HTTP_MOVED_TEMP) {
                System.out.println("redirected url: " + con.getHeaderField("Location"));
                return getFinalUrl(con.getHeaderField("Location"));
            }
        } catch (IOException ex) {
            System.err.println(ex.toString());
        }

        return originalUrl;
    }

So getting the above page will have a HTTP response code 200 and my spider will just assume there will be no further redirection and start parsing the page which is empty in term of content text.

I have google this issue a bit and apparently javax.script is somehow related, but I have no idea how to make it works. How can I program my spider so it will be able to get the correct URL?

score 0 · Answer 1 · answered May 25 '17 at 05:03

Here is a solution that uses Apache HttpClient to handle the response code redirects, Jsoup to extract javascript from html, then regular expressions to get the redirect String from a couple of ways redirects can be performed in javascript.

package com.yourpackage;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.StringWriter;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.HttpClientBuilder;
import org.jsoup.Jsoup;
import org.jsoup.helper.StringUtil;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;

import com.google.common.base.Joiner;
import com.google.common.net.HttpHeaders;

public class CrawlHelper {

  /**
   * Get end contents of a urlString. Status code is not checked here because
   * org.apache.http.client.HttpClient effectively handles the 301 redirects.
   * 
   * Javascript is extracted using Jsoup, and checked for references to
   * &quot;window.location.replace&quot;.
   * 
   * @param urlString Url. &quot;http&quot; will be prepended if https or http not already there.
   * @return Result after all redirects, including javascript.
   * @throws IOException
   */
  public String getResult(final String urlString) throws IOException {
    String html = getTextFromUrl(urlString);
    Document doc = Jsoup.parse(html);
    for (Element script : doc.select("script")) {
      String potentialURL = getTargetLocationFromScript(urlString, script.html());
      if (potentialURL.indexOf("/") == 0) {
        potentialURL = Joiner.on("").join(urlString, potentialURL);
      }
      if (!StringUtil.isBlank(potentialURL)) {
        return getTextFromUrl(potentialURL);
      }
    }
    return html;
  }

  /**
   * 
   * @param urlString Will be prepended if the target location doesn't start with &quot;http&quot;.
   * @param js Javascript to scan.
   * @return Target that matches window.location.replace or window.location.href assignments.
   * @throws IOException
   */
  String getTargetLocationFromScript(String urlString, String js) throws IOException {
    String potentialURL = getTargetLocationFromScript(js);
    if (potentialURL.indexOf("http") == 0) {
      return potentialURL;
    }
    return Joiner.on("").join(urlString, potentialURL);
  }

  String getTargetLocationFromScript(String js) throws IOException {
    int i = js.indexOf("window.location.replace");
    if (i > -1) {
      return getTargetLocationFromLocationReplace(js);
    }
    i = js.indexOf("window.location.href");    
    if (i > -1) {
      return getTargetLocationFromHrefAssign(js);
    }
    return "";
  }

  private String getTargetLocationFromHrefAssign(String js) {
    return findTargetFrom("window.location.href\\s?=\\s?\\\"(.+)\\\"", js);
  }

  private String getTargetLocationFromLocationReplace(String js) throws IOException {
    return findTargetFrom("window.location.replace\\(\\\"(.+)\\\"\\)", js);
  }

  private String findTargetFrom(String regex, String js) {
    Pattern p = Pattern.compile(regex);
    Matcher m = p.matcher(js);
    while (m.find()) {
      String potentialURL = m.group(1);
      if (!StringUtil.isBlank(potentialURL)) {
        return potentialURL;
      }
    }
    return "";
  }

  private String getTextFromUrl(String urlString) throws IOException {
    if (StringUtil.isBlank(urlString)) {
      throw new IOException("Supplied URL value is empty.");
    }
    String httpUrlString = prependHTTPifNecessary(urlString);
    HttpClient client = HttpClientBuilder.create().build();
    HttpGet request = new HttpGet(httpUrlString);
    request.addHeader("User-Agent", HttpHeaders.USER_AGENT);
    HttpResponse response = client.execute(request);
    try (BufferedReader rd =
        new BufferedReader(new InputStreamReader(response.getEntity().getContent()))) {
      StringWriter result = new StringWriter();
      String line = "";
      while ((line = rd.readLine()) != null) {
        result.append(line);
      }
      return result.toString();
    }
  }

  private String prependHTTPifNecessary(String urlString) throws IOException {
    if (urlString.indexOf("http") != 0) {
      return Joiner.on("://").join("http", urlString);
    }
    return validateURL(urlString);
  }

  private String validateURL(String urlString) throws IOException {
    try {
      new URL(urlString);
    } catch (MalformedURLException mue) {
      throw new IOException(mue);
    }
    return urlString;
  }
}

TDD... modify/enhance to match various scenarios:

package com.yourpackage;

import java.io.IOException;

import org.junit.Assert;
import org.junit.Test;

public class CrawlHelperTest {

  @Test
  public void testRegex() throws IOException {
    String targetLoc = 
    new CrawlHelper().getTargetLocationFromScript("somesite.com", "function goHome() { window.location.replace(\"/s/index.html\")}");
    Assert.assertEquals("somesite.com/s/index.html", targetLoc);
    targetLoc = 
        new CrawlHelper().getTargetLocationFromScript("window.location.href=\"web/mobile/index.php\";");
    Assert.assertEquals("web/mobile/index.php", targetLoc);
  }

  @Test
  public void testCrawl() throws IOException {
    Assert.assertTrue(new CrawlHelper().getResult("somesite.com").indexOf("someExpectedContent") > -1);
  }

}

Crawl through JavaScript redirect

1 Answers1