I'm trying to parse the web page padalvarigal.com to get the URL of all the results from the page(highlighted in green colour). But when I'm parsing the web page using Jsoup, I'm not getting the entire divs while printing the doc object. The URL's and Titles in the div id "hits" are also getting replaced with "{{{URL}}}", "{{{Title}}}" in the doc object which I'm printing in console. Also out of six divs with class name hit in the actual page
I'm getting only one div named hit in the parsed page.
I have also tried setting the maxBodySize() to 0 for getting the entire web page results but still getting the same problem. Please guide me on whats going wrong.
package com.balaji.parse;
import org.jsoup.Jsoup;
import org.jsoup.Connection;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class ParseHTML {
private static final String URL = "http://www.paadalvarigal.com/search/?q=naanum%20rowdythan";
public static void main(String args[]) {
//parseFromString();
parseFromHTML();
}
private static void parseFromString() {
String html = "<html><head><title>First parse</title></head><body><p>Parsed HTML into a doc.</p></body></html>";
Document doc = Jsoup.parse(html);
System.out.println(doc.head());
System.out.println(doc.title());
System.out.println(doc.body());
//To Parse only body tag and elements - adds HTML and Body tags.
System.out.println("Parsing only Body");
Document doc2 = Jsoup.parseBodyFragment(html);
System.out.println(doc2);
}
private static void parseFromHTML() {
try {
Connection con = Jsoup.connect(URL);
con.timeout(5000);
con.header("Accept-Encoding", "gzip, deflate");
con.userAgent("Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0");
con.maxBodySize(0);
Document doc = con.get();
System.out.println(doc.head());
System.out.println(doc.title());
System.out.println(doc);
} catch(Exception ex) {
ex.printStackTrace();
}
}
}
P.S: I'm a newbie to JSoup and I'm trying to learn the framework for personal projects.