I am trying to create a web scraper program that takes tables from a website and converts them into ".csv" files.
I'm using Jsoup to pull the data down into a document and have it read from document.html() doc.html() below. The reader as it stands picks up 18 tables at my test site but no table data tags.
Do you have any idea what could be going wrong?
ArrayList<Data_Log> container = new ArrayList<Data_Log>();
ArrayList<ListData_Log> containerList = new ArrayList<ListData_Log>();
ArrayList<String> tableNames = new ArrayList<String>();// Stores native names of tables
ArrayList<Double> meanStorage = new ArrayList<Double>();// Stores data mean per table
ArrayList<String> processlog = new ArrayList<String>();// Keeps a record of all actions taken per iteration
ArrayList<Double> modeStorage = new ArrayList<Double>();
Calendar cal;
private static final long serialVersionUID = -8174362940798098542L;
public void takeData() throws IOException {
if (testModeActive == true) {
System.out.println("Initializing Data Cruncher with developer logs");
System.out.println("Taking data from: " + dataSource); }
int irow = 0;
int icolumn = 0;
int iTable = 0;
// int iListno = 0;
// int iListLevel;
String u = null;
boolean recording = false;
boolean duplicate = false;
Document doc = Jsoup.connect(dataSource).get();
Webtitle = doc.title();
Pattern tb = Pattern.compile("<table");
Matcher tB = tb.matcher(doc.html());
Pattern ttl = Pattern.compile("<title>(//s+)</title>");
Matcher ttl2= ttl.matcher(doc.html());
Pattern tr = Pattern.compile("<tr");
Matcher tR = tr.matcher(doc.html());
Pattern td = Pattern.compile("<td(//s+)</td>");
Matcher tD = td.matcher(doc.html());
Pattern tdc = Pattern.compile("<td class=(//s+)>(//s+)</td>");
Matcher tDC = tdc.matcher(doc.html());
Pattern tb2 = Pattern.compile("</table>");
Matcher tB2 = tb2.matcher(doc.html());
Pattern th = Pattern.compile("<th");
Matcher tH = th.matcher(doc.html());
while (tB.find()) {
iTable++;
while(ttl2.find()) {
tableNames.add(ttl2.group(1));
}
while (tR.find()) {
while (tD.find()||tH.find()) {
u = tD.group(1);
Data_Log v = new Data_Log();
v.setTable(iTable);
v.dataSort(u);
v.setRow(irow);
v.setColumn(icolumn);
container.add(v);
icolumn++;
}
while(tDC.find()) {
u = tDC.group(2);
Data_Log v = new Data_Log();
v.setTable(iTable);
v.dataSort(u);
v.setRow(irow);
v.setColumn(icolumn);
container.add(v);
icolumn++;
}
irow++;
}
if (tB2.find()) {
irow=0;
icolumn=0;
}
}
Expected results:
table# logged + "td"s logged
Actual result:
table# logged "td"s
omitted