1

so a quick overview of what I'm doing I am using Android Webview to Render JavaScript and then reading the HTML from the javascript to parse it.

I am currently having trouble with retrieving the HTML from a website called Sport Chek.

Here is the code for my SportChekSearch class:

public class SportChekSearch extends SearchQuery{

public Elements finalDoc;
private ArrayList<Item> processed;
private final Handler uiHandler = new Handler();
public int status = 0;

//This basically is just so that the class knows which Activity we're working with
private Context c;

protected class JSHtmlInterface {
    @android.webkit.JavascriptInterface
    public void showHTML(String html) {
        final String htmlContent = html;

        uiHandler.post(
                new Runnable() {
                    @Override
                    public void run() {
                        Document doc = Jsoup.parse(htmlContent);
                    }
                }
        );
    }
}

/**
 * Constructor method
 * @param context The context taken from the webview (So that the asynctask can show progress)
 */
public SportChekSearch(Context context, String query) {

    final Context c = context;

    try {
        final WebView browser = new WebView(c);
        browser.setVisibility(View.INVISIBLE);
        browser.setLayerType(View.LAYER_TYPE_NONE, null);
        browser.getSettings().setJavaScriptEnabled(true);
        browser.getSettings().setBlockNetworkImage(true);
        browser.getSettings().setDomStorageEnabled(true);
        browser.getSettings().setCacheMode(WebSettings.LOAD_NO_CACHE);
        browser.getSettings().setLoadsImagesAutomatically(false);
        browser.getSettings().setGeolocationEnabled(false);
        browser.getSettings().setSupportZoom(false);
        browser.getSettings().setUserAgentString("Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36");
        browser.addJavascriptInterface(new JSHtmlInterface(), "JSBridge");

        browser.setWebViewClient(
                new WebViewClient() {

                    @Override
                    public void onPageStarted(WebView view, String url, Bitmap favicon) {
                        super.onPageStarted(view, url, favicon);
                    }

                    @Override
                    public void onPageFinished(WebView view, String url) {
                        browser.loadUrl("javascript:window.JSBridge.showHTML('<html>'+document.getElementsByTagName('html')[0].innerHTML+'</html>');");
                    }
                }
        );


            browser.loadUrl("https://www.sportchek.ca/search.html#q=" + query.replaceAll(" ", "+") + "&lastVisibleProductNumber=3");
            browser.loadUrl(browser.getUrl());
            final String link = browser.getUrl();
            new fetcher(c).execute(link);



    }
    catch(Exception e){
        e.printStackTrace();
    }

    //Get the link from the WebView, and save it in a final string so it can be accessed from worker thread


}

/**
 * This subclass is a worker thread meaning it does work in the background while the user interface is doing something else
 * This is done to prevent "lag".
 * To call this class you must write fetcher(Context c).execute(The link you want to connect to)
 *
 */
class fetcher extends AsyncTask<String, Void, Elements> {

    Context mContext;
    ProgressDialog pdialog;

    public fetcher(Context context) {
        mContext = context;
    }

    @Override
    protected void onPreExecute() {
        super.onPreExecute();
        pdialog = new ProgressDialog(mContext);
        pdialog.setTitle(R.string.finding_results);
        pdialog.setCancelable(false);
        pdialog.show();
    }

    //This return elements because the postExecute() method needs an Elements object to parse its results
    @Override
    protected Elements doInBackground(String... strings) {

        //You can pass in multiple strings, so this line just says to use the first string
        String link = strings[0];

        //For Debug Purposes, Do NOT Remove - **Important**
        System.out.println("Connecting to: " + link);

        try {
            doc = Jsoup.connect(link)
                    .ignoreContentType(true)
                    .userAgent("Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36")
                    .timeout(10000)
                    .get();


            finalDoc = doc.select("body section.product-grid-wrapper");

            System.out.println(finalDoc.toString());



        } catch (IOException e) {
            e.printStackTrace();
        }

        return finalDoc;
    }


    @Override
    protected void onPostExecute(Elements result) {


        //This line clears the list of info in the Search activity
        //I should probably be using a getter method but adapter is a static variable so it shouldn't matter


        //parse seperates document into elements
        //crunch results formats those elements into item objects
        //I am saving the result of this to an ArrayList<Item> called "processed"
        processed = crunchResults(result);

        //For debug purposes, do NOT remove - **Important**
        System.out.println(processed.size() + " results have been crunched by Sport Chek.");

        //Adds all of the processed results to the list of info in Search activity
        ClothingSearch.adapter.addAll(processed);


        //For debug purposes, do NOt remove - **Important
        System.out.println("Adapter has been notified by Sport Chek.");

        //Closes the progress dialog called pdialog assigned to the AsyncTask

        pdialog.dismiss();

        ClothingSearch.adapter.notifyDataSetChanged();
        SearchQueueHandler.makeRequest(mContext, processed, SearchQueueHandler.CLOTHING_SEARCH);




    }
}



    public ArrayList<Item> crunchResults(Elements e){

    ArrayList<Item> results = new ArrayList<Item>();

    try {

        for (int i = 0; i < e.size(); i++) {

            Element ele = e.get(i);


            String link = "https://www.sportchek.ca" + ele.select(" a.product-grid__link").attr("href");
            System.out.println("https://www.sportchek.ca" + ele.select(" a.product-grid__link").attr("href"));
            String title = ele.select(" span.product-title-text").text();

            String pricestring = ele.select(" span.product-price__wrap").text();
            price = Double.parseDouble(pricestring.substring(pricestring.lastIndexOf("$")));
            System.out.println(pricestring);

            //*******************************************

            String store = "Sport Chek";



                //Adds the formatted item to an ArrayList of items
                results.add(new Item(title, store, price, link));


            //Prints the object's to String to console
            //For debug purposes, do NOT remove - **Important
            System.out.println(results.get(i).toString());
        }
    } catch (Exception a){
        a.printStackTrace();
    }

    return results;
}

public int getStatus(){
    return status;
}

}

The two relevant methods are doInBackground in my AsyncTask and the crunchResults method.

Here is the result I get from using Ctrl+Shift+I on the actual website (Desired Result):

Desired Result

But when running the above code and using a println here is the result that I get for the tag section class="product-grid-wrapper" :

<section class="product-grid-wrapper"> 
<ul data-module-type="SearchProductGrid" class="product-grid__list product-grid__list_quickview"> 
<!-- #product-grid__item-template --> 
</ul>
</section>

Can anyone help me figure out why I am not getting my desired result?

All help is appreciated

EDIT: for this specific search that the println data was collected from, the link was https://www.sportchek.ca/search.html#q=men+coat&lastVisibleProductNumber=3

Community
  • 1
  • 1
Yamaha T64312
  • 103
  • 10

1 Answers1

0

It looks like what you are actually getting is the actual html sent by the server, and that your 'desired result' is what the DOM looks like after the JavaScript runs.

Your 'actual' is what I see if I use "View Source" in Chrome, while your "desired result" is what I see if I use Chrome's DOM inspector.

On further inspection, I see that you are not actually getting the HTML from the browser, you are (indirectly) using JSoup's Connection object to get the HTML directly. Unfortunately, that's not going to run the Javascript.

Instead, you're going to have to get the HTML from the WebView after the JavaScript runs. For a possible way to do that, see How do I get the web page contents from a WebView?

Then, you give the HTML that you get from that to JSoup with

Jsoup.parse(html);
Community
  • 1
  • 1
GreyBeardedGeek
  • 29,460
  • 2
  • 47
  • 67
  • If you scroll up to the above code, you'll see that all of that is already in there unfortunately – Yamaha T64312 Jan 08 '17 at 18:41
  • hmmm...ok, so maybe you need to introduce a delay to let the javascript finish executing? – GreyBeardedGeek Jan 08 '17 at 18:43
  • I was thinking of using Selenium in conjunction with HtmlUnit Driver but I can't seem to get it working. Would this be a good solution? And if so, do you have any resources that I could use to get started? – Yamaha T64312 Jan 08 '17 at 18:43
  • I was using an AsyncTask. It uses a method called onPostExecute() which I think should take care of the delay thing – Yamaha T64312 Jan 08 '17 at 18:44
  • I think you need to add the delay in the javascript addied in the onPageFinished method of the WebVIewClient - otherwise, you're delaying after you've gotten the html - you want to delay before getting the html – GreyBeardedGeek Jan 08 '17 at 18:46
  • So, after I put in loadUrl(blah) i would put for example thread.sleep(1000)? – Yamaha T64312 Jan 08 '17 at 18:51
  • no, you need to do it in the Javascript, so instead of `browser.loadUrl("javascript:window.JSBridge.showHTML(''+document.getElementsByTagName('html')[0].innerHTML+'');");` you'd have to figure out how to introduce a setTimeout() javascript call that would do ths JSBridge.showHTML call when the timer completes } – GreyBeardedGeek Jan 08 '17 at 19:05
  • Let us [continue this discussion in chat](http://chat.stackoverflow.com/rooms/132645/discussion-between-yamaha-t64312-and-greybeardedgeek). – Yamaha T64312 Jan 08 '17 at 19:20