3

I am trying to write a summary of the content of a web page. For that I need to extract all the irrelevant text and data from a webpage.

I have used boilerpipe, but the text extraction is not good.The results are here, where you can see lot of irrelevant text.

Also tried JSoup to scrap away irrelevant data, by removing headers, footers, external links, etc. But again, the results are not up to the mark.

    Document doc = Jsoup.connect("www.anyurl.com").get()
    doc.head().remove();
    doc.getElementsByTag("header").remove();
    doc.getElementsByTag("footer").remove();
    doc.getElementsByTag("form").remove();
    doc.getElementsByTag("table").remove();
    doc.getElementsByTag("meta").remove();
    doc.getElementsByTag("img").remove();
    doc.getElementsByTag("a").remove();
    doc.getElementsByTag("br").remove();

    doc.getElementsByClass("tags").remove();
    doc.getElementsByClass("copyright").remove();
    doc.getElementsByClass("widget").remove();

    doc.select("div[class*=foot").remove();
    doc.select("div[class*=tag").remove();
    doc.select("div[class*=Loading").remove();
    doc.select("div[class*=Widget").remove();
    doc.select("div[class*=Head").remove();
    doc.select("div[class*=menu").remove();
    doc.select("p[class*=link").remove();

    Elements paragraphs = doc.select("p");
    Elements divs = doc.select("div");

    formattedOutput = paragraphs.text() + divs.text();

Can anyone suggest me how to get this done? Is there any Java library other than boilerpipe, which does it for you?

Pritam Banerjee
  • 17,953
  • 10
  • 93
  • 108
  • I looked at the link, and I don't see "a lot of irrelevant text". I don't think you will get anywhere simply by asking how to remove more irrelevant text, because what's relevant or not may be a matter of opinion. Instead, give us a specific idea of what you want done. Or maybe the solution is simply that you need to have a more specific idea. – ajb Oct 21 '16 at 05:21
  • 1
    **1)** Maybe before you continue you should have a look at their [Terms of use](http://www.medicalnewstoday.com/terms) `You may not place, or use any software to place, any complete articles from Medical News Today on your Site, unless you have contacted us first and have received written permission from us to place the entire article online - these full terms of use will apply.` **2)** Maybe using one of their [newsfeeds](http://www.medicalnewstoday.com/newsfeeds-rss) would already provide the information you are looking for. – SubOptimal Oct 21 '16 at 05:48

1 Answers1

0

I don't about java but you can use extract the main content from a webpage

<?php

class ContentExtractor {

    var $container_tags = array(
            'div', 'table', 'td', 'th', 'tr', 'tbody', 'thead', 'tfoot', 'col', 
            'colgroup', 'ul', 'ol', 'html', 'center', 'span'
        );
    var $removed_tags = array(
            'script', 'noscript', 'style', 'form', 'meta', 'input', 'iframe', 'embed', 'hr', 'img',
            '#comment', 'link', 'label'
        );
    var $ignore_len_tags = array(
            'span'
        );  

    var $link_text_ratio = 0.04;
    var $min_text_len = 20;
    var $min_words = 0; 

    var $total_links = 0;
    var $total_unlinked_words = 0;
    var $total_unlinked_text='';
    var $text_blocks = 0;

    var $tree = null;
    var $unremoved=array();

    function sanitize_text($text){
        $text = str_ireplace('&nbsp;', ' ', $text);
        $text = html_entity_decode($text, ENT_QUOTES);

        $utf_spaces = array("\xC2\xA0", "\xE1\x9A\x80", "\xE2\x80\x83", 
            "\xE2\x80\x82", "\xE2\x80\x84", "\xE2\x80\xAF", "\xA0");
        $text = str_replace($utf_spaces, ' ', $text);

        return trim($text);
    }

    function extract($text, $ratio = null, $min_len = null){
        $this->tree = new DOMDocument();

        $start = microtime(true);
        if (!@$this->tree->loadHTML($text)) return false;

        $root = $this->tree->documentElement;
        $start = microtime(true);
        $this->HeuristicRemove($root, ( ($ratio == null) || ($min_len == null) ));

        if ($ratio == null) {
            $this->total_unlinked_text = $this->sanitize_text($this->total_unlinked_text);

            $words = preg_split('/[\s\r\n\t\|?!.,]+/', $this->total_unlinked_text);
            $words = array_filter($words);
            $this->total_unlinked_words = count($words);
            unset($words);
            if ($this->total_unlinked_words>0) {
                $this->link_text_ratio = $this->total_links / $this->total_unlinked_words;// + 0.01;
                $this->link_text_ratio *= 1.3;
            }

        } else {
            $this->link_text_ratio = $ratio;
        };

        if ($min_len == null) {
            $this->min_text_len = strlen($this->total_unlinked_text)/$this->text_blocks;
        } else {
            $this->min_text_len = $min_len;
        }

        $start = microtime(true);
        $this->ContainerRemove($root);

        return $this->tree->saveHTML();
    }

    function HeuristicRemove($node, $do_stats = false){
        if (in_array($node->nodeName, $this->removed_tags)){
            return true;
        };

        if ($do_stats) {
            if ($node->nodeName == 'a') {
                $this->total_links++;
            }
            $found_text = false;
        };

        $nodes_to_remove = array();

        if ($node->hasChildNodes()){
            foreach($node->childNodes as $child){
                if ($this->HeuristicRemove($child, $do_stats)) {
                    $nodes_to_remove[] = $child;
                } else if ( $do_stats && ($node->nodeName != 'a') && ($child->nodeName == '#text') ) {
                    $this->total_unlinked_text .= $child->wholeText;
                    if (!$found_text){
                        $this->text_blocks++;
                        $found_text=true;
                    }
                };
            }
            foreach ($nodes_to_remove as $child){
                $node->removeChild($child);
            }
        }

        return false;
    }

    function ContainerRemove($node){
        if (is_null($node)) return 0;
        $link_cnt = 0;
        $word_cnt = 0;
        $text_len = 0;
        $delete = false;
        $my_text = '';

        $ratio = 1;

        $nodes_to_remove = array();
        if ($node->hasChildNodes()){
            foreach($node->childNodes as $child){
                $data = $this->ContainerRemove($child);

                if ($data['delete']) {
                    $nodes_to_remove[]=$child;
                } else {
                    $text_len += $data[2];
                }

                $link_cnt += $data[0];

                if ($child->nodeName == 'a') {
                    $link_cnt++;
                } else {
                    if ($child->nodeName == '#text') $my_text .= $child->wholeText;
                    $word_cnt += $data[1];
                }
            }

            foreach ($nodes_to_remove as $child){
                $node->removeChild($child);
            }

            $my_text = $this->sanitize_text($my_text);

            $words = preg_split('/[\s\r\n\t\|?!.,\[\]]+/', $my_text);
            $words = array_filter($words);

            $word_cnt += count($words);
            $text_len += strlen($my_text);

        };

        if (in_array($node->nodeName, $this->container_tags)){
            if ($word_cnt>0) $ratio = $link_cnt/$word_cnt;

            if ($ratio > $this->link_text_ratio){
                    $delete = true;
            }

            if ( !in_array($node->nodeName, $this->ignore_len_tags) ) {
                if ( ($text_len < $this->min_text_len) || ($word_cnt<$this->min_words) ) {
                    $delete = true;
                }
            }

        }   

        return array($link_cnt, $word_cnt, $text_len, 'delete' => $delete);
    }

}

/****************************
    Simple usage example
*****************************/

$html = file_get_contents('http://en.wikipedia.org/wiki/Shannon_index');

$extractor = new ContentExtractor();
$content = $extractor->extract($html);
echo $content;

?>
Nishank Mahore
  • 504
  • 4
  • 12