3

I am looking for a way to identify all the words on a page, and count how many of each instance of each word there is on that page. I need to use JavaScript for this though, and no jQuery.

UPDATE

This is what i have so far, although it seems to be working, i am still getting some cases where 2 or more words have been merged together, any clues?

if(window.attachEvent) {
    window.attachEvent("onload", myFunc);
} else {
    if(window.onload) {
        var curronload = window.onload;
        var newonload = function() {
            curronload();
            myFunc();
        };
        window.onload = newonload;
    } else {
        window.onload = myFunc;
    }
}

function myFunc() {
    var words = document.body.innerText;  
    words = words.replace(/\n/g, " "); //Remove line breaks
    words = words.split(" ");
    var foundWords = new Array(); 
    var counts = new Array(); 
    words.forEach(function(s) { 
        s = s.replace(/^\s+|\s+$/g,''); //Trim
        s = s.toLowerCase(); //To lower case
        var index = foundWords.indexOf(s);
        if(s != \'\') { //If word not blank
            if(index < 0) {
                foundWords.push(s);
                var newindex = foundWords.indexOf(s);
                counts.push(1);
            } else {
                counts[index] += 1; 
            }
        }

    });

    //Cycle through all found words and log the index, word & count
    foundWords.forEach( function(s) { 
        var index = foundWords.indexOf(s);
        console.log(index+" "+s+" "+counts[index]);
    });
}
user1448020
  • 147
  • 1
  • 10
  • Nothing so far, javascript isnt my strongest point, im not sure how i would go about doing this in javascript in the first place – user1448020 Dec 31 '12 at 19:03
  • 1
    1) select all text nodes 2) split the text into words 3) count the occurences of each word 4) print the results; which part do you need with, after reading a decent Javascript tutorial? – John Dvorak Dec 31 '12 at 19:05
  • How about 3a) Sort the list of words 3b) scan the list to find adjacent items in the list that are the same 3c) store the count somewhere with the word – Lee Meador Dec 31 '12 at 19:19

4 Answers4

9

I must admit that I agree with the (somewhat sarcastic) commenters urging some research into basic JavaScript first. I thought it'd be fun to take a crack at this, though, so here's the first thing I came up with.

It outputs the list and frequency of words to the console.

One would, of course, want to filter the results to make them a bit nicer, but that's another question.

http://jsfiddle.net/E7qSb/

var words = [];

var walkDOM = function (node, func) {
    func(node);
    node = node.firstChild;
    while(node) {
        walkDOM(node, func);
        node = node.nextSibling;
    }

};

walkDOM(document.body, function (node) {

    if (node.nodeName === '#text') {
        var text = node.textContent;

        text = text.replace(/[^A-Za-z]/g, ' ');

        text = text.split(' ');

        if (text.length) {

            for (var i = 0, length = text.length; i < length; i += 1) {
                var matched = false,
                    word = text[i];

                for (var j = 0, numberOfWords = words.length; j < numberOfWords; j += 1) {
                    if (words[j][0] === word) {
                        matched = true;
                        words[j][1] += 1;
                    }
                }

                if (!matched) {
                    words.push([word, 1]);
                }

            }
        }
    }
});

var displayWordList = function (words) {
    for (var i = 0, length = words.length; i < length; i += 1) {
        console.log(words[i][0], words[i][1]);
    }
};

displayWordList(words);
​

This uses Douglas Crockford's walkDOM example from JavaScript: The Good Parts. But I see from other folks that there's an innerText property of document.body?! That's, umm, easier.

I'm leaving this answer up because the method of keeping word counts might prove useful to the asker.

Nate
  • 4,718
  • 2
  • 25
  • 26
  • how can i find the most frequent word from this list? – milpool Jan 12 '20 at 02:15
  • 1
    Just sort the results by the number of occurrences: `words = words.sort((a, b) => b[1] - a[1]);` (https://jsfiddle.net/nate/un5xLg4d/4/) – Nate Jan 13 '20 at 14:32
2

Use regular expressions like this.

var words = document.body.textContent || document.body.innerText,
    matches = words.match(/word/gmi);

console.log(matches);
jeremy
  • 9,965
  • 4
  • 39
  • 59
  • This works, kind of. I get loads of javascript and some div elements left behind though. To test i have run this on this page: javascript:console.log(document.body.textContent) – user1448020 Dec 31 '12 at 19:18
  • i get a better result using innerText . Does document.body.innerText work across all browsers (including IE) – user1448020 Dec 31 '12 at 19:20
  • @user1448020 Not sure, but you can search for compatibility on Google. – jeremy Dec 31 '12 at 19:20
  • Once i have my plaintext string, how do i extract all of the words from it? im not looking for a particular word or words, im trying to work out the most commonly used words on the page, rather then specific words themselves. Once i have all of the words in an array i should be fine, i can run through the array and count the instances of each word. – user1448020 Dec 31 '12 at 19:26
  • use `split(" ")` and count occurrences of each word – jeremy Dec 31 '12 at 19:27
  • I keep getting the following error: Uncaught TypeError: Cannot read property 'textContent' of null Using the following code: var words = document.body.textContent; words = words.split(" "); alert(words); i have also tried using document.body.innerText and get the same error. The domain the script is hosted on is different to the domain the script is included on, don't know if it is failing because of security restrictions? Although i don't see how this is the case because surely that means things like jQuery would not work when hosted on Google CDN etc... – user1448020 Dec 31 '12 at 19:37
  • Sorry Guys (and Gals)! I'm being stupid, i included the script in the head element, of course by this point the page has not finished loading. i used the code from the following page: http://stackoverflow.com/a/807997 and it seems to be working now – user1448020 Dec 31 '12 at 19:51
1

You can use like this.

var findWord="What";
var totalCount = document.body.innerText.split(findWord).length - 1;
sohail.hussain.dyn
  • 1,411
  • 1
  • 16
  • 26
0

You can polish this solution :

<!DOCTYPE html>
<html>
<body>

<p id="demo">Click the button to display the matches.</p>

<button onclick="myFunction()">Try it</button>

<script>
function myFunction()
{
    var str="The rain in SPAIN stays mainly in the plain rain"; 
    var n=str.match(/\S+/g);

    document.getElementById("demo").innerHTML=n;

    for(i=0; i < n.length ; i++){
        r = str.match(new RegExp( n[i], 'g' ));
        document.getElementById("demo").innerHTML+= '<br>'+ n[i] +' = ' + r.length ;
    }
}
</script>

</body>
</html>
Mehdi Karamosly
  • 5,388
  • 2
  • 32
  • 50