8

I'm trying to build JavaScript code that reads one string (say a sentence of English text), then outputs another string of (comma-separated) words that were "uncommon". Something like:

    var sentence="The dog ran to the other side of the field."; 

    var common_words="the, it is, we all, a, an, by, to, you, me, he, she, they, we, how, it, i, are, to, for, of"; 

--Some JavaScript code--

    var uncommon_words="dog, ran, other, side, field"; 

How can I do this?

Peter Mortensen
  • 30,738
  • 21
  • 105
  • 131
Test Tester
  • 1,409
  • 4
  • 14
  • 14

6 Answers6

11

Here you go:

function getUncommon(sentence, common) {
    var wordArr = sentence.match(/\w+/g),
        commonObj = {},
        uncommonArr = [],
        word, i;

    common = common.split(',');
    for ( i = 0; i < common.length; i++ ) {
        commonObj[ common[i].trim() ] = true;
    }

    for ( i = 0; i < wordArr.length; i++ ) {
        word = wordArr[i].trim().toLowerCase();
        if ( !commonObj[word] ) {
            uncommonArr.push(word);
        }
    }

    return uncommonArr;
}

Live demo: http://jsfiddle.net/simevidas/knXkS/

Šime Vidas
  • 182,163
  • 62
  • 281
  • 385
6

the words you want to remove is called stop words witch is:

["a", "able", "about", "across", "after", "all", "almost", "also", "am", "among", "an", "and", "any", "are", "as", "at", "be", "because", "been", "but", "by", "can", "cannot", "could", "dear", "did", "do", "does", "either", "else", "ever", "every", "for", "from", "get", "got", "had", "has", "have", "he", "her", "hers", "him", "his", "how", "however", "i", "if", "in", "into", "is", "it", "its", "just", "least", "let", "like", "likely", "may", "me", "might", "most", "must", "my", "neither", "no", "nor", "not", "of", "off", "often", "on", "only", "or", "other", "our", "own", "rather", "said", "say", "says", "she", "should", "since", "so", "some", "than", "that", "the", "their", "them", "then", "there", "these", "they", "this", "tis", "to", "too", "twas", "us", "wants", "was", "we", "were", "what", "when", "where", "which", "while", "who", "whom", "why", "will", "with", "would", "yet", "you", "your", "ain't", "aren't", "can't", "could've", "couldn't", "didn't", "doesn't", "don't", "hasn't", "he'd", "he'll", "he's", "how'd", "how'll", "how's", "i'd", "i'll", "i'm", "i've", "isn't", "it's", "might've", "mightn't", "must've", "mustn't", "shan't", "she'd", "she'll", "she's", "should've", "shouldn't", "that'll", "that's", "there's", "they'd", "they'll", "they're", "they've", "wasn't", "we'd", "we'll", "we're", "weren't", "what'd", "what's", "when'd", "when'll", "when's", "where'd", "where'll", "where's", "who'd", "who'll", "who's", "why'd", "why'll", "why's", "won't", "would've", "wouldn't", "you'd", "you'll", "you're", "you've"]

here is the source: http://99webtools.com/list-english-stop-words.php

so your code should be

function getNoneStopWords(sentence) {
        var common = getStopWords();
        var wordArr = sentence.match(/\w+/g),
            commonObj = {},
            uncommonArr = [],
            word, i;

        for (i = 0; i < common.length; i++) {
            commonObj[ common[i].trim() ] = true;
        }

        for (i = 0; i < wordArr.length; i++) {
            word = wordArr[i].trim().toLowerCase();
            if (!commonObj[word]) {
                uncommonArr.push(word);
            }
        }
        return uncommonArr;
    }

    function getStopWords() {
        return ["a", "able", "about", "across", "after", "all", "almost", "also", "am", "among", "an", "and", "any", "are", "as", "at", "be", "because", "been", "but", "by", "can", "cannot", "could", "dear", "did", "do", "does", "either", "else", "ever", "every", "for", "from", "get", "got", "had", "has", "have", "he", "her", "hers", "him", "his", "how", "however", "i", "if", "in", "into", "is", "it", "its", "just", "least", "let", "like", "likely", "may", "me", "might", "most", "must", "my", "neither", "no", "nor", "not", "of", "off", "often", "on", "only", "or", "other", "our", "own", "rather", "said", "say", "says", "she", "should", "since", "so", "some", "than", "that", "the", "their", "them", "then", "there", "these", "they", "this", "tis", "to", "too", "twas", "us", "wants", "was", "we", "were", "what", "when", "where", "which", "while", "who", "whom", "why", "will", "with", "would", "yet", "you", "your", "ain't", "aren't", "can't", "could've", "couldn't", "didn't", "doesn't", "don't", "hasn't", "he'd", "he'll", "he's", "how'd", "how'll", "how's", "i'd", "i'll", "i'm", "i've", "isn't", "it's", "might've", "mightn't", "must've", "mustn't", "shan't", "she'd", "she'll", "she's", "should've", "shouldn't", "that'll", "that's", "there's", "they'd", "they'll", "they're", "they've", "wasn't", "we'd", "we'll", "we're", "weren't", "what'd", "what's", "when'd", "when'll", "when's", "where'd", "where'll", "where's", "who'd", "who'll", "who's", "why'd", "why'll", "why's", "won't", "would've", "wouldn't", "you'd", "you'll", "you're", "you've"];
    }
Fareed Alnamrouti
  • 30,771
  • 4
  • 85
  • 76
3

How about this?

sentence.replace(/\b(?:the|it is|we all|an?|by|to|you|[mh]e|she|they|we...)\b/ig, '');

This should remove all common words from your sentence. Just split the remaining string the way you want.

Mario
  • 35,726
  • 5
  • 62
  • 78
0

The String#diff function returns a list of differences (uncommon terms). The terms can be provided as an array or a string.

You call it like: sentence.diff(terms). Below is a unit test:

var sentence = 'The dog ran to the other side of the field.';
var terms    = 'the, it is, we all, a, an, by, to, you, me, he, she, they, we, how, it, i, are, to, for, of';
// NOTE: The "terms" variable could also be an array.

(sentence.diff(terms).toString() === 'dog,ran,other,side,field')
  ? console.log('pass')
  : console.log('fail');

Below is the 'String.diff' function definition:

String.prototype.diff = function(terms){
  if (!terms) {
    return [];
  }

  if (typeof terms === 'string') {
    terms = terms.split(/,[\s]*/);
  }

  if (typeof terms !== 'object' || !Array.isArray(terms)) {
    return [];
  }

  terms = terms.map(function(term){
    return term.toLowerCase();
  });

  var words = this.split(/[\W]/).filter(function(word){
    return word.length;
  });

  return words.filter(function(word){
    return terms.indexOf(word.toLowerCase()) < 0;
  });
};
Wil Moore III
  • 6,968
  • 3
  • 36
  • 49
0

Build an associative array of common words first, then tokenize sequence to output any words not contained in it. E.g.

var excluded = new Object();
common_words = common_words.split(",");
for (var i in common_words) {
    excluded[common_words[i].trim().toLowerCase()] = true;
}
var result = new Array();
var match = sentence.match(/\w+/g);
for (var i in match) {
    if (!excluded[match[i].toLowerCase()]) {
        result.push(match[i]);
    }
}
var uncommon_words = result.join(", ");
Max
  • 2,917
  • 1
  • 16
  • 16
  • this isn't going to catch 'it is' or 'we all' as you're splitting your sequence on ' '. Also, your excluded array doesn't need the boolean value and has to be converted to an Object as you are giving key-value pairs. Plus, aren't you calling split every single time you loop in both loops? – tomfumb Jul 13 '11 at 23:27
  • it doesn't look like a space is a part of a word. Correct about Object vs. Array. Split is NOT being called in a loop, it defines the loop. – Max Jul 13 '11 at 23:47
  • I was referring to 'common_words="the, it is, we all...' in the original question - although the variable name is 'words' this list includes space-separated phrases. – tomfumb Jul 13 '11 at 23:59
0

Here's a start, I reckon:

var sentence_arr = sentence.split(/(?=\w)\b|\W/);
var common_arr = common_words.split(', ');

var uncommon_arr = array();
for(var i = 0; i < sentence_arr.length; i++) {
    for ( var j = 0; j < common_arr.length; j++ ) {
        if ( sentence_arr[i].toLowerCase() != common_arr[j].toLowerCase() ) {
            uncommon_arr.push(sentence_arr[i].toLowerCase());
    }
}

var uncommon_words = uncommon_arr.join(', ');

completely untested, but the point is you split both sentences and individually check each word against each member of that list. Kinda naive, and totally doesn't scale, but would be fine with small examples such as this.

Simon Scarfe
  • 9,378
  • 4
  • 28
  • 32