I know there are some SO's about this topic, but all the solutions presented seem take a different approach than an example I've seen worked out in javascript.
Here's the javascript example that counts paragraphs, sentences words and characters typed in a text string, which includes checks for high / low surrogates for counting the characters specifically:
javascript version
count(text);
function count(original) {
var trimmed = original.replace(/[\u200B]+/, '').trim();
return {
paragraphs: trimmed ? (trimmed.match(/\n+/g) || []).length + 1 : 0,
sentences: trimmed ? (trimmed.match(/[.?!…\n]+./g) || []).length + 1 : 0,
words: trimmed ? (trimmed.replace(/['";:,.?¿\-!¡]+/g, '').match(/\S+/g) || []).length : 0,
characters: trimmed ? _decode(trimmed.replace(/\s/g, '')).length : 0,
all: _decode(original).length
};
};
function _decode(string) {
var output = [],
counter = 0,
length = string.length,
value, extra;
while (counter < length) {
value = string.charCodeAt(counter++);
if (value >= 0xD800 && value <= 0xDBFF && counter < length) {
// High surrogate, and there is a next character.
extra = string.charCodeAt(counter++);
if ((extra & 0xFC00) === 0xDC00) {
// Low surrogate.
output.push(((value & 0x3FF) << 10) + (extra & 0x3FF) + 0x10000);
} else {
// unmatched surrogate; only append this code unit, in case the next
// code unit is the high surrogate of a surrogate pair
output.push(value, extra);
counter--;
}
} else {
output.push(value);
}
}
return output;
}
Demo below and in jsfiddle
var text = 'This is a paragraph. This is the 2nd sentence in the 1st paragraph.\nThis is another paragraph.';
var count = doCount(text);
document.body.innerHTML = '<pre>' + text + '</pre><hr>';
for (i in count) {
document.body.innerHTML += '<p>'+ i +': ' + count[i] + '</p>';
}
/* COUNTING LIBRARY */
/**
* Extracted from https://github.com/RadLikeWhoa/Countable/, which in
* turn uses `ucs2decode` function from the punycode.js library.
*/
function doCount(original) {
var trimmed = original.replace(/[\u200B]+/, '').trim();
return {
paragraphs: trimmed ? (trimmed.match(/\n+/g) || []).length + 1 : 0,
sentences: trimmed ? (trimmed.match(/[.?!…\n]+./g) || []).length + 1 : 0,
words: trimmed ? (trimmed.replace(/['";:,.?¿\-!¡]+/g, '').match(/\S+/g) || []).length : 0,
characters: trimmed ? _decode(trimmed.replace(/\s/g, '')).length : 0,
all: _decode(original).length
};
};
/**
* `ucs2decode` function from the punycode.js library.
*
* Creates an array containing the decimal code points of each Unicode
* character in the string. While JavaScript uses UCS-2 internally, this
* function will convert a pair of surrogate halves (each of which UCS-2
* exposes as separate characters) into a single code point, matching
* UTF-16.
*
* @see <http://goo.gl/8M09r>
* @see <http://goo.gl/u4UUC>
*
* @param {String} string The Unicode input string (UCS-2).
*
* @return {Array} The new array of code points.
*/
function _decode(string) {
var output = [],
counter = 0,
length = string.length,
value, extra;
while (counter < length) {
value = string.charCodeAt(counter++);
if (value >= 0xD800 && value <= 0xDBFF && counter < length) {
// High surrogate, and there is a next character.
extra = string.charCodeAt(counter++);
if ((extra & 0xFC00) === 0xDC00) {
// Low surrogate.
output.push(((value & 0x3FF) << 10) + (extra & 0x3FF) + 0x10000);
} else {
// unmatched surrogate; only append this code unit, in case the next
// code unit is the high surrogate of a surrogate pair
output.push(value, extra);
counter--;
}
} else {
output.push(value);
}
}
return output;
}
I'm not well-versed in character encoding schemes and things like high / low surrogates, but is this not needed when counting using java?
I'm happy with the javascript implementation's results and I would like to do the counting on my java back-end, but I'm not sure if the same approach is even needed or how it should be done.