I found a very nice code for CKeditor to count and rank most used words in a textarea, this is very useful for SEO-keywords suggestions when writting articles. The problem is that when you write any non english characters like öäåÖÄÅ they are just not passed by some kind of filter, the same occurs with Diacritics like léjonet with é or ñ.
Here is the code and a working Jsfiddle demo
The HTML is:
<!-- Textarea -->
<div class="form-group">
<label class="col-md-2 control-label" for="editor1">HTML </label>
<div class="col-md-10">
<textarea class="form-control" id="editor1" name="editor1"><p>text example with ahöäåra</p></textarea>
</div>
</div>
<!-- KW density result -->
<div class="form-group">
<label class="col-md-2 control-label" for="editor1">Words Repeat</label>
<div class="col-md-10">
<div id="KWdensity" ></div>
</div>
</div>
And the javascript code is:
<script type="text/javascript">
$(document).ready(function () {
//----------------------------------------------------------------------
// Editor init
//----------------------------------------------------------------------
CKEDITOR.replace( 'editor1' );
//----------------------------------------------------------------------
// KW init
//----------------------------------------------------------------------
$(initKW);
//----------------------------------------------------------------------
// Editor key intercept
//----------------------------------------------------------------------
CKEDITOR.instances.editor1.on('contentDom', function() {
CKEDITOR.instances.editor1.document.on('keyup', function(event) { $(initKW); });
});
function removeDiacritics (str) {
var defaultDiacriticsRemovalMap = [
{'base':'A', 'letters':'\u0041\u24B6\uFF21\u00C0\u00C1\u00C2\u1EA6\u1EA4\u1EAA\u1EA8\u00C3\u0100\u0102\u1EB0\u1EAE\u1EB4\u1EB2\u0226\u01E0\u00C4\u01DE\u1EA2\u00C5\u01FA\u01CD\u0200\u0202\u1EA0\u1EAC\u1EB6\u1E00\u0104\u023A\u2C6F'}
];
var diacriticsMap = {};
for (var i=0; i < defaultDiacriticsRemovalMap .length; i++){
var letters = defaultDiacriticsRemovalMap [i].letters;
for (var j=0; j < letters.length ; j++){
diacriticsMap[letters[j]] = defaultDiacriticsRemovalMap [i].base;
}}
return str.replace(/[^\u0000-\u007E]/g, function(a){
return diacriticsMap[a] || a;
});}
function KeyDensityShow(srctext, MaxKeyOut, keylenMin) {
var Output;
// Split text on non word characters
var words = srctext.toLowerCase().split(/\W+/)
var positions = new Array()
var word_counts = new Array()
try {
for (var i=0; i<words.length; i++) {
var word = words[i]
if (!word || word.length < keylenMin) {
continue
}
if (!positions.hasOwnProperty(word)) {
positions[word] = word_counts.length;
word_counts.push([word, 1]);
} else {
word_counts[positions[word]][1]++;
}}
// Put most frequent words at the beginning.
word_counts.sort(function (a, b) {return b[1] - a[1]})
// Return the first MaxKeyOut items
return word_counts.slice(0, MaxKeyOut)
}
catch(err) {
return "";
}}
function removeStopWords(input) {
var stopwords = ['test',
];
var filtered = input.split( /\b/ ).filter( function( v ){
return stopwords.indexOf( v ) == -1;
});
stopwords.forEach(function(item) {
var reg = new RegExp('\\W'+item +'\\W','gmi');
input = input.replace(reg, " ");
});
return input.toString();
}
function initKW() {
$('#KWdensity').html('');
var TextGrab = CKEDITOR.instances['editor1'].getData();
TextGrab = $(TextGrab).text(); // html to text
TextGrab = removeDiacritics(TextGrab);
TextGrab = removeStopWords(TextGrab);
TextGrab = TextGrab.replace(/\r?\n|\r/gm," ").trim(); // remove line breaks
TextGrab = TextGrab.replace(/\s\s+/g, " ").trim(); // remove double spaces
TextGrab = TextGrab.replace(/[^a-zA-Z ]+/g, "").trim(); // only letters and and space
if (TextGrab != "")
{
var keyCSV = KeyDensityShow(TextGrab, 11, 3);
var KeysArr = keyCSV.toString().split(',');
var item, items = '';
for (var i = 0; i < KeysArr.length; i++) {
item = '';
item = item + '<b>' + KeysArr[i] + "</b></button> ";
i++;
item = '<button class="btn btn-default btn-xs" type="button"><span class="badge">' + KeysArr[i] + "</span> " + item;
items = items + item;
}
$('#KWdensity').html(items);
}
}
});
</script>
As you can see there is a function to remove diacritics "defaultDiacriticsRemovalMap" it had some values from A to Z i shortened it to see if Ö displays and nothing, and no diacritics neither. I also tried deleting many parts of this code and nothing works.
I am not good with Javascript but I have some knowledge anyway to understand the functions and I cannot really see where is the problem in this code.
Any idea how to delete the function of diacritics removal and to make characters like öäåñ etc work without the script changing it automatically to oaan ?
Here is the Fiddle demo: