0

I found a very nice code for CKeditor to count and rank most used words in a textarea, this is very useful for SEO-keywords suggestions when writting articles. The problem is that when you write any non english characters like öäåÖÄÅ they are just not passed by some kind of filter, the same occurs with Diacritics like léjonet with é or ñ.

Here is the code and a working Jsfiddle demo

The HTML is:

<!-- Textarea -->
<div class="form-group">
<label class="col-md-2 control-label" for="editor1">HTML </label>
<div class="col-md-10">                     
<textarea class="form-control" id="editor1" name="editor1"><p>text example with ahöäåra</p></textarea>
</div>
</div>
<!-- KW density result -->
<div class="form-group">
<label class="col-md-2 control-label" for="editor1">Words Repeat</label>
<div class="col-md-10">                     
<div id="KWdensity" ></div>
</div>
</div> 

And the javascript code is:

<script type="text/javascript">
$(document).ready(function () {
//----------------------------------------------------------------------
// Editor init
//----------------------------------------------------------------------
CKEDITOR.replace( 'editor1' );

//----------------------------------------------------------------------
// KW init
//----------------------------------------------------------------------
$(initKW);

//----------------------------------------------------------------------
// Editor key intercept
//----------------------------------------------------------------------
CKEDITOR.instances.editor1.on('contentDom', function() {
CKEDITOR.instances.editor1.document.on('keyup', function(event) { $(initKW); });
});    
function removeDiacritics (str) {

var defaultDiacriticsRemovalMap = [
{'base':'A', 'letters':'\u0041\u24B6\uFF21\u00C0\u00C1\u00C2\u1EA6\u1EA4\u1EAA\u1EA8\u00C3\u0100\u0102\u1EB0\u1EAE\u1EB4\u1EB2\u0226\u01E0\u00C4\u01DE\u1EA2\u00C5\u01FA\u01CD\u0200\u0202\u1EA0\u1EAC\u1EB6\u1E00\u0104\u023A\u2C6F'}
];        
var diacriticsMap = {};
for (var i=0; i < defaultDiacriticsRemovalMap .length; i++){
var letters = defaultDiacriticsRemovalMap [i].letters;
for (var j=0; j < letters.length ; j++){
diacriticsMap[letters[j]] = defaultDiacriticsRemovalMap [i].base;
}}    
return str.replace(/[^\u0000-\u007E]/g, function(a){ 
return diacriticsMap[a] || a; 
});}  
function KeyDensityShow(srctext, MaxKeyOut, keylenMin) {
var Output;
// Split text on non word characters
var words = srctext.toLowerCase().split(/\W+/)
var positions = new Array()
var word_counts = new Array()
try {
for (var i=0; i<words.length; i++) {
var word = words[i]
if (!word || word.length < keylenMin) {
continue
}
if (!positions.hasOwnProperty(word)) {
positions[word] = word_counts.length;
word_counts.push([word, 1]);
} else {
word_counts[positions[word]][1]++;
}}
// Put most frequent words at the beginning.
word_counts.sort(function (a, b) {return b[1] - a[1]})
// Return the first MaxKeyOut items
return word_counts.slice(0, MaxKeyOut)
}        
catch(err) {
return "";
}}
function removeStopWords(input) {
var stopwords = ['test',
];
var filtered  = input.split( /\b/ ).filter( function( v ){
return stopwords.indexOf( v ) == -1;
});
stopwords.forEach(function(item) {
var reg = new RegExp('\\W'+item +'\\W','gmi');
input = input.replace(reg, " ");
});
return input.toString();
}
function initKW() {
$('#KWdensity').html('');
var TextGrab = CKEDITOR.instances['editor1'].getData();
TextGrab = $(TextGrab).text(); // html to text
TextGrab = removeDiacritics(TextGrab);
TextGrab = removeStopWords(TextGrab);            
TextGrab = TextGrab.replace(/\r?\n|\r/gm," ").trim(); // remove line breaks   
TextGrab = TextGrab.replace(/\s\s+/g, " ").trim(); // remove double spaces
TextGrab = TextGrab.replace(/[^a-zA-Z ]+/g, "").trim(); // only letters and and space
if (TextGrab != "") 
{    
var keyCSV = KeyDensityShow(TextGrab, 11, 3);        
var KeysArr = keyCSV.toString().split(',');
var item, items = '';
for (var i = 0; i < KeysArr.length; i++) {
item = '';
item = item + '<b>' + KeysArr[i] + "</b></button>&nbsp;";
i++;
item = '<button class="btn btn-default btn-xs" type="button"><span class="badge">' + KeysArr[i] + "</span>&nbsp;" + item;    
items = items + item; 
} 
$('#KWdensity').html(items);
}
}
});
</script>

As you can see there is a function to remove diacritics "defaultDiacriticsRemovalMap" it had some values from A to Z i shortened it to see if Ö displays and nothing, and no diacritics neither. I also tried deleting many parts of this code and nothing works.

I am not good with Javascript but I have some knowledge anyway to understand the functions and I cannot really see where is the problem in this code.

Any idea how to delete the function of diacritics removal and to make characters like öäåñ etc work without the script changing it automatically to oaan ?

Here is the Fiddle demo:

https://jsfiddle.net/rg0myntj/1/

DEMO

  • Found problem at 2 lines. 1. is `TextGrab.replace(/[^a-zA-Z ]+/g, "").trim()` that remove anything except a to z. 2. is `srctext.toLowerCase().split(/\W+/)` that split anything that is not a to z, 0 to 9 (`\W`). – vee Oct 24 '22 at 16:00
  • I tried deleting this trim and lowercase but it does not passes the öäå – newspapper.eu Oct 24 '22 at 16:02
  • From what I tried, I just comment out `TextGrab.replace(/[^a-zA-Z ]+/g, "").trim()` and replace `srctext.toLowerCase().split(/\W+/)` with `srctext.toLowerCase().split(/[^\p{L}\p{M}\p{N}]+/u)` and it seems to work fine. However I couldn't make sure that is this valid for your text characters you want. – vee Oct 24 '22 at 16:09
  • Read more about regular expression pattern for unicode characters on [this question](https://stackoverflow.com/q/5555613/128761). – vee Oct 24 '22 at 16:10
  • it works, but is there any way to delete the function function removeDiacritics ? I tried but give me errors ( I am not good with javascript) Please add it as an answer to select your answer the correct answer and help people with the same problem in the future – newspapper.eu Oct 24 '22 at 16:24

1 Answers1

0

This line TextGrab.replace(/[^a-zA-Z ]+/g, "").trim() removed anything except a to z in both lower and upper case, and blank space. This means unicode characters will be removed. So, delete or comment out this line of code.

This line srctext.toLowerCase().split(/\W+/) will be split anything that is not a to z or 0 to 9. This means word ahöäåra will be ah and ra.
Replace this with srctext.toLowerCase().split(/[^\p{L}\p{M}\p{N}]+/u).

And to delete function removeDiacritics(), you just delete it in the code.

$(document).ready(function() {
    //----------------------------------------------------------------------
    // Editor init
    //----------------------------------------------------------------------
    CKEDITOR.replace('editor1');

    //----------------------------------------------------------------------
    // KW init
    //----------------------------------------------------------------------
    $(initKW);

    //----------------------------------------------------------------------
    // Editor key intercept
    //----------------------------------------------------------------------
    CKEDITOR.instances.editor1.on('contentDom', function() {
        CKEDITOR.instances.editor1.document.on('keyup', function(event) {
            $(initKW);
        });
    });

    /*function removeDiacritics(str) {

        var defaultDiacriticsRemovalMap = [{
            'base': 'A',
            'letters': '\u0041\u24B6\uFF21\u00C0\u00C1\u00C2\u1EA6\u1EA4\u1EAA\u1EA8\u00C3\u0100\u0102\u1EB0\u1EAE\u1EB4\u1EB2\u0226\u01E0\u00C4\u01DE\u1EA2\u00C5\u01FA\u01CD\u0200\u0202\u1EA0\u1EAC\u1EB6\u1E00\u0104\u023A\u2C6F'
        }];
        var diacriticsMap = {};
        for (var i = 0; i < defaultDiacriticsRemovalMap.length; i++) {
            var letters = defaultDiacriticsRemovalMap[i].letters;
            for (var j = 0; j < letters.length; j++) {
                diacriticsMap[letters[j]] = defaultDiacriticsRemovalMap[i].base;
            }
        }
        return str.replace(/[^\u0000-\u007E]/g, function(a) {
            return diacriticsMap[a] || a;
        });
    }*/ // this is comment code block, you can delete this function from your code.

    function KeyDensityShow(srctext, MaxKeyOut, keylenMin) {
        var Output;
        // Split text on non word characters
        var words = srctext.toLowerCase().split(/[^\p{L}\p{M}\p{N}]+/u)
        var positions = new Array()
        var word_counts = new Array()
        try {
            for (var i = 0; i < words.length; i++) {
                var word = words[i]
                if (!word || word.length < keylenMin) {
                    continue
                }
                if (!positions.hasOwnProperty(word)) {
                    positions[word] = word_counts.length;
                    word_counts.push([word, 1]);
                } else {
                    word_counts[positions[word]][1]++;
                }
            }
            // Put most frequent words at the beginning.
            word_counts.sort(function(a, b) {
                return b[1] - a[1]
            })
            // Return the first MaxKeyOut items
            return word_counts.slice(0, MaxKeyOut)
        } catch (err) {
            return "";
        }
    }

    function removeStopWords(input) {
        var stopwords = ['test', ];
        var filtered = input.split(/\b/).filter(function(v) {
            return stopwords.indexOf(v) == -1;
        });
        stopwords.forEach(function(item) {
            var reg = new RegExp('\\W' + item + '\\W', 'gmi');
            input = input.replace(reg, " ");
        });
        return input.toString();
    }

    function initKW() {
        $('#KWdensity').html('');
        var TextGrab = CKEDITOR.instances['editor1'].getData();
        TextGrab = $(TextGrab).text(); // html to text
        //TextGrab = removeDiacritics(TextGrab); // you can delete this line of code.
        TextGrab = removeStopWords(TextGrab);
        TextGrab = TextGrab.replace(/\r?\n|\r/gm, " ").trim(); // remove line breaks   
        TextGrab = TextGrab.replace(/\s\s+/g, " ").trim(); // remove double spaces
        //TextGrab = TextGrab.replace(/[^a-zA-Z ]+/g, "").trim(); // only letters and and space // you can delete this line of code.
        if (TextGrab != "") {
            var keyCSV = KeyDensityShow(TextGrab, 11, 3);
            var KeysArr = keyCSV.toString().split(',');
            var item, items = '';
            for (var i = 0; i < KeysArr.length; i++) {
                item = '';
                item = item + '<b>' + KeysArr[i] + "</b></button>&nbsp;";
                i++;
                item = '<button class="btn btn-default btn-xs" type="button"><span class="badge">' + KeysArr[i] + "</span>&nbsp;" + item;
                items = items + item;
            }
            $('#KWdensity').html(items);
        }
    }
});
<link href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/css/bootstrap.min.css" rel="stylesheet"/>
<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/2.2.4/jquery.min.js"></script>
<script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/js/bootstrap.min.js"></script>
<script src="https://cdn.ckeditor.com/4.6.1/standard/ckeditor.js"></script>

<!-- Textarea -->
<div class="form-group">
    <label class="col-md-2 control-label" for="editor1">HTML </label>
    <div class="col-md-10">
        <textarea class="form-control" id="editor1" name="editor1"><p>text example with ahöäåra</p></textarea>
    </div>
</div>
<!-- KW density result -->
<div class="form-group">
    <label class="col-md-2 control-label" for="editor1">Words Repeat</label>
    <div class="col-md-10">
        <div id="KWdensity"></div>
    </div>
</div>

The code above cannot run properly on Stack Overflow due to iframe permission. See it in action on jsFiddle instead.

vee
  • 4,506
  • 5
  • 44
  • 81