-1

I want to insert a space between Chinese charactes and numbers (or letters) only within <tag> and </tag>.


ex1:

raw string:

<a href="http://example.com/漢字1234.html">it's a Chinese character漢字1234</a>

expected result:

<a href="http://example.com/漢字1234.html">it's a Chinese character 漢字 12345</a>


ex2:

a more complicated raw string:

<div id="foo">
    <div class="bar1">
        <span>abcd漢字1234</span>
    </div>
    <div class="bar2">
        123漢字abcd
        <p>letters漢字<a href="http://example.com/漢字1234.html">42漢字answer</a></p>
    </div>
</div>

expected result:

<div id="foo">
    <div class="bar1">
        <span>abcd 漢字 1234</span>
    </div>
    <div class="bar2">
        123 漢字 abcd
        <p>letters 漢字<a href="http://example.com/漢字1234.html">answer 漢字 42</a></p>
    </div>
</div>


here is what I do in JavaScipt:

function insert_space(text) {
    // I use [\u4E00-\u9FA5] to match Chinese characte
    text = text.replace(/([\u4E00-\u9FA5])([a-z0-9])/ig, '$1 $2');

    text = text.replace(/([a-z0-9])([\u4E00-\u9FA5])/ig, '$1 $2');

    return text;
}

function replaceEntry() {
    var target_tags = ['div', 'p', 'li', 'td', 'span', 'a', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'];
    var body_tag = $('body');
    var raw_text;
    var format_text;

    for (var i = 0; i < target_tags.length; i++) {
        var target_tag = target_tags[i];

        $(target_tag, body_tag).each(function() {
            var has_children = false;

            console.log('\n');
            console.log(target_tag + ' begin');

            raw_text = $(this).html();

            console.log(raw_text);

            format_text = insert_space(raw_text);

            $(this).html(format_text);

            console.log(target_tag + ' end');
            console.log('\n');
        });
    }
}

replaceEntry();

But this code can't process those 2 examples above.

I need a favor. Thanks so much.


ps.

I put this code in a Google Chrome Extension, like:

chrome.browserAction.onClicked.addListener(function(tab) {
    chrome.tabs.executeScript(tab.id, {file: 'js/libs/jquery-1.7.1.min.js'});
    chrome.tabs.executeScript(tab.id, {file: 'js/auto_spacing.js'});
});



UPDATE:

I found a solution: using XML XPath

var current_documant = window.document;

var xpath_query = '//text()[normalize-space(.)][translate(name(..),"ABCDEFGHIJKLMNOPQRSTUVWXYZ","abcdefghijklmnopqrstuvwxyz")!="script"][translate(name(..),"ABCDEFGHIJKLMNOPQRSTUVWXYZ","abcdefghijklmnopqrstuvwxyz")!="style"]';

var nodes = current_documant.evaluate(xpath_query, current_documant, null, XPathResult.UNORDERED_NODE_SNAPSHOT_TYPE, null);

var nodes_length = nodes.snapshotLength;

for (var i = 0; i < nodes_length; ++i) {
    var current_node = nodes.snapshotItem(i);

    // http://www.w3school.com.cn/xmldom/dom_text.asp
    current_node.data = insert_space(current_node.data);
}
Vinta
  • 387
  • 4
  • 10
  • 9
    Every time you regex some html, Allan Turing kills a kitten. – Marc B Feb 23 '12 at 18:51
  • Don't parse hmtl with regex - see this question and answer http://stackoverflow.com/questions/1732348/regex-match-open-tags-except-xhtml-self-contained-tags – asawyer Feb 23 '12 at 18:52
  • HTML is not a regular language, so _regular_ expressions don't work so well. You're better off using a library like jQuery with `.html()` to do this kind of work for you. – robbrit Feb 23 '12 at 18:52
  • @Madmartigan Hm I mean accepted answer. I guess my sort is different. Woops. – asawyer Feb 23 '12 at 18:58
  • 1
    Stupid people cannot use regexes on HTML. Competent ones can. Sometimes it is the right answer. Don’t be lame. – tchrist Feb 23 '12 at 19:02
  • in ex2, using jQuery with `.html()`, how to avoid to replae the value of "href"? – Vinta Feb 25 '12 at 17:56

1 Answers1

1
function doReplace( str ) {
    var rchinese = /([\u4E00-\u9FA5]+)/g;

    var root = document.createElement("div");
    root.innerHTML = str;

    ['div', 'p', 'li', 'td', 'span', 'a', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'].forEach( function( tagName ) {
        [].forEach.call( root.getElementsByTagName( tagName ), function( elem ) {
            [].forEach.call( elem.childNodes, function( node ) {
                if( node.nodeType === 3 ) {
                    node.nodeValue = node.nodeValue.replace( rchinese, " $1 ");
                }
            });
        });
    });

    return root.innerHTML;
}

Result:

var test = '<a href="http://example.com/漢字1234.html">it\'s a Chinese character漢字1234</a>';
doReplace(test);
//<a href="http://example.com/漢字1234.html">it's a Chinese character 漢字 1234</a>
Esailija
  • 138,174
  • 23
  • 272
  • 326