1

I am attempting to build a parser for BBCode in JavaScript that will allow me to transpile a string with BBCode in it to a string with HTML. I have in my head how it is all supposed to work and I even have two of the parser steps built.

Right now the entire process of the parser can be described as

  1. Get input
  2. Break input into tokens (tokenize)
  3. Add information about the tokens (lex)
  4. Build the AST from the tokens (parse)
  5. Clean up the AST based on grammar rules (clean)
  6. Evaluate AST and transform to HTML (evaluate)
  7. Return the HTML string

I have a general idea of how to do all of this in my head except for step four.

When I reached step four I ran into a problem when building the AST. The problem was how would I go about recursively building this tree. I have in the past recursively built two dimensional arrays but a variable depth tree is way out of my scope of abilities.

In my head I think that the tree should look something like this:

// Hello, [b]World![/b]
{
    "text": "Hello, ",
    "tag": {
        "type": "b",
        "text": "World!"
    }
}

But when trying to generate this I have an issue with recursively building this down.

A more complex example would be as follows:

// [c=red]Hello Tom, [/c][b][c=green]how are you?[/c][/b]
{
    "tag": {
        type: "c",
        "parameters": "red",
        "text": "Hello Tom, "
        "tag": {
            "type": "b",
            "tag": {
                "type": "c",
                "parameters": "green",
                "text": "how are you?"
            }
        }
    }
}

The main issue I run across is keeping my place while building down without accidentally overwriting the entire tree.

Currently the code I am using is:

var bbcode = {};

bbcode._tokens = {
    'TO_DEL': '[',
    'TC_DEL': ']',
    'TE_DEL': '/',
    'EQ_DEL': '='
};

bbcode._tags = ['c', 'b'];

bbcode.parse = function(bbcode) {
    var tokens = this._tokenize(bbcode);
        tokens = this._lex(tokens);

    var ast = this._parse(tokens);

    console.log(JSON.stringify(ast, null, 4));
    //return tokens;
};

bbcode._isToken = function(token) {
    for (var k in this._tokens) {
        if (this._tokens[k] === token) {
            return true;
        }
    }

    return false;
};

bbcode._isTag = function(token) {
    return (this._tags.indexOf(token) > -1) ? true : false;
};

bbcode._getType = function(token) {
    for (var k in this._tokens) {
        if (this._tokens[k] === token) {
            return k;
        }
    }   
};

bbcode._next = function(tokens, curr) {
    return tokens[curr + 1][0];
};

bbcode._previous = function(tokens, curr) {
    return tokens[curr - 1][0];
};

bbcode._tokenize = function(bbcode) {
    var tree = [];
    var temp = '';

    for (var i = 0; i < bbcode.length; i++) {
        if (this._isToken(bbcode[i])) {
            if (temp.length > 0) {
                tree.push(temp);
                temp = '';
            }

            tree.push(bbcode[i]);
        } else {
            temp += bbcode[i];
        }
    }

    return tree;
};

bbcode._lex = function(tokens) {
    var tree = [];

    for (var i = 0; i < tokens.length; i++) {
        if (this._isToken(tokens[i])) {
            tree.push([this._getType(tokens[i]), tokens[i]]);
        } else if (this._isTag(tokens[i])) {
            tree.push(['BB_TAG', tokens[i]]);
        } else {
            tree.push(['BB_STRING', tokens[i]]);
        }
    }

    return tree;
};

/*****************************************************************************/
/* I need help with the block below                                          */
/*****************************************************************************/

bbcode._parse = function(tokens) {
    var tree = {};

    for (var i = 0; i < tokens.length; i++) {
        if (tokens[i][0] === 'BB_STRING') {
            if (tree['text']) {
                tree['text'] += tokens[i][1];
            } else {
                tree['text'] = tokens[i][1];
            }
        } else if (tokens[i][0] === 'TO_DEL') {
            if (this._next(tokens, i) === 'BB_TAG') {
                tree['tag'] = {};
            } else {
                if (tree['text']) {
                    tree['text'] += tokens[i][1];
                } else {
                    tree['text'] = tokens[i][1];
                }
            }
        }
    }

    return tree;
};

/*****************************************************************************/
Bergi
  • 630,263
  • 148
  • 957
  • 1,375
  • First of all, fix your tree representation to use arrays instead of nesting for multiple consecutive elements (be it tags or texts) – Bergi Oct 26 '15 at 01:59
  • What part (which step) of the algorithm is supposed to take care of misnested tags (like `[b] a [i] b [/b] c [/i]`), and how should it do that? – Bergi Oct 26 '15 at 02:01
  • See http://stackoverflow.com/a/2336769/120163 for an approach to build recursive descent parsers that build trees. – Ira Baxter Oct 26 '15 at 06:01
  • I would like to thank all of you. Bergi for the array advice and the thoughts of handling how it is nested and Ira Baxter for providing resources which I could not find before. – Benjamin Williams Oct 26 '15 at 08:42

0 Answers0