How do I re-write/modify my current Python interpreter's grammar structure, not too good with Python?

Question

This is my current Python interpreter that uses parsing rules to take input and then print out the expression. The interpreter works fine, but I want make and add certain changes of my current grammar rules to new grammar rules. So far I can only get some grammar changes that I want.

This is the changes I want to make from my current grammar:

# <stmt-list> ::= empty | <stmt> <stmt-list>
to
# <stmt_list> ::= <stmt> | <stmt> <stmt_list>


# <factor> ::= id | intnum | ( <expr> )
to
# <base> ::= (<expr>) | id | number


<stmt> ::= id = <expr> ; | print <expr>;
to
<stmt> ::= id = <expr> ; | iprint <expr> ; | rprint <expr> ;

Also I'm sure not sure how implement the new grammar rules below into my interpreter, I think I might already have them?

<prog> ::= <decl_list> <stmt_list>
<decl-list> ::= <decl> | <decl> <decl_list>
<decl> ::= <type> <id_list> ;
<type> ::= int | real
<id_list> ::= id | id {, <id_list>}

This is my current code for my current grammar:

import sys

global varTable
varTable = {}

def main():
    global itProgram, nextToken, nextChar, nextLex, flagEof, strStmt
    nextToken = ""
    nextChar = ""
    flagEof = False
    strStmt = ""

    try:
        fileProgram = open(sys.argv[1], "rt")
    except IndexError:
        print "Missing input file!"
        return
    except IOError:
        print "Could not open \'" + sys.argv[1] + "\'!"
        return

    strProgram = fileProgram.read()
    itProgram = iter(strProgram)

    if strProgram == "":
        nextChar = ""
    else:
        nextChar = itProgram.next()

    #while not flagEof:
    funcLex()

    stmtList()

def funcLex():
    global itProgram, nextToken, nextLex, nextChar, flagEof, strStmt
    nextToken = ""
    nextLex = ""

    isFloat = False

    try:
        while nextChar.isspace():
            nextChar = itProgram.next()
    except StopIteration:
        nextChar = ""
        funcLex()

        return

    try:
        if nextChar == "(":
            nextToken = "LPARA"
            nextLex = nextChar
            nextChar = itProgram.next()
        elif nextChar == ")":
            nextToken = "RPARA"
            nextLex = nextChar
            nextChar = itProgram.next()
        elif nextChar == "+":
            nextToken = "ADD"
            nextLex = nextChar
            nextChar = itProgram.next()
        elif nextChar == "-":
            nextToken = "SUB"
            nextLex = nextChar
            nextChar = itProgram.next()
        elif nextChar == "*":
            nextToken = "MULT"
            nextLex = nextChar
            nextChar = itProgram.next()
        elif nextChar == "/":
            nextToken = "DIV"
            nextLex = nextChar
            nextChar = itProgram.next()
        elif nextChar == "=":
            nextToken = "ASSIGN"
            nextLex = nextChar
            nextChar = itProgram.next()
        elif nextChar == ";":
            nextToken = "SEMI"
            nextLex = nextChar
            nextChar = itProgram.next()
        elif nextChar.isalpha():
            nextLex = nextChar
            nextChar = itProgram.next()
            while nextChar.isalnum():
                nextLex += nextChar
                nextChar = itProgram.next()
            if nextLex == "print":
                nextToken = "PRINT"
            else:
                nextToken = "ID"
        elif nextChar.isalnum():
            nextLex = nextChar
            nextChar = itProgram.next()
            while nextChar.isalnum() or nextChar == ".":
                if nextChar == ".":
                    isFloat = True
                nextLex += nextChar
                nextChar = itProgram.next()
            if isFloat:
                nextToken = "FLOAT"
            else:
                nextToken = "INT"
        elif nextChar == "":
            nextLex = nextChar
            nextToken = "EMPTY"
            flagEof = True
        else:
            nextToken = "UNKNOWN"
            #print "Syntax error!"
    except StopIteration:
        nextChar = ""

    strStmt = strStmt + nextLex + " "
    if nextToken == "SEMI":
        print strStmt
        strStmt = ""

# <stmt-list> ::= empty | <stmt> <stmt-list>
def stmtList():
    global nextToken

    if nextToken == "EMPTY":
        print ">>> Empty .tiny file."
    else:
        while nextToken != "EMPTY":
            stmt()

# <stmt> ::= id = <expr> ; |
#            print <expr> ;
def stmt():
    global nextToken, nextLex

    if nextToken == "ID":
        varName = nextLex
        funcLex()
        if nextToken == "ASSIGN":
            funcLex()
            result = expr()
            if result[1] != "UNKNOWN":
                lookupVarTable(varName, result[0], result[1])
            else:
                printError("undefined variable.")
    elif nextToken == "PRINT":
        funcLex()
        result = expr()
        if result[1] != "UNKNOWN" and nextToken == "SEMI":
            print ">>> " + str(result[0])
        elif result[1] == "UNKNOWN":
            printError("undefined variable.")
    else:
        printError("<stmt> syntax error.")
        return

    if nextToken == "SEMI":
        funcLex()
    else:
        printError("<stmt> missing ';'")

# <expr> ::= <term> { + <term> | - <term> }
def expr():
    global nextToken, nextLex

    lResult = term()

    while nextToken == "ADD" or nextToken == "SUB":
        operator = nextToken
        funcLex()
        rResult = term()
        #Variable is not defined
        if lResult[1] == "UNKNOWN" or rResult[1] == "UNKNOWN":
            printError("Undefined variable!")
        if lResult[1] != rResult[1]:    #type mismatch
            printError("Type mismatch!")
        elif operator == "ADD":
            lResult = (lResult[0]+rResult[0], lResult[1])
        else:
            lResult = (lResult[0]-rResult[0], lResult[1])

    return lResult

# <term> ::= <factor> { * <factor> | / <factor> }
def term():
    global nextToken, nextLex

    lResult = factor()

    while nextToken == "MULT" or nextToken == "DIV":
        operator = nextToken
        funcLex()
        rResult = factor()
        #Variable is not defined
        if lResult[1] == "UNKNOWN" or rResult[1] == "UNKNOWN":
            printError("Undefined variable!")
        if lResult[1] != rResult[1]:    #type mismatch
            printError("Type mismatch!")
        elif operator == "MULT":
            lResult = (lResult[0]*rResult[0], lResult[1])
        else:
            lResult = (lResult[0]/rResult[0], lResult[1])

    return lResult

# <factor> ::= id | intnum | ( <expr> )
def factor():
    global nextToken, nextLex

    if nextToken == "ID":
        result = lookupVarTable(nextLex, 0, "UNKNOWN")
        funcLex()
    elif nextToken == "INT":
        result = (int(nextLex), "INT")
        funcLex()
    elif nextToken == "FLOAT":
        result = (float(nextLex), "FLOAT")
        funcLex()
    elif nextToken == "LPARA":
        funcLex()
        result = expr()
        if nextToken == "RPARA":
            funcLex()
        else:
            printError("<factor>")

    return result

def printError(strMessage):
    global strStmt

    if strStmt != "":
        print strStmt

    print ">>> Error: " + strMessage
    exit()

def lookupVarTable(varName, varValue, varType):

    #if varName not in varTable:
    # varValue == "UNKNOWN"
    if varType != "UNKNOWN":
        varTable[varName] = (varValue, varType)
        return varTable[varName]
    elif varName in varTable:
        return varTable[varName]
    else:
        return (varValue, varType)

if __name__ == "__main__":
    main()

The `global` keyword is only useful inside functions body. See also: the [FAQ - What are the rules for local and global variables in Python](https://docs.python.org/3.6/faq/programming.html#what-are-the-rules-for-local-and-global-variables-in-python), and http://stackoverflow.com/a/423596/1513933 — Laurent LAPORTE, Nov 05 '16 at 19:58
This variables should be defined at module level: `itProgram = None`, `nextToken = ""`, `nextChar = ""`, `nextLex = ""`, `flagEof = False`, `strStmt = ""` — Laurent LAPORTE, Nov 05 '16 at 20:00
According to the [PEP8 Style Guide](https://www.python.org/dev/peps/pep-0008/#prescriptive-naming-conventions), you ought to use snake-case for variables and functions. — Laurent LAPORTE, Nov 05 '16 at 20:02
if you want to remove this question from the public eye, you have to delete it. When you post code here is for the rest of the world to see asking for help on it, so you have to post something that you don't mind the rest of the world seeing, and if you care about it, then you need to make a similar code that reproduce your problem and use it to ask for help here — Copperfield, Nov 05 '16 at 21:54
I mean your question and/or whatever you post here in this site, if you want to remove the answer of LaurentLAPORTE you have to wait for him to remove it or flag it so a moderator do it — Copperfield, Nov 05 '16 at 22:17
@PaulGriffiths hmm this is my question, and I want a new answer. So stop editing my question? — Harris Schmidt, Nov 06 '16 at 03:57
@HarrisSchmidt: Stack Overflow is not your personal playground. You asked a question and already received an answer, which will look like nonsense if you vandalize your question and change it beyond recognition. So stop doing that. If you have a different question, ask a different question - don't change this one. — Crowman, Nov 06 '16 at 04:01
Stop editing your question so that it invalidates the existing answer. Ask a new question. I'm locking this to prevent further abuse. — ChrisF, Nov 06 '16 at 12:48

score 1 · Answer 1 · edited Apr 13 '17 at 12:40

You should consider using Antlr, there is a Python port.

In the meanwhile, here is how you can design you lexer:

def parser_file(file_obj):
    for line in file_obj:
        for char in line:
            yield char


mapping = {'(': 'LPARA',
           ')': 'RPARA',
           '+': 'ADD',
           '-': 'SUB',
           '*': 'MUL',
           '/': 'DIV',
           '=': 'ASSIGN',
           ';': 'SEMI'}


def lexer(chars):
    it_char = iter(chars)

    char = next(it_char)

    while True:
        # skip spaces
        while char.isspace():
            char = next(it_char)

        # find simple tokens
        if char in mapping:
            yield mapping[char], char
            char = next(it_char)
            continue

        # find complex tokens
        if char.isalpha():
            lex = char
            char = next(it_char)
            while char.isalnum():
                lex += char
                char = next(it_char)
            if lex == "print":
                yield "PRINT", lex
            else:
                yield "ID", lex
            continue
        elif char.isdigit():
            lex = char
            char = next(it_char)
            while char.isdigit():
                lex += char
                char = next(it_char)
            if char == ".":
                lex += char
                char = next(it_char)
                while char.isdigit():
                    lex += char
                    char = next(it_char)
            if "." in lex:
                yield "FLOAT", lex
            else:
                yield "INT", lex
            continue
        else:
            raise SyntaxError(char)

To use that, you can process as follow:

import io

content = """\
10 + 12.5 / 18
(8 + 3.14)
"""

file_obj = io.BytesIO(content)

for token in lexer(parser_file(file_obj)):
    print(token)

You get:

('INT', '10')
('ADD', '+')
('FLOAT', '12.5')
('DIV', '/')
('INT', '18')
('LPARA', '(')
('INT', '8')
('ADD', '+')
('FLOAT', '3.14')
('RPARA', ')')

You can use a real file of course.

For your parser: use a stack to build the abstract syntax tree and evaluate it.

I'm sorry, it's too long to explain and it's irrelevant on SO, consider posting on Code Review.

How do I re-write/modify my current Python interpreter's grammar structure, not too good with Python?

1 Answers1