8

I would like to parse a basic indented language using Megaparsec. Originally I was using Parsec which I managed to get working correctly with indentation but now I'm having quite some trouble.

I've been following a tutorial here and here's the code I have to parse a language ignoring indentation.

module Parser where

import           Data.Functor                  ((<$>), (<$))
import           Control.Applicative           (Applicative(..))
import qualified Control.Monad                 as M
import Control.Monad (void)
import           Data.Functor.Identity
import           Data.Text                     (Text)
import qualified Data.Text                     as Text

import Data.Void

import Text.Megaparsec
import Text.Megaparsec.Char
import Text.Megaparsec.Perm
import Text.Megaparsec.Expr
import qualified Text.Megaparsec.Char.Lexer as L

import Text.Pretty.Simple
import Data.Either.Unwrap

--import Lexer
import Syntax

type Parser = Parsec Void String

lineComment :: Parser ()
lineComment = L.skipLineComment "#"

scn :: Parser ()
scn = L.space space1 lineComment empty

sc :: Parser () -- ‘sc’ stands for “space consumer”
sc = L.space (void $ takeWhile1P Nothing f) lineComment empty
  where
    f x = x == ' ' || x == '\t'

lexeme :: Parser a -> Parser a
lexeme = L.lexeme sc

symbol :: String -> Parser String
symbol = L.symbol sc

integer :: Parser Integer
integer = lexeme L.decimal


semi :: Parser String
semi = symbol ";"

rword :: String -> Parser ()
rword w = lexeme (string w *> notFollowedBy alphaNumChar)

rws :: [String] -- list of reserved words
rws = ["if","then","else","while","do","skip","true","false","not","and","or"]

identifier :: Parser String
identifier = (lexeme . try) (p >>= check)
  where
    p       = (:) <$> letterChar <*> many alphaNumChar
    check x = if x `elem` rws
                then fail $ "keyword " ++ show x ++ " cannot be an identifier"
                else return x


parens :: Parser a -> Parser a
parens = between (symbol "(") (symbol ")")


whileParser :: Parser Stmt
whileParser = between sc eof stmt

stmt :: Parser Stmt
stmt = f <$> sepBy1 stmt' semi
  where
    -- if there's only one stmt return it without using ‘Seq’
    f l = if length l == 1 then head l else Seq l

stmt' :: Parser Stmt
stmt' = ifStmt
  <|> whileStmt
  <|> skipStmt
  <|> assignStmt
  <|> parens stmt

ifStmt :: Parser Stmt
ifStmt = do
    rword "if"
    cond  <- bExpr
    rword "then"
    stmt1 <- stmt
    rword "else"
    stmt2 <- stmt
    return (If cond stmt1 stmt2)

whileStmt :: Parser Stmt
whileStmt = do
  rword "while"
  cond <- bExpr
  rword "do"
  stmt1 <- stmt
  return (While cond stmt1)

assignStmt :: Parser Stmt
assignStmt = do
  var  <- identifier
  void (symbol ":=")
  expr <- aExpr
  return (Assign var expr)

skipStmt :: Parser Stmt
skipStmt = Skip <$ rword "skip"

aExpr :: Parser AExpr
aExpr = makeExprParser aTerm aOperators

bExpr :: Parser BExpr
bExpr = makeExprParser bTerm bOperators

aOperators :: [[Operator Parser AExpr]]
aOperators =
  [ [Prefix (Neg <$ symbol "-") ]
  , [ InfixL (ABinary Multiply <$ symbol "*")
    , InfixL (ABinary Divide   <$ symbol "/") ]
  , [ InfixL (ABinary Add      <$ symbol "+")
    , InfixL (ABinary Subtract <$ symbol "-") ]
  ]

bOperators :: [[Operator Parser BExpr]]
bOperators =
  [ [Prefix (Not <$ rword "not") ]
  , [InfixL (BBinary And <$ rword "and")
    , InfixL (BBinary Or <$ rword "or") ]
  ]

aTerm :: Parser AExpr
aTerm = parens aExpr
  <|> Var      <$> identifier
  <|> IntConst <$> integer

bTerm :: Parser BExpr
bTerm =  parens bExpr
  <|> (BoolConst True  <$ rword "true")
  <|> (BoolConst False <$ rword "false")
  <|> rExpr

rExpr :: Parser BExpr
rExpr = do
  a1 <- aExpr
  op <- relation
  a2 <- aExpr
  return (RBinary op a1 a2)

relation :: Parser RBinOp
relation = (symbol ">" *> pure Greater)
  <|> (symbol "<" *> pure Less)

parsePrint :: String -> IO()
parsePrint s = do
    parseTest stmt' s

Running this parses correctly.

parsePrint $ unlines
[ "while (true) do if(false) then x := 5 else y := 20"
]

This is the code for parsing indentation from the second tutorial here.

{-# LANGUAGE TupleSections #-}

module Main where

import Control.Applicative (empty)
import Control.Monad (void)
import Data.Void
import Data.Char (isAlphaNum)
import Text.Megaparsec
import Text.Megaparsec.Char
import qualified Text.Megaparsec.Char.Lexer as L

type Parser = Parsec Void String

lineComment :: Parser ()
lineComment = L.skipLineComment "#"

scn :: Parser ()
scn = L.space space1 lineComment empty

sc :: Parser ()
sc = L.space (void $ takeWhile1P Nothing f) lineComment empty
  where
    f x = x == ' ' || x == '\t'

lexeme :: Parser a -> Parser a
lexeme = L.lexeme sc

pItem :: Parser String
pItem = lexeme (takeWhile1P Nothing f) <?> "list item"
  where
    f x = isAlphaNum x || x == '-'

pComplexItem :: Parser (String, [String])
pComplexItem = L.indentBlock scn p
  where
    p = do
      header <- pItem
      return (L.IndentMany Nothing (return . (header, )) pLineFold)

pLineFold :: Parser String
pLineFold = L.lineFold scn $ \sc' ->
  let ps = takeWhile1P Nothing f `sepBy1` try sc'
      f x = isAlphaNum x || x == '-'
  in unwords <$> ps <* sc

pItemList :: Parser (String, [(String, [String])])
pItemList = L.nonIndented scn (L.indentBlock scn p)
  where
    p = do
      header <- pItem
      return (L.IndentSome Nothing (return . (header, )) pComplexItem)

parser :: Parser (String, [(String, [String])])
parser = pItemList <* eof

main :: IO ()
main = return ()

I would like as an example for this to parse correctly.

parsePrint $ unlines
[ "while (true) do" 
, "    if(false) then x := 5 else y := 20"
]

How could I parse indentation correctly? Also are there any other places with tutorials/documentation on using Megaparsec?

Michael
  • 3,411
  • 4
  • 25
  • 56
  • Clearly you must combine the approach presented in the indentation tutorial with your first program. If you're having trouble with that, include your attempts in the question. Also note that the indentation parser from the tutorial would not accept input on a 2nd line which isn't indented (e.g. `"foo\nbar"` doesn't parse, but `"foo\n bar"` does). More importantly, parsing such programs isn't the goal of an indentation sensitive parser (since there is no indentation in your example program). – user2407038 Jan 16 '18 at 00:55
  • Okay I can add my attempts later today. Sorry in my last example I didn't include indentation... I'll add it in. – Michael Jan 16 '18 at 10:31
  • In terms of adding examples of what I've done, I'm struggling getting my head around it to actually have any good viable attempts. So I feel that it wouldn't be of use unfortunately. I could grasp the concepts of indentation using Parsec but I'm struggling with this. Also these two files are the only two I could find demoing how to use indentation in Megaparsec so it's making it more difficult. – Michael Jan 16 '18 at 10:36

1 Answers1

6

After spending a lot of time on this over the last couple of weeks I managed to work it out. It was a matter of moving from using strings to using my own "Expr" data type.
For anybody else who would like to start writing an indented language this code could be a good start!

Parser

{-# LANGUAGE TupleSections #-}

module IndentTest where

import Control.Applicative (empty)
import Control.Monad (void)
import Data.Void
import Data.Char (isAlphaNum)
import Text.Megaparsec
import Text.Megaparsec.Char
import qualified Text.Megaparsec.Char.Lexer as L
import Text.Megaparsec.Expr

import Block

type Parser = Parsec Void String

-- Tokens


lineComment :: Parser ()
lineComment = L.skipLineComment "#"


scn :: Parser ()
scn = L.space space1 lineComment empty


sc :: Parser ()
sc = L.space (void $ takeWhile1P Nothing f) lineComment empty
  where
    f x = x == ' ' || x == '\t'


symbol :: String -> Parser String
symbol = L.symbol sc


rword :: String -> Parser ()
rword w = lexeme (string w *> notFollowedBy alphaNumChar)


rws :: [String] -- list of reserved words
rws = ["module", "println", "import",  "let", "if","then","else","while","do","skip","true","false","not","and","or"]


word :: Parser String
word = (lexeme . try) (p >>= check)
  where
    p       = (:) <$> alphaNumChar <*> many alphaNumChar
    check x = if x `elem` rws
                then fail $ "keyword " ++ show x ++ " cannot be an word"
                else return x

lexeme :: Parser a -> Parser a
lexeme = L.lexeme sc


integer :: Parser Integer
integer = lexeme L.decimal


parens :: Parser a -> Parser a
parens = between (symbol "(") (symbol ")")


aTerm :: Parser AExpr
aTerm = parens aExpr
  <|> Var      <$> identifier
  <|> IntConst <$> integer


aOperators :: [[Operator Parser AExpr]]
aOperators =
  [ [Prefix (Neg <$ symbol "-") ]
  , [ InfixL (ABinary Multiply <$ symbol "*")
    , InfixL (ABinary Divide   <$ symbol "/") ]
  , [ InfixL (ABinary Add      <$ symbol "+")
    , InfixL (ABinary Subtract <$ symbol "-") ]
  ]


aExpr :: Parser AExpr
aExpr = makeExprParser aTerm aOperators


assignArith :: Parser Expr
assignArith = do
  var  <- identifier
  symbol ":"
  vType <- valType
  symbol "="
  e <- aExpr
  return $ AssignArith vType var e


bTerm :: Parser BExpr
bTerm =  parens bExpr
  <|> (BoolConst True  <$ rword "true")
  <|> (BoolConst False <$ rword "false")
  <|> rExpr


bOperators :: [[Operator Parser BExpr]]
bOperators =
  [ [Prefix (Not <$ rword "not") ]
  , [InfixL (BBinary And <$ rword "and")
    , InfixL (BBinary Or <$ rword "or") ]
  ]


bExpr :: Parser BExpr
bExpr = makeExprParser bTerm bOperators


rExpr :: Parser BExpr
rExpr = do
  a1 <- aExpr
  op <- relation
  a2 <- aExpr
  return (RBinary op a1 a2)


relation :: Parser RBinOp
relation = (symbol ">" *> pure Greater)
  <|> (symbol "<" *> pure Less)


identifier :: Parser String
identifier = (lexeme . try) (p >>= check)
  where
    p       = (:) <$> letterChar <*> many alphaNumChar
    check x = if x `elem` rws
                then fail $ "keyword " ++ show x ++ " cannot be an identifier"
                else return x


stringLiteral :: Parser Expr
stringLiteral = do
  value <- char '"' >> manyTill L.charLiteral (char '"')
  symbol ";"
  return $ StringLiteral value


assignString :: Parser Expr
assignString = do
  var  <- identifier
  symbol ":"
  vType <- valType
  symbol "="
  e <- stringLiteral
  return (AssignString vType var e)


arrayDef :: Parser Expr
arrayDef = do
  name <- identifier
  symbol ":"

  symbol "["
  arrType <- word
  symbol "]"

  symbol "="
  return $ ArrayDef arrType name

arrayValues :: Parser Expr
arrayValues = do
  symbol "["
  values <- many identifier
  symbol "]"
  return $ ArrayValues values

arrayAssign :: Parser Expr
arrayAssign = do
  def <- arrayDef
  values <- arrayValues
  return $ ArrayAssignment def values

arrayElementSelect :: Parser Expr
arrayElementSelect = do
  symbol "!!"
  elementNum <- word
  return $ ArrayElementSelect elementNum


moduleParser :: Parser Expr
moduleParser = L.nonIndented scn (L.indentBlock scn p)
  where
    p = do
      rword "module"
      name <- identifier
      return (L.IndentSome Nothing (return . (Module name)) expr')


valType :: Parser Expr
valType = do
    value <- identifier
    return $ Type value


argumentType :: Parser Expr
argumentType = do
    value <- identifier
    return $ ArgumentType value


returnType :: Parser Expr
returnType = do
    value <- identifier
    return $ ReturnType value


argument :: Parser Expr
argument = do
  value <- identifier
  return $ Argument value


-- Function parser
functionParser :: Parser Expr
functionParser = L.indentBlock scn p
  where
    p = do
      name <- identifier
      symbol ":"
      argTypes <- some argumentType
      symbol "->"
      rType <- IndentTest.returnType
      nameDup <- L.lineFold scn $ \sp' ->
        (identifier) `sepBy1` try sp' <* scn
      args <- many argument
      symbol "="
      if(name == "main") then
          return (L.IndentMany Nothing (return . (MainFunction name argTypes args rType)) expr')
      else
          return (L.IndentMany Nothing (return . (Function name argTypes args rType)) expr')



functionCallParser :: Parser Expr
functionCallParser = do
  name <- identifier
  args <- parens $ many argument
  return $ FunctionCall name args


printParser :: Parser Expr
printParser = do
  rword "println"
  bodyArr <- identifier
  symbol ";"
  return $ Print bodyArr


valueToken :: Parser String
valueToken = lexeme (takeWhile1P Nothing f) <?> "list item"
  where
    f x = isAlphaNum x || x == '-'


ifStmt :: Parser Expr
ifStmt = L.indentBlock scn p
   where
     p = do
       rword "if"
       cond  <- bExpr
       return (L.IndentMany Nothing (return . (If cond)) expr')

elseStmt :: Parser Expr
elseStmt = L.indentBlock scn p
   where
     p = do
       rword "else"
       return (L.IndentMany Nothing (return . (Else)) expr')

whereStmt :: Parser Expr
whereStmt = do
  rword "where"
  symbol "{"
  exprs <- many expr
  symbol "}"
  return $ (Where exprs)


expr :: Parser Expr
expr = f <$> sepBy1 expr' (symbol ";")
  where
    -- if there's only one expr return it without using ‘Seq’
    f l = if length l == 1 then head l else Seq l


expr' :: Parser Expr
expr' = try moduleParser
  <|> try functionParser
  <|> try ifStmt
  <|> try elseStmt
  <|> try arrayAssign
  <|> arrayElementSelect
  <|> try assignArith
  <|> try functionCallParser
  <|> try assignString
  <|> try printParser
  <|> try whereStmt
  <|> try stringLiteral


parser :: Parser Expr
parser = expr'


parseFromFile file = runParser expr file <$> readFile file


parseString input =
  case parse expr' "" input of
    Left  e -> show e
    Right x -> show x


parsePrint :: String -> IO()
parsePrint s = parseTest' parser s

Block/Expr - The AST consists of this

module Block where

import Data.List
import Text.Show.Functions
import Data.Char
import Data.Maybe

-- Boolean expressions
data BExpr
  = BoolConst Bool
  | Not BExpr
  | BBinary BBinOp BExpr BExpr
  | RBinary RBinOp AExpr AExpr

instance Show BExpr where
    show (BoolConst b) = lowerString $ show b
    show (Not n) = show n
    show (BBinary bbinop bExpr1 bExpr2) = show bExpr1 ++ " " ++ show bbinop ++ " " ++ show bExpr2
    show (RBinary rbinop aExpr1 aExpr2) = show aExpr1 ++ " " ++ show rbinop ++ " " ++ show aExpr2


-- Boolean ops
data BBinOp
  = And
  | Or

instance Show BBinOp where
    show (And) = "&&"
    show (Or) = "||"

-- R binary ops
data RBinOp
  = Greater
  | Less

instance Show RBinOp where
    show (Greater) = ">"
    show (Less) = "<"

-- Arithmetic expressions
data AExpr
  = Var String
  | IntConst Integer
  | Neg AExpr
  | ABinary ABinOp AExpr AExpr
  | Parenthesis AExpr

instance Show AExpr where
    show (Var v) = v
    show (IntConst i) = show i
    show (Neg aExpr) = "-" ++ show aExpr
    show (ABinary aBinOp aExpr1 aExpr2) = show aExpr1 ++ " " ++ show aBinOp ++ " " ++ show aExpr2
    show (Parenthesis aExpr) = "(" ++ show aExpr ++ ")"

-- Arithmetic ops
data ABinOp
  = OpeningParenthesis
  | ClosingParenthesis
  | Add
  | Subtract
  | Multiply
  | Divide

instance Show ABinOp where
    show (Add) = "+"
    show (Subtract) = "-"
    show (Multiply) = "*"
    show (Divide) = "/"
    show (OpeningParenthesis) = "("
    show (ClosingParenthesis) = ")"

-- Statements
data Expr
  = Seq [Expr]
  | Module String [Expr]
  | Import String String
  | MainFunction {name ::String, argTypes:: [Expr], args::[Expr], returnType::Expr, body::[Expr]}
  | Function String [Expr] [Expr] Expr [Expr]
  | FunctionCall String [Expr]
  | Type String
  | ValueType String
  | Argument String
  | ArgumentType String
  | ReturnType String
  | AssignArith Expr String AExpr
  | AssignString Expr String Expr
  | If BExpr [Expr]
  | Else [Expr]
  | While BExpr [Expr]
  | Print String
  | Return Expr
  | ArrayValues [String]
  | ArrayDef String String
  | ArrayAssignment Expr Expr
  | ArrayElementSelect String
  | Lambda String String
  | Where [Expr]
  | StringLiteral String
  | Skip

instance Show Expr where
    show (Module name bodyArray) =
        -- Get the main function tree

        "public class " ++ name ++ "{\n" ++
            "public static void main(String[] args){\n" ++
                name ++ " " ++ lowerString name ++ "= new " ++ name ++ "();\n" ++
                intercalate "\n" (map (\mStatement -> if(isFunctionCall mStatement) then (lowerString name ++ "." ++ show mStatement) else show mStatement) (body ((filter (isMainFunction) bodyArray)!!0))) ++
            "}\n" ++
            getFunctionString bodyArray ++
        "}\n"

    show (Import directory moduleName) = "import " ++ directory ++ moduleName
    show (Function name argTypes args returnType body) = "public " ++ show returnType ++ " " ++ name ++ "("++ intercalate ", " (zipWith (\x y -> x ++ " " ++ y) (map show argTypes) (map show args)) ++"){\n" ++ intercalate "\n" (map show body) ++ "}"
    show (MainFunction name argTypes args returnType body) =
        intercalate "\n " $ map show body
    show (FunctionCall name exprs) = name ++ "(" ++ (intercalate ", " (map show exprs)) ++ ");"
    show (Type b) = b
    show (Argument b) = b
    show (ArgumentType b) = b
    show (ReturnType b) = b
    show (AssignArith vType name value) = "" ++ show vType ++ " " ++ name ++ "=" ++ show value ++ ";"
    show (AssignString vType name value) = "" ++ show vType ++ " " ++ name ++ "=" ++ show value ++ ";"
    show (If condition statement) = "if(" ++ show condition ++ "){\n" ++ intercalate "\n" (map show statement) ++ "}"
    show (Else statement) = " else {\n" ++ intercalate "\n" (map show statement) ++ "}"
    show (While condition statement) = "while(" ++ show condition ++ "){\n" ++ intercalate "\n" (map show statement) ++ "}"
    show (Skip) = "[skip]"
    show (Seq s) = "[seq]"
    show (Return expr) = "return " ++ show expr ++ ";"
    show (Print exprs) = "System.out.println(" ++ exprs ++ ");" --"System.out.println(" ++ intercalate " " (map show exprs) ++ ");"
    show (ArrayDef arrType name) = arrType ++ "[] " ++ name ++ "="
    show (ArrayValues exprs) = "{" ++ intercalate ", " exprs ++ "};"
    show (ArrayAssignment arr values) = show arr ++ show values
    show (ArrayElementSelect i) = "[" ++ i ++ "];"
    show (Lambda valName collectionName) = ""
    show (Where exprs) = intercalate "\n" (map show exprs)
    show (StringLiteral value) = "\"" ++ value ++ "\""
    show (_) = "<unknown>"

lowerString str = [ toLower loweredString | loweredString <- str]

extractMain :: Expr -> Maybe String
extractMain (MainFunction m _ _ _ _) = Just m
extractMain _ = Nothing

extractFunctionCall :: Expr -> Maybe String
extractFunctionCall (FunctionCall m _) = Just m
extractFunctionCall _ = Nothing

isMainFunction :: Expr -> Bool
isMainFunction e = isJust $ extractMain e

isFunctionCall :: Expr -> Bool
isFunctionCall e = isJust $ extractFunctionCall e

{--
getInnerMainFunctionString :: [Expr] -> String -> String
getInnerMainFunctionString e instanceName  = do
    if(isMainFunction (e!!0)) then
      show (e!!0)
    else
      getInnerMainFunctionString (drop 1 e) instanceName
--}
getFunctionString :: [Expr] -> String
getFunctionString e = do
    if(isMainFunction (e!!0)) then
      ""
    else
      "" ++ show (e!!0) ++ getFunctionString (drop 1 e)

Code Example

module IndentationTest
    testFunction : int -> void
    testFunction x =
        if(x < 50)
            println x;
            nextX :int = x + 1 * 2 - 3 / 2 + 5
            testFunction (nextX)
        else
            last :int = 1000
            println last;

    main : String -> IO
    main args =
        x :int = 3
        y :int = 10
        z :int = 15
        arrTest:[int] = [x y z]
        println arrTest;
        testFunction (x)
        stringTest :String = "Helloworld";

This will successfully parse the example code. Just pass it into the parsePrint function.

Michael
  • 3,411
  • 4
  • 25
  • 56