{- CSci 450: Organization of Programming Languages
   Imperative Core Language, Parser
   Fall 2017 
   H. Conrad Cunningham

1234567890123456789012345678901234567890123456789012345678901234567890

2017-09-14: Based partly on the Expression Language prefix parser
            code, but modified for the Kamin/Ramsey Imperative Core
            language syntax
2017-09-21: Changed import of ValueImpCore to Values

The intention of this module is to encapsulate the implementation of
parsing from other modules. The input is a string consisting of an
ImpCore script written in the concrete syntax. The output is an
abstract syntax tree for the program.

The structure of the abstract syntax is defined in module
AbSynImpCore, but its semantics is known by the current module and
others that manipulate the abstract syntax trees.

Similarly, this module uses lexical analysis module to convert the
input sequence to be a sequence of tokens. The definition of the Token
type is defined in the lexical module but its semantics is shared
between the two modules.

This implementation provides a recursive descent parser for the
following grammar.

Imperative Core Language Grammar:

    <script>    ::= {<def>}
    <def>       ::= <valdef>
--                | <usedef>  -- moved to REPL command
                  | <fundef>
                  | <expr>
    <valdef>    ::= "(" "val" <varname>  <expr> ")"
    <fundef>    ::= "(" "define" <funcname> <formals> <expr> ")"
--  <usedef>    ::= "(" "use" <filename> ")"
    <formals>   ::= "(" {<varname>} ")"
    <expr>      ::= <value>
                  | <varref>
                  | <setexpr>
                  | <ifexpr>
                  | <whexpr>
                  | <begexpr>
                  | <appexpr> 
    <setexpr>   ::= "(" "set" <varname> <expr> ")"
    <ifexpr>    ::= "(" "if" <expr> <expr> <expr> ")"
    <whexpr>    ::= "(" "while" <expr> <expr> ")"
    <begexpr>   ::= "(" "begin" {<expr>} ")"
    <appexpr>   ::= "(" <function> {<expr>} ")"
    <function>  ::= <primitive>
                  | <funcname>
    <value>     ::= ["-" | "+"] <unsigned>
    <varref>    ::= <varname>
    <varname>   ::= <name> for global variables and formal parameters
    <primitive> ::= <name> for the builtin primitive operations
    <funcname>  ::= <name> for user-defined functions

The supported primitive operations are +, -, *, /, ==, <, >, and
print.


-}

module ParseImpCore
    ( ValType, Name, ParErr, Def(..), Expr(..), trimComment, 
      parse, parseMany,
      parseScript, parseDef, parseExpr
    )
where

-- Haskell libraries
import Data.Char ( isSpace, isDigit )

-- Imperative Core modules
import Values ( NumType, ValType, Name )
import LexImpCore   ( Token(..), lexx )
import AbSynImpCore ( Def(..), Expr(..) )

-- Type definitions
type ParErr = String

-- Maximum prefix of the token sequence displayed for errors
errpref = 10


{-  Function "trimComment" trims an end-of-line comment from a line
    of text input. End-of-line comments begin with "beginComment".
-}

beginComment = ';'

trimComment :: String -> String
trimComment = takeWhile (/= beginComment)


{-  Function "getName" takes a string and returns a Just
    wrapping a Name if it is a valid identifier or a Nothing
    if any non-identifier characters occur.
-}

getName :: String -> Maybe Name
getName xs = 
    case getNameFront xs of
        (xs@(_:_),[]) -> Just xs
        otherwise     -> Nothing


{-  Function "getNameFront" extracts a name from the beginning of
    a string and returns the name and the remaining string.
-}

getNameFront :: String -> (Name,String)
getNameFront []  = ([],[])
getNameFront xs = span isNameChar xs
    where
      nonNameChars = ['(',')',';']
      isNameChar c = not (isSpace c || elem c nonNameChars)


{-  Function "getValue"takes a string and returns a Just wrapping
    the value if it is a valid value literal and returns a
    Nothing if the argument is not a valid literal.
-}

getValue :: String -> Maybe ValType
getValue xs =
    case getInt xs of
        (xs@(_:_),[]) -> Just (read xs)
        otherwise     -> Nothing


{-  Function "getInt" takes a string and extracts an integer
    literal from the beginning of a string and returns the integer
    literal and the remaining string.
-}

getInt :: String -> (String,String)
getInt xs@(x:xs')
    | isDigit x = span isDigit xs
    | x == '+'  = span isDigit xs'
    | x == '-'  = let (ys,zs) = span isDigit xs'
                  in  (x:ys,zs)
    | otherwise = ([],xs)


{-  PARSING  -}

{-  Function "parse" takes an input, processes it with the lexical
    analyzer "lexx" and the recursive descent prefix parser "parseDef"
    for one definition, and returns an Either item wrapping the
    Def abstract syntax tree or an error message. There must not be
    extra nonspace charaters at the end of the input.
-}

parse :: String -> Either ParErr Def
parse xs =
    case parseDef (lexx xs) of
        (ex@(Right _), []) -> ex
        (ex@(Left  _), _ ) -> ex
        (ex@(Right _), ys) ->
          Left (extraAtEndDef ys)

extraAtEndDef ts = "Extra tokens at end of definition " ++  show ts


{-  Function "parseMany" takes an input, processes it with the lexical
    analyzer "lexx" and the recursive descent prefix parser
    "parseScript" for zero or more definitions, and returns an
    Either item wrapping the list of Def abstract syntax trees
    or an error message. There must not be extra nonspace charaters
    at the end of the input.
-}

parseMany :: String -> Either ParErr [Def]
parseMany xs =
    case parseScript (lexx xs) of
        (ex@(Right _), []) -> ex
        (ex@(Left  _), _ ) -> ex
        (ex@(Right _), ys) -> 
          Left (extraAtEndDef ys)

extraAtEndScript ts = "Extra tokens at end of script " ++  show ts


{-  Function "parseScript" parses the grammar rule:

        <script> ::= {<def>}
-}

parseScript :: [Token] -> (Either ParErr [Def], [Token])
parseScript [] = (Right [], [])
parseScript xs =
    case parseDef xs of 
        (Right d, ys) -> 
            case parseScript ys of 
               (Right ds, zs) -> (Right (d:ds), zs)
               l@(Left _, _ ) -> l
        (Left err, zs) -> (Left err, zs) -- left/right types differ


{-  Function "parseDef" parses the grammar rule:

         <def> ::= <valdef> | <fundef> | <expr>

    Note: Because "val" and "define" are not syntactically eliminated
    from the names of variables or functions, the attempt to parse
    <expr> must occur after the other alternatives to get the desired
    result.
-}

parseDef :: [Token] -> (Either ParErr Def, [Token])
parseDef xs =
    case parseValDef xs of
        r@(Right _, _) -> r  -- <valdef>
        _ ->
            case parseFunDef xs of
                r@(Right _, _) -> r  -- <fundef>
                _ ->
--                  case parseUseDef xs of   -- Moved to REPL command
--                      r@(Right _, _) -> r  -- <usedef>
--                      _              -> 
                            case parseExpr xs of
                                (Right e, ys) ->
                                    (Right (Top e), ys)  -- <expr> 
                                _ -> (missingDef xs, xs)

missingDef ts = Left ("No definition <def> found beginning at "
                      ++ show (take errpref ts))


{-  Function "parseValDef" parses the following grammar rule:

        <valdef> ::= "(" "val" <varname>  <expr> ")"

-}

parseValDef :: [Token] -> (Either ParErr Def, [Token])
parseValDef xs@(TokLeft:(TokName "val"):(TokName vn):ys) =
    case parseExpr ys of
        (Right e, zs) ->
            case zs of
                (TokRight:zs') -> (Right (Val vn e), zs')
                _ -> (missingRightVal xs, xs)
        _ -> (missingInitVal xs, xs)
parseValDef xs =
    (wrongPrefixVal xs, xs)


wrongPrefixVal ts =
    Left ("Incorrect start to global variable definition <valdef> at "
          ++ show (take errpref ts))
missingInitVal ts =
    Left (
      "Missing initializer on global variable definition <valdef> at "
       ++ show (take errpref ts))
missingRightVal ts =
    Left ("Missing ')' on global variable definition <valdef> at "
    ++ show (take errpref ts))


{-  Function "parseFunDef" parses the following grammar rule:

        <fundef> ::= "(" "define" <funcname> <formals> <expr> ")"
-}

parseFunDef :: [Token] -> (Either ParErr Def, [Token])
parseFunDef xs@(TokLeft:(TokName "define"):(TokName fn):ys) =
    case parseFormals ys of
        r@(Right ps, zs) -> 
            case parseExpr zs of
                r@(Right be, zs') ->
                    case zs' of
                        (TokRight:zs'') ->
                            (Right (Define fn ps be), zs'')
                        _ ->
                            (missingRightFun xs, xs)
                _ -> (missingBodyFun xs, xs)
        _ -> (wrongParmFun xs, xs)
parseFunDef xs =
    (wrongPrefixFun xs, xs)

wrongPrefixFun ts =
    Left ("Incorrect start to function definition <fundef> at "
          ++ show (take errpref ts))
wrongParmFun ts =
    Left ("Incorrect parameter list on function defintion <fundef> at "
          ++ show (take errpref ts))
missingBodyFun ts =
    Left ("Missing body expression on function defintion <fundef> at "
          ++ show (take errpref ts))
missingRightFun ts =
    Left ("Missing ')' on function defintion <fundef> at "
          ++ show (take errpref ts))


{-  Function "parseFormals" parses the grammar rule:

        <formals>  ::= "(" { <varname> } ")"

    It does so by factoring out the repetition, giving the rules:

        <formals>  ::= "(" <namelist> ")"
        <namelist> ::= { <varname> }

    The "parseNameList" function parses the <namelist> rule.

     Note: The result types on these functions differ from the Def and
     Expr used in most other parsing functions in this module.
-}

parseFormals :: [Token] -> (Either ParErr [Name], [Token])
parseFormals xs@(TokLeft:ys) =
    case parseNameList ys of
        (ps, TokRight:zs) -> (Right ps, zs)
        _                 -> (missingRightFormals xs, xs)
parseFormals xs =
    (missingLeftFormals xs, xs)
                  
parseNameList :: [Token] -> ([Name],[Token])
parseNameList ((TokName vn):xs) =
    let (vns,ys) = parseNameList xs
    in  (vn:vns,ys)
parseNameList xs =
    ([],xs)

missingLeftFormals ts  = Left ("Missing '(' on parameter list at "
                              ++ show (take errpref ts))
missingRightFormals ts = Left ("Missing ')' on parameter list at "
                              ++ show (take errpref ts))


{-  Function "parseUseDef" parses the following grammar rule:

        <usedef> ::= "(" "use" <filename> ")"

-}

{-  Moved to REPL command
parseUseDef :: [Token] -> (Either ParErr Def, [Token])
parseUseDef (TokLeft:(TokName "use"):(TokName fn):TokRight:ys) =
    (Right (Use fn), ys)
parseUseDef xs =
    (wrongPrefixUse xs, xs)

wrongPrefixUse ts =
    Left ("Incorrect start to file use definition <usedef> at "
          ++ show ts)
-}

{-  Function "parseExpr" parses <expr> from the set of rules:

        <expr>     ::= <value>  | <varref> | <setexpr>
                     | <ifexpr> | <whexpr>  | <begexpr> | <appexpr>
        <ifexpr>   ::= "(" "if" <expr> <expr> <expr> ")"
        <whexpr>   ::= "(" "while" <expr> <expr> ")"
        <begexpr>  ::= "(" "begin" {<expr>} ")"
        <appexpr>  ::= "(" <function> {<expr>} ")"
        <function> ::= <primitive> | <funcname>
        <varref>   ::= <varname>


    However, we recognize that <expr> alternatives <ifexpr>, <whexpr>,
    <begexpr>, and <appexpr> are similarly structured. So we refactor
    this set of rules as:
    
        <expr>     ::= <value>  | <varref> | <setexpr> | <opexpr> 
        <opexpr>   ::= "(" <op> {<expr>} ")"
        <op>       ::= "if" | "while" | "begin" | <function>
        <function> ::= <primitive> | <funcname>
        <varref>   ::= <varname>

    Note: Because "set" is not syntactically eliminated from being
    the name of a function, the attempt to parse <setexpr> must
    occur before <operexpr>.
-}

parseExpr :: [Token] -> (Either ParErr Expr,[Token])
parseExpr xs =
    case parseValue xs of
        r@(Right _, _) -> r
        _ ->
            case parseVarRef xs of
                r@(Right _, _) -> r
                _ ->
                    case parseSetExpr xs of
                        r@(Right _, _) -> r
                        _ ->
                            case parseOpExpr xs of
                                r@(Right _, _) -> r
                                _ ->
                                    (missingExpr xs, xs)

missingExpr ts =
    Left ("Missing expression <expr> at " ++ show (take errpref ts))

    
{-  Function "parseValue" parses the following grammar rule:

        <value> ::= ["-" | "+"] <unsigned>
-}

parseValue :: [Token] -> (Either ParErr Expr, [Token])
parseValue ((TokName "-"):(TokNum v):xs) =
    (Right (Lit (-v)), xs)
parseValue ((TokName "+"):(TokNum v):xs) =
    (Right (Lit v), xs)
parseValue ((TokNum v):xs) =
    (Right (Lit v), xs)
parseValue xs =
    (missingLit xs, xs)

missingLit ts =
    Left ("Missing literal vallue at " ++ show (take errpref ts))


{-  Function "parseVarRef" parses the following grammar rule:

        <varref> ::= <varname>
-}

parseVarRef :: [Token] -> (Either ParErr Expr, [Token])
parseVarRef ((TokName vn):xs) =
    (Right (Var vn), xs)
parseVarRef xs =
    (missingVarRef xs, xs)

missingVarRef ts =
    Left ("Missing variable reference <varref> at " ++ show ts)


{-  Function "parseSetExpr" parses the following rule:

        <setexpr> ::= "(" "set" <varname> <expr> ")"
-}

parseSetExpr :: [Token] -> (Either ParErr Expr, [Token])
parseSetExpr xs@(TokLeft:(TokName "set"):(TokName vn):ys) =
    case parseExpr ys of
        (Right e, zs) ->
            case zs of
                (TokRight:zs') -> (Right (Set vn e), zs')
                _              -> (missingRightSet xs, xs)
        _ -> (missingInitSet xs, xs)
parseSetExpr xs =
    (wrongPrefixSet xs, xs)

wrongPrefixSet ts =
    Left ("Incorrect start to set exoression <setexpr> at "
          ++ show (take errpref ts))
missingInitSet ts =
    Left ("Missing initial value on set expression <setexpr> at "
          ++ show  (take errpref ts))
missingRightSet ts =
    Left ("Missing ')' on set expression <setexpr> at " ++
           show (take errpref ts))


{-  Function "parseOpExpr" parses the following refactored rules
    (from the "parseExpr" step):

       <opexpr> ::= "(" <op> {<expr>})
       <op>     ::= "if" | "while" | "begin" | <function>

    After parsing, we distinguish between primitive and user-defined
    functions, i.e., parse the rule

        <function> ::= <primitive> | <funcname>

    and ensure that the length of the operand expression list is
    appropriate for the primitives.
-}

parseOpExpr :: [Token] -> (Either ParErr Expr, [Token])
parseOpExpr xs@(TokLeft:(TokName op):ys) =
  case parseOperandList ys of
        (exs, TokRight:zs) -> (makeOpExpr op exs, zs)
        _                  -> (missingRightOpExpr xs, xs)
parseOpExpr xs =
    (missingLeftOpExpr xs, xs)

missingLeftOpExpr ts =
    Left ("Missing '(' on parenthesized operator expression at "
          ++ show (take errpref ts))
missingRightOpExpr ts =
    Left ("Missing ')' on parenthesized operator expression at "
          ++ show (take errpref ts))


{-  Function "parseOperandList" takes a token list and collects a list
    of 0 or more operand expressions. An empty list means that no
    operands were found.
-}

parseOperandList:: [Token] -> ([Expr],[Token])
parseOperandList xs =
    case parseExpr xs of 
        (Left   _, _ ) -> ([],xs)
        (Right ex, ys) ->
            let (exs,zs) = parseOperandList ys
            in  (ex:exs,zs)


{-  Function "makeExpr" takes the operator string and a list of
    operand expressions and constructs an appropirate Expr.
-}

makeOpExpr :: Name -> [Expr] -> Either ParErr Expr

makeOpExpr "if" exs =
    case length exs of
        3 -> Right (If (exs!!0) (exs!!1) (exs!!2))
        l -> Left ("If expression has wrong number of operands: "
                  ++ show l)

makeOpExpr "while" exs =
    case length exs of
        2 -> Right (While (exs!!0) (exs!!1))
        l -> Left ("While expression has wrong number of operands: "
                  ++ show l)

makeOpExpr "begin" exs = Right (Begin exs)

makeOpExpr op exs =  -- primitive and user-defined functions
    case lookup op primitiveArity of
        Nothing -> Right (Apply op exs)  -- user-defined function
        Just 1  -> primitive1 op exs     -- unary primitive
        Just 2  -> primitive2 op exs     -- binary primitive
        Just n  ->
            Left ("Use of unsupported primitive operator '" ++ op
                   ++ "' with " ++ show n ++ "arguments")

primitiveArity = [ ("+", 2), ("-",2), ("*",2), ("/",2),
                   ("==",2), ("<",2), (">",2),("print",1)
                 ]


{-  Function "primitive1" takes a unary operator string and an
    operand list with one element and returns the corresponding Expr
    structure wrapped in a Right. An error is denoted by passing back
    a Left.
-}

assocCons1 = [ ("print",Print) ]

primitive1 :: String -> [Expr] -> Either ParErr Expr
primitive1 op exs =
    case length exs of
        1 -> case lookup op assocCons1 of
                Just c  -> Right (c (exs!!0))
                Nothing -> invalidOp op
        n -> arityErr op n


{-  Function "primitive2" takes a binary operator string and an
    operand list with two elements and returns the corresponding Expr
    structure wrapped in a Right. An error is denoted by passing back
    a Left.
-}

assocCons2 =
  [ ("+",Add), ("-",Sub), ("*",Mul), ("/",Div),
    ("==",Eq),("<",Lt),(">",Gt) -- while?
  ]

primitive2 :: String -> [Expr] -> Either ParErr Expr
primitive2 op exs =
    case length exs of
        2 -> case lookup op assocCons2 of
                Just c  -> Right (c (exs!!0) (exs!!1))
                Nothing -> invalidOp op
        n -> arityErr op n

invalidOp op = Left ("Use of invalid operator '" ++ op ++ "'")
arityErr op n =
    Left ("Use of primitive operator '" ++ op ++
          "' with incorrect number of arguments (" ++ show n ++ ")")
