{-  Exploring Languages with Interpreters and Functional Programming
    ELI Calculator Language; Infix Parser
    Copyright (C) 2017, 2018, H. Conrad Cunningham

1234567890123456789012345678901234567890123456789012345678901234567890

1994-11-14: Developed expression recognizer for Gofer/Hugs Haskell
2006-02-08: Changed test to use IO monad (putStrLn)
2010-09-16: Added main program for testing
2017-04-05: V1--Revised to use Token type, modularized for
            Haskell 2010, based partly on a prefix interpreter
            from 2007-05-10, still just a recognizer
2017-05-23: V2--Significant rewrite following Prefix Parser V5,
            separated lexical analysis and testing modules,
            added support for AST generation,
            used Either for error handling
2017-07-06: V3--Significant rewrite to match grammar changes &
            recursive descent explanation in course notes
2017-09-20: Separated out AbSynExpr module. Modified to be
            similar to ParseImpCore. Renamed to ParseInfixExpr
2017-09-21: Added getName and getValue (from ParsePrefixExpr)
2017-11-05: Improved comments
2018-08-09: Updated for 2018 ELIFP textbook, Expression Language
            renamed ELI Calculator language, AbSynExpr renamed
            AbSynCalc

This module implements a simple recursive descent recognizer for the
following grammar.

       <expression> ::= <term> { <addop> <term> }
       <term>       ::= <factor> { <mulop> <factor> }
       <factor>     ::= <var> | <val>
                      | <leftparen> <expression> <rightparen>
       <val>        ::= [ <minus> ] <unsigned>
       <var>        ::= <id>
       <addop>      ::= <plus>  | <minus>
       <mulop>      ::= <times> | <div>

This parser supports two levels of precedence among operators --
addition and multiplication -- with right binding of operators. To
support other operators, a parser for a grammar with more levels will
be needed. Similarly, changes would be needed to support left-binding
operators.

TODO:

-  Make handling of operators more robust and flexible
   (avoid "error")
- Include better support for extension to operations "min", "max",
  general unary "neg", boolean ops, relational ops, sine, cosine, if
- Reimplement using a parser combinator library

-}

module ParseInfixCalc
    ( ValType, Name, ParErr, Expr(..), parse, parseExpression,
      trimComment, getName, getValue
    )
where

-- Haskell libraries
import Data.Maybe ( fromMaybe )
import Data.Char ( isSpace, isAlpha, isAlphaNum, isDigit )

-- Expression Language modules
import Values    ( NumType, ValType, Name )
import AbSynCalc ( Expr(..) )  -- uses ValType, Name
import LexCalc   ( Token(..), lexx, lexer, showTokens )
                 -- uses NumType, Name

-- Type definitions
type ParErr = String

-- Maximum prefix of the token sequence displayed for errors
errpref = 10

-- Trim a prefix of the Token list
pref xs = take errpref xs


{- PARSING -}

{-  Function "trimComment" trims an end-of-line comment from a line
    of text input. End-of-line comments begin with "beginComment".
-}

beginComment = ';'

trimComment :: String -> String
trimComment = takeWhile (/= beginComment)


{-  Function "getName" takes a string and returns a Just
    wrapping a Name if it is a valid identifier or a Nothing
    if any non-identifier characters occur.
-}

getName :: String -> Maybe Name
getName xs = 
    case getId xs of
        (xs@(_:_),[]) -> Just xs
        otherwise     -> Nothing


{-  Function "getId" extracts an identifer from the beginning
    of a string and returns the identifier and the remaining string.
-}

getId :: String -> (Name,String)
getId []          = ([],[])
getId xs@(x:_)
    | isFirstId x = span isRestId xs
    | otherwise   = ([],xs)
    where
        isFirstId c = isAlpha c    || c == '_'
        isRestId  c = isAlphaNum c || c == '_'


{-  Function "getValue"takes a string and returns a Just wrapping
    the value if it is a valid value literal and returns a
    Nothing if the argument is not a valid literal.
-}

getValue :: String -> Maybe ValType
getValue xs =
    case getInt xs of
        (xs@(_:_),[]) -> Just (read xs)
        otherwise     -> Nothing


{-  Function "getInt" takes a string and extracts an integer
    literal from the beginning of a string and returns the integer
    literal and the remaining string.
-}

getInt :: String -> (String,String)
getInt xs@(x:xs')
    | isDigit x = span isDigit xs
    | x == '+'  = span isDigit xs'
    | x == '-'  = let (ys,zs) = span isDigit xs'
                  in  (x:ys,zs)
    | otherwise = ([],xs)


{- FULL GRAMMAR PARSING  -}

{- Function "parse" takes an input expression, processes it with the
   lexical analyzer "lexer" and the recursive descent infix parser
   "parseExpression", and returns an Either item wrapping the
   Expr abstract syntax tree or an error message.
-}

parse :: String -> Either ParErr Expr
parse xs =
    case lexer xs of
        [] -> incompleteExpr xs
        ts ->
            case parseExpression ts of
               (ex@(Right _), []) -> ex
               (ex@(Left  _), _ ) -> ex
               (ex, ss)           -> extraAtEnd ex ss

incompleteExpr xs =
    Left ("Incomplete expression: " ++ xs)

extraAtEnd ex xs =
   Left ("Nonspace token(s) \"" ++ (showTokens xs) ++ 
         "\" at end of the expression \"" ++ (show ex) ++ "\"")


{- Function "parseExpression" takes a Token list, parses an
   <expression>, and returns a pair consisting of an Either wrapping
   the Expr abstract syntax tree found and the list of Tokens
   remaining after the <expression>.  An error is denoted by returning
   the Left value for the Either.

   We can refactor rule

       <expression> ::= <term> { <addop> <term> }

   to better match the recursive descent prototypes, as follows:

       <expression> ::= <term> <moreterms>
       <moreterms>  ::= { <addterm }
       <addterm>    ::= <addop> <term>

   We include a function for each rule.
-}

-- Sequence: <expression> ::= <term> <moreterms>
parseExpression :: [Token] -> (Either ParErr Expr, [Token])
parseExpression xs =
    case parseTerm xs of 
        (ex@(Right e1), ys) ->  -- <term>
            let (terms, zs) = parseMoreTerms ys  -- <moreterms>
            in  (Right (makeBinOpSeq e1 terms), zs)
        (err@(Left _), _ )  -> (err, xs)


-- Repetition: <moreterms> ::= { <addterm }
parseMoreTerms :: [Token] -> ([(String,Expr)], [Token])
parseMoreTerms xs =
    case parseAddTerm xs of
        (Right (op,ex), ys) ->   -- has <addterm>
            let (terms, zs) = parseMoreTerms ys
            in  (((op,ex):terms), zs)
        (Left _,  _) -> ([], xs) -- no <addterm>


-- Sequence: <addterm> ::= <addop> <term>
parseAddTerm :: [Token] -> (Either ParErr (String,Expr), [Token])
parseAddTerm xs =
    case parseAddOp xs of
        (Right op, ys) ->
            case parseTerm ys of
                (Right ex, zs)  -> (Right (op,ex), zs)
                (Left err, _ ) -> (Left err, xs)
        (Left err, _) -> (Left err, xs)


-- Base case: <addop>
parseAddOp :: [Token] -> (Either ParErr String, [Token])
parseAddOp ((TokOp op):xs)
    | isAddOp op = (Right op, xs)
parseAddOp xs    = (missingAddOp xs, xs)

missingAddOp xs =
    Left ("Missing add-like operator at \"" ++ (showTokens (pref xs))
          ++ "\"")


{- Function "parseTerm" takes a Token list, parses a <term>, and
   returns a pair consisting of an Either wrapping the Expr
   abstract syntax tree found and the list of Tokens remaining
   after the <term>.  An error is denoted by returning the Left
   value for the Either.

   As with <expressioon>, we can refactor the BNF rule

       <term> ::= <factor> { <mulop> <factor> }

   to better match the recursive descent prototypes, as follows:

       <term>        ::= <factor> <morefactors>
       <morefactors> ::= { <mulfactor> }
       <mulfactor>   ::= <mulop> <factor>
-}

-- Sequence: <term> ::= <factor> <morefactors>
parseTerm :: [Token] -> (Either ParErr Expr, [Token])
parseTerm xs =
    case parseFactor xs of
        (ex@(Right e1), ys) ->  -- <factor>
            let (factors, zs) = parseMoreFactors ys  -- <morefactors>
            in  (Right (makeBinOpSeq e1 factors), zs)
        (err@(Left _), _ ) -> (err, xs)


-- Repetition: <morefactors> ::= { <mulfactor> }
parseMoreFactors :: [Token] -> ([(String,Expr)], [Token])
parseMoreFactors xs =
    case parseMulFactor xs of
        (Right (op,ex), ys) ->   -- has <mulfactor>
            let (factors, zs) = parseMoreFactors ys
            in  (((op,ex):factors), zs)
        (Left _,  _) -> ([], xs) -- no <mulfactor>


-- Sequence: <mulfactor> ::= <mulop> <factor>
parseMulFactor :: [Token] -> (Either ParErr (String,Expr), [Token])
parseMulFactor xs =
    case parseMulOp xs of
        (Right op, ys) ->
            case parseFactor ys of
                (Right ex, zs) -> (Right (op,ex), zs)
                (Left err, _ ) -> (Left err, xs)
        (Left err, _) -> (Left err, xs)


-- Base case: <mulop>
parseMulOp :: [Token] -> (Either ParErr String, [Token])
parseMulOp ((TokOp op):xs)
    | isMulOp op = (Right op, xs)
parseMulOp xs    = (missingMulOp xs, xs)

missingMulOp xs =
    Left ("Missing add-like operator at \"" ++ (showTokens (pref xs))
          ++ "\"")


{- Function "parseFactor" takes a Token list, parses a <factor>, and
   returns a pair consisting of an Either wrapping the Expr abstract
   syntax tree found and the list of Tokens remaining after the
   <factor>. An error is denoted by returning the Left value for the
   Either.

   We can factor the BNF rules

       <factor> ::= <var> | <val>
                  | <leftparen> <expression> <rightparen>

   to better match the recursive descent prototypes, as follows:

       <factor>   ::= <var> | <val> | <nestexpr>
       <nextexpr> ::= <leftparen> <expression> <rightparen>

-}

-- Alternative: <factor> ::= <var> | <val> | <nestexpr>
parseFactor :: [Token] -> (Either ParErr Expr, [Token])
parseFactor xs =
    case parseVar xs of
        r@(Right _, _) -> r  -- <var>
        _ ->
          case parseVal xs of
              r@(Right _, _) -> r  -- <val>
              _ ->
                  case parseNestExpr xs of
                      r@(Right _, _) -> r  -- <nestexpr>
                      (Left m, ts)  -> (missingFactor m ts, ts)  

missingFactor m xs =
    Left ("Missing variable, value, or parenthesized expression " ++
      "beginning at \"" ++ (showTokens (pref xs)) ++
      "\" with nested error [" ++ m ++ "]")


{- Function "parseVar" takes a Token list, parses a <var>, and
   returns a pair consisting of an Either wrapping the Expr abstract
   syntax tree found and the list of Tokens remaining after the <var>.
   An error is denoted by returning the Left value for the Either.

   Function "parseVar" directly implements the following BNF rule as a
   base case, where <id> is a lexical Token:

       <var> ::= <id>
-}

parseVar :: [Token] -> (Either ParErr Expr, [Token])
parseVar ((TokId id):xs) = (Right (Var id),xs)
parseVar xs              = (missingVar xs, xs)

missingVar xs =
    Left ("Missing variable at \"" ++ (showTokens (pref xs)) ++ "\"")


{- Function "parseVal" takes a Token list, parses an unsigned <val>,
   and returns a pair consisting of an Either wrapping the Expr
   abstract syntax tree found and the list of Tokens remaining after
   the <val>.  An error is denoted by returning the Left value for the
   Either.

   Function "parseVal" directly implements the following BNF rule as a
   base case, where "-" and <unsigned> are lexical Tokens:

       <val> ::= [ "-" ] <unsigned>
-}
        
parseVal :: [Token] -> (Either ParErr Expr, [Token])
parseVal ((TokNum i):xs)             = (Right (Val i), xs)
parseVal ((TokOp "-"):(TokNum i):xs) = (Right (Val (-i)), xs)
parseVal xs                          = (missingVal xs, xs)

missingVal xs =
    Left ("Missing value at \"" ++ (showTokens (pref xs)) ++ "\"")

    
-- Sequence: <nextexpr> ::= <leftparen> <expression> <rightparen>
--     where <leftparen> and <rightparen> are single Tokens
parseNestExpr:: [Token] -> (Either ParErr Expr, [Token])
parseNestExpr xs@(TokLeft:ys) =  -- <leftparen>
    case parseExpression ys of 
        (ex@(Right _), zs) -> -- <expression>
            case zs of
                (TokRight:as) -> (ex,as) -- <rightparen>
                _             -> (missingRightParen zs, xs)
        (err@(Left _), _)  -> (err,xs)      -- no <expression>
parseNestExpr xs = (missingLeftParen xs, xs) -- no <leftparen>

missingLeftParen xs =
    Left ("Missing '(' at \"" ++ (showTokens (pref xs)) ++ "\"")
missingRightParen xs =
    Left ("Missing ')' at \"" ++ (showTokens (pref xs)) ++ "\"")


{- Functions "isAddOp" and "isMulOp" take a token and return "True" if
   and only if the token is in the addOp or mulOp groups,
   respectively.
-}

isAddOp :: String -> Bool
isAddOp xs = (xs == "+" || xs == "-")

isMulOp :: String -> Bool
isMulOp xs = (xs == "*" || xs == "/")


{- AST CONSTRUCTION -}

{- Function "makeBinOpSeq" takes the left operand Expr and a (possibly
   empty) list of pairs of additional valid binary operators (as
   strings) and their corresponding left operand Expr.

   It associates the operations from the right and returns the
   corresponding Expr.  That is, e1 [("+",e2),("-",e3)] yields
   (e1 + (e2 - e3)).
-}

makeBinOpSeq :: Expr -> [(String,Expr)] -> Expr
makeBinOpSeq e1 []           = e1
makeBinOpSeq e1 ((op,e2):xs) = makeBinOp op e1 (makeBinOpSeq e2 xs)


{- Function "makeBinOp" takes a valid binary operator string and
   its left and right operand expressions and returns the
   corresponding expression. It uses association list "assocOpCons2"
   to associate the valid perator strings with the Expr constructors.
-}

assocOpCons2 = [ ("+",Add), ("-",Sub), ("*",Mul), ("/",Div) ]
               -- add (operator,Constructor) pairs as neeed

makeBinOp :: String -> Expr -> Expr -> Expr
makeBinOp op e1 e2 =
    case lookup op assocOpCons2 of
        Just c  -> c e1 e2
        Nothing -> error ("Invalid operator " ++ op) -- invalid call