{- CSci 450: Organization of Programming Languages
   Imperative Core Language, Lexical Analyzer
   Fall 2017 
   H. Conrad Cunningham

1234567890123456789012345678901234567890123456789012345678901234567890

2017-09-14: Prototype adapted from earlier Expression Language Lexer
            code, but follows Kamin/Ramsey Imperative Core language
            syntax
2017-08-21: Handled number overflow by adding convertNumType
            (from LexExpr). Changed import of ValueImpCore to Values.

Function "lexx" does not distinguish among keywords, primitive
function names, and user-defined variable and function names; all are just name tokens.

Function "lexer" uses "lexx" but then moves keywords and primitive
function names into separate token categories. aThis does does not
allow user-defined variable and function names to have the same names
as primitives or keywords. This differs from the Kamin/Ramsey
language.

The Imperative Core language parser uses "lexx". Function "lexer" is
an artifact of an earlier approach.

TODO:
- Consider removing lexer and removing TokPrim and TokKey from Token

-}

module LexImpCore
    ( NumType, Name, Token(..), lexx, lexer )
where
  
-- Haskell libraries
import Data.Char (isSpace, isDigit)

-- ImpCore modules
import Values (NumType, Name, toNumType)

data Token = TokLeft          -- left parenthesis
           | TokRight         -- right parenthesis
           | TokNum  NumType  -- unsigned integer literal
           | TokName Name     -- names of variables, functions, etc.
           | TokPrim Name     -- primitive functions
           | TokKey  Name     -- keywords
             deriving (Eq, Show)


{-  Function "lexx" takes a string and returns the corresponding list
    of lexical tokens.  It uses a regular grammar to group characters
    into parenthesis characters, unsigned integers, and names. It
    skips whitespace characters and comments (from any ';'to the
    end of the line). It makes <name> and <unsigned> tokens as long
    as possible.

    Note: Negative integers must be handled by the parser.

        <input>    ::=  <token>  |  <token> <input>
        <token>    ::=  '('  |  ')'  |  <unsigned>  |  <name>
        <unsigned> ::=  <digit>  |  <digit> <unsigned> 
        <digit>    ::=  any numeric character
        <name>     ::=  any sequence that is not an <unsigned>
                        and contains no '('. ')', ';', or whitespace

    Note: Comment removal is also done in the REPL for scripts
    read by the REPL, but it seems useful for lexx to also remove
    comments in case it is used in other contexts.
-}

lexx :: String -> [Token]
lexx []  = []
lexx xs@(x:xs')
    | isSpace x  = lexx xs'
    | x == ';'   = lexx (dropWhile (/='\n') xs')
    | x == '('   = TokLeft  : lexx xs'
    | x == ')'   = TokRight : lexx xs'
    | isDigit x   = let (num,rest) = span isDigit xs
                    in (TokNum (convertNumType num)) : lexx rest
    | otherwise  = let isNameChar c =
                           not (isSpace c || elem c nonNameChars)
                       (name,rest) = span isNameChar xs
                   in  (TokName name) : lexx rest

-- Characters that cannot appear in names
nonNameChars :: [Char]
nonNameChars = ['(',')',';']


{-  Function "convertNumType" converts and unsigned number string
    into a NumType value.  If that cannot be done, it does an
    error call.

-}

convertNumType :: String -> NumType
convertNumType num  =
    case toNumType num of
        Right v  -> v
        Left err -> error ("Lexical error: " ++ err)


{-  Function "lexer" takes a string and returns the corresponding list
    of lexical tokens, including the separating keywords and primitive
    function names into separate syntactic categories.

    It uses "lexx" to tokenize the input and then "markSpecials" to
    transform the name tokens to encode keywords and primitive
    function names in separate categories.
-}

lexer :: String -> [Token]
lexer xs = markSpecials (lexx xs)

markSpecials :: [Token] -> [Token]
markSpecials ts = map xformTok ts

xformTok :: Token -> Token
xformTok (TokName n)
   | elem n keywords   = TokKey n
   | elem n primitives = TokPrim n
xformTok t             = t

-- Reserved names -- keywords and primitives
-- "use" moved to repl
keywords, primitives :: [Name]
keywords     = ["val","define","use","set","if","while","begin"]
primitives   = ["+","-","*","/","==","<",">"]