Since I'm trying to learn Haskell and functional languages are supposed to be really good at handling compiling/translating/transforming tasks, it may be a good idea to try to reimplement things in Haskell. This and reading 'Real world Haskell' should complement nicely.
Here's a first attempt: a simple lexer written in less than 100 lines of Haskell (though it's pretty 'high density', without any comments).
A few tasks that should help me become a better Haskell programmer:
- document things (the literate Haskell mode works great with Emacs, no excuses this time); add examples of usage in the comments !
- learn how to isolate the lexer guts by using the Haskell module system
- use the State monad, because there is state passed around in the lexer (although in a rather uniform manner, which avoids 'ladders to the right of the screen')
- maybe use the Reader monad (there are some configuration items, such as the width of a tab)
- ... so maybe learn how to use monad transformers to 'stack' State and Reader?
Until then, here's my newbie Haskeller code (uncommented :-( ):
> data StreamPosition = StreamPosition { line :: Int, column :: Int }
> deriving Show
> data TokenPosition = TokenPosition { start :: StreamPosition,
> end :: StreamPosition }
> deriving Show
> data TokenContent = TokenContent { position :: TokenPosition,
> content :: String }
> deriving Show
> data Token = TokenNumber TokenContent
> | TokenIdentifier TokenContent
> | TokenChar TokenContent
> deriving Show
> data LexerState = LexerState StreamPosition String [Token]
> lex :: String -> [Token]
> lex input = lexImpl $ LexerState (StreamPosition 1 1) input []
> where lexImpl (LexerState pos [] tokens) = reverse tokens
> lexImpl arg@(LexerState pos input@(c:cs) tokens)
> | c == '\n' = lexImpl $ LexerState (addToLine pos 1) cs tokens
> | c == ' ' = lexImpl $ LexerState (addToColumn pos 1) cs tokens
> | c == '\n' = lexImpl $ LexerState (addToColumn pos 8) cs tokens
> | isDigit c = lexImpl $ lexNumber arg
> | isLetterOrUnderscore c = lexImpl $ lexIdentifier arg
> | otherwise = lexImpl $ LexerState (addToColumn pos 1) cs upTokens
> where upTokens = (charTok : tokens)
> tokenPos = TokenPosition pos pos
> charTok = TokenChar $ TokenContent tokenPos [c]
> isDigit c = c >= '0' && c <= '9'
> isLetterOrUnderscore c = c == '_' || isLower || isUpper
> where isLower = c >= 'a' && c <= 'z'
> isUpper = c >= 'A' && c <= 'Z'
> lexNumber :: LexerState -> LexerState
> lexNumber = lexEntity isDigit TokenNumber
> lexIdentifier = lexEntity isLetterOrUnderscore TokenIdentifier
> lexEntity :: (Char -> Bool) -> (TokenContent -> Token) -> LexerState -> LexerState
> lexEntity spanP tokenCtor (LexerState pos cs tokens) = LexerState upPos upCs upTokens
> where (content, upCs) = span spanP cs
> upPos = addToColumn pos (length content)
> tokenPos = TokenPosition pos (addToColumn upPos (-1))
> numTok = tokenCtor (TokenContent tokenPos content)
> upTokens = (numTok : tokens)
> addToLine (StreamPosition line column) lineInc = StreamPosition (line + lineInc) column
> addToColumn (StreamPosition line column) columnInc =
> StreamPosition line (column + columnInc)
> data TokenTransform = TokenTransform { predicate :: ([Token] -> Int),
> combiner :: ([Token] -> [Token]) }
> applyTokenTransform :: [Token] -> TokenTransform -> [Token]
> applyTokenTransform [] _ = []
> applyTokenTransform tokens transform@(TokenTransform p c) =
> let matchLen = p tokens
> in if matchLen > 0
> then let (theMatch, theRest) = splitAt matchLen tokens
> theRest' = applyTokenTransform theRest transform
> in (c theMatch) ++ theRest'
> else let theHead = head tokens
> theRest = tail tokens
> in (theHead : applyTokenTransform theRest transform)
> applyTokenTransforms :: [Token] -> [TokenTransform] -> [Token]
> applyTokenTransforms tokens (t:ts) = applyTokenTransforms (applyTokenTransform tokens t) ts
> applyTokenTransforms tokens [] = tokens
> matchPair :: Char -> Char -> TokenTransform
> matchPair char1 char2 = TokenTransform pred combiner
> where isChar (TokenChar tc) ch = content tc == [ch]
> isChar _ _ = False
> pred [] = 0
> pred [x] = 0
> pred (el1 : el2 : els) | isChar el1 char1 && isChar el2 char2 = 2
> | otherwise = 0
> combiner (el1 : el2 : els) = (combineTokenChars el1 el2 : els)
> where combineTokenChars (TokenChar tc1) (TokenChar tc2) =
> let startPos = (start . position)
> endPos = (end . position)
> newTokenPosition = TokenPosition (startPos tc1) (endPos tc2)
> in TokenChar $ TokenContent newTokenPosition (content tc1 ++ content tc2)
> matchOps :: [TokenTransform]
> matchOps = [matchPair '=' '=', matchPair '>' '=', matchPair '<' '=']