Lexcial Analysis With F# – Part 5

I’ve begun to establish a reasonably sound design pattern for the lexical analyzer. Of course this isn’t intended to be an ideal solution to the general case of writing a tokenizer for any language, it does not support any kind of short hand for describing token structure for example. But it isn’t overly complex and at this stage supports some of the common tokens seen in C, C++ or C#.

As you’ll see its quite a straightforward pattern and its a rather academic exercise to support the full token set for any of the above languages. A match statement does most of the work and the layout of that is easy to grasp and extend.

So here is the code, this version supports exposing the token stream as a sequence (using the powerful Seq.unfold function) and has character lookahead which allows us to recognize single character tokens which begin with a character that could also be a two or three character token. What this means is that we can recognize both +5 as two distinct tokens yet ++ or += as one token, in the first case we ‘restore’ the ‘5’ so we can begin to process that as a distinct token once ‘+’ has been recognized as a distinct token.


// Experimental Lexical Analyzer written in F#

open System.IO

type States =
     | START
     | IDENT     // Creating an identifier token
     | NUMBR
     | STRING    // Creating a string token
     | ONE_EQUAL // A = was read which may be the start of a longer token.
     | ONE_MINUS // A - was read which may be the start of a longer token.
     | ONE_PLUS  // A + was read which may be the start of a longer token.

type Kinds =
     | UNRECOGNIZED
     | IDENTIFIER
     | INTEGER
     | STRING
     | SEMICOLON
     | COMMA
     | DOT
     | LBRACK
     | RBRACK
     | LPAR
     | RPAR
     | LBRACE
     | RBRACE
     | EQUALS           // =
     | EQUALITY         // ==
     | MINUS            // -
     | POINTING         // ->
     | DECREMENT        // --
     | DEQUAL           // -=
     | PLUS             // +
     | INCREMENT        // ++
     | INQUAL           // +=

     override this.ToString() =
      match this with
      | UNRECOGNIZED -> "Unrecognized  "
      | IDENTIFIER   -> "Identifier    "
      | INTEGER      -> "Integer       "
      | STRING       -> "String        "
      | SEMICOLON    -> "Semicolon     "
      | COMMA        -> "Comma         "
      | DOT          -> "Period        "
      | LBRACK       -> "L Bracket     "
      | RBRACK       -> "R Bracket     "
      | LPAR         -> "L Parenthesis "
      | RPAR         -> "R Parenthesis "
      | LBRACE       -> "L Brace       "
      | RBRACE       -> "R Brace       "
      | EQUALS       -> "Equals        "
      | EQUALITY     -> "Equality      "
      | MINUS        -> "Minus         "
      | POINTING     -> "Pointing      "
      | DECREMENT    -> "Decrement     "
      | DEQUAL       -> "Decqual       "
      | PLUS         -> "Plus          "
      | INCREMENT    -> "Increment     "
      | INQUAL       -> "Incqual       "

type State =
     {
     stream : seq<char>;
     state  : States;
     }

type Token =
     {
     lexeme : string;
     kind   : Kinds;
     }

type Context =
     {
     state : State;
     token : Token;
     }

[<EntryPoint>]

let main argv =

    printfn "%A" argv

    let start_state    input = {State.stream = input; State.state = States.START; }
    let empty_context  input = {Context.state = start_state input ; Context.token =
                               {Token.kind = Kinds.UNRECOGNIZED; Token.lexeme = "";};}

    let create_context input state lexeme kind =
        {
         Context.state = {State.stream = input;  State.state = state; };
         Context.token = {Token.kind = kind; Token.lexeme = lexeme;};
        }

    // Simple function that classifies a character as being alphabetic or not etc.

    let Digit = function
        | X when X >= '0' && X <= '9' -> true
        | _ -> false

    let Alpha = function
        | X when X >= 'a' && X <= 'z' -> true
        | X when X >= 'A' && X <= 'Z' -> true
        |'_' -> true
        | _  -> false

    // Simple function that classifies a character as being white space or not.

    let Space = function
        |' '  -> true
        |'\n' -> true
        |'\t' -> true
        |'\r' -> true
        |';'  -> true
        | _   -> false

    // Simple function that classifies a character as being a quote or not

    let Quote = function
        |'"' -> true
        | _  -> false

    // Simple function that converts a Char to a String.

    let ToString C = sprintf "%c" C

    // Simple function that appends a char to a String.

    let Append S C = S + ToString C

    // A RECURSIVE FINITE STATE MACHINE
    // This is core recognizer function. It uses recursion to implement a state machine, in this model
    // a recursive invocation is made passing in a new state - this replicates exactly the bahvior of
    // an iterative state machine with a mutable state.
    //
    // The function takes a single 'context' argument which wraps an input stream, new state, lexeme
    // string and token type info a single record.
    //

    let rec tokenize context =
         // Token type UNRECOGNIZED actually means - continue lexing...
        if (context.token.kind <> Kinds.UNRECOGNIZED) || (Seq.isEmpty context.state.stream) then
           // put a line here to test if token is IDENTIFIER and if so to see if its a keyword.
           context
        else
           let Sb = context.state.stream     // Get the source stream BEFORE a char has been read
           let Sa = Seq.skip 1 Sb            // Get the source stream AFTER  a char has been read
           let C = Seq.head Sb               // Get the next char.
           match (C, context.state.state) with

           // For a given C and S there are three items: 1) Next State, 2) Token Type, 3) Action/Operation

           // Actions associated with the start state and single char tokens
           | (',', States.START)               -> create_context Sa States.START (Append context.token.lexeme C) Kinds.COMMA            |> tokenize
           | ('.', States.START)               -> create_context Sa States.START (Append context.token.lexeme C) Kinds.DOT              |> tokenize
           | (';', States.START)               -> create_context Sa States.START (Append context.token.lexeme C) Kinds.SEMICOLON        |> tokenize
           | ('[', States.START)               -> create_context Sa States.START (Append context.token.lexeme C) Kinds.LBRACK           |> tokenize
           | (']', States.START)               -> create_context Sa States.START (Append context.token.lexeme C) Kinds.RBRACK           |> tokenize
           | ('(', States.START)               -> create_context Sa States.START (Append context.token.lexeme C) Kinds.LPAR             |> tokenize
           | (')', States.START)               -> create_context Sa States.START (Append context.token.lexeme C) Kinds.RPAR             |> tokenize
           | ('{', States.START)               -> create_context Sa States.START (Append context.token.lexeme C) Kinds.LBRACE           |> tokenize
           | ('}', States.START)               -> create_context Sa States.START (Append context.token.lexeme C) Kinds.RBRACE           |> tokenize

           // Tokens that start with =
           | ('=', States.START)               -> create_context Sa States.ONE_EQUAL (Append context.token.lexeme C) Kinds.UNRECOGNIZED |> tokenize
           | ('=', States.ONE_EQUAL)           -> create_context Sa States.START (Append context.token.lexeme C) Kinds.EQUALITY         |> tokenize
           | ( _ , States.ONE_EQUAL)           -> create_context Sb States.START context.token.lexeme Kinds.EQUALS                      |> tokenize

           // Tokens that start with -
           | ('-', States.START)               -> create_context Sa States.ONE_MINUS (Append context.token.lexeme C) Kinds.UNRECOGNIZED |> tokenize
           | ('>', States.ONE_MINUS)           -> create_context Sa States.START (Append context.token.lexeme C) Kinds.POINTING         |> tokenize
           | ('-', States.ONE_MINUS)           -> create_context Sa States.START (Append context.token.lexeme C) Kinds.DECREMENT        |> tokenize
           | ('=', States.ONE_MINUS)           -> create_context Sa States.START (Append context.token.lexeme C) Kinds.DEQUAL           |> tokenize
           | ( _ , States.ONE_MINUS)           -> create_context Sb States.START context.token.lexeme Kinds.MINUS                       |> tokenize

           // Tokens that start with +
           | ('+', States.START)               -> create_context Sa States.ONE_PLUS (Append context.token.lexeme C) Kinds.UNRECOGNIZED  |> tokenize
           | ('+', States.ONE_PLUS)            -> create_context Sa States.START (Append context.token.lexeme C) Kinds.INCREMENT        |> tokenize
           | ('=', States.ONE_PLUS)            -> create_context Sa States.START (Append context.token.lexeme C) Kinds.INQUAL           |> tokenize
           | ( _ , States.ONE_PLUS)            -> create_context Sb States.START context.token.lexeme Kinds.PLUS                        |> tokenize

           | (C,   States.START)  when Space C -> create_context Sa States.START  context.token.lexeme Kinds.UNRECOGNIZED               |> tokenize
           | (C,   States.START)  when Alpha C -> create_context Sa States.IDENT  (Append context.token.lexeme C) Kinds.UNRECOGNIZED    |> tokenize
           | (C,   States.START)  when Digit C -> create_context Sa States.NUMBR  (Append context.token.lexeme C) Kinds.UNRECOGNIZED    |> tokenize
           | ('"', States.START)               -> create_context Sa States.STRING context.token.lexeme Kinds.UNRECOGNIZED               |> tokenize
           // Actions associated with the identifier state
           | (C,   States.IDENT)  when Alpha C -> create_context Sa States.IDENT  (Append context.token.lexeme C) Kinds.UNRECOGNIZED    |> tokenize
           | (C,   States.IDENT)  when Space C -> create_context Sa States.START  context.token.lexeme Kinds.IDENTIFIER                 |> tokenize
           | (_,   States.IDENT)               -> create_context Sb States.START  context.token.lexeme Kinds.IDENTIFIER                 |> tokenize

           | (C,   States.NUMBR)  when Digit C -> create_context Sa States.NUMBR  (Append context.token.lexeme C) Kinds.UNRECOGNIZED    |> tokenize
           | (C,   States.NUMBR)  when Space C -> create_context Sa States.START  context.token.lexeme Kinds.INTEGER                    |> tokenize
           | (_,   States.NUMBR)               -> create_context Sb States.START  context.token.lexeme Kinds.INTEGER                    |> tokenize

           | (C,   States.STRING) when Quote C -> create_context Sa States.START  context.token.lexeme Kinds.STRING                     |> tokenize
           // Actions associated with the string state
           | (C,   States.STRING)              -> create_context Sa States.STRING (Append context.token.lexeme C) Kinds.UNRECOGNIZED    |> tokenize
           | _                                 -> empty_context  Sa

    // Get a char sequence that contains all source file contents.
    // This test file is situated in the same folder as the .fs source
    // files for this project.

    let source_stream path = seq {for c in File.ReadAllText path do yield c}

    // Helper function invoked by the seq unfold operation. This function is passed
    // an argument and expected to compute the 'next' item in the sequence. In this
    // design a returned token is passed back in as the argument and serves as a
    // context or state to be passed to each successive tokenize operation.

    let unfolder state =
        if Seq.isEmpty state.stream then
           None // Denotes end of sequence, causes foreach etc to end.
        else
           // Outermost invocation begins with a special 'empty' context argument.

           let context = tokenize (empty_context state.stream)

           // In this simple design a token contains the unread input
           // stream and state so serves as both the returned value
           // (of the enumeration) and the state - hence we use the
           // same value for both args in 'Some'

           Some(context.token,context.state)

    // Defines a sequence using the unfolder function and beginning with the
    // initialized start token value.

    let input = source_stream "..\\..\\sample.txt"

    // Define a sequence which returns complete tokens from the char input stream.

    let token_sequence = Seq.unfold unfolder (start_state input)

    // Enumerate the sequence of tokens and print their token type and lexeme.

    for token in token_sequence do
        printfn "%s %s" (token.kind.ToString()) token.lexeme

    0 // return an integer exit code

This design does not support the ability to record the line and column at which a token begins within a source file, but this isn’t a difficult thing to add.

Here’s some input that I tested with, the tokens get written to the console window:

sample

Here’s the resulting console window when the above code is run with the above text input:

Lexer Output 1

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s