6. Dolphin – Phase 3#

Attention

This is a group assignment. The workload is calibrated for a group of 3.

In case of questions regarding ambiguity of what you should do, ask questions on the forum. If you are in doubt and there is no enough time, use your best judgment and explain your reasoning in your report.

6.1. Assignment overview#

This assignment builds on the previous two assignments: Phase 1 and Phase 2. The main focus of this assignment is building a frontend for the language of Phase 2. We extend the Dolphin implementation with the following:

  1. A lexer that translates the source code into a stream of tokens.

  2. A parser that translates a stream of tokens into an AST.

There are 4 tasks and no questions in this assignment. There are no glory questions in this assignment.

6.1.1. What you need to get started#

  • This assignment is continuation of the previous assignment. To get started you need to edit the code from the previous assignment.

  • You need to understand how OCamllex lexer generator works. See OCamllex documentation.

  • You need to understand how Menhir parser generator works. See Menhir documentation.

6.1.2. What you need to hand in#

Please hand in a .zip file containing the following

  1. A brief report documenting your solution. Acceptable report formats are .pdf, .rtf, and .md. For each task and question, briefly (1 – 4 sentences) describe your implementation or answer. Write concisely.

  2. All the source files needed to reproduce your solution. This also includes the C code provided. Please explain in your report how the solution could be reproduced, e.g., calling make (if you have made a Makefile), the command line to call clang, etc.

  3. All the tests that you create (see Task 4) should be placed into a directory assignment-06-tests as individual .dlp files.

Important

Make sure to understand all the code you hand in, including what is copied from here. (The code for pretty printing (typed) ASTs is an exception here; see the appendix below.)

6.2. The Abstract Syntax Tree (AST) of Dolphin (phase 2)#

The AST that we use in this assignment is conceptually the same as before, except for one techical change – most of the nodes of the AST now carry location information. Location information should be used in error reporting.

The OCaml types for the AST describing programs is given below:

(* -- Use this in your solution without modifications *)
module Loc = Location

type ident = Ident of {name : string; loc : Loc.location}

type typ =
| Int of {loc : Loc.location}
| Bool of {loc : Loc.location}

type binop =
| Plus of {loc : Loc.location}
| Minus of {loc : Loc.location}
| Mul of {loc : Loc.location}
| Div of {loc : Loc.location}
| Rem of {loc : Loc.location}
| Lt of {loc : Loc.location}
| Le of {loc : Loc.location}
| Gt of {loc : Loc.location}
| Ge of {loc : Loc.location}
| Lor of {loc : Loc.location}
| Land of {loc : Loc.location}
| Eq of {loc : Loc.location}
| NEq of {loc : Loc.location}

type unop = 
| Neg of {loc : Loc.location}
| Lnot of {loc : Loc.location}

type expr =
| Integer of {int : int64; loc : Loc.location}
| Boolean of {bool : bool; loc : Loc.location}
| BinOp of {left : expr; op : binop; right : expr; loc : Loc.location}
| UnOp of {op : unop; operand : expr; loc : Loc.location}
| Lval of lval
| Assignment of {lvl : lval; rhs : expr; loc : Loc.location}
| Call of {fname : ident; args : expr list; loc : Loc.location}
and lval =
| Var of ident

type single_declaration = Declaration of {name : ident; tp : typ option; body : expr; loc : Loc.location}

type declaration_block = DeclBlock of {declarations : single_declaration list; loc : Loc.location}

type for_init =
| FIExpr of expr
| FIDecl of declaration_block

type statement =
| VarDeclStm of declaration_block
| ExprStm of {expr : expr option; loc : Loc.location}
| IfThenElseStm of {cond : expr; thbr : statement; elbro : statement option; loc : Loc.location}
| WhileStm of {cond : expr; body : statement; loc : Loc.location}
| ForStm of {init : for_init option; cond : expr option; update : expr option; body : statement; loc : Loc.location}
| BreakStm of {loc : Loc.location}
| ContinueStm of {loc : Loc.location}
| CompoundStm of {stms : statement list; loc : Loc.location}
| ReturnStm of {ret : expr; loc : Loc.location}

type program = statement list

Action item

The AST declarations above should replace the contents of the module called Ast. Do not change the code above.

6.2.1. Location module#

The AST module above uses the module Location below:

(* -- Use this in your solution without modifications *)
module PBox = PrintBox

type location = {start_pos : Lexing.position; end_pos : Lexing.position}

let make_location (startp, endp) = {start_pos = startp; end_pos = endp}

let dummy_loc = {start_pos = Lexing.dummy_pos; end_pos = Lexing.dummy_pos}

let location_style = PBox.Style.fg_color PBox.Style.Cyan

let location_to_tree ?(includefile = true) {start_pos; end_pos} =
  if includefile then
    if start_pos.pos_lnum = end_pos.pos_lnum then
      PBox.sprintf_with_style location_style "\"%s\" @ %d:%d-%d"
        start_pos.pos_fname start_pos.pos_lnum 
        (start_pos.pos_cnum - start_pos.pos_bol)
        (end_pos.pos_cnum - end_pos.pos_bol)
    else
      PBox.sprintf_with_style location_style "\"%s\" @ %d:%d-%d:%d"
        start_pos.pos_fname start_pos.pos_lnum 
        (start_pos.pos_cnum - start_pos.pos_bol)
        end_pos.pos_lnum 
        (end_pos.pos_cnum - end_pos.pos_bol)
  else
    if start_pos.pos_lnum = end_pos.pos_lnum then
      PBox.sprintf_with_style location_style "@ %d:%d-%d"
        start_pos.pos_lnum 
        (start_pos.pos_cnum - start_pos.pos_bol)
        (end_pos.pos_cnum - end_pos.pos_bol)
    else
      PBox.sprintf_with_style location_style "@ %d:%d-%d:%d"
        start_pos.pos_lnum 
        (start_pos.pos_cnum - start_pos.pos_bol)
        end_pos.pos_lnum 
        (end_pos.pos_cnum - end_pos.pos_bol)

Action item

The definition of locations above should be placed in a module called Location. That is, the contents above should be placed in a file called location. Do not change the code above.

Note

The typed AST module remains the same as in the previous assignment.

6.3. The syntax of Dolphin#

You should by now have an intuitive understanding of the syntax and semantics of Dolphin. See Exercises for Week 7 for an overview. Below, we focus on the details of the syntax that are relevant for the implementaiton of the Dolphin frontend.

6.3.1. Whitespace#

Dolphin’s syntax recognizes, and ignores, the following (sequences of) characters as whitespace: \n (linefeed character, i.e., Unix new line indicator), \r\n (carriage return, followed by linefeed, i.e., Windows new line indicator), tab, and space. All whitespace characters are completely ignored in Dolphin. New line indicators also indicate the end of a single-line comment.

Hint

When using ocamllex, the semntic action of recognizing new line indicators should invoke Lexing.new_line function to increment line number in the lexer state to maintain consistent source location.

6.3.2. Comments#

In Dolphin, single-line comments start with //, and extend up to the end of the line. Multi-line (block) comments are delimited by /* and */. Note: Dolphin supports nested multi-line comments. That is, both /* abc */ and /*/* abc */*/ are valid comments.

6.3.3. Keywords#

The keywords of Dolphin are as follows: true, false, nil, var, let, if, else, while, for, break, continue, return, int, byte, bool, string, void, record, new, length_of

Some of these keywords are not yet used in our implementation of Dolphin. However, they should be recognized by the lexer so that identifiers are handled properly; see below.

6.3.4. Identifiers#

An identifier is any string consisting of underscore, _, lowercse or capitl letters of English alphabet, and digits, 0, …, 9, that does not start with a digit, and is not a Dolphin keyword.

6.3.5. Operators#

The operators of the Dolphin language are as follows:

Operator

Description

+, -, *, /, %

Arithmetic operators: addition, subtraction, unary negation, multiplication, division, remainder.

<, <=, > >=

Relative comparison operators; these operators only apply to integers (and strings in phase 5).

==, !=

Equality comparison operator; these operators apply to any type, as long as the two sides are the same type, and that type is not void.

||, &&, !

Logical operators on booleans: or, and, and not.

.

Looking up fields in a record.

,

See comma expressions explained below. (Note that , is also used as punctuation.)

6.3.5.1. Precedences and associativity#

  • Arithmetic operator precedences are as usual: + and - have the same precedence which is lower than that of *, /, and %, which also have the same precedence. All these operators left-associative.

  • The operator && has a higher priority than || and they are both left associative.

  • Unary operations, - and !, have a higher precedence than binary operations.

  • The comparison operators are non-associative.

6.3.6. Punctuation#

Punctuation mark

Description

,

Used for separating function arguments in a call, and separating variable declarations in a variable declaration block.

{ and }

Delimits compound statements. Also, they will later be used to delimit function body.

( and )

Used for grouping, e.g., overriding precedences. Also used for function calls (see below).

6.3.7. Literals#

This phase of Dolphin supports integer and boolean literals.

  • An integer literal is any number within the range \(-2^{63}\) to \(2^{63}-1\). Integer literals out of this range are invalid.

  • Boolean literals: keywords true and false.

6.3.8. Types#

  • Primitive types: int, bool.

  • Void type: void. This type does not yet syntactically appear in our programs because it may only be used the return type of a function in a function declaration, which we do not yet support in Dolphin.

6.3.9. L-values#

An l-value, also written lvalue, is any value in the programming language that roughly-speaking corresponds to a memory location. For the current subset of Dolphin, l-values can only be local variable names.

6.3.10. Expressions#

Dolphin expressions are as follows:

  • Integer literals.

  • Boolean literals true and false.

  • Binary operators applied to two expressions: e1 o e2 where e1 and e2 are two expressions and o is any binary operator; see above for details of precedences.

  • Unary operators applied to expressions: o e where o is a unary operator and e is an expression; see above for details of precedences.

  • L-values are also expressions; this corresponds to reading the location.

  • Assignment: l = e where l is an l-value and e is an expression; this corresponds to writing the location.

  • Function calls are written by writing the name of the function followed by its arguments passed in parentheses, separated by a comma, e.g., func(a, b, 2 + 3). Function arguments are expressions.

  • Parentheses can be used to group expressions: (e) is a valid expression whenever e is.

6.3.11. Statements#

  • Expression statements: an expression statement is an assignment, a function call, or nothing, followed by a semicolon. That is, both func(2, 3); and ; are valid expression statements. However, 2 + 5; is not a valid expression statement.

  • Declaration blocks: A declaration block consists of a var keyword, followed by a sequence of declarations, separated with a comma, terminated by a semicolon. A declaration consists of an identifier and a type, separated by a colon, and an initialization (= followed by an expression, which cannot itself be a comma operator; recall parentheses for grouping). Finally, the type ascription is optional. That is, var x : int = 2;, var y = false, w : int = 5;, var x = 3, z = true; are all valid declaration blocks.

  • Conditional statement of the form if(cond) true_body else false_body where cond is an expression and true_body and false_body are both statements (see compound statements below). The else false_body part of the statement is optional and may be omitted. Note: the parentheses around the condition are part of the syntax of if statements.

  • While loops of the form while(cond) body where cond is an expression and body is a statement (see compound statements below). Note: the parentheses around the condition are part of the syntax of while loops.

  • For loops of the form for(init; cond; update) body. The initialization part, init may be an expression, or a declaration block (without a semicolon, as init and cond are already separated by a ; as part of the syntax of the for loop). Both cond and update are expressions and body is a statement (see compound statements below). Note: the parentheses around init; cond; update are part of the syntax of for loops. Each of the parts init, cond, and update may be omitted. That is, for(;;);, for(; i < 10;);, for(var i : int = 12; i > 10;);, etc. are all valid for loops.

  • Break statement: break; (Note: the ; ist part of the syntax of the statement.)

  • Continue statement: continue; (Note: the ; ist part of the syntax of the statement.)

  • Return statement of the form return res; where res is an expression, or return; (for functions of return type void).

  • Compound statement (block of statements): a compound statement is a sequence of statements, one followed after the other, surrounded in curly braces, e.g., {if (n > 0) {r = 10; return 1; } else continue; {} }. A compound statement may appear at any point where a statement is expected, e.g., as the body of a loop, or conditional, as a statement inside another compound statement, etc.

6.4. Lexer#

6.4.1. Lexemes#

The Dolphin language has the following lexeme kinds. (This is written in the language of Menhir.)

// end of file
%token EOF
// integer literals
%token <int64> INT_LIT
// booleans
%token TRUE FALSE
// length operation; for arrays and strings
%token LENGTHOF
// arithmetic oprations
%token PLUS MINUS MUL DIV REM
// comparison operators
%token LT LE GT GE
// logical operations
%token LOR LAND LNOT
// equality
%token EQ NEQ
// assignment
%token ASSIGN
// punctuation
%token COLON COMMA SEMICOLON
// accessors
%token DOT LBRACKET RBRACKET
// braces
%token LBRACE RBRACE
// parentheses
%token LPAREN RPAREN
// identifiers
%token <string> IDENT
// keywords
%token NIL VAR IF ELSE WHILE FOR BREAK CONTINUE RETURN NEW
// types
%token INT BOOL STRING BYTE VOID RECORD

6.4.2. Lexer#

Task 1: Implement lexical analysis using OCamllex
Implement lexical analysis for the present subset of Dolphin as described above, where the input is read from a file, using `ocamllex``

Hint

Use a dummy parser for token generation.

6.5. Parser#

Task 2: Implement the parser using Menhir parser generator
Implement the parser for the present subset of Dolphin as described above, using Menhir parser generator. To score full points on this task, your parser should not have any shift/reduce or reduce/reduce conflicts.

6.6. Updated semantic analysis#

Task 3: Update semantic analysis
Update your semantic analysis so that error reporting now includes the location information.

6.7. Consolidation#

Task 4: Testing and consolidation

In this task, we put the previous tasks together and test our project in an end-to-end fashion.

  1. Extend your function compile_prog from the previous phase to include the frontend. In other words, ensure the following behavior for compile_prog.

    • Given a path to a file with Dolphin source code, it runs the lexer. If there are no errors in lexing, compile_prog runs the parser. If there are no errors in the parser, compile_prog runs the semantic analysis (from here on, it is as just as in the previous assignment). In case of errors, they should be printed on standard error output, and the program should exit with exit code 1. If there are no errors, compile_prog proceeds to generate the LLVM translation. The result of the translation should be output on standard output, and the program exits with exit code 0.

  2. Port your earlier tests to source code. Add 10 new tests that negatively test your lexer and the parser. All the tests should be included into your submission as individual .dlp files.

6.8. Appendix#

Recall that you will need libraries printbox and printbox-text to use pretty printers below. These can be installed using opam using the following command opam install printbox printbox-text

These pretty printers produce a so-called box which is the terminology that printbox uses to refer to formatted, structured texts. A box ca be printed as follows:

PrintBox_text.output stdout (Pretty.program_to_tree prog)

This will print the AST of the program as a tree.

6.8.1. pretty printer for ASTs (module Pretty)#

module PBox = PrintBox
open Ast

(* producing trees for pretty printing *)
let typ_style = PBox.Style.fg_color PBox.Style.Green
let ident_style = PBox.Style.fg_color PBox.Style.Yellow
let fieldname_style = ident_style
let keyword_style = PBox.Style.fg_color PBox.Style.Blue

let info_node_style = PBox.Style.fg_color PBox.Style.Cyan

let make_typ_line name = PBox.line_with_style typ_style name
let make_fieldname_line name = PBox.line_with_style fieldname_style name
let make_ident_line name = PBox.line_with_style ident_style name
let make_keyword_line name = PBox.line_with_style keyword_style name

let make_info_node_line info = PBox.line_with_style info_node_style info

let ident_to_tree (Ident {name; _}) = make_ident_line name

let typ_to_tree tp =
  match tp with
  | Bool _ -> make_typ_line "Bool"
  | Int _ -> make_typ_line "Int"

let binop_to_tree op =
    match op with
    | Plus _ -> make_keyword_line "PLUS"
    | Minus _ -> make_keyword_line "Minus"
    | Mul _ -> make_keyword_line "Mul"
    | Div _ -> make_keyword_line "Div"
    | Rem _ -> make_keyword_line "Rem"
    | Lt _ -> make_keyword_line "Lt"
    | Le _ -> make_keyword_line "Le"
    | Gt _ -> make_keyword_line "Gt"
    | Ge _ -> make_keyword_line "Ge"
    | Lor _ -> make_keyword_line "Lor"
    | Land _ -> make_keyword_line "Land"
    | Eq _ -> make_keyword_line "Eq"
    | NEq _ -> make_keyword_line "NEq"
  
let unop_to_tree op =
  match op with
  | Neg _ -> make_keyword_line "Neg"
  | Lnot _ -> make_keyword_line "Lnot"
  
  let rec expr_to_tree e =
    match e with
    | Integer {int; _} -> PBox.hlist ~bars:false [make_info_node_line "IntLit("; PBox.line (Int64.to_string int); make_info_node_line ")"]
    | Boolean {bool; _} -> PBox.hlist ~bars:false [make_info_node_line "BooleanLit("; make_keyword_line (if bool then "true" else "false"); make_info_node_line ")"]
    | BinOp {left; op; right; _} -> PBox.tree (make_info_node_line "BinOp") [expr_to_tree left; binop_to_tree op; expr_to_tree right]
    | UnOp {op; operand; _} -> PBox.tree (make_info_node_line "UnOp") [unop_to_tree op; expr_to_tree operand]
    | Lval l -> PBox.tree (make_info_node_line "Lval") [lval_to_tree l]
    | Assignment {lvl; rhs; _} -> PBox.tree (make_info_node_line "Assignment") [lval_to_tree lvl; expr_to_tree rhs]
    | Call {fname; args; _} ->
      PBox.tree (make_info_node_line "Call")
        [PBox.hlist ~bars:false [make_info_node_line "FunName: "; ident_to_tree fname];
         PBox.tree (make_info_node_line "Args") (List.map (fun e -> expr_to_tree e) args)]
  and lval_to_tree l =
    match l with
    | Var ident -> PBox.hlist ~bars:false [make_info_node_line "Var("; ident_to_tree ident; make_info_node_line ")"]

let single_declaration_to_tree (Declaration {name; tp; body; _}) =
  PBox.tree (make_keyword_line "Declaration") 
    [PBox.hlist ~bars:false [make_info_node_line "Ident: "; ident_to_tree name]; 
    PBox.hlist ~bars:false [make_info_node_line "Type: "; Option.fold ~none:PBox.empty ~some:typ_to_tree tp];
    PBox.hlist ~bars:false [make_info_node_line "Body: "; expr_to_tree body]]

let declaration_block_to_tree (DeclBlock {declarations; _}) =
PBox.tree (make_keyword_line "VarDecl")  (List.map single_declaration_to_tree declarations)

let for_init_to_tree = function
| FIDecl db -> PBox.hlist ~bars:false [PBox.line "ForInitDecl: "; declaration_block_to_tree db]
| FIExpr e -> PBox.hlist ~bars:false [PBox.line "ForInitExpr: "; expr_to_tree e]

let rec statement_to_tree c =
  match c with
  | VarDeclStm db -> PBox.hlist ~bars:false [PBox.line "DeclStm: "; declaration_block_to_tree db]
  | ExprStm {expr; _} -> PBox.hlist ~bars:false [make_info_node_line "ExprStm: "; Option.fold ~none:PBox.empty ~some:expr_to_tree expr]
  | IfThenElseStm {cond; thbr; elbro; _} ->
    PBox.tree (make_keyword_line "IfStm")
      ([PBox.hlist ~bars:false [make_info_node_line "Cond: "; expr_to_tree cond]; PBox.hlist ~bars:false [make_info_node_line "Then-Branch: "; statement_to_tree thbr]] @
       match elbro with None -> [] | Some elbr -> [PBox.hlist ~bars:false [make_info_node_line "Else-Branch: "; statement_to_tree elbr]])
  | WhileStm {cond; body; _} ->
    PBox.tree (make_keyword_line "WhileStm") 
      [PBox.hlist ~bars:false [make_info_node_line "Cond: "; expr_to_tree cond];
        PBox.hlist ~bars:false [make_info_node_line "Body: "; statement_to_tree body]]
  | ForStm {init; cond; update; body; _} ->
    PBox.tree (make_keyword_line "ForStm") 
      [PBox.hlist ~bars:false [make_info_node_line "Init: "; Option.fold ~none:PBox.empty ~some:for_init_to_tree init];
        PBox.hlist ~bars:false [make_info_node_line "Cond: "; Option.fold ~none:PBox.empty ~some:expr_to_tree cond];
        PBox.hlist ~bars:false [make_info_node_line "Update: "; Option.fold ~none:PBox.empty ~some:expr_to_tree update];
        PBox.hlist ~bars:false [make_info_node_line "Body: "; statement_to_tree body]]
  | BreakStm _ -> make_keyword_line "BreakStm"
  | ContinueStm _ -> make_keyword_line "ContinueStm"
  | CompoundStm {stms; _} -> PBox.tree (make_info_node_line "CompoundStm") (statement_seq_to_forest stms)
  | ReturnStm {ret; _} -> PBox.hlist ~bars:false [make_keyword_line "ReturnValStm: "; expr_to_tree ret]
and statement_seq_to_forest stms = List.map statement_to_tree stms

let program_to_tree prog = 
  PBox.tree (make_info_node_line "Program") (statement_seq_to_forest prog)

6.8.2. pretty printer for ASTs (module TypedPretty)#

The typed AST has not changed compared to the previous assignment. Hence, there is no need to update the TypedPretty module.

6.8.3. C runtime#

The C runtime we use has not changed compared to the previous assignment. See the corresponding description in the previous assignment