Updated Lexer

* Fixed handling for numbers according to lexer rules * Updated README and tests
e3m3 · Sep 13, 2024 · 0b65a54 · 0b65a54
1 parent 200e10f
commit 0b65a54
Show file tree

Hide file tree

Showing 3 changed files with 110 additions and 8 deletions.
diff --git a/README.md b/README.md
@@ -19,12 +19,70 @@ Accepted factors in the grammar have been extended for convenience (see `src/{le
 The output of the compiler is LLVM IR or LLVM bytecode [[6]].
 
 
-##  Grammar
+##  Language
+
+### Lexer
+
+```text
+ident           ::= is_letter+ (is_letter | is_number)*
+number          ::= digit+ | (`0x` hex_digit+)
+digit           ::= [0-9]
+hex_digit       ::= [a-fA-F0-9]
+letter          ::= letter_lower | letter_upper | `_`
+letter_lower    ::= [a-z]
+letter_upper    ::= [A-Z]
+whitespace      ::= ` ` | `\r` | `\n` | `\t`
+
+any             ::= _
+token           ::= { tokenkind, text }
+tokenkind       ::=
+    | Unknown
+    | Comma
+    | Comment
+    | Colon
+    | Eoi
+    | Eol
+    | Ident
+    | Minus
+    | Number
+    | ParenL
+    | ParenR
+    | Plus
+    | Slash
+    | Star
+    | With
+text            ::=
+    | ``
+    | `,`
+    | `/``/` any*
+    | `:`
+    | ident
+    | `-`
+    | number
+    | `(`
+    | `)`
+    | `+`
+    | `/`
+    | `*`
+    | `with`
+```
+
+### Grammar
 
 ```text
-calc : ("with" ":" ident ("," ident)* ":" )? expr
+calc    ::= ( With Colon Ident (Comma Ident)* Colon )? expr
+expr    ::= term ( Plus | Minus ) term
+factor  ::= Minus? ( Number | Ident | ParenL expr ParenR )
+term    ::= factor ( Slash | Star ) factor
 ```
 
+Notes:
+
+*   The grammar rules above use the `tokenkind` as a shorthand for a `token` object as described by the lexer rules.
+
+*   In the AST, a factor with a leading `Minus` token is represented as a subtraction expression where the left term
+    is `Number` with the constant value `0`.
+
 
 ##  Prequisites
 

diff --git a/src/lex.rs b/src/lex.rs
@@ -180,6 +180,16 @@ impl <'a, T: Read> Lexer<'a, T> {
         pos_end
     }
 
+    fn check_suffix(&self, pos: usize) -> () {
+        if self.has_next_in_line(pos) {
+            let c: char = self.next_char_in_line(pos);
+            if !Self::is_whitespace(c) && !Self::is_other(c) {
+                eprintln!("Found invalid suffix '{}' for number in expression", c);
+                exit(ExitCode::LexerError);
+            }
+        }
+    }
+
     fn next_in_line(&mut self, t: &mut Token) -> () {
         let (mut c, mut pos_start): (char, usize) = ('\0', self.position);
         while self.has_next_in_line(pos_start) {
@@ -194,11 +204,13 @@ impl <'a, T: Read> Lexer<'a, T> {
                 c = self.next_char_in_line(pos_start + 1);
                 if c == 'x' {
                     let pos_end: usize = self.collect_token_sequence(pos_start + 2, Self::is_hex_digit);
+                    self.check_suffix(pos_end);
                     self.form_token(t, pos_start, pos_end, TokenKind::Number);
                     return;
                 }
             }
             let pos_end: usize = self.collect_token_sequence(pos_start + 1, Self::is_digit);
+            self.check_suffix(pos_end);
             self.form_token(t, pos_start, pos_end, TokenKind::Number);
         } else if Self::is_letter(c) {
             let pos_end: usize = self.collect_token_sequence(pos_start + 1, Self::is_ident);
@@ -260,6 +272,20 @@ impl <'a, T: Read> Lexer<'a, T> {
         c == '/'
     }
 
+    fn is_other(c: char) -> bool {
+        match c {
+            ',' => true,
+            ':' => true,
+            '-' => true,
+            '(' => true,
+            ')' => true,
+            '+' => true,
+            '/' => true,
+            '*' => true,
+            _   => false,
+        }
+    }
+
     fn is_whitespace(c: char) -> bool {
         c == ' ' || c == '\t' || c == '\r' || c == '\n'
     }
@@ -273,7 +299,7 @@ impl <'a, T: Read> Lexer<'a, T> {
     }
 
     fn is_hex_digit(c: char) -> bool {
-        Self::is_digit(c) || Self::is_letter_lower(c) || Self::is_letter_upper(c)
+        Self::is_digit(c) || ('a'..='f').contains(&c) || ('A'..='F').contains(&c)
     }
 
     fn is_letter_lower(c:char) -> bool {

diff --git a/tests/lit-llvm/lex_number_enhanced.calc b/tests/lit-llvm/lex_number_enhanced.calc
@@ -1,8 +1,10 @@
-// RUN: @calcc --verbose --lex -e -9    2>&1    | @filecheck %s --check-prefix=CHECK_A
-// RUN: @calcc --verbose --lex -e "- 9" 2>&1    | @filecheck %s --check-prefix=CHECK_B
-// RUN: @calcc --verbose --lex -e 0xAF  2>&1    | @filecheck %s --check-prefix=CHECK_C
-// RUN: @calcc --verbose --lex -e 0x0c1 2>&1    | @filecheck %s --check-prefix=CHECK_D
-// RUN: @calcc --verbose --lex -e 012   2>&1    | @filecheck %s --check-prefix=CHECK_E
+// RUN:     @calcc --verbose --lex -e -9    2>&1    | @filecheck %s --check-prefix=CHECK_A
+// RUN:     @calcc --verbose --lex -e "- 9" 2>&1    | @filecheck %s --check-prefix=CHECK_B
+// RUN:     @calcc --verbose --lex -e 0xAF  2>&1    | @filecheck %s --check-prefix=CHECK_C
+// RUN:     @calcc --verbose --lex -e 0x0c1 2>&1    | @filecheck %s --check-prefix=CHECK_D
+// RUN:     @calcc --verbose --lex -e 012   2>&1    | @filecheck %s --check-prefix=CHECK_E
+// RUN: not @calcc --verbose --lex -e 0x0z3 2>&1    | @filecheck %s --check-prefix=CHECK_F
+// RUN: not @calcc --verbose --lex -e 140_3 2>&1    | @filecheck %s --check-prefix=CHECK_G
 
 // CHECK_A: Processing input 'Expression:-9'
 // CHECK_A: Read 2 bytes from buffer at line 0
@@ -46,3 +48,19 @@
 // CHECK_E: Found char '2' in line 0 at pos 2
 // CHECK_E: Lexed token 'Number:012'
 // CHECK_E: Lexed token 'Eoi:'
+
+// CHECK_F: Processing input 'Expression:0x0z3'
+// CHECK_F: Read 5 bytes from buffer at line 0
+// CHECK_F: Found char '0' in line 0 at pos 0
+// CHECK_F: Found char 'x' in line 0 at pos 1
+// CHECK_F: Found char '0' in line 0 at pos 2
+// CHECK_F: Found char 'z' in line 0 at pos 3
+// CHECK_F: Found invalid suffix 'z' for number in expression
+
+// CHECK_G: Processing input 'Expression:140_3'
+// CHECK_G: Read 5 bytes from buffer at line 0
+// CHECK_G: Found char '1' in line 0 at pos 0
+// CHECK_G: Found char '4' in line 0 at pos 1
+// CHECK_G: Found char '0' in line 0 at pos 2
+// CHECK_G: Found char '_' in line 0 at pos 3
+// CHECK_G: Found invalid suffix '_' for number in expression