Skip to content

Commit

Permalink
Updated Lexer
Browse files Browse the repository at this point in the history
*   Fixed handling for numbers according to lexer rules

*   Updated README and tests
  • Loading branch information
e3m3 committed Sep 13, 2024
1 parent 200e10f commit 0b65a54
Show file tree
Hide file tree
Showing 3 changed files with 110 additions and 8 deletions.
62 changes: 60 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,70 @@ Accepted factors in the grammar have been extended for convenience (see `src/{le
The output of the compiler is LLVM IR or LLVM bytecode [[6]].


## Grammar
## Language

### Lexer

```text
ident ::= is_letter+ (is_letter | is_number)*
number ::= digit+ | (`0x` hex_digit+)
digit ::= [0-9]
hex_digit ::= [a-fA-F0-9]
letter ::= letter_lower | letter_upper | `_`
letter_lower ::= [a-z]
letter_upper ::= [A-Z]
whitespace ::= ` ` | `\r` | `\n` | `\t`
any ::= _
token ::= { tokenkind, text }
tokenkind ::=
| Unknown
| Comma
| Comment
| Colon
| Eoi
| Eol
| Ident
| Minus
| Number
| ParenL
| ParenR
| Plus
| Slash
| Star
| With
text ::=
| ``
| `,`
| `/``/` any*
| `:`
| ident
| `-`
| number
| `(`
| `)`
| `+`
| `/`
| `*`
| `with`
```

### Grammar

```text
calc : ("with" ":" ident ("," ident)* ":" )? expr
calc ::= ( With Colon Ident (Comma Ident)* Colon )? expr
expr ::= term ( Plus | Minus ) term
factor ::= Minus? ( Number | Ident | ParenL expr ParenR )
term ::= factor ( Slash | Star ) factor
```

Notes:

* The grammar rules above use the `tokenkind` as a shorthand for a `token` object as described by the lexer rules.

* In the AST, a factor with a leading `Minus` token is represented as a subtraction expression where the left term
is `Number` with the constant value `0`.


## Prequisites

Expand Down
28 changes: 27 additions & 1 deletion src/lex.rs
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,16 @@ impl <'a, T: Read> Lexer<'a, T> {
pos_end
}

fn check_suffix(&self, pos: usize) -> () {
if self.has_next_in_line(pos) {
let c: char = self.next_char_in_line(pos);
if !Self::is_whitespace(c) && !Self::is_other(c) {
eprintln!("Found invalid suffix '{}' for number in expression", c);
exit(ExitCode::LexerError);
}
}
}

fn next_in_line(&mut self, t: &mut Token) -> () {
let (mut c, mut pos_start): (char, usize) = ('\0', self.position);
while self.has_next_in_line(pos_start) {
Expand All @@ -194,11 +204,13 @@ impl <'a, T: Read> Lexer<'a, T> {
c = self.next_char_in_line(pos_start + 1);
if c == 'x' {
let pos_end: usize = self.collect_token_sequence(pos_start + 2, Self::is_hex_digit);
self.check_suffix(pos_end);
self.form_token(t, pos_start, pos_end, TokenKind::Number);
return;
}
}
let pos_end: usize = self.collect_token_sequence(pos_start + 1, Self::is_digit);
self.check_suffix(pos_end);
self.form_token(t, pos_start, pos_end, TokenKind::Number);
} else if Self::is_letter(c) {
let pos_end: usize = self.collect_token_sequence(pos_start + 1, Self::is_ident);
Expand Down Expand Up @@ -260,6 +272,20 @@ impl <'a, T: Read> Lexer<'a, T> {
c == '/'
}

fn is_other(c: char) -> bool {
match c {
',' => true,
':' => true,
'-' => true,
'(' => true,
')' => true,
'+' => true,
'/' => true,
'*' => true,
_ => false,
}
}

fn is_whitespace(c: char) -> bool {
c == ' ' || c == '\t' || c == '\r' || c == '\n'
}
Expand All @@ -273,7 +299,7 @@ impl <'a, T: Read> Lexer<'a, T> {
}

fn is_hex_digit(c: char) -> bool {
Self::is_digit(c) || Self::is_letter_lower(c) || Self::is_letter_upper(c)
Self::is_digit(c) || ('a'..='f').contains(&c) || ('A'..='F').contains(&c)
}

fn is_letter_lower(c:char) -> bool {
Expand Down
28 changes: 23 additions & 5 deletions tests/lit-llvm/lex_number_enhanced.calc
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
// RUN: @calcc --verbose --lex -e -9 2>&1 | @filecheck %s --check-prefix=CHECK_A
// RUN: @calcc --verbose --lex -e "- 9" 2>&1 | @filecheck %s --check-prefix=CHECK_B
// RUN: @calcc --verbose --lex -e 0xAF 2>&1 | @filecheck %s --check-prefix=CHECK_C
// RUN: @calcc --verbose --lex -e 0x0c1 2>&1 | @filecheck %s --check-prefix=CHECK_D
// RUN: @calcc --verbose --lex -e 012 2>&1 | @filecheck %s --check-prefix=CHECK_E
// RUN: @calcc --verbose --lex -e -9 2>&1 | @filecheck %s --check-prefix=CHECK_A
// RUN: @calcc --verbose --lex -e "- 9" 2>&1 | @filecheck %s --check-prefix=CHECK_B
// RUN: @calcc --verbose --lex -e 0xAF 2>&1 | @filecheck %s --check-prefix=CHECK_C
// RUN: @calcc --verbose --lex -e 0x0c1 2>&1 | @filecheck %s --check-prefix=CHECK_D
// RUN: @calcc --verbose --lex -e 012 2>&1 | @filecheck %s --check-prefix=CHECK_E
// RUN: not @calcc --verbose --lex -e 0x0z3 2>&1 | @filecheck %s --check-prefix=CHECK_F
// RUN: not @calcc --verbose --lex -e 140_3 2>&1 | @filecheck %s --check-prefix=CHECK_G

// CHECK_A: Processing input 'Expression:-9'
// CHECK_A: Read 2 bytes from buffer at line 0
Expand Down Expand Up @@ -46,3 +48,19 @@
// CHECK_E: Found char '2' in line 0 at pos 2
// CHECK_E: Lexed token 'Number:012'
// CHECK_E: Lexed token 'Eoi:'

// CHECK_F: Processing input 'Expression:0x0z3'
// CHECK_F: Read 5 bytes from buffer at line 0
// CHECK_F: Found char '0' in line 0 at pos 0
// CHECK_F: Found char 'x' in line 0 at pos 1
// CHECK_F: Found char '0' in line 0 at pos 2
// CHECK_F: Found char 'z' in line 0 at pos 3
// CHECK_F: Found invalid suffix 'z' for number in expression

// CHECK_G: Processing input 'Expression:140_3'
// CHECK_G: Read 5 bytes from buffer at line 0
// CHECK_G: Found char '1' in line 0 at pos 0
// CHECK_G: Found char '4' in line 0 at pos 1
// CHECK_G: Found char '0' in line 0 at pos 2
// CHECK_G: Found char '_' in line 0 at pos 3
// CHECK_G: Found invalid suffix '_' for number in expression

0 comments on commit 0b65a54

Please sign in to comment.