compiler/lexer.oc

//* The lexer

import std::vector::Vector
import std::span::{Span, Location}
import std::buffer::Buffer
import @errors::Error
import @tokens::{Token, TokenType}

def is_hex_digit(c: char): bool {
    if c.is_digit() return true
    if 'a' <= c <= 'f' return true
    if 'A' <= c <= 'F' return true
    return false
}

struct Lexer {
    source: str
    source_len: u32
    i: u32
    loc: Location
    seen_newline: bool
    tokens: &Vector<&Token>

    errors: &Vector<&Error>

    in_comment: bool
    comment: Buffer
    comment_start: Location
}

def Lexer::make(source: str, filename: str): Lexer {
    let start_loc = Location(filename, 1, 1, 0)
    return Lexer(
        source,
        source_len: source.len(),
        i: 0,
        loc: start_loc,
        seen_newline: false,
        tokens: Vector<&Token>::new(),
        errors: Vector<&Error>::new(),
        in_comment: false,
        comment: Buffer::make(),
        comment_start: start_loc,
    )
}

def Lexer::push(&this, token: &Token) {
    token.seen_newline = .seen_newline
    if .comment.size > 0 {
        token.comment = .comment.new_str()
        token.comment_loc = .comment_start
    }
    .comment.clear()

    .tokens.push(token)

    .seen_newline = false
    .in_comment = false
}

def Lexer::push_type(&this, type: TokenType, len: u32) {
    let start_loc = .loc
    for let i = 0; i < len; i += 1 {
        .inc()
    }
    .push(Token::from_type(type, Span(start_loc, .loc)))
}

def Lexer::cur(&this): char => .source[.i]

def Lexer::inc(&this) {
    match .cur() {
        '\n' => {
            .loc.line += 1
            .loc.col = 1
            .seen_newline = true
        }
        else => .loc.col += 1
    }
    .i += 1
    .loc.index += 1
}

def Lexer::peek(&this, offset: i32): char {
    if .cur() == '\0' {
        return .cur()
    }
    return .source[.i + 1]
}

def Lexer::lex_char_literal(&this) {
    let start_loc = .loc
    let start = .i + 1
    .inc()

    if .cur() == '\\' {
        .inc()
    }
    .inc()
    if .cur() != '\'' {
        .errors.push(Error::new(Span(.loc, .loc), "Expected ' after character literal"))
    }

    let len = .i - start
    let text = .source.substring(start, len)

    .inc()
    .push(Token::new(TokenType::CharLiteral, Span(start_loc, .loc), text))
}

// Format strings can be specified JS-style with backticks, or Python-style with f"..."
// Backticks can be inferred in here directly, but for `f"..."` we need to the lexer to tell
// us whether we saw an `f` right before the string literal or not.
def Lexer::lex_string_literal(&this, has_seen_f: bool) {
    let start_loc = .loc
    let end_char = .cur()
    let start = .i + 1
    .inc()
    while .i < .source_len and .cur() != end_char {
        if .cur() == '\\' {
            .inc()
        }
        .inc()
    }

    let len = .i - start
    let text = .source.substring(start, len)
    .inc()

    if .i >= .source_len {
        .errors.push(Error::new(Span(.loc, .loc), "Unterminated string literal"))
    }

    if end_char == '`' or has_seen_f {
        .push(Token::new(TokenType::FormatStringLiteral, Span(start_loc, .loc), text))
    } else {
        .push(Token::new(TokenType::StringLiteral, Span(start_loc, .loc), text))
    }
}

def Lexer::lex_int_literal_different_base(&this): &Token {
    let start_loc = .loc
    let start = .i
    .inc()
    match .cur() {
        'x' => {
            .inc()
            while .i < .source_len and is_hex_digit(.cur()) {
                .inc()
            }
        }
        'b' => {
            .inc()
            while .i < .source_len and .cur() == '0' or .cur() == '1' {
                .inc()
            }
        }
        else => assert false, "Invalid base for int literal"
    }
    let len = .i - start
    let text = .source.substring(start, len)
    return Token::new(TokenType::IntLiteral, Span(start_loc, .loc), text)
}

def Lexer::lex_numeric_literal_helper(&this): &Token {
    let start_loc = .loc
    if .cur() == '0' {
        match .peek(1) {
            'x' | 'b' => {
                return .lex_int_literal_different_base()
            }
            // Do nothing, fall through
            else => {}
        }
    }

    let start = .i
    let token_type: TokenType
    while .cur().is_digit() {
        .inc()
    }
    if .cur() == '.' {
        .inc()
        while .cur().is_digit() {
            .inc()
        }
        token_type = TokenType::FloatLiteral
    } else {
        token_type = TokenType::IntLiteral
    }
    let len = .i - start
    let text = .source.substring(start, len)

    return Token::new(token_type, Span(start_loc, .loc), text)
}

def Lexer::lex_numeric_literal(&this) {
    let token = .lex_numeric_literal_helper()

    if .cur() == 'u' or .cur() == 'i' or .cur() == 'f' {
        let start_loc = .loc
        let start = .i
        .inc()
        while .i < .source_len and .cur().is_digit() {
            .inc()
        }
        let len = .i - start
        let suffix = .source.substring(start, len)
        token.suffix = Token::from_ident(suffix, Span(start_loc, .loc))
    }

    .push(token)
}

def Lexer::lex_comment(&this) {
    // Skip leading slashes
    while .cur() == '/' { .inc() }

    // We only save comments that have a leading asterisk or dot
    let save_comment = false
    if .cur() == '*' or .cur() == '.' {
        .inc()
        save_comment = true
        if .comment.size == 0 {
            .comment_start = .loc
        }
    }

    // Skip whitespace
    if .cur() == ' ' or .cur() == '\t' { .inc() }

    // Read the comment and store it into the buffer
    while .i < .source_len and .cur() != '\n' {
        if save_comment then .comment.putc(.cur())
        .inc()
    }

    if save_comment then .comment.putc('\n')
}

def Lexer::lex(&this): &Vector<&Token> {
    while .i < .source_len {
        let c = .cur()
        match c {
            ' ' | '\t' | '\v' | '\r' | '\b'| '\n' => {
                .inc()
            }
            ';' => .push_type(TokenType::Semicolon, len: 1)
            ',' => .push_type(TokenType::Comma, len: 1)
            '.' => .push_type(TokenType::Dot, len: 1)
            '(' => .push_type(TokenType::OpenParen, len: 1)
            ')' => .push_type(TokenType::CloseParen, len: 1)
            '[' => .push_type(TokenType::OpenSquare, len: 1)
            ']' => .push_type(TokenType::CloseSquare, len: 1)
            '{' => .push_type(TokenType::OpenCurly, len: 1)
            '}' => .push_type(TokenType::CloseCurly, len: 1)
            '@' => .push_type(TokenType::AtSign, len: 1)
            '%' => .push_type(TokenType::Percent, len: 1)
            '^' => .push_type(TokenType::Caret, len: 1)
            '&' => .push_type(TokenType::Ampersand, len: 1)
            '|' => .push_type(TokenType::Line, len: 1)
            '?' => .push_type(TokenType::Question, len: 1)
            '~' => .push_type(TokenType::Tilde, len: 1)
            '!' => match .peek(1) {
                '='  => .push_type(TokenType::NotEquals, len: 2)
                else => .push_type(TokenType::Exclamation, len: 1)
            }
            ':' => match .peek(1) {
                ':'  => .push_type(TokenType::ColonColon, len: 2)
                else => .push_type(TokenType::Colon, len: 1)
            }
            '=' => match .peek(1) {
                '='  => .push_type(TokenType::EqualEquals, len: 2)
                '>'  => .push_type(TokenType::FatArrow, len: 2)
                else => .push_type(TokenType::Equals, len: 1)
            }
            '*' => match .peek(1) {
                '='  => .push_type(TokenType::StarEquals, len: 2)
                else => .push_type(TokenType::Star, len: 1)
            }
            '+' => match .peek(1) {
                '='  => .push_type(TokenType::PlusEquals, len: 2)
                '+' => .push_type(TokenType::PlusPlus, len: 2)
                else => .push_type(TokenType::Plus, len: 1)
            }
            '-' => match .peek(1) {
                '='  => .push_type(TokenType::MinusEquals, len: 2)
                '-' => .push_type(TokenType::MinusMinus, len: 2)
                else => .push_type(TokenType::Minus, len: 1)
            }
            '<' => match .peek(1) {
                '='  => .push_type(TokenType::LessThanEquals, len: 2)
                else => .push_type(TokenType::LessThan, len: 1)
            }
            '>' => match .peek(1) {
                '='  => .push_type(TokenType::GreaterThanEquals, len: 2)
                else => .push_type(TokenType::GreaterThan, len: 1)
            }
            '/' => match .peek(1) {
                '/' => .lex_comment()
                '='  => .push_type(TokenType::SlashEquals, len: 2)
                else => .push_type(TokenType::Slash, len: 1)
            }
            '\'' => .lex_char_literal()
            '"' | '`' => .lex_string_literal(has_seen_f: false)
            else => {
                let start_loc = .loc

                if c == 'f' and .peek(1) == '"' {
                    .inc()
                    .lex_string_literal(has_seen_f: true)

                } else if c.is_digit() {
                    .lex_numeric_literal()

                } else if c.is_alpha() or c == '_' {
                    let start = .i
                    while .cur().is_alnum() or .cur() == '_' {
                        .inc()
                    }
                    let len = .i - start
                    let text = .source.substring(start, len)

                    .push(Token::from_ident(text, Span(start_loc, .loc)))

                } else {
                    .errors.push(Error::new(Span(.loc, .loc), `Unrecognized char in lexer: '{c}'`))
                    .inc()
                }
            }
        }
    }

    // We can assume EOF acts like a newline
    .seen_newline = true
    .push_type(TokenType::EOF, len: 0)
    return .tokens
}