lexer.bah

#import "iostream.bah"
#import "string.bah"

//tokens types
#define tokenType char

const TOKEN_NO_TYPE =      <tokenType>-1
const TOKEN_TYPE_INT =     <tokenType>0
const TOKEN_TYPE_FLOAT =   <tokenType>1
const TOKEN_TYPE_VAR =     <tokenType>2
const TOKEN_TYPE_ENCL =    <tokenType>3
const TOKEN_TYPE_SEP =     <tokenType>4
const TOKEN_TYPE_STR =     <tokenType>5
const TOKEN_TYPE_KEYWORD = <tokenType>6
const TOKEN_TYPE_CHAR =    <tokenType>7
const TOKEN_TYPE_BOOL =    <tokenType>8
const TOKEN_TYPE_SYNTAX =  <tokenType>10
const TOKEN_TYPE_FUNC =    <tokenType>11
const TOKEN_TYPE_CAST =    <tokenType>12

//a token (the atomic unit for describing the inputed Bah file)
struct Tok {
    cont: str = ""
    ogCont: str = ""
    type: tokenType = TOKEN_NO_TYPE
    pos: uint32 = 0
    linePos: uint32 = 0
    line: uint32 = 1
    begLine: uint32 = 1
    isValue: bool = false
    highlighted: bool = false
}


inArray(needle char, arr []char) bool {
    i=0; for i < len(arr) {
        if needle == arr[i] {
            return true
        }
        i++
    }
    return false
}

inArrayStr(needle str, arr []str) bool {
    i=0; for i < len(arr) {
        if needle == arr[i] {
            return true
        }
        i++
    }
    return false
}

makeToken(pos int, linePos int, lineNb int, cont []char, type tokenType) Tok {
    t = Tok{}
    t.cont = arrToStr(cont)
    t.ogCont = t.cont
    clear(cont)
    t.pos = pos
    t.line = lineNb
    t.linePos = linePos
    t.type = type
    if type == TOKEN_TYPE_INT || type == TOKEN_TYPE_STR || type == TOKEN_TYPE_FLOAT || type == TOKEN_TYPE_VAR || type == TOKEN_TYPE_BOOL || type == TOKEN_TYPE_CHAR {
        t.isValue = true
    }
    return t
}

isMinus(c char, nc char) bool {
    return c == '-' && isNumber(nc)
}

enclavers = []char{'(', ')', '{', '}', '[', ']'}
syntaxes = []char{'!', '=', '|', <char>38, '%', '+', '-', '*', '/', ',', '<', '>', ':', <char>59, '^'}
                                    //&                                                    //;

keywords = []str{"if", "else", "for", "struct", "const", "return", "extend", "new", "break", "continue", "default", "switch", "case", "while", "typedef", "function", "async", "in", "chan", "map", "buffer", "capture", "then"}

lexer(s <any>, codeLength int) []Tok {

    tokens = []Tok

    memory = []char

    lineNb = 1 //current line number (in source file)
    posOffset = 0

    i=0; for i < codeLength, i++ {
        c = s[i]
        nc char = <char>0
        if i+1 < codeLength {
            nc = s[i+1]
        }

        //we have it the new line character
        if c == <char>10 {
            lineNb++
            posOffset = i+1
        }

        //token is a string
        if c == <char>34 || c == <char>39 {

            pos = i
            begLine = lineNb
            memory[0] = c
            i++
            for i < codeLength, i++ {
                c = s[i]
                pc = s[i-1]
                //string ending
                if c == <char>34 || c == <char>39 {
                    //except if the previous char is a backslash
                    if pc != <char>92 {
                        memory[len(memory)] = c
                        break
                    }
                }
                //escaping line returns
                if c == <char>10 {
                    lineNb++
                    posOffset = i+1
                }
                memory[len(memory)] = c
            }
            tokens[len(tokens)] = makeToken(pos, pos - posOffset, lineNb, memory, TOKEN_TYPE_STR)
            lt = tokens[len(tokens) - 1]
            lt.begLine = begLine
            tokens[len(tokens) - 1] = lt
        } else if isNumber(c) || isMinus(c, nc) { //token is a number

            memory[0] = c
            pos = i
            i++
            currentType = TOKEN_TYPE_INT
            isHex = false
            for i < codeLength, i++ {
                c = s[i]
                if c == <char>46 {
                    currentType = TOKEN_TYPE_FLOAT
                } else if isNumber(c) == false {
                    if isHex == false {
                        if c == 'x' {
                            isHex = true
                        } else {
                            break
                        }
                    } else {
                        if isUpper(c) {
                            c += <char>32
                        }
                        if c < 'a' || c > 'f' {
                            break
                        }
                    }
                    if isHex == false {
                        break
                    }
                }
                memory[len(memory)] = c
            }
            i--
            tokens[len(tokens)] = makeToken(pos, pos - posOffset, lineNb, memory, currentType)
        } else if c == <char>35 { //is a hash keyword

            pos = i
            memory[0] = c
            i++
            for i < codeLength, i++ {
                c = s[i]
                if isAlphaNumeric(c) == false {
                    break
                }
                memory[len(memory)] = c
            }
            i--
            tokens[len(tokens)] = makeToken(pos, pos - posOffset, lineNb, memory, TOKEN_TYPE_KEYWORD)
        } else if inArray(c, enclavers) { //token is enclaver

            memory[0] = c
            tokens[len(tokens)] = makeToken(i, i - posOffset, lineNb, memory, TOKEN_TYPE_ENCL)
        } else if inArray(c, syntaxes) { //token is a syntax element

            
            if c == '<' {
                pos = i
                isCast = false
                memory[0] = c
                i++
                for i < codeLength, i++ {
                    c = s[i]
                    if c == '>' {
                        isCast = true
                        memory[len(memory)] = c
                        break
                    }
                    if isAlphaNumeric(c) == false && c != '*' && c != ':' && c != '_' && c != '[' && c != ']' {
                        break
                    }
                    memory[len(memory)] = c
                }
                if isCast == true {
                    tokens[len(tokens)] = makeToken(pos, pos - posOffset, lineNb, memory, TOKEN_TYPE_CAST)
                    continue
                }
                i = pos
                c = '<'
                clear(memory)
            }
            
            memory[0] = c
            pos = i
            i++
            fc = c
            for i < codeLength, i++ {
                c = s[i]
                if c in syntaxes == false {
                    break
                }
                //for <=, >=, ==, !=, +=, -=, &&, ||, <<
                if fc == '<' {
                    if c != '-' && c != '=' && c != '<' {
                        break
                    }
                } else if c == '|' {
                    if fc != c {
                        break
                    }
                } else if c == '&' {
                    if fc != c {
                        break
                    }
                } else if c != '=' {
                    if c != '>' {
                        break
                    }
                }
                memory[len(memory)] = c
            }
            i--
            tokens[len(tokens)] = makeToken(pos, pos - posOffset, lineNb, memory, TOKEN_TYPE_SYNTAX)
        } else if c == '.' { //token is a separator

            memory[0] = c
            tokens[len(tokens)] = makeToken(i, i - posOffset, lineNb, memory, TOKEN_TYPE_SEP)
        } else if isAlphaNumeric(c) || c == '_' { //token is a var / keyword

            memory[0] = c
            pos = i
            i++
            //get full token
            for i < codeLength, i++ {
                c = s[i]
                if isAlphaNumeric(c) == false {
                    if c != '_' {
                        if c == '>' {
                            lc = memory[len(memory)-1]
                            if lc == '-' {
                                memory[len(memory)-1] = <char>0
                                i--
                                break
                            }
                        }
                        break
                    }
                }

                memory[len(memory)] = c
            }
            i--
            currentType = TOKEN_TYPE_VAR
            
            memstr = arrToStr(memory)

            if memstr in keywords {
                currentType = TOKEN_TYPE_KEYWORD
            }

            tokens[len(tokens)] = makeToken(pos, pos - posOffset, lineNb, memory, currentType)
        }

    }
    return tokens
}