explorer/syntax/lexer.lpp

/*
Part of the Carbon Language project, under the Apache License v2.0 with LLVM
Exceptions. See /LICENSE for license information.
SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/

%{
  #include <cstdlib>

  #include "common/check.h"
  #include "common/error.h"
  #include "explorer/syntax/lex_helper.h"
  #include "explorer/syntax/lex_scan_helper.h"
  #include "explorer/syntax/parse_and_lex_context.h"
  #include "explorer/syntax/parser.h"
  #include "llvm/ADT/StringExtras.h"
  #include "llvm/Support/FormatVariadic.h"
%}

/* Turn off legacy bits we don't need. */
%option noyywrap nounput nodefault

%option reentrant

/* Lexing a token immediately after consuming some whitespace. */
%s AFTER_WHITESPACE
/*
 * Lexing a token immediately after consuming an operand-ending token:
 * a closing bracket, identifier, or literal.
 */
%s AFTER_OPERAND

/* table-begin */
ABSTRACT             "abstract"
ADDR                 "addr"
ALIAS                "alias"
AMPERSAND            "&"
AND                  "and"
API                  "api"
ARROW                "->"
AS                   "as"
AUTO                 "auto"
AWAIT                "__await"
BASE                 "base"
BOOL                 "bool"
BREAK                "break"
CARET                "^"
CASE                 "case"
CHOICE               "choice"
CLASS                "class"
COLON                ":"
COLON_BANG           ":!"
COMMA                ","
CONSTRAINT           "constraint"
CONTINUATION         "__continuation"
CONTINUATION_TYPE    "__Continuation"
CONTINUE             "continue"
DEFAULT              "default"
DESTRUCTOR           "destructor"
DOUBLE_ARROW         "=>"
ELSE                 "else"
EQUAL                "="
EQUAL_EQUAL          "=="
EXTENDS              "extends"
EXTERNAL             "external"
FALSE                "false"
FN                   "fn"
FN_TYPE              "__Fn"
FOR                  "for"
FORALL               "forall"
GREATER              ">"
GREATER_EQUAL        ">="
GREATER_GREATER      ">>"
IF                   "if"
IMPL                 "impl"
IMPORT               "import"
IN                   "in"
INTERFACE            "interface"
IS                   "is"
LEFT_CURLY_BRACE     "{"
LEFT_PARENTHESIS     "("
LEFT_SQUARE_BRACKET  "["
LESS                 "<"
LESS_EQUAL           "<="
LESS_LESS            "<<"
LET                  "let"
LIBRARY              "library"
MATCH                "match"
MINUS                "-"
MIX                  "__mix"
MIXIN                "__mixin"
NOT                  "not"
NOT_EQUAL            "!="
OR                   "or"
PACKAGE              "package"
PERCENT              "%"
PERIOD               "."
PIPE                 "|"
PLUS                 "+"
RETURN               "return"
RETURNED             "returned"
RIGHT_CURLY_BRACE    "}"
RIGHT_PARENTHESIS    ")"
RIGHT_SQUARE_BRACKET "]"
RUN                  "__run"
SELF                 "Self"
SEMICOLON            ";"
SLASH                "/"
STRING               "String"
THEN                 "then"
TRUE                 "true"
TYPE                 "Type"
UNDERSCORE           "_"
UNIMPL_EXAMPLE       "__unimplemented_example_infix"
VAR                  "var"
WHERE                "where"
WHILE                "while"
/* table-end */

/* This should be kept table-like, but isn't automatic due to spaces. */
identifier            [A-Za-z_][A-Za-z0-9_]*
/* TODO: Remove Print special casing once we have variadics or overloads. */
intrinsic_identifier  (Print|__intrinsic_[A-Za-z0-9_]*)
sized_type_literal    [iuf][1-9][0-9]*
integer_literal       [0-9]+
horizontal_whitespace [ \t\r]
whitespace            [ \t\r\n]
one_line_comment      \/\/[^\n]*\n
operand_start         [(A-Za-z0-9_\"]

%%

%{
  // Code run each time yylex is called.

  // Begin with an empty token span starting where its previous end was.
  context.current_token_position.step();
%}

 /* table-begin */
{ABSTRACT}            { return CARBON_SIMPLE_TOKEN(ABSTRACT);            }
{ADDR}                { return CARBON_SIMPLE_TOKEN(ADDR);                }
{ALIAS}               { return CARBON_SIMPLE_TOKEN(ALIAS);               }
{AMPERSAND}           { return CARBON_SIMPLE_TOKEN(AMPERSAND);           }
{AND}                 { return CARBON_SIMPLE_TOKEN(AND);                 }
{API}                 { return CARBON_SIMPLE_TOKEN(API);                 }
{ARROW}               { return CARBON_SIMPLE_TOKEN(ARROW);               }
{AS}                  { return CARBON_SIMPLE_TOKEN(AS);                  }
{AUTO}                { return CARBON_SIMPLE_TOKEN(AUTO);                }
{AWAIT}               { return CARBON_SIMPLE_TOKEN(AWAIT);               }
{BASE}                { return CARBON_SIMPLE_TOKEN(BASE);                }
{BOOL}                { return CARBON_SIMPLE_TOKEN(BOOL);                }
{BREAK}               { return CARBON_SIMPLE_TOKEN(BREAK);               }
{CARET}               { return CARBON_SIMPLE_TOKEN(CARET);               }
{CASE}                { return CARBON_SIMPLE_TOKEN(CASE);                }
{CHOICE}              { return CARBON_SIMPLE_TOKEN(CHOICE);              }
{CLASS}               { return CARBON_SIMPLE_TOKEN(CLASS);               }
{COLON_BANG}          { return CARBON_SIMPLE_TOKEN(COLON_BANG);          }
{COLON}               { return CARBON_SIMPLE_TOKEN(COLON);               }
{COMMA}               { return CARBON_SIMPLE_TOKEN(COMMA);               }
{CONSTRAINT}          { return CARBON_SIMPLE_TOKEN(CONSTRAINT);          }
{CONTINUATION_TYPE}   { return CARBON_SIMPLE_TOKEN(CONTINUATION_TYPE);   }
{CONTINUATION}        { return CARBON_SIMPLE_TOKEN(CONTINUATION);        }
{CONTINUE}            { return CARBON_SIMPLE_TOKEN(CONTINUE);            }
{DEFAULT}             { return CARBON_SIMPLE_TOKEN(DEFAULT);             }
{DESTRUCTOR}          { return CARBON_SIMPLE_TOKEN(DESTRUCTOR);          }
{DOUBLE_ARROW}        { return CARBON_SIMPLE_TOKEN(DOUBLE_ARROW);        }
{ELSE}                { return CARBON_SIMPLE_TOKEN(ELSE);                }
{EQUAL_EQUAL}         { return CARBON_SIMPLE_TOKEN(EQUAL_EQUAL);         }
{EQUAL}               { return CARBON_SIMPLE_TOKEN(EQUAL);               }
{EXTENDS}             { return CARBON_SIMPLE_TOKEN(EXTENDS);             }
{EXTERNAL}            { return CARBON_SIMPLE_TOKEN(EXTERNAL);            }
{FALSE}               { return CARBON_SIMPLE_TOKEN(FALSE);               }
{FN_TYPE}             { return CARBON_SIMPLE_TOKEN(FN_TYPE);             }
{FN}                  { return CARBON_SIMPLE_TOKEN(FN);                  }
{FORALL}              { return CARBON_SIMPLE_TOKEN(FORALL);              }
{FOR}                 { return CARBON_SIMPLE_TOKEN(FOR);                 }
{GREATER_EQUAL}       { return CARBON_SIMPLE_TOKEN(GREATER_EQUAL);       }
{GREATER_GREATER}     { return CARBON_SIMPLE_TOKEN(GREATER_GREATER);     }
{GREATER}             { return CARBON_SIMPLE_TOKEN(GREATER);             }
{IF}                  { return CARBON_SIMPLE_TOKEN(IF);                  }
{IMPL}                { return CARBON_SIMPLE_TOKEN(IMPL);                }
{IMPORT}              { return CARBON_SIMPLE_TOKEN(IMPORT);              }
{INTERFACE}           { return CARBON_SIMPLE_TOKEN(INTERFACE);           }
{IN}                  { return CARBON_SIMPLE_TOKEN(IN);                  }
{IS}                  { return CARBON_SIMPLE_TOKEN(IS);                  }
{LEFT_CURLY_BRACE}    { return CARBON_SIMPLE_TOKEN(LEFT_CURLY_BRACE);    }
{LEFT_PARENTHESIS}    { return CARBON_SIMPLE_TOKEN(LEFT_PARENTHESIS);    }
{LEFT_SQUARE_BRACKET} { return CARBON_SIMPLE_TOKEN(LEFT_SQUARE_BRACKET); }
{LESS_EQUAL}          { return CARBON_SIMPLE_TOKEN(LESS_EQUAL);          }
{LESS_LESS}           { return CARBON_SIMPLE_TOKEN(LESS_LESS);           }
{LESS}                { return CARBON_SIMPLE_TOKEN(LESS);                }
{LET}                 { return CARBON_SIMPLE_TOKEN(LET);                 }
{LIBRARY}             { return CARBON_SIMPLE_TOKEN(LIBRARY);             }
{MATCH}               { return CARBON_SIMPLE_TOKEN(MATCH);               }
{MINUS}               { return CARBON_SIMPLE_TOKEN(MINUS);               }
{MIXIN}               { return CARBON_SIMPLE_TOKEN(MIXIN);               }
{MIX}                 { return CARBON_SIMPLE_TOKEN(MIX);                 }
{NOT_EQUAL}           { return CARBON_SIMPLE_TOKEN(NOT_EQUAL);           }
{NOT}                 { return CARBON_SIMPLE_TOKEN(NOT);                 }
{OR}                  { return CARBON_SIMPLE_TOKEN(OR);                  }
{PACKAGE}             { return CARBON_SIMPLE_TOKEN(PACKAGE);             }
{PERCENT}             { return CARBON_SIMPLE_TOKEN(PERCENT);             }
{PERIOD}              { return CARBON_SIMPLE_TOKEN(PERIOD);              }
{PIPE}                { return CARBON_SIMPLE_TOKEN(PIPE);                }
{PLUS}                { return CARBON_SIMPLE_TOKEN(PLUS);                }
{RETURNED}            { return CARBON_SIMPLE_TOKEN(RETURNED);            }
{RETURN}              { return CARBON_SIMPLE_TOKEN(RETURN);              }
{RUN}                 { return CARBON_SIMPLE_TOKEN(RUN);                 }
{SELF}                { return CARBON_SIMPLE_TOKEN(SELF);                }
{SEMICOLON}           { return CARBON_SIMPLE_TOKEN(SEMICOLON);           }
{SLASH}               { return CARBON_SIMPLE_TOKEN(SLASH);               }
{STRING}              { return CARBON_SIMPLE_TOKEN(STRING);              }
{THEN}                { return CARBON_SIMPLE_TOKEN(THEN);                }
{TRUE}                { return CARBON_SIMPLE_TOKEN(TRUE);                }
{TYPE}                { return CARBON_SIMPLE_TOKEN(TYPE);                }
{UNDERSCORE}          { return CARBON_SIMPLE_TOKEN(UNDERSCORE);          }
{UNIMPL_EXAMPLE}      { return CARBON_SIMPLE_TOKEN(UNIMPL_EXAMPLE);      }
{VAR}                 { return CARBON_SIMPLE_TOKEN(VAR);                 }
{WHERE}               { return CARBON_SIMPLE_TOKEN(WHERE);               }
{WHILE}               { return CARBON_SIMPLE_TOKEN(WHILE);               }
 /* table-end */

 /* More modern Bisons provide make_EOF. */
<<EOF>>               { return CARBON_SIMPLE_TOKEN(END_OF_FILE); }

{RIGHT_PARENTHESIS} {
  BEGIN(AFTER_OPERAND);
  return CARBON_SIMPLE_TOKEN(RIGHT_PARENTHESIS);
}
{RIGHT_CURLY_BRACE} {
  BEGIN(AFTER_OPERAND);
  return CARBON_SIMPLE_TOKEN(RIGHT_CURLY_BRACE);
}
{RIGHT_SQUARE_BRACKET} {
  BEGIN(AFTER_OPERAND);
  return CARBON_SIMPLE_TOKEN(RIGHT_SQUARE_BRACKET);
}

 /*
  * For a `*` operator, we look at whitespace and local context to determine the
  * arity and fixity. There are two ways to write a binary operator:
  *
  * 1) Whitespace on both sides.
  * 2) Whitespace on neither side, and the previous token is considered to be
  *    the end of an operand, and the next token is considered to be the start
  *    of an operand.
  *
  * Otherwise, the operator is unary, but we also check for whitespace to help
  * the parser enforce the rule that whitespace is not permitted between the
  * operator and its operand, leading to three more cases:
  *
  * 3) Whitespace before (but implicitly not after, because that would give a
  *    longer match and hit case 1): this can only be a prefix operator.
  * 4) Whitespace after and not before: this can only be a postfix operator.
  * 5) No whitespace on either side (otherwise the longest match would take us
  *    to case 4): this is a unary operator and could be either prefix or
  *    postfix.
  */

 /* `*` operator case 1: */
<AFTER_WHITESPACE>"*"{whitespace}+ {
  BEGIN(AFTER_WHITESPACE);
  return CARBON_SIMPLE_TOKEN(BINARY_STAR);
}
 /* `*` operator case 2: */
<AFTER_OPERAND>"*"/{operand_start} { return CARBON_SIMPLE_TOKEN(BINARY_STAR); }
 /* `*` operator case 3: */
<AFTER_WHITESPACE>"*" { return CARBON_SIMPLE_TOKEN(PREFIX_STAR); }
 /* `*` operator case 4: */
<INITIAL,AFTER_OPERAND>"*"{whitespace}+ {
  BEGIN(AFTER_WHITESPACE);
  return CARBON_SIMPLE_TOKEN(POSTFIX_STAR);
}
 /* `*` operator case 5: */
<INITIAL,AFTER_OPERAND>"*" { return CARBON_SIMPLE_TOKEN(UNARY_STAR); }

{sized_type_literal} {
  BEGIN(AFTER_OPERAND);
  return CARBON_ARG_TOKEN(sized_type_literal, yytext);
}

{intrinsic_identifier} {
  BEGIN(AFTER_OPERAND);
  Carbon::ErrorOr<Carbon::IntrinsicExpression::Intrinsic> intrinsic =
      Carbon::IntrinsicExpression::FindIntrinsic(yytext, context.source_loc());
  if (intrinsic.ok()) {
    return CARBON_ARG_TOKEN(intrinsic_identifier, *intrinsic);
  } else {
    return context.RecordSyntaxError(std::move(intrinsic).error());
  }
}

{identifier} {
  BEGIN(AFTER_OPERAND);
  return CARBON_ARG_TOKEN(identifier, yytext);
}

{integer_literal} {
  BEGIN(AFTER_OPERAND);
  int val = 0;
  if (!llvm::to_integer(yytext, val)) {
    return context.RecordSyntaxError(
        llvm::formatv("Invalid integer literal: {0}", yytext));
  }
  return CARBON_ARG_TOKEN(integer_literal, val);
}

#*(\"\"\"|\") {
  // Raw string literal.
  // yytext (the token that matches the above regex) and chars scanned by
  // str_lex_helper hold the source text, not the string the source represents.
  Carbon::StringLexHelper str_lex_helper(yytext, yyscanner, context);
  const std::string& s = str_lex_helper.str();
  const int hashtag_num = s.find_first_of('"');
  const int leading_quotes = s.size() - hashtag_num;
  if (leading_quotes == 3 && hashtag_num > 0) {
    // Check if it's a single-line string, like #"""#.
    // TODO: Extend with other single-line string cases, like #""""#, based on
    // the definition of block string in the design doc.
    if (Carbon::ReadHashTags(str_lex_helper, hashtag_num)) {
      return Carbon::ProcessSingleLineString(str_lex_helper.str(), context,
                                             hashtag_num);
    } else if (str_lex_helper.is_eof()) {
      return CARBON_SIMPLE_TOKEN(END_OF_FILE);
    }
  } else if (!str_lex_helper.Advance()) {
    return CARBON_SIMPLE_TOKEN(END_OF_FILE);
  }
  // 3 quotes indicates multi-line, otherwise it'll be one.
  const bool multi_line = leading_quotes == 3;

  while (!str_lex_helper.is_eof()) {
    switch (str_lex_helper.last_char()) {
      case '\n':  // Fall through.
      case '\v':  // Fall through.
      case '\f':  // Fall through.
      case '\r':
        if (!multi_line) {
          return context.RecordSyntaxError(
              llvm::formatv("missing closing quote in single-line string: {0}",
                            str_lex_helper.str()));
        }
        str_lex_helper.Advance();
        break;
      case '"':
        if (multi_line) {
          // Check for 2 more '"'s on block string.
          if (!(str_lex_helper.Advance() &&
                str_lex_helper.last_char() == '"')) {
            continue;
          }
          if (!(str_lex_helper.Advance() &&
                str_lex_helper.last_char() == '"')) {
            continue;
          }
          // Now we are at the last " of """.
        }

        if (Carbon::ReadHashTags(str_lex_helper, hashtag_num)) {
          // Reach closing quotes, break out of the loop.
          if (leading_quotes == 3) {
            return Carbon::ProcessMultiLineString(str_lex_helper.str(), context,
                                                  hashtag_num);
          } else {
            return Carbon::ProcessSingleLineString(str_lex_helper.str(),
                                                   context, hashtag_num);
          }
        }
        break;
      case '\\':
        if (Carbon::ReadHashTags(str_lex_helper, hashtag_num)) {
          // Read the escaped char.
          if (!str_lex_helper.Advance()) {
            continue;
          }
          // Read the next char.
          str_lex_helper.Advance();
        }
        break;
      default:
        str_lex_helper.Advance();
    }
  }
  return CARBON_SIMPLE_TOKEN(END_OF_FILE);
}

{one_line_comment} {
  // Advance end by 1 line, resetting the column to zero.
  context.current_token_position.lines(1);
  // Make the span empty by setting start to end.
  context.current_token_position.step();
}

{horizontal_whitespace}+ {
  // Make the span empty by setting start to end.
  context.current_token_position.step();
  BEGIN(AFTER_WHITESPACE);
}

\n+ {
  // Advance end by yyleng lines, resetting the column to zero.
  context.current_token_position.lines(yyleng);
  // Make the span empty by setting start to end.
  context.current_token_position.step();
  BEGIN(AFTER_WHITESPACE);
}

. {
  return context.RecordSyntaxError(
      llvm::formatv("invalid character '\\x{0}' in source file.",
                    llvm::toHex(llvm::StringRef(yytext, 1))));
}

%%

auto YyinputWrapper(yyscan_t yyscanner) -> int { return yyinput(yyscanner); }