forked from carbon-language/carbon-lang
-
Notifications
You must be signed in to change notification settings - Fork 0
/
tokenized_buffer.h
514 lines (424 loc) · 18.8 KB
/
tokenized_buffer.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
// Exceptions. See /LICENSE for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#ifndef CARBON_TOOLCHAIN_LEX_TOKENIZED_BUFFER_H_
#define CARBON_TOOLCHAIN_LEX_TOKENIZED_BUFFER_H_
#include <compare>
#include <cstdint>
#include <iterator>
#include "common/ostream.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/iterator.h"
#include "llvm/ADT/iterator_range.h"
#include "llvm/Support/Allocator.h"
#include "llvm/Support/raw_ostream.h"
#include "toolchain/base/index_base.h"
#include "toolchain/base/mem_usage.h"
#include "toolchain/base/value_store.h"
#include "toolchain/diagnostics/diagnostic_emitter.h"
#include "toolchain/lex/token_index.h"
#include "toolchain/lex/token_kind.h"
#include "toolchain/source/source_buffer.h"
namespace Carbon::Lex {
class TokenizedBuffer;
// A lightweight handle to a lexed line in a `TokenizedBuffer`.
//
// `LineIndex` objects are designed to be passed by value, not reference or
// pointer. They are also designed to be small and efficient to store in data
// structures.
//
// Each `LineIndex` object refers to a specific line in the source code that was
// lexed. They can be compared directly to establish that they refer to the
// same line or the relative position of different lines within the source.
//
// All other APIs to query a `LineIndex` are on the `TokenizedBuffer`.
struct LineIndex : public IndexBase {
static const LineIndex Invalid;
using IndexBase::IndexBase;
};
constexpr LineIndex LineIndex::Invalid(LineIndex::InvalidIndex);
// Random-access iterator over tokens within the buffer.
class TokenIterator
: public llvm::iterator_facade_base<TokenIterator,
std::random_access_iterator_tag,
const TokenIndex, int>,
public Printable<TokenIterator> {
public:
TokenIterator() = delete;
explicit TokenIterator(TokenIndex token) : token_(token) {}
auto operator==(const TokenIterator& rhs) const -> bool {
return token_ == rhs.token_;
}
auto operator<=>(const TokenIterator& rhs) const -> std::strong_ordering {
return token_ <=> rhs.token_;
}
auto operator*() const -> const TokenIndex& { return token_; }
using iterator_facade_base::operator-;
auto operator-(const TokenIterator& rhs) const -> int {
return token_.index - rhs.token_.index;
}
auto operator+=(int n) -> TokenIterator& {
token_.index += n;
return *this;
}
auto operator-=(int n) -> TokenIterator& {
token_.index -= n;
return *this;
}
// Prints the raw token index.
auto Print(llvm::raw_ostream& output) const -> void;
private:
friend class TokenizedBuffer;
TokenIndex token_;
};
// A diagnostic location converter that maps token locations into source
// buffer locations.
class TokenDiagnosticConverter : public DiagnosticConverter<TokenIndex> {
public:
explicit TokenDiagnosticConverter(const TokenizedBuffer* buffer)
: buffer_(buffer) {}
// Map the given token into a diagnostic location.
auto ConvertLoc(TokenIndex token, ContextFnT context_fn) const
-> DiagnosticLoc override;
private:
const TokenizedBuffer* buffer_;
};
// A buffer of tokenized Carbon source code.
//
// This is constructed by lexing the source code text into a series of tokens.
// The buffer provides lightweight handles to tokens and other lexed entities,
// as well as iterations to walk the sequence of tokens found in the buffer.
//
// Lexing errors result in a potentially incomplete sequence of tokens and
// `HasError` returning true.
class TokenizedBuffer : public Printable<TokenizedBuffer> {
public:
auto GetKind(TokenIndex token) const -> TokenKind;
auto GetLine(TokenIndex token) const -> LineIndex;
// Returns the 1-based line number.
auto GetLineNumber(TokenIndex token) const -> int;
// Returns the 1-based column number.
auto GetColumnNumber(TokenIndex token) const -> int;
// Returns the line and 1-based column number of the first character after
// this token.
auto GetEndLoc(TokenIndex token) const -> std::pair<LineIndex, int>;
// Returns the source text lexed into this token.
auto GetTokenText(TokenIndex token) const -> llvm::StringRef;
// Returns the identifier associated with this token. The token kind must be
// an `Identifier`.
auto GetIdentifier(TokenIndex token) const -> IdentifierId;
// Returns the value of an `IntLiteral()` token.
auto GetIntLiteral(TokenIndex token) const -> IntId;
// Returns the value of an `RealLiteral()` token.
auto GetRealLiteral(TokenIndex token) const -> RealId;
// Returns the value of a `StringLiteral()` token.
auto GetStringLiteralValue(TokenIndex token) const -> StringLiteralValueId;
// Returns the size specified in a `*TypeLiteral()` token.
auto GetTypeLiteralSize(TokenIndex token) const -> IntId;
// Returns the closing token matched with the given opening token.
//
// The given token must be an opening token kind.
auto GetMatchedClosingToken(TokenIndex opening_token) const -> TokenIndex;
// Returns the opening token matched with the given closing token.
//
// The given token must be a closing token kind.
auto GetMatchedOpeningToken(TokenIndex closing_token) const -> TokenIndex;
// Returns whether the given token has leading whitespace.
auto HasLeadingWhitespace(TokenIndex token) const -> bool;
// Returns whether the given token has trailing whitespace.
auto HasTrailingWhitespace(TokenIndex token) const -> bool;
// Returns whether the token was created as part of an error recovery effort.
//
// For example, a closing paren inserted to match an unmatched paren.
auto IsRecoveryToken(TokenIndex token) const -> bool;
// Returns the 1-based line number.
auto GetLineNumber(LineIndex line) const -> int;
// Returns the 1-based indentation column number.
auto GetIndentColumnNumber(LineIndex line) const -> int;
// Returns the next line handle.
auto GetNextLine(LineIndex line) const -> LineIndex;
// Returns the previous line handle.
auto GetPrevLine(LineIndex line) const -> LineIndex;
// Prints a description of the tokenized stream to the provided `raw_ostream`.
//
// It prints one line of information for each token in the buffer, including
// the kind of token, where it occurs within the source file, indentation for
// the associated line, the spelling of the token in source, and any
// additional information tracked such as which unique identifier it is or any
// matched grouping token.
//
// Each line is formatted as a YAML record:
//
// clang-format off
// ```
// token: { index: 0, kind: 'Semi', line: 1, column: 1, indent: 1, spelling: ';' }
// ```
// clang-format on
//
// This can be parsed as YAML using tools like `python-yq` combined with `jq`
// on the command line. The format is also reasonably amenable to other
// line-oriented shell tools from `grep` to `awk`.
auto Print(llvm::raw_ostream& output_stream) const -> void;
// Prints a description of a single token. See `Print` for details on the
// format.
auto PrintToken(llvm::raw_ostream& output_stream, TokenIndex token) const
-> void;
// Collects memory usage of members.
auto CollectMemUsage(MemUsage& mem_usage, llvm::StringRef label) const
-> void;
// Returns true if the buffer has errors that were detected at lexing time.
auto has_errors() const -> bool { return has_errors_; }
auto tokens() const -> llvm::iterator_range<TokenIterator> {
return llvm::make_range(TokenIterator(TokenIndex(0)),
TokenIterator(TokenIndex(token_infos_.size())));
}
auto size() const -> int { return token_infos_.size(); }
// This is an upper bound on the number of output parse nodes in the absence
// of errors.
auto expected_max_parse_tree_size() const -> int {
return expected_max_parse_tree_size_;
}
auto source() const -> const SourceBuffer& { return *source_; }
private:
friend class Lexer;
friend class TokenDiagnosticConverter;
// A diagnostic location converter that maps token locations into source
// buffer locations.
class SourceBufferDiagnosticConverter
: public DiagnosticConverter<const char*> {
public:
explicit SourceBufferDiagnosticConverter(const TokenizedBuffer* buffer)
: buffer_(buffer) {}
// Map the given position within the source buffer into a diagnostic
// location.
auto ConvertLoc(const char* loc, ContextFnT context_fn) const
-> DiagnosticLoc override;
private:
const TokenizedBuffer* buffer_;
};
// Specifies minimum widths to use when printing a token's fields via
// `printToken`.
struct PrintWidths {
// Widens `this` to the maximum of `this` and `new_width` for each
// dimension.
auto Widen(const PrintWidths& widths) -> void;
int index;
int kind;
int line;
int column;
int indent;
};
// Storage for the information about a specific token in the buffer.
//
// This provides a friendly accessor API to the carefully space-optimized
// storage model of the information we associated with each token.
//
// There are four pieces of information stored here:
// - The kind of the token.
// - Whether that token has leading whitespace before it.
// - A kind-specific payload that can be compressed into a small integer.
// - This class provides dedicated accessors for each different form of
// payload that check the kind and payload correspond correctly.
// - A 32-bit byte offset of the token within the source text.
//
// These are compressed and stored in 8-bytes for each token.
//
// Note that while the class provides some limited setters for payloads and
// mutating methods, setters on this type may be unexpectedly expensive due to
// the bit-packed representation and should be avoided. As such, only the
// minimal necessary setters are provided.
//
// TODO: It might be worth considering a struct-of-arrays data layout in order
// to move the byte offset to a separate array from the rest as it is only hot
// during lexing, and then cold during parsing and semantic analysis. However,
// a trivial approach to that adds more overhead than it saves due to tracking
// two separate vectors and their growth. Making this profitable would likely
// at least require a highly specialized single vector that manages the growth
// once and then provides separate storage areas for the two arrays.
class TokenInfo {
public:
// The kind for this token.
auto kind() const -> TokenKind { return TokenKind::Make(kind_); }
// Whether this token is preceded by whitespace. We only store the preceding
// state, and look at the next token to check for trailing whitespace.
auto has_leading_space() const -> bool { return has_leading_space_; }
// A collection of methods to access the specific payload included with
// particular kinds of tokens. Only the specific payload accessor below may
// be used for an info entry of a token with a particular kind, and these
// check that the kind is valid. Some tokens do not include a payload at all
// and none of these methods may be called.
auto ident_id() const -> IdentifierId {
CARBON_DCHECK(kind() == TokenKind::Identifier);
return IdentifierId(token_payload_);
}
auto set_ident_id(IdentifierId ident_id) -> void {
CARBON_DCHECK(kind() == TokenKind::Identifier);
CARBON_DCHECK(ident_id.index < (2 << PayloadBits));
token_payload_ = ident_id.index;
}
auto string_literal_id() const -> StringLiteralValueId {
CARBON_DCHECK(kind() == TokenKind::StringLiteral);
return StringLiteralValueId(token_payload_);
}
auto int_id() const -> IntId {
CARBON_DCHECK(kind() == TokenKind::IntLiteral ||
kind() == TokenKind::IntTypeLiteral ||
kind() == TokenKind::UnsignedIntTypeLiteral ||
kind() == TokenKind::FloatTypeLiteral);
return IntId(token_payload_);
}
auto real_id() const -> RealId {
CARBON_DCHECK(kind() == TokenKind::RealLiteral);
return RealId(token_payload_);
}
auto closing_token_index() const -> TokenIndex {
CARBON_DCHECK(kind().is_opening_symbol());
return TokenIndex(token_payload_);
}
auto set_closing_token_index(TokenIndex closing_index) -> void {
CARBON_DCHECK(kind().is_opening_symbol());
CARBON_DCHECK(closing_index.index < (2 << PayloadBits));
token_payload_ = closing_index.index;
}
auto opening_token_index() const -> TokenIndex {
CARBON_DCHECK(kind().is_closing_symbol());
return TokenIndex(token_payload_);
}
auto set_opening_token_index(TokenIndex opening_index) -> void {
CARBON_DCHECK(kind().is_closing_symbol());
CARBON_DCHECK(opening_index.index < (2 << PayloadBits));
token_payload_ = opening_index.index;
}
auto error_length() const -> int {
CARBON_DCHECK(kind() == TokenKind::Error);
return token_payload_;
}
// Zero-based byte offset of the token within the file. This can be combined
// with the buffer's line information to locate the line and column of the
// token as well.
auto byte_offset() const -> int32_t { return byte_offset_; }
// Transforms the token into an error token of the given length but at its
// original position and with the same whitespace adjacency.
auto ResetAsError(int error_length) -> void {
// Construct a fresh token to establish any needed invariants and replace
// this token with it.
TokenInfo error(TokenKind::Error, has_leading_space(), error_length,
byte_offset());
*this = error;
}
private:
friend class Lexer;
static constexpr int PayloadBits = 23;
// Constructor for a TokenKind that carries no payload, or where the payload
// will be set later.
//
// Only used by the lexer which enforces only the correct kinds are used.
//
// When the payload is not being set, we leave it uninitialized. At least in
// some cases, this will allow MSan to correctly detect erroneous attempts
// to access the payload, as it works to track uninitialized memory
// bit-for-bit specifically to handle complex cases like bitfields.
TokenInfo(TokenKind kind, bool has_leading_space, int32_t byte_offset)
: kind_(kind),
has_leading_space_(has_leading_space),
byte_offset_(byte_offset) {}
// Constructor for a TokenKind that carries a payload.
//
// Only used by the lexer which enforces the correct kind and payload types.
TokenInfo(TokenKind kind, bool has_leading_space, int payload,
int32_t byte_offset)
: kind_(kind),
has_leading_space_(has_leading_space),
token_payload_(payload),
byte_offset_(byte_offset) {
CARBON_DCHECK(payload >= 0 && payload < (2 << PayloadBits),
"Payload won't fit into unsigned bit pack: {0}", payload);
}
// A bitfield that encodes the token's kind, the leading space flag, and the
// remaining bits in a payload. These are encoded together as a bitfield for
// density and because these are the hottest fields of tokens for consumers
// after lexing.
TokenKind::RawEnumType kind_ : sizeof(TokenKind) * 8;
bool has_leading_space_ : 1;
unsigned token_payload_ : PayloadBits;
// Separate storage for the byte offset, this is hot while lexing but then
// generally cold.
int32_t byte_offset_;
};
static_assert(sizeof(TokenInfo) == 8,
"Expected `TokenInfo` to pack to an 8-byte structure.");
struct LineInfo {
explicit LineInfo(int32_t start) : start(start), indent(0) {}
// Zero-based byte offset of the start of the line within the source buffer
// provided.
int32_t start;
// The byte offset from the start of the line of the first non-whitespace
// character.
int32_t indent;
};
// The constructor is merely responsible for trivial initialization of
// members. A working object of this type is built with `Lex::Lex` so that its
// return can indicate if an error was encountered while lexing.
explicit TokenizedBuffer(SharedValueStores& value_stores,
SourceBuffer& source)
: value_stores_(&value_stores), source_(&source) {}
auto FindLineIndex(int32_t byte_offset) const -> LineIndex;
auto GetLineInfo(LineIndex line) -> LineInfo&;
auto GetLineInfo(LineIndex line) const -> const LineInfo&;
auto AddLine(LineInfo info) -> LineIndex;
auto GetTokenInfo(TokenIndex token) -> TokenInfo&;
auto GetTokenInfo(TokenIndex token) const -> const TokenInfo&;
auto AddToken(TokenInfo info) -> TokenIndex;
auto GetTokenPrintWidths(TokenIndex token) const -> PrintWidths;
auto PrintToken(llvm::raw_ostream& output_stream, TokenIndex token,
PrintWidths widths) const -> void;
// Used to allocate computed string literals.
llvm::BumpPtrAllocator allocator_;
SharedValueStores* value_stores_;
SourceBuffer* source_;
llvm::SmallVector<TokenInfo> token_infos_;
llvm::SmallVector<LineInfo> line_infos_;
// An upper bound on the number of parse tree nodes that we expect to be
// created for the tokens in this buffer.
int expected_max_parse_tree_size_ = 0;
bool has_errors_ = false;
// A vector of flags for recovery tokens. If empty, there are none. When doing
// token recovery, this will be extended to be indexable by token indices and
// contain true for the tokens that were synthesized for recovery.
llvm::BitVector recovery_tokens_;
};
// A diagnostic emitter that uses positions within a source buffer's text as
// its source of location information.
using LexerDiagnosticEmitter = DiagnosticEmitter<const char*>;
// A diagnostic emitter that uses tokens as its source of location information.
using TokenDiagnosticEmitter = DiagnosticEmitter<TokenIndex>;
inline auto TokenizedBuffer::GetKind(TokenIndex token) const -> TokenKind {
return GetTokenInfo(token).kind();
}
inline auto TokenizedBuffer::HasLeadingWhitespace(TokenIndex token) const
-> bool {
return GetTokenInfo(token).has_leading_space();
}
inline auto TokenizedBuffer::HasTrailingWhitespace(TokenIndex token) const
-> bool {
TokenIterator it(token);
++it;
return it != tokens().end() && GetTokenInfo(*it).has_leading_space();
}
inline auto TokenizedBuffer::GetTokenInfo(TokenIndex token) -> TokenInfo& {
return token_infos_[token.index];
}
inline auto TokenizedBuffer::GetTokenInfo(TokenIndex token) const
-> const TokenInfo& {
return token_infos_[token.index];
}
inline auto TokenizedBuffer::AddToken(TokenInfo info) -> TokenIndex {
TokenIndex index(token_infos_.size());
token_infos_.push_back(info);
expected_max_parse_tree_size_ += info.kind().expected_max_parse_tree_size();
return index;
}
} // namespace Carbon::Lex
#endif // CARBON_TOOLCHAIN_LEX_TOKENIZED_BUFFER_H_