Skip to content

Commit 6683cf3

Browse files
authored
Switch token_infos_ to a ValueStore (#5633)
Split out `TokenInfo` to be able to easily write `using ValueType = TokenInfo;` on `TokenIndex`. Also fixes a small type issue on `ValueStore` that affected `mapped_iterator` behavior when writing `old_tokens_it->first < next_offset`.
1 parent 78d4cce commit 6683cf3

File tree

7 files changed

+252
-220
lines changed

7 files changed

+252
-220
lines changed

toolchain/base/value_store.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,10 @@ class ValueStore
156156
// for (auto [id, value] : store.enumerate()) { ... }
157157
// ```
158158
auto enumerate() const [[clang::lifetimebound]] -> auto {
159-
auto index_to_id = [&](int32_t i) -> std::pair<IdT, ConstRefType> {
159+
// For `it->val`, writing `const std::pair` is required; otherwise
160+
// `mapped_iterator` incorrectly infers the pointer type for `PointerProxy`.
161+
// NOLINTNEXTLINE(readability-const-return-type)
162+
auto index_to_id = [&](int32_t i) -> const std::pair<IdT, ConstRefType> {
160163
return std::pair<IdT, ConstRefType>(IdT(i), Get(IdT(i)));
161164
};
162165
// Because indices into `ValueStore` are all sequential values from 0, we

toolchain/lex/BUILD

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,18 @@ cc_library(
222222
],
223223
)
224224

225+
cc_library(
226+
name = "token_info",
227+
hdrs = ["token_info.h"],
228+
deps = [
229+
":token_index",
230+
":token_kind",
231+
"//common:check",
232+
"//toolchain/base:int",
233+
"//toolchain/base:value_ids",
234+
],
235+
)
236+
225237
cc_library(
226238
name = "tokenized_buffer",
227239
srcs = ["tokenized_buffer.cpp"],
@@ -232,6 +244,7 @@ cc_library(
232244
":numeric_literal",
233245
":string_literal",
234246
":token_index",
247+
":token_info",
235248
":token_kind",
236249
"//common:check",
237250
"//common:ostream",

toolchain/lex/lex.cpp

Lines changed: 27 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -53,8 +53,6 @@ namespace Carbon::Lex {
5353
// `TokenizedBuffer` or undermining the performance constraints of the lexer.
5454
class [[clang::internal_linkage]] Lexer {
5555
public:
56-
using TokenInfo = TokenizedBuffer::TokenInfo;
57-
5856
// Symbolic result of a lexing action. This indicates whether we successfully
5957
// lexed a token, or whether other lexing actions should be attempted.
6058
//
@@ -1225,10 +1223,10 @@ auto Lexer::LexClosingSymbolToken(llvm::StringRef source_text, TokenKind kind,
12251223
TokenIndex token =
12261224
LexTokenWithPayload(kind, opening_token.index, byte_offset);
12271225

1228-
auto& opening_token_info = buffer_.GetTokenInfo(opening_token);
1226+
auto& opening_token_info = buffer_.token_infos_.Get(opening_token);
12291227
if (LLVM_UNLIKELY(opening_token_info.kind() != kind.opening_symbol())) {
12301228
has_mismatched_brackets_ = true;
1231-
buffer_.GetTokenInfo(token).set_opening_token_index(TokenIndex::None);
1229+
buffer_.token_infos_.Get(token).set_opening_token_index(TokenIndex::None);
12321230
return token;
12331231
}
12341232

@@ -1376,7 +1374,8 @@ auto Lexer::LexHash(llvm::StringRef source_text, ssize_t& position)
13761374

13771375
// Look for the `r` token. Note that this is always in bounds because we
13781376
// create a start of file token.
1379-
auto& prev_token_info = buffer_.token_infos_.back();
1377+
auto& prev_token_info =
1378+
buffer_.token_infos_.Get(TokenIndex(buffer_.token_infos_.size() - 1));
13801379

13811380
// If the previous token isn't the identifier `r`, or the character after `#`
13821381
// isn't the start of an identifier, this is not a raw identifier.
@@ -1534,7 +1533,7 @@ class Lexer::ErrorRecoveryBuffer {
15341533
// Find the end of the token before the target token, and add the new token
15351534
// there.
15361535
TokenIndex insert_after(insert_before.index - 1);
1537-
const auto& prev_info = buffer_->GetTokenInfo(insert_after);
1536+
const auto& prev_info = buffer_->token_infos_.Get(insert_after);
15381537
int32_t byte_offset =
15391538
prev_info.byte_offset() + buffer_->GetTokenText(insert_after).size();
15401539
new_tokens_.push_back(
@@ -1544,30 +1543,32 @@ class Lexer::ErrorRecoveryBuffer {
15441543
// Replace the given token with an error token. We do this immediately,
15451544
// because we don't benefit from buffering it.
15461545
auto ReplaceWithError(TokenIndex token) -> void {
1547-
auto& token_info = buffer_->GetTokenInfo(token);
1546+
auto& token_info = buffer_->token_infos_.Get(token);
15481547
int error_length = buffer_->GetTokenText(token).size();
15491548
token_info.ResetAsError(error_length);
15501549
any_error_tokens_ = true;
15511550
}
15521551

15531552
// Merge the recovery tokens into the token list of the tokenized buffer.
15541553
auto Apply() -> void {
1555-
auto old_tokens = std::move(buffer_->token_infos_);
1556-
buffer_->token_infos_.clear();
1554+
ValueStore<TokenIndex> old_tokens =
1555+
std::exchange(buffer_->token_infos_, {});
15571556
int new_size = old_tokens.size() + new_tokens_.size();
1558-
buffer_->token_infos_.reserve(new_size);
1557+
buffer_->token_infos_.Reserve(new_size);
15591558
buffer_->recovery_tokens_.resize(new_size);
15601559

1561-
int old_tokens_offset = 0;
1560+
auto old_tokens_range = old_tokens.enumerate();
1561+
auto old_tokens_it = old_tokens_range.begin();
15621562
for (auto [next_offset, info] : new_tokens_) {
1563-
buffer_->token_infos_.append(old_tokens.begin() + old_tokens_offset,
1564-
old_tokens.begin() + next_offset.index);
1563+
for (; old_tokens_it->first < next_offset; ++old_tokens_it) {
1564+
buffer_->token_infos_.Add(old_tokens_it->second);
1565+
}
15651566
buffer_->AddToken(info);
15661567
buffer_->recovery_tokens_.set(next_offset.index);
1567-
old_tokens_offset = next_offset.index;
15681568
}
1569-
buffer_->token_infos_.append(old_tokens.begin() + old_tokens_offset,
1570-
old_tokens.end());
1569+
for (; old_tokens_it != old_tokens_range.end(); ++old_tokens_it) {
1570+
buffer_->token_infos_.Add(old_tokens_it->second);
1571+
}
15711572
}
15721573

15731574
// Perform bracket matching to fix cross-references between tokens. This must
@@ -1583,12 +1584,12 @@ class Lexer::ErrorRecoveryBuffer {
15831584
CARBON_CHECK(!open_groups.empty(), "Failed to balance brackets");
15841585
auto opening_token = open_groups.pop_back_val();
15851586

1586-
CARBON_CHECK(
1587-
kind ==
1588-
buffer_->GetTokenInfo(opening_token).kind().closing_symbol(),
1589-
"Failed to balance brackets");
1590-
auto& opening_token_info = buffer_->GetTokenInfo(opening_token);
1591-
auto& closing_token_info = buffer_->GetTokenInfo(token);
1587+
CARBON_CHECK(kind == buffer_->token_infos_.Get(opening_token)
1588+
.kind()
1589+
.closing_symbol(),
1590+
"Failed to balance brackets");
1591+
auto& opening_token_info = buffer_->token_infos_.Get(opening_token);
1592+
auto& closing_token_info = buffer_->token_infos_.Get(token);
15921593
opening_token_info.set_closing_token_index(token);
15931594
closing_token_info.set_opening_token_index(opening_token);
15941595
}
@@ -1601,8 +1602,7 @@ class Lexer::ErrorRecoveryBuffer {
16011602
// A list of tokens to insert into the token stream to fix mismatched
16021603
// brackets. The first element in each pair is the original token index to
16031604
// insert the new token before.
1604-
llvm::SmallVector<std::pair<TokenIndex, TokenizedBuffer::TokenInfo>>
1605-
new_tokens_;
1605+
llvm::SmallVector<std::pair<TokenIndex, TokenInfo>> new_tokens_;
16061606

16071607
// Whether we have changed any tokens into error tokens.
16081608
bool any_error_tokens_ = false;
@@ -1652,8 +1652,9 @@ auto Lexer::DiagnoseAndFixMismatchedBrackets() -> void {
16521652
// Find the innermost matching opening symbol.
16531653
auto opening_it = llvm::find_if(
16541654
llvm::reverse(open_groups_), [&](TokenIndex opening_token) {
1655-
return buffer_.GetTokenInfo(opening_token).kind().closing_symbol() ==
1656-
kind;
1655+
return buffer_.token_infos_.Get(opening_token)
1656+
.kind()
1657+
.closing_symbol() == kind;
16571658
});
16581659
if (opening_it == open_groups_.rend()) {
16591660
CARBON_DIAGNOSTIC(

toolchain/lex/token_index.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010

1111
namespace Carbon::Lex {
1212

13+
class TokenInfo;
14+
1315
// A lightweight handle to a lexed token in a `TokenizedBuffer`.
1416
//
1517
// `TokenIndex` objects are designed to be passed by value, not reference or
@@ -24,6 +26,8 @@ namespace Carbon::Lex {
2426
//
2527
// All other APIs to query a `TokenIndex` are on the `TokenizedBuffer`.
2628
struct TokenIndex : public IndexBase<TokenIndex> {
29+
using ValueType = TokenInfo;
30+
2731
// The number of bits which must be allotted for `TokenIndex`.
2832
static constexpr int Bits = 23;
2933
// The maximum number of tokens that can be stored, including the FileStart

toolchain/lex/token_info.h

Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,181 @@
1+
// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
2+
// Exceptions. See /LICENSE for license information.
3+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
4+
5+
#ifndef CARBON_TOOLCHAIN_LEX_TOKEN_INFO_H_
6+
#define CARBON_TOOLCHAIN_LEX_TOKEN_INFO_H_
7+
8+
#include "common/check.h"
9+
#include "toolchain/base/int.h"
10+
#include "toolchain/base/value_ids.h"
11+
#include "toolchain/lex/token_index.h"
12+
#include "toolchain/lex/token_kind.h"
13+
14+
namespace Carbon::Lex {
15+
16+
// Storage for the information about a specific token, as an implementation
17+
// detail of `TokenizedBuffer`.
18+
//
19+
// This provides a friendly accessor API to the carefully space-optimized
20+
// storage model of the information we associated with each token.
21+
//
22+
// There are four pieces of information stored here:
23+
// - The kind of the token.
24+
// - Whether that token has leading whitespace before it.
25+
// - A kind-specific payload that can be compressed into a small integer.
26+
// - This class provides dedicated accessors for each different form of
27+
// payload that check the kind and payload correspond correctly.
28+
// - A 32-bit byte offset of the token within the source text.
29+
//
30+
// These are compressed and stored in 8-bytes for each token.
31+
//
32+
// Note that while the class provides some limited setters for payloads and
33+
// mutating methods, setters on this type may be unexpectedly expensive due to
34+
// the bit-packed representation and should be avoided. As such, only the
35+
// minimal necessary setters are provided.
36+
//
37+
// TODO: It might be worth considering a struct-of-arrays data layout in order
38+
// to move the byte offset to a separate array from the rest as it is only hot
39+
// during lexing, and then cold during parsing and semantic analysis. However,
40+
// a trivial approach to that adds more overhead than it saves due to tracking
41+
// two separate vectors and their growth. Making this profitable would likely
42+
// at least require a highly specialized single vector that manages the growth
43+
// once and then provides separate storage areas for the two arrays.
44+
class TokenInfo {
45+
public:
46+
// The kind for this token.
47+
auto kind() const -> TokenKind { return kind_; }
48+
49+
// Whether this token is preceded by whitespace. We only store the preceding
50+
// state, and look at the next token to check for trailing whitespace.
51+
auto has_leading_space() const -> bool { return has_leading_space_; }
52+
53+
// A collection of methods to access the specific payload included with
54+
// particular kinds of tokens. Only the specific payload accessor below may
55+
// be used for an info entry of a token with a particular kind, and these
56+
// check that the kind is valid. Some tokens do not include a payload at all
57+
// and none of these methods may be called.
58+
auto ident_id() const -> IdentifierId {
59+
CARBON_DCHECK(kind() == TokenKind::Identifier);
60+
return IdentifierId(token_payload_);
61+
}
62+
auto set_ident_id(IdentifierId ident_id) -> void {
63+
CARBON_DCHECK(kind() == TokenKind::Identifier);
64+
token_payload_ = ident_id.index;
65+
}
66+
67+
auto string_literal_id() const -> StringLiteralValueId {
68+
CARBON_DCHECK(kind() == TokenKind::StringLiteral);
69+
return StringLiteralValueId(token_payload_);
70+
}
71+
72+
auto int_id() const -> IntId {
73+
CARBON_DCHECK(kind() == TokenKind::IntLiteral ||
74+
kind() == TokenKind::IntTypeLiteral ||
75+
kind() == TokenKind::UnsignedIntTypeLiteral ||
76+
kind() == TokenKind::FloatTypeLiteral);
77+
return IntId::MakeFromTokenPayload(token_payload_);
78+
}
79+
80+
auto real_id() const -> RealId {
81+
CARBON_DCHECK(kind() == TokenKind::RealLiteral);
82+
return RealId(token_payload_);
83+
}
84+
85+
auto closing_token_index() const -> TokenIndex {
86+
CARBON_DCHECK(kind().is_opening_symbol());
87+
return TokenIndex(token_payload_);
88+
}
89+
auto set_closing_token_index(TokenIndex closing_index) -> void {
90+
CARBON_DCHECK(kind().is_opening_symbol());
91+
token_payload_ = closing_index.index;
92+
}
93+
94+
auto opening_token_index() const -> TokenIndex {
95+
CARBON_DCHECK(kind().is_closing_symbol());
96+
return TokenIndex(token_payload_);
97+
}
98+
auto set_opening_token_index(TokenIndex opening_index) -> void {
99+
CARBON_DCHECK(kind().is_closing_symbol());
100+
token_payload_ = opening_index.index;
101+
}
102+
103+
auto error_length() const -> int {
104+
CARBON_DCHECK(kind() == TokenKind::Error);
105+
return token_payload_;
106+
}
107+
108+
// Zero-based byte offset of the token within the file. This can be combined
109+
// with the buffer's line information to locate the line and column of the
110+
// token as well.
111+
auto byte_offset() const -> int32_t { return byte_offset_; }
112+
113+
// Transforms the token into an error token of the given length but at its
114+
// original position and with the same whitespace adjacency.
115+
auto ResetAsError(int error_length) -> void {
116+
// Construct a fresh token to establish any needed invariants and replace
117+
// this token with it.
118+
TokenInfo error(TokenKind::Error, has_leading_space(), error_length,
119+
byte_offset());
120+
*this = error;
121+
}
122+
123+
private:
124+
friend class Lexer;
125+
126+
static constexpr int PayloadBits = 23;
127+
128+
// Make sure we have enough payload bits to represent token-associated IDs.
129+
static_assert(PayloadBits >= IntId::TokenIdBits);
130+
static_assert(PayloadBits >= TokenIndex::Bits);
131+
132+
// Constructor for a TokenKind that carries no payload, or where the payload
133+
// will be set later.
134+
//
135+
// Only used by the lexer which enforces only the correct kinds are used.
136+
//
137+
// When the payload is not being set, we leave it uninitialized. At least in
138+
// some cases, this will allow MSan to correctly detect erroneous attempts
139+
// to access the payload, as it works to track uninitialized memory
140+
// bit-for-bit specifically to handle complex cases like bitfields.
141+
TokenInfo(TokenKind kind, bool has_leading_space, int32_t byte_offset)
142+
: kind_(kind),
143+
has_leading_space_(has_leading_space),
144+
byte_offset_(byte_offset) {}
145+
146+
// Constructor for a TokenKind that carries a payload.
147+
//
148+
// Only used by the lexer which enforces the correct kind and payload types.
149+
TokenInfo(TokenKind kind, bool has_leading_space, int payload,
150+
int32_t byte_offset)
151+
: kind_(kind),
152+
has_leading_space_(has_leading_space),
153+
token_payload_(payload),
154+
byte_offset_(byte_offset) {}
155+
156+
// A bitfield that encodes the token's kind, the leading space flag, and the
157+
// remaining bits in a payload. These are encoded together as a bitfield for
158+
// density and because these are the hottest fields of tokens for consumers
159+
// after lexing.
160+
//
161+
// Payload values are typically ID types for which we create at most one per
162+
// token, so we ensure that `token_payload_` is large enough to fit any
163+
// token index. Stores to this field may overflow, but we produce an error
164+
// in `Lexer::Finalize` if the file has more than `TokenIndex::Max` tokens,
165+
// so this value never overflows if lexing succeeds.
166+
TokenKind kind_;
167+
static_assert(sizeof(kind_) == 1, "TokenKind must pack to 8 bits");
168+
bool has_leading_space_ : 1;
169+
unsigned token_payload_ : PayloadBits;
170+
171+
// Separate storage for the byte offset, this is hot while lexing but then
172+
// generally cold.
173+
int32_t byte_offset_;
174+
};
175+
176+
static_assert(sizeof(TokenInfo) == 8,
177+
"Expected `TokenInfo` to pack to an 8-byte structure.");
178+
179+
} // namespace Carbon::Lex
180+
181+
#endif // CARBON_TOOLCHAIN_LEX_TOKEN_INFO_H_

0 commit comments

Comments
 (0)