Switch token_infos_ to a ValueStore (#5633)

jonmeow · web-flow · commit 6683cf3b1c14 · 2025-06-09T22:24:03.000Z
Split out `TokenInfo` to be able to easily write `using ValueType =
TokenInfo;` on `TokenIndex`. Also fixes a small type issue on
`ValueStore` that affected `mapped_iterator` behavior when writing
`old_tokens_it-&gt;first &lt; next_offset`.
diff --git a/toolchain/base/value_store.h b/toolchain/base/value_store.h
@@ -156,7 +156,10 @@ class ValueStore
   // for (auto [id, value] : store.enumerate()) { ... }
   // ```
   auto enumerate() const [[clang::lifetimebound]] -> auto {
-    auto index_to_id = [&](int32_t i) -> std::pair<IdT, ConstRefType> {
+    // For `it->val`, writing `const std::pair` is required; otherwise
+    // `mapped_iterator` incorrectly infers the pointer type for `PointerProxy`.
+    // NOLINTNEXTLINE(readability-const-return-type)
+    auto index_to_id = [&](int32_t i) -> const std::pair<IdT, ConstRefType> {
       return std::pair<IdT, ConstRefType>(IdT(i), Get(IdT(i)));
     };
     // Because indices into `ValueStore` are all sequential values from 0, we
diff --git a/toolchain/lex/BUILD b/toolchain/lex/BUILD
@@ -222,6 +222,18 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "token_info",
+    hdrs = ["token_info.h"],
+    deps = [
+        ":token_index",
+        ":token_kind",
+        "//common:check",
+        "//toolchain/base:int",
+        "//toolchain/base:value_ids",
+    ],
+)
+
 cc_library(
     name = "tokenized_buffer",
     srcs = ["tokenized_buffer.cpp"],
@@ -232,6 +244,7 @@ cc_library(
         ":numeric_literal",
         ":string_literal",
         ":token_index",
+        ":token_info",
         ":token_kind",
         "//common:check",
         "//common:ostream",
diff --git a/toolchain/lex/lex.cpp b/toolchain/lex/lex.cpp
@@ -53,8 +53,6 @@ namespace Carbon::Lex {
 // `TokenizedBuffer` or undermining the performance constraints of the lexer.
 class [[clang::internal_linkage]] Lexer {
  public:
-  using TokenInfo = TokenizedBuffer::TokenInfo;
-
   // Symbolic result of a lexing action. This indicates whether we successfully
   // lexed a token, or whether other lexing actions should be attempted.
   //
@@ -1225,10 +1223,10 @@ auto Lexer::LexClosingSymbolToken(llvm::StringRef source_text, TokenKind kind,
   TokenIndex token =
       LexTokenWithPayload(kind, opening_token.index, byte_offset);
 
-  auto& opening_token_info = buffer_.GetTokenInfo(opening_token);
+  auto& opening_token_info = buffer_.token_infos_.Get(opening_token);
   if (LLVM_UNLIKELY(opening_token_info.kind() != kind.opening_symbol())) {
     has_mismatched_brackets_ = true;
-    buffer_.GetTokenInfo(token).set_opening_token_index(TokenIndex::None);
+    buffer_.token_infos_.Get(token).set_opening_token_index(TokenIndex::None);
     return token;
   }
 
@@ -1376,7 +1374,8 @@ auto Lexer::LexHash(llvm::StringRef source_text, ssize_t& position)
 
   // Look for the `r` token. Note that this is always in bounds because we
   // create a start of file token.
-  auto& prev_token_info = buffer_.token_infos_.back();
+  auto& prev_token_info =
+      buffer_.token_infos_.Get(TokenIndex(buffer_.token_infos_.size() - 1));
 
   // If the previous token isn't the identifier `r`, or the character after `#`
   // isn't the start of an identifier, this is not a raw identifier.
@@ -1534,7 +1533,7 @@ class Lexer::ErrorRecoveryBuffer {
     // Find the end of the token before the target token, and add the new token
     // there.
     TokenIndex insert_after(insert_before.index - 1);
-    const auto& prev_info = buffer_->GetTokenInfo(insert_after);
+    const auto& prev_info = buffer_->token_infos_.Get(insert_after);
     int32_t byte_offset =
         prev_info.byte_offset() + buffer_->GetTokenText(insert_after).size();
     new_tokens_.push_back(
@@ -1544,30 +1543,32 @@ class Lexer::ErrorRecoveryBuffer {
   // Replace the given token with an error token. We do this immediately,
   // because we don't benefit from buffering it.
   auto ReplaceWithError(TokenIndex token) -> void {
-    auto& token_info = buffer_->GetTokenInfo(token);
+    auto& token_info = buffer_->token_infos_.Get(token);
     int error_length = buffer_->GetTokenText(token).size();
     token_info.ResetAsError(error_length);
     any_error_tokens_ = true;
   }
 
   // Merge the recovery tokens into the token list of the tokenized buffer.
   auto Apply() -> void {
-    auto old_tokens = std::move(buffer_->token_infos_);
-    buffer_->token_infos_.clear();
+    ValueStore<TokenIndex> old_tokens =
+        std::exchange(buffer_->token_infos_, {});
     int new_size = old_tokens.size() + new_tokens_.size();
-    buffer_->token_infos_.reserve(new_size);
+    buffer_->token_infos_.Reserve(new_size);
     buffer_->recovery_tokens_.resize(new_size);
 
-    int old_tokens_offset = 0;
+    auto old_tokens_range = old_tokens.enumerate();
+    auto old_tokens_it = old_tokens_range.begin();
     for (auto [next_offset, info] : new_tokens_) {
-      buffer_->token_infos_.append(old_tokens.begin() + old_tokens_offset,
-                                   old_tokens.begin() + next_offset.index);
+      for (; old_tokens_it->first < next_offset; ++old_tokens_it) {
+        buffer_->token_infos_.Add(old_tokens_it->second);
+      }
       buffer_->AddToken(info);
       buffer_->recovery_tokens_.set(next_offset.index);
-      old_tokens_offset = next_offset.index;
     }
-    buffer_->token_infos_.append(old_tokens.begin() + old_tokens_offset,
-                                 old_tokens.end());
+    for (; old_tokens_it != old_tokens_range.end(); ++old_tokens_it) {
+      buffer_->token_infos_.Add(old_tokens_it->second);
+    }
   }
 
   // Perform bracket matching to fix cross-references between tokens. This must
@@ -1583,12 +1584,12 @@ class Lexer::ErrorRecoveryBuffer {
         CARBON_CHECK(!open_groups.empty(), "Failed to balance brackets");
         auto opening_token = open_groups.pop_back_val();
 
-        CARBON_CHECK(
-            kind ==
-                buffer_->GetTokenInfo(opening_token).kind().closing_symbol(),
-            "Failed to balance brackets");
-        auto& opening_token_info = buffer_->GetTokenInfo(opening_token);
-        auto& closing_token_info = buffer_->GetTokenInfo(token);
+        CARBON_CHECK(kind == buffer_->token_infos_.Get(opening_token)
+                                 .kind()
+                                 .closing_symbol(),
+                     "Failed to balance brackets");
+        auto& opening_token_info = buffer_->token_infos_.Get(opening_token);
+        auto& closing_token_info = buffer_->token_infos_.Get(token);
         opening_token_info.set_closing_token_index(token);
         closing_token_info.set_opening_token_index(opening_token);
       }
@@ -1601,8 +1602,7 @@ class Lexer::ErrorRecoveryBuffer {
   // A list of tokens to insert into the token stream to fix mismatched
   // brackets. The first element in each pair is the original token index to
   // insert the new token before.
-  llvm::SmallVector<std::pair<TokenIndex, TokenizedBuffer::TokenInfo>>
-      new_tokens_;
+  llvm::SmallVector<std::pair<TokenIndex, TokenInfo>> new_tokens_;
 
   // Whether we have changed any tokens into error tokens.
   bool any_error_tokens_ = false;
@@ -1652,8 +1652,9 @@ auto Lexer::DiagnoseAndFixMismatchedBrackets() -> void {
     // Find the innermost matching opening symbol.
     auto opening_it = llvm::find_if(
         llvm::reverse(open_groups_), [&](TokenIndex opening_token) {
-          return buffer_.GetTokenInfo(opening_token).kind().closing_symbol() ==
-                 kind;
+          return buffer_.token_infos_.Get(opening_token)
+                     .kind()
+                     .closing_symbol() == kind;
         });
     if (opening_it == open_groups_.rend()) {
       CARBON_DIAGNOSTIC(
diff --git a/toolchain/lex/token_index.h b/toolchain/lex/token_index.h
@@ -10,6 +10,8 @@
 
 namespace Carbon::Lex {
 
+class TokenInfo;
+
 // A lightweight handle to a lexed token in a `TokenizedBuffer`.
 //
 // `TokenIndex` objects are designed to be passed by value, not reference or
@@ -24,6 +26,8 @@ namespace Carbon::Lex {
 //
 // All other APIs to query a `TokenIndex` are on the `TokenizedBuffer`.
 struct TokenIndex : public IndexBase<TokenIndex> {
+  using ValueType = TokenInfo;
+
   // The number of bits which must be allotted for `TokenIndex`.
   static constexpr int Bits = 23;
   // The maximum number of tokens that can be stored, including the FileStart
diff --git a/toolchain/lex/token_info.h b/toolchain/lex/token_info.h
@@ -0,0 +1,181 @@
+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
+// Exceptions. See /LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef CARBON_TOOLCHAIN_LEX_TOKEN_INFO_H_
+#define CARBON_TOOLCHAIN_LEX_TOKEN_INFO_H_
+
+#include "common/check.h"
+#include "toolchain/base/int.h"
+#include "toolchain/base/value_ids.h"
+#include "toolchain/lex/token_index.h"
+#include "toolchain/lex/token_kind.h"
+
+namespace Carbon::Lex {
+
+// Storage for the information about a specific token, as an implementation
+// detail of `TokenizedBuffer`.
+//
+// This provides a friendly accessor API to the carefully space-optimized
+// storage model of the information we associated with each token.
+//
+// There are four pieces of information stored here:
+// - The kind of the token.
+// - Whether that token has leading whitespace before it.
+// - A kind-specific payload that can be compressed into a small integer.
+//   - This class provides dedicated accessors for each different form of
+//     payload that check the kind and payload correspond correctly.
+// - A 32-bit byte offset of the token within the source text.
+//
+// These are compressed and stored in 8-bytes for each token.
+//
+// Note that while the class provides some limited setters for payloads and
+// mutating methods, setters on this type may be unexpectedly expensive due to
+// the bit-packed representation and should be avoided. As such, only the
+// minimal necessary setters are provided.
+//
+// TODO: It might be worth considering a struct-of-arrays data layout in order
+// to move the byte offset to a separate array from the rest as it is only hot
+// during lexing, and then cold during parsing and semantic analysis. However,
+// a trivial approach to that adds more overhead than it saves due to tracking
+// two separate vectors and their growth. Making this profitable would likely
+// at least require a highly specialized single vector that manages the growth
+// once and then provides separate storage areas for the two arrays.
+class TokenInfo {
+ public:
+  // The kind for this token.
+  auto kind() const -> TokenKind { return kind_; }
+
+  // Whether this token is preceded by whitespace. We only store the preceding
+  // state, and look at the next token to check for trailing whitespace.
+  auto has_leading_space() const -> bool { return has_leading_space_; }
+
+  // A collection of methods to access the specific payload included with
+  // particular kinds of tokens. Only the specific payload accessor below may
+  // be used for an info entry of a token with a particular kind, and these
+  // check that the kind is valid. Some tokens do not include a payload at all
+  // and none of these methods may be called.
+  auto ident_id() const -> IdentifierId {
+    CARBON_DCHECK(kind() == TokenKind::Identifier);
+    return IdentifierId(token_payload_);
+  }
+  auto set_ident_id(IdentifierId ident_id) -> void {
+    CARBON_DCHECK(kind() == TokenKind::Identifier);
+    token_payload_ = ident_id.index;
+  }
+
+  auto string_literal_id() const -> StringLiteralValueId {
+    CARBON_DCHECK(kind() == TokenKind::StringLiteral);
+    return StringLiteralValueId(token_payload_);
+  }
+
+  auto int_id() const -> IntId {
+    CARBON_DCHECK(kind() == TokenKind::IntLiteral ||
+                  kind() == TokenKind::IntTypeLiteral ||
+                  kind() == TokenKind::UnsignedIntTypeLiteral ||
+                  kind() == TokenKind::FloatTypeLiteral);
+    return IntId::MakeFromTokenPayload(token_payload_);
+  }
+
+  auto real_id() const -> RealId {
+    CARBON_DCHECK(kind() == TokenKind::RealLiteral);
+    return RealId(token_payload_);
+  }
+
+  auto closing_token_index() const -> TokenIndex {
+    CARBON_DCHECK(kind().is_opening_symbol());
+    return TokenIndex(token_payload_);
+  }
+  auto set_closing_token_index(TokenIndex closing_index) -> void {
+    CARBON_DCHECK(kind().is_opening_symbol());
+    token_payload_ = closing_index.index;
+  }
+
+  auto opening_token_index() const -> TokenIndex {
+    CARBON_DCHECK(kind().is_closing_symbol());
+    return TokenIndex(token_payload_);
+  }
+  auto set_opening_token_index(TokenIndex opening_index) -> void {
+    CARBON_DCHECK(kind().is_closing_symbol());
+    token_payload_ = opening_index.index;
+  }
+
+  auto error_length() const -> int {
+    CARBON_DCHECK(kind() == TokenKind::Error);
+    return token_payload_;
+  }
+
+  // Zero-based byte offset of the token within the file. This can be combined
+  // with the buffer's line information to locate the line and column of the
+  // token as well.
+  auto byte_offset() const -> int32_t { return byte_offset_; }
+
+  // Transforms the token into an error token of the given length but at its
+  // original position and with the same whitespace adjacency.
+  auto ResetAsError(int error_length) -> void {
+    // Construct a fresh token to establish any needed invariants and replace
+    // this token with it.
+    TokenInfo error(TokenKind::Error, has_leading_space(), error_length,
+                    byte_offset());
+    *this = error;
+  }
+
+ private:
+  friend class Lexer;
+
+  static constexpr int PayloadBits = 23;
+
+  // Make sure we have enough payload bits to represent token-associated IDs.
+  static_assert(PayloadBits >= IntId::TokenIdBits);
+  static_assert(PayloadBits >= TokenIndex::Bits);
+
+  // Constructor for a TokenKind that carries no payload, or where the payload
+  // will be set later.
+  //
+  // Only used by the lexer which enforces only the correct kinds are used.
+  //
+  // When the payload is not being set, we leave it uninitialized. At least in
+  // some cases, this will allow MSan to correctly detect erroneous attempts
+  // to access the payload, as it works to track uninitialized memory
+  // bit-for-bit specifically to handle complex cases like bitfields.
+  TokenInfo(TokenKind kind, bool has_leading_space, int32_t byte_offset)
+      : kind_(kind),
+        has_leading_space_(has_leading_space),
+        byte_offset_(byte_offset) {}
+
+  // Constructor for a TokenKind that carries a payload.
+  //
+  // Only used by the lexer which enforces the correct kind and payload types.
+  TokenInfo(TokenKind kind, bool has_leading_space, int payload,
+            int32_t byte_offset)
+      : kind_(kind),
+        has_leading_space_(has_leading_space),
+        token_payload_(payload),
+        byte_offset_(byte_offset) {}
+
+  // A bitfield that encodes the token's kind, the leading space flag, and the
+  // remaining bits in a payload. These are encoded together as a bitfield for
+  // density and because these are the hottest fields of tokens for consumers
+  // after lexing.
+  //
+  // Payload values are typically ID types for which we create at most one per
+  // token, so we ensure that `token_payload_` is large enough to fit any
+  // token index. Stores to this field may overflow, but we produce an error
+  // in `Lexer::Finalize` if the file has more than `TokenIndex::Max` tokens,
+  // so this value never overflows if lexing succeeds.
+  TokenKind kind_;
+  static_assert(sizeof(kind_) == 1, "TokenKind must pack to 8 bits");
+  bool has_leading_space_ : 1;
+  unsigned token_payload_ : PayloadBits;
+
+  // Separate storage for the byte offset, this is hot while lexing but then
+  // generally cold.
+  int32_t byte_offset_;
+};
+
+static_assert(sizeof(TokenInfo) == 8,
+              "Expected `TokenInfo` to pack to an 8-byte structure.");
+
+}  // namespace Carbon::Lex
+
+#endif  // CARBON_TOOLCHAIN_LEX_TOKEN_INFO_H_
diff --git a/toolchain/lex/tokenized_buffer.cpp b/toolchain/lex/tokenized_buffer.cpp
diff --git a/toolchain/lex/tokenized_buffer.h b/toolchain/lex/tokenized_buffer.h