|
| 1 | +// Part of the Carbon Language project, under the Apache License v2.0 with LLVM |
| 2 | +// Exceptions. See /LICENSE for license information. |
| 3 | +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 4 | + |
| 5 | +#ifndef CARBON_TOOLCHAIN_LEX_TOKEN_INFO_H_ |
| 6 | +#define CARBON_TOOLCHAIN_LEX_TOKEN_INFO_H_ |
| 7 | + |
| 8 | +#include "common/check.h" |
| 9 | +#include "toolchain/base/int.h" |
| 10 | +#include "toolchain/base/value_ids.h" |
| 11 | +#include "toolchain/lex/token_index.h" |
| 12 | +#include "toolchain/lex/token_kind.h" |
| 13 | + |
| 14 | +namespace Carbon::Lex { |
| 15 | + |
| 16 | +// Storage for the information about a specific token, as an implementation |
| 17 | +// detail of `TokenizedBuffer`. |
| 18 | +// |
| 19 | +// This provides a friendly accessor API to the carefully space-optimized |
| 20 | +// storage model of the information we associated with each token. |
| 21 | +// |
| 22 | +// There are four pieces of information stored here: |
| 23 | +// - The kind of the token. |
| 24 | +// - Whether that token has leading whitespace before it. |
| 25 | +// - A kind-specific payload that can be compressed into a small integer. |
| 26 | +// - This class provides dedicated accessors for each different form of |
| 27 | +// payload that check the kind and payload correspond correctly. |
| 28 | +// - A 32-bit byte offset of the token within the source text. |
| 29 | +// |
| 30 | +// These are compressed and stored in 8-bytes for each token. |
| 31 | +// |
| 32 | +// Note that while the class provides some limited setters for payloads and |
| 33 | +// mutating methods, setters on this type may be unexpectedly expensive due to |
| 34 | +// the bit-packed representation and should be avoided. As such, only the |
| 35 | +// minimal necessary setters are provided. |
| 36 | +// |
| 37 | +// TODO: It might be worth considering a struct-of-arrays data layout in order |
| 38 | +// to move the byte offset to a separate array from the rest as it is only hot |
| 39 | +// during lexing, and then cold during parsing and semantic analysis. However, |
| 40 | +// a trivial approach to that adds more overhead than it saves due to tracking |
| 41 | +// two separate vectors and their growth. Making this profitable would likely |
| 42 | +// at least require a highly specialized single vector that manages the growth |
| 43 | +// once and then provides separate storage areas for the two arrays. |
| 44 | +class TokenInfo { |
| 45 | + public: |
| 46 | + // The kind for this token. |
| 47 | + auto kind() const -> TokenKind { return kind_; } |
| 48 | + |
| 49 | + // Whether this token is preceded by whitespace. We only store the preceding |
| 50 | + // state, and look at the next token to check for trailing whitespace. |
| 51 | + auto has_leading_space() const -> bool { return has_leading_space_; } |
| 52 | + |
| 53 | + // A collection of methods to access the specific payload included with |
| 54 | + // particular kinds of tokens. Only the specific payload accessor below may |
| 55 | + // be used for an info entry of a token with a particular kind, and these |
| 56 | + // check that the kind is valid. Some tokens do not include a payload at all |
| 57 | + // and none of these methods may be called. |
| 58 | + auto ident_id() const -> IdentifierId { |
| 59 | + CARBON_DCHECK(kind() == TokenKind::Identifier); |
| 60 | + return IdentifierId(token_payload_); |
| 61 | + } |
| 62 | + auto set_ident_id(IdentifierId ident_id) -> void { |
| 63 | + CARBON_DCHECK(kind() == TokenKind::Identifier); |
| 64 | + token_payload_ = ident_id.index; |
| 65 | + } |
| 66 | + |
| 67 | + auto string_literal_id() const -> StringLiteralValueId { |
| 68 | + CARBON_DCHECK(kind() == TokenKind::StringLiteral); |
| 69 | + return StringLiteralValueId(token_payload_); |
| 70 | + } |
| 71 | + |
| 72 | + auto int_id() const -> IntId { |
| 73 | + CARBON_DCHECK(kind() == TokenKind::IntLiteral || |
| 74 | + kind() == TokenKind::IntTypeLiteral || |
| 75 | + kind() == TokenKind::UnsignedIntTypeLiteral || |
| 76 | + kind() == TokenKind::FloatTypeLiteral); |
| 77 | + return IntId::MakeFromTokenPayload(token_payload_); |
| 78 | + } |
| 79 | + |
| 80 | + auto real_id() const -> RealId { |
| 81 | + CARBON_DCHECK(kind() == TokenKind::RealLiteral); |
| 82 | + return RealId(token_payload_); |
| 83 | + } |
| 84 | + |
| 85 | + auto closing_token_index() const -> TokenIndex { |
| 86 | + CARBON_DCHECK(kind().is_opening_symbol()); |
| 87 | + return TokenIndex(token_payload_); |
| 88 | + } |
| 89 | + auto set_closing_token_index(TokenIndex closing_index) -> void { |
| 90 | + CARBON_DCHECK(kind().is_opening_symbol()); |
| 91 | + token_payload_ = closing_index.index; |
| 92 | + } |
| 93 | + |
| 94 | + auto opening_token_index() const -> TokenIndex { |
| 95 | + CARBON_DCHECK(kind().is_closing_symbol()); |
| 96 | + return TokenIndex(token_payload_); |
| 97 | + } |
| 98 | + auto set_opening_token_index(TokenIndex opening_index) -> void { |
| 99 | + CARBON_DCHECK(kind().is_closing_symbol()); |
| 100 | + token_payload_ = opening_index.index; |
| 101 | + } |
| 102 | + |
| 103 | + auto error_length() const -> int { |
| 104 | + CARBON_DCHECK(kind() == TokenKind::Error); |
| 105 | + return token_payload_; |
| 106 | + } |
| 107 | + |
| 108 | + // Zero-based byte offset of the token within the file. This can be combined |
| 109 | + // with the buffer's line information to locate the line and column of the |
| 110 | + // token as well. |
| 111 | + auto byte_offset() const -> int32_t { return byte_offset_; } |
| 112 | + |
| 113 | + // Transforms the token into an error token of the given length but at its |
| 114 | + // original position and with the same whitespace adjacency. |
| 115 | + auto ResetAsError(int error_length) -> void { |
| 116 | + // Construct a fresh token to establish any needed invariants and replace |
| 117 | + // this token with it. |
| 118 | + TokenInfo error(TokenKind::Error, has_leading_space(), error_length, |
| 119 | + byte_offset()); |
| 120 | + *this = error; |
| 121 | + } |
| 122 | + |
| 123 | + private: |
| 124 | + friend class Lexer; |
| 125 | + |
| 126 | + static constexpr int PayloadBits = 23; |
| 127 | + |
| 128 | + // Make sure we have enough payload bits to represent token-associated IDs. |
| 129 | + static_assert(PayloadBits >= IntId::TokenIdBits); |
| 130 | + static_assert(PayloadBits >= TokenIndex::Bits); |
| 131 | + |
| 132 | + // Constructor for a TokenKind that carries no payload, or where the payload |
| 133 | + // will be set later. |
| 134 | + // |
| 135 | + // Only used by the lexer which enforces only the correct kinds are used. |
| 136 | + // |
| 137 | + // When the payload is not being set, we leave it uninitialized. At least in |
| 138 | + // some cases, this will allow MSan to correctly detect erroneous attempts |
| 139 | + // to access the payload, as it works to track uninitialized memory |
| 140 | + // bit-for-bit specifically to handle complex cases like bitfields. |
| 141 | + TokenInfo(TokenKind kind, bool has_leading_space, int32_t byte_offset) |
| 142 | + : kind_(kind), |
| 143 | + has_leading_space_(has_leading_space), |
| 144 | + byte_offset_(byte_offset) {} |
| 145 | + |
| 146 | + // Constructor for a TokenKind that carries a payload. |
| 147 | + // |
| 148 | + // Only used by the lexer which enforces the correct kind and payload types. |
| 149 | + TokenInfo(TokenKind kind, bool has_leading_space, int payload, |
| 150 | + int32_t byte_offset) |
| 151 | + : kind_(kind), |
| 152 | + has_leading_space_(has_leading_space), |
| 153 | + token_payload_(payload), |
| 154 | + byte_offset_(byte_offset) {} |
| 155 | + |
| 156 | + // A bitfield that encodes the token's kind, the leading space flag, and the |
| 157 | + // remaining bits in a payload. These are encoded together as a bitfield for |
| 158 | + // density and because these are the hottest fields of tokens for consumers |
| 159 | + // after lexing. |
| 160 | + // |
| 161 | + // Payload values are typically ID types for which we create at most one per |
| 162 | + // token, so we ensure that `token_payload_` is large enough to fit any |
| 163 | + // token index. Stores to this field may overflow, but we produce an error |
| 164 | + // in `Lexer::Finalize` if the file has more than `TokenIndex::Max` tokens, |
| 165 | + // so this value never overflows if lexing succeeds. |
| 166 | + TokenKind kind_; |
| 167 | + static_assert(sizeof(kind_) == 1, "TokenKind must pack to 8 bits"); |
| 168 | + bool has_leading_space_ : 1; |
| 169 | + unsigned token_payload_ : PayloadBits; |
| 170 | + |
| 171 | + // Separate storage for the byte offset, this is hot while lexing but then |
| 172 | + // generally cold. |
| 173 | + int32_t byte_offset_; |
| 174 | +}; |
| 175 | + |
| 176 | +static_assert(sizeof(TokenInfo) == 8, |
| 177 | + "Expected `TokenInfo` to pack to an 8-byte structure."); |
| 178 | + |
| 179 | +} // namespace Carbon::Lex |
| 180 | + |
| 181 | +#endif // CARBON_TOOLCHAIN_LEX_TOKEN_INFO_H_ |
0 commit comments