Skip to content

Commit 6fb0bb4

Browse files
committed
feat: no state hashing during catch-up (state manager)
1 parent 0e4b3d1 commit 6fb0bb4

7 files changed

Lines changed: 327 additions & 9 deletions

File tree

rs/interfaces/state_manager/mocks/src/lib.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@ mock! {
5151

5252
fn list_state_hashes_to_certify(&self) -> Vec<(Height, CryptoHashOfPartialState)>;
5353

54+
fn list_state_heights_to_certify(&self) -> Vec<Height>;
55+
5456
fn deliver_state_certification(&self, certification: Certification);
5557

5658
fn remove_states_below(&self, height: Height);

rs/interfaces/state_manager/src/lib.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,10 @@ pub trait StateManager: StateReader {
146146
/// ```
147147
fn list_state_hashes_to_certify(&self) -> Vec<(Height, CryptoHashOfPartialState)>;
148148

149+
/// Returns a list of heights for which the state manager optimistically requests
150+
/// a certification to be delivered via `state_manager.deliver_state_certification`.
151+
fn list_state_heights_to_certify(&self) -> Vec<Height>;
152+
149153
/// Delivers a `certification` corresponding to some state hash / height
150154
/// pair.
151155
///

rs/state_machine_tests/src/lib.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -942,6 +942,10 @@ impl StateManager for StateMachineStateManager {
942942
self.deref().list_state_hashes_to_certify()
943943
}
944944

945+
fn list_state_heights_to_certify(&self) -> Vec<Height> {
946+
self.deref().list_state_heights_to_certify()
947+
}
948+
945949
fn deliver_state_certification(&self, certification: Certification) {
946950
self.deref().deliver_state_certification(certification)
947951
}

rs/state_manager/src/lib.rs

Lines changed: 68 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ use std::sync::{
8383
};
8484
use std::time::{Duration, Instant, SystemTime};
8585
use std::{
86-
collections::{BTreeMap, BTreeSet, VecDeque},
86+
collections::{BTreeMap, BTreeSet, HashSet, VecDeque},
8787
sync::Mutex,
8888
};
8989
use tempfile::tempfile;
@@ -118,9 +118,15 @@ const CRITICAL_ERROR_REPLICATED_STATE_ALTERED_AFTER_CHECKPOINT: &str =
118118
const ARCHIVED_DIVERGED_CHECKPOINT_MAX_AGE: Duration = Duration::from_secs(30 * 24 * 60 * 60); // 30 days
119119

120120
/// The maximum number of consecutive rounds for which the optimization of
121-
/// skipping state cloning and certification metadata computation triggers.
121+
/// skipping state cloning and computing certification metadata triggers
122+
/// while catching up.
122123
const MAX_CONSECUTIVE_ROUNDS_WITHOUT_STATE_CLONING: u64 = 10;
123124

125+
/// The maximum number of future heights starting at tip height
126+
/// that the state manager optimistically asks consensus to certify
127+
/// in advance while catching up.
128+
const MAX_FUTURE_HEIGHTS_TO_CERTIFY: u64 = 20;
129+
124130
/// Write an overlay file this many rounds before each checkpoint.
125131
pub const NUM_ROUNDS_BEFORE_CHECKPOINT_TO_WRITE_OVERLAY: u64 = 50;
126132

@@ -765,6 +771,7 @@ impl StateSyncMetrics {
765771
type StatesMetadata = BTreeMap<Height, StateMetadata>;
766772

767773
type CertificationsMetadata = BTreeMap<Height, CertificationMetadata>;
774+
type Certifications = BTreeMap<Height, Certification>;
768775

769776
/// This struct bundles the root hash, manifest and meta-manifest.
770777
#[derive(Clone, Debug)]
@@ -886,6 +893,8 @@ impl StateSyncRefs {
886893
struct SharedState {
887894
/// Certifications metadata kept for all states
888895
certifications_metadata: CertificationsMetadata,
896+
/// Certifications delivered optimistically to optimize state hashing.
897+
certifications: Certifications,
889898
/// Metadata for each checkpoint
890899
states_metadata: StatesMetadata,
891900
/// A list of states present in the memory. This list is guaranteed to not be
@@ -1556,6 +1565,7 @@ impl StateManagerImpl {
15561565

15571566
let states = Arc::new(parking_lot::RwLock::new(SharedState {
15581567
certifications_metadata,
1568+
certifications: BTreeMap::new(),
15591569
states_metadata,
15601570
snapshots,
15611571
last_advertised: Self::INITIAL_STATE_HEIGHT,
@@ -2316,6 +2326,10 @@ impl StateManagerImpl {
23162326
.latest_certified_height
23172327
.set(latest_certified_height.get() as i64);
23182328

2329+
let mut certifications = states.certifications.split_off(&last_height_to_keep);
2330+
std::mem::swap(&mut certifications, &mut states.certifications);
2331+
self.deallocator_thread.send(Box::new(certifications));
2332+
23192333
let mut metadata_to_keep = states.states_metadata.split_off(&last_height_to_keep);
23202334

23212335
for h in checkpoint_heights_to_keep.iter() {
@@ -2711,6 +2725,8 @@ impl StateManager for StateManagerImpl {
27112725
states.certifications_metadata.get(&tip_height)
27122726
{
27132727
CryptoHashOfPartialState::from(tip_metadata.certified_state_hash.clone())
2728+
} else if let Some(tip_certification) = states.certifications.get(&tip_height) {
2729+
tip_certification.signed.content.hash.clone()
27142730
} else {
27152731
std::mem::drop(states);
27162732

@@ -2987,6 +3003,27 @@ impl StateManager for StateManagerImpl {
29873003
.collect()
29883004
}
29893005

3006+
fn list_state_heights_to_certify(&self) -> Vec<Height> {
3007+
let states = self.states.read();
3008+
let tip_height = states.tip_height.get();
3009+
let heights_with_certification: HashSet<_> =
3010+
states.certifications.keys().cloned().collect();
3011+
drop(states);
3012+
3013+
let latest_subnet_certified_height =
3014+
self.latest_subnet_certified_height.load(Ordering::Relaxed);
3015+
let state_heights = tip_height
3016+
..min(
3017+
tip_height + MAX_FUTURE_HEIGHTS_TO_CERTIFY,
3018+
latest_subnet_certified_height,
3019+
);
3020+
state_heights
3021+
.into_iter()
3022+
.map(Height::new)
3023+
.filter(|h| !heights_with_certification.contains(h))
3024+
.collect()
3025+
}
3026+
29903027
fn deliver_state_certification(&self, certification: Certification) {
29913028
let _timer = self
29923029
.metrics
@@ -3038,6 +3075,9 @@ impl StateManager for StateManagerImpl {
30383075
self.deallocator_thread.send(Box::new(tree));
30393076
}
30403077
}
3078+
} else {
3079+
let height = certification.height;
3080+
states.certifications.insert(height, certification);
30413081
}
30423082
}
30433083

@@ -3337,11 +3377,7 @@ impl StateManager for StateManagerImpl {
33373377

33383378
assert_tip_is_none(&states);
33393379

3340-
// It's possible that we already computed this state before. We
3341-
// validate that hashes agree to spot bugs causing non-determinism as
3342-
// early as possible.
3343-
if let Some(prev_metadata) = states.certifications_metadata.get(&height) {
3344-
let prev_hash = &prev_metadata.certified_state_hash;
3380+
let assert_prev_hash_matches = |prev_hash| {
33453381
let hash = &certification_metadata.certified_state_hash;
33463382
if prev_hash != hash {
33473383
if let Err(err) = self.state_layout.create_diverged_state_marker(height) {
@@ -3354,6 +3390,23 @@ impl StateManager for StateManagerImpl {
33543390
"Committed state @{height} with hash {hash:?} which is different from previously computed or delivered hash {prev_hash:?}"
33553391
);
33563392
}
3393+
};
3394+
3395+
// It's possible that we already computed this state before. We
3396+
// validate that hashes agree to spot bugs causing non-determinism as
3397+
// early as possible.
3398+
if let Some(prev_metadata) = states.certifications_metadata.get(&height) {
3399+
let prev_hash = &prev_metadata.certified_state_hash;
3400+
assert_prev_hash_matches(prev_hash);
3401+
}
3402+
3403+
// We reuse certification delivered by consensus if possible.
3404+
// We also validate that hashes agree to spot bugs causing non-determinism as
3405+
// early as possible.
3406+
if let Some(certification) = states.certifications.get(&height) {
3407+
let prev_hash = &certification.signed.content.hash.clone().get();
3408+
assert_prev_hash_matches(prev_hash);
3409+
certification_metadata.certification = Some(certification.clone());
33573410
}
33583411

33593412
if !states
@@ -4093,6 +4146,9 @@ pub mod testing {
40934146
/// Testing only: Returns certification at a given height in `states.certifications_metadata`.
40944147
fn certifications_metadata_certification(&self, height: Height) -> Option<Certification>;
40954148

4149+
/// Testing only: Returns certifications in `states.certifications`.
4150+
fn certifications(&self) -> BTreeMap<Height, Certification>;
4151+
40964152
/// Testing only: Returns `fast_forward_height`.
40974153
fn fast_forward_height(&self) -> u64;
40984154
}
@@ -4175,6 +4231,11 @@ pub mod testing {
41754231
.clone()
41764232
}
41774233

4234+
fn certifications(&self) -> BTreeMap<Height, Certification> {
4235+
let states = self.states.read();
4236+
states.certifications.clone()
4237+
}
4238+
41784239
fn fast_forward_height(&self) -> u64 {
41794240
self.fast_forward_height.load(Ordering::Relaxed)
41804241
}

rs/state_manager/src/tip.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1679,6 +1679,7 @@ mod test {
16791679

16801680
let dummy_states = Arc::new(parking_lot::RwLock::new(SharedState {
16811681
certifications_metadata: Default::default(),
1682+
certifications: Default::default(),
16821683
states_metadata: Default::default(),
16831684
snapshots: Default::default(),
16841685
last_advertised: Height::new(0),

0 commit comments

Comments
 (0)