Skip to content

Commit e73c945

Browse files
committed
Implement collect_block for lazy scorers.
1 parent 2fba123 commit e73c945

File tree

4 files changed

+112
-8
lines changed

4 files changed

+112
-8
lines changed

src/collector/sort_key/sort_key_computer.rs

Lines changed: 36 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
use std::cmp::Ordering;
22

3+
use crate::collector::ComparableDoc;
34
use crate::collector::sort_key::ReverseOrder;
5+
use crate::collector::top_score_collector::push_assuming_capacity;
46
use crate::{DocId, Order, Result, Score, SegmentReader};
57

68
/// A `SegmentSortKeyComputer` makes it possible to modify the default score
@@ -23,7 +25,8 @@ pub trait SegmentSortKeyComputer: 'static {
2325
/// Returns true if the `SegmentSortKeyComputer` is a good candidate for the lazy evaluation
2426
/// optimization. See [`SegmentSortKeyComputer::accept_score_lazy`].
2527
fn is_lazy() -> bool {
26-
false
28+
// TODO: Without this, we don't currently have test coverage for laziness.
29+
true
2730
}
2831

2932
/// Implementing this method makes it possible to avoid computing
@@ -43,9 +46,9 @@ pub trait SegmentSortKeyComputer: 'static {
4346
threshold: &Self::SegmentSortKey,
4447
) -> Option<(std::cmp::Ordering, Self::SegmentSortKey)> {
4548
let excluded_ordering = if REVERSE_ORDER {
46-
Ordering::Greater
47-
} else {
4849
Ordering::Less
50+
} else {
51+
Ordering::Greater
4952
};
5053
let sort_key = self.sort_key(doc_id, score);
5154
let cmp = sort_key.partial_cmp(threshold).unwrap_or(excluded_ordering);
@@ -56,6 +59,36 @@ pub trait SegmentSortKeyComputer: 'static {
5659
}
5760
}
5861

62+
/// Similar to `accept_sort_key_lazy`, but pushes results directly into the given buffer.
63+
///
64+
/// The buffer must have at least enough capacity for `docs` matches, or this method will
65+
/// panic.
66+
fn accept_sort_key_block_lazy<const REVERSE_ORDER: bool>(
67+
&mut self,
68+
docs: &[DocId],
69+
threshold: &Self::SegmentSortKey,
70+
output: &mut Vec<ComparableDoc<Self::SegmentSortKey, DocId, REVERSE_ORDER>>,
71+
) {
72+
let excluded_ordering = if REVERSE_ORDER {
73+
Ordering::Less
74+
} else {
75+
Ordering::Greater
76+
};
77+
for &doc in docs {
78+
let sort_key = self.sort_key(doc, 0.0);
79+
let cmp = sort_key.partial_cmp(threshold).unwrap_or(excluded_ordering);
80+
if cmp != excluded_ordering {
81+
push_assuming_capacity(
82+
ComparableDoc {
83+
sort_key,
84+
doc,
85+
},
86+
output,
87+
);
88+
}
89+
}
90+
}
91+
5992
/// Convert a segment level sort key into the global sort key.
6093
fn convert_segment_sort_key(&self, sort_key: Self::SegmentSortKey) -> Self::SortKey;
6194
}

src/collector/sort_key_top_collector.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,11 @@ where TSegmentSortKeyComputer: 'static + SegmentSortKeyComputer
102102
.collect_lazy(doc, score, &mut self.segment_sort_key_computer);
103103
}
104104

105+
fn collect_block(&mut self, docs: &[DocId]) {
106+
self.segment_collector
107+
.collect_block_lazy(docs, &mut self.segment_sort_key_computer);
108+
}
109+
105110
fn harvest(self) -> Self::Fruit {
106111
let segment_hits: Vec<(TSegmentSortKeyComputer::SegmentSortKey, DocAddress)> =
107112
self.segment_collector.harvest();

src/collector/top_collector.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,15 @@ impl<T: PartialOrd + Clone> TopSegmentCollector<T> {
202202
self.topn_computer.push(feature, doc);
203203
}
204204

205+
#[inline]
206+
pub fn collect_block_lazy(
207+
&mut self,
208+
docs: &[DocId],
209+
segment_scorer: &mut impl SegmentSortKeyComputer<SegmentSortKey = T>,
210+
) {
211+
self.topn_computer.push_block_lazy(docs, segment_scorer);
212+
}
213+
205214
#[inline]
206215
pub fn collect_lazy(
207216
&mut self,

src/collector/top_score_collector.rs

Lines changed: 62 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -693,7 +693,9 @@ where
693693
/// Create a new `TopNComputer`.
694694
/// Internally it will allocate a buffer of size `2 * top_n`.
695695
pub fn new(top_n: usize) -> Self {
696-
let vec_cap = top_n.max(1) * 2;
696+
// We ensure that there is always enough space to include an entire block in the buffer if
697+
// need be, so that `push_block_lazy` can avoid checking capacity inside its loop.
698+
let vec_cap = (top_n.max(1) * 2) + crate::COLLECT_BLOCK_BUFFER_LEN;
697699
TopNComputer {
698700
buffer: Vec::with_capacity(vec_cap),
699701
top_n,
@@ -775,6 +777,12 @@ where TScore: PartialOrd + Clone
775777
else {
776778
return;
777779
};
780+
781+
if self.buffer.len() == self.buffer.capacity() {
782+
let median = self.truncate_top_n();
783+
self.threshold = Some(median);
784+
}
785+
778786
push_assuming_capacity(
779787
ComparableDoc {
780788
sort_key: feature,
@@ -789,13 +797,62 @@ where TScore: PartialOrd + Clone
789797
self.push(feature, doc);
790798
return;
791799
}
800+
801+
#[inline(always)]
802+
pub(crate) fn push_block_lazy<
803+
TSegmentSortKeyComputer: SegmentSortKeyComputer<SegmentSortKey = TScore>,
804+
>(
805+
&mut self,
806+
docs: &[DocId],
807+
score_tweaker: &mut TSegmentSortKeyComputer,
808+
) {
809+
// If the addition of this block might push us over capacity, start by truncating: our
810+
// capacity is larger than 2*n + COLLECT_BLOCK_BUFFER_LEN, so this always makes enough room
811+
// for the entire block (although some of the block might be eliminated).
812+
if self.buffer.len() + docs.len() > self.buffer.capacity() {
813+
let median = self.truncate_top_n();
814+
self.threshold = Some(median);
815+
}
816+
817+
if let Some(last_median) = self.threshold.clone() {
818+
if TSegmentSortKeyComputer::is_lazy() {
819+
// We validated at the top of the method that we have capacity.
820+
score_tweaker.accept_sort_key_block_lazy::<REVERSE_ORDER>(docs, &last_median, &mut self.buffer);
821+
return;
822+
}
823+
824+
// Eagerly push, with a threshold to compare to.
825+
for &doc in docs {
826+
let sort_key = score_tweaker.sort_key(doc, 0.0);
827+
828+
if !REVERSE_ORDER && sort_key > last_median {
829+
continue;
830+
}
831+
if REVERSE_ORDER && sort_key < last_median {
832+
continue;
833+
}
834+
835+
// We validated at the top of the method that we have capacity.
836+
let comparable_doc = ComparableDoc { doc, sort_key };
837+
push_assuming_capacity(comparable_doc, &mut self.buffer);
838+
}
839+
} else {
840+
// Eagerly push, without a threshold to compare to.
841+
for &doc in docs {
842+
let sort_key = score_tweaker.sort_key(doc, 0.0);
843+
// We validated at the top of the method that we have capacity.
844+
let comparable_doc = ComparableDoc { doc, sort_key };
845+
push_assuming_capacity(comparable_doc, &mut self.buffer);
846+
}
847+
}
848+
}
792849
}
793850

794851
// Push an element provided there is enough capacity to do so.
795852
//
796853
// Panics if there is not enough capacity to add an element.
797854
#[inline(always)]
798-
fn push_assuming_capacity<T>(el: T, buf: &mut Vec<T>) {
855+
pub fn push_assuming_capacity<T>(el: T, buf: &mut Vec<T>) {
799856
let prev_len = buf.len();
800857
assert!(prev_len < buf.capacity());
801858
// This is mimicking the current (non-stabilized) implementation in std.
@@ -1509,11 +1566,11 @@ mod tests {
15091566
#[test]
15101567
fn test_top_field_collect_string_prop(
15111568
order in prop_oneof!(Just(Order::Desc), Just(Order::Asc)),
1512-
limit in 1..256_usize,
1513-
offset in 0..256_usize,
1569+
limit in 1..32_usize,
1570+
offset in 0..32_usize,
15141571
segments_terms in
15151572
proptest::collection::vec(
1516-
proptest::collection::vec(0..32_u8, 1..32_usize),
1573+
proptest::collection::vec(0..64_u8, 1..256_usize),
15171574
0..8_usize,
15181575
)
15191576
) {

0 commit comments

Comments
 (0)