Skip to content

Commit 0903e92

Browse files
agourlaycoszio
andauthored
Fix UnindexedField infra to handle lookup&range index requirements (#6496)
* Fix UnindexedField infra to handle lookup&range index requirements * more generic approach * better test * restore * small unit test for sanity * review nits * Do not recommend parametrized index --------- Co-authored-by: Luis Cossío <[email protected]>
1 parent b92a322 commit 0903e92

2 files changed

Lines changed: 261 additions & 71 deletions

File tree

lib/collection/src/problems/unindexed_field.rs

Lines changed: 155 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,13 @@ use http::{HeaderMap, HeaderValue, Method, Uri};
88
use issues::{Action, Code, ImmediateSolution, Issue, Solution};
99
use itertools::Itertools;
1010
use segment::common::operation_error::OperationError;
11-
use segment::data_types::index::{TextIndexParams, TextIndexType, TokenizerType};
1211
use segment::index::query_optimization::rescore_formula::parsed_formula::VariableId;
1312
use segment::json_path::JsonPath;
1413
use segment::types::{
1514
AnyVariants, Condition, FieldCondition, Filter, Match, MatchValue, PayloadFieldSchema,
1615
PayloadKeyType, PayloadSchemaParams, PayloadSchemaType, RangeInterface, UuidPayloadType,
1716
};
18-
use strum::IntoEnumIterator as _;
17+
use strum::{EnumIter, IntoEnumIterator as _};
1918

2019
use crate::operations::universal_query::formula::ExpressionInternal;
2120
#[derive(Debug)]
@@ -162,33 +161,33 @@ impl Issue for UnindexedField {
162161
}
163162

164163
/// Suggest any index, let user choose depending on their data type
165-
fn all_indexes() -> impl Iterator<Item = PayloadFieldSchema> {
166-
PayloadSchemaType::iter().map(PayloadFieldSchema::FieldType)
164+
fn all_indexes() -> impl Iterator<Item = FieldIndexType> {
165+
FieldIndexType::iter()
167166
}
168167

169-
fn infer_schema_from_match_value(value: &MatchValue) -> Vec<PayloadFieldSchema> {
168+
fn infer_index_from_match_value(value: &MatchValue) -> Vec<FieldIndexType> {
170169
match &value.value {
171170
segment::types::ValueVariants::String(string) => {
172171
let mut inferred = Vec::new();
173172

174173
if UuidPayloadType::parse_str(string).is_ok() {
175-
inferred.push(PayloadFieldSchema::FieldType(PayloadSchemaType::Uuid))
174+
inferred.push(FieldIndexType::UuidMatch)
176175
}
177176

178-
inferred.push(PayloadFieldSchema::FieldType(PayloadSchemaType::Keyword));
177+
inferred.push(FieldIndexType::KeywordMatch);
179178

180179
inferred
181180
}
182181
segment::types::ValueVariants::Integer(_integer) => {
183-
vec![PayloadFieldSchema::FieldType(PayloadSchemaType::Integer)]
182+
vec![FieldIndexType::IntMatch]
184183
}
185184
segment::types::ValueVariants::Bool(_boolean) => {
186-
vec![PayloadFieldSchema::FieldType(PayloadSchemaType::Bool)]
185+
vec![FieldIndexType::BoolMatch]
187186
}
188187
}
189188
}
190189

191-
fn infer_schema_from_any_variants(value: &AnyVariants) -> Vec<PayloadFieldSchema> {
190+
fn infer_index_from_any_variants(value: &AnyVariants) -> Vec<FieldIndexType> {
192191
match value {
193192
AnyVariants::Strings(strings) => {
194193
let mut inferred = Vec::new();
@@ -197,20 +196,20 @@ fn infer_schema_from_any_variants(value: &AnyVariants) -> Vec<PayloadFieldSchema
197196
.iter()
198197
.all(|s| UuidPayloadType::parse_str(s).is_ok())
199198
{
200-
inferred.push(PayloadFieldSchema::FieldType(PayloadSchemaType::Uuid))
199+
inferred.push(FieldIndexType::UuidMatch)
201200
}
202201

203-
inferred.push(PayloadFieldSchema::FieldType(PayloadSchemaType::Keyword));
202+
inferred.push(FieldIndexType::KeywordMatch);
204203

205204
inferred
206205
}
207206
AnyVariants::Integers(_integers) => {
208-
vec![PayloadFieldSchema::FieldType(PayloadSchemaType::Integer)]
207+
vec![FieldIndexType::IntMatch]
209208
}
210209
}
211210
}
212211

213-
fn infer_schema_from_field_condition(field_condition: &FieldCondition) -> Vec<PayloadFieldSchema> {
212+
fn infer_index_from_field_condition(field_condition: &FieldCondition) -> Vec<FieldIndexType> {
214213
let FieldCondition {
215214
key: _key,
216215
r#match,
@@ -223,47 +222,36 @@ fn infer_schema_from_field_condition(field_condition: &FieldCondition) -> Vec<Pa
223222
is_null,
224223
} = field_condition;
225224

226-
let mut inferred = Vec::new();
225+
let mut required_indexes = Vec::new();
227226

228227
if let Some(r#match) = r#match {
229-
inferred.extend(match r#match {
230-
Match::Value(match_value) => infer_schema_from_match_value(match_value),
231-
Match::Text(_match_text) => {
232-
vec![PayloadFieldSchema::FieldParams(PayloadSchemaParams::Text(
233-
TextIndexParams {
234-
r#type: TextIndexType::Text,
235-
tokenizer: TokenizerType::default(),
236-
min_token_len: None,
237-
max_token_len: None,
238-
lowercase: None,
239-
on_disk: None,
240-
},
241-
))]
242-
}
243-
Match::Any(match_any) => infer_schema_from_any_variants(&match_any.any),
244-
Match::Except(match_except) => infer_schema_from_any_variants(&match_except.except),
228+
required_indexes.extend(match r#match {
229+
Match::Value(match_value) => infer_index_from_match_value(match_value),
230+
Match::Text(_match_text) => vec![FieldIndexType::Text],
231+
Match::Any(match_any) => infer_index_from_any_variants(&match_any.any),
232+
Match::Except(match_except) => infer_index_from_any_variants(&match_except.except),
245233
})
246234
}
247235
if let Some(range_interface) = range {
248236
match range_interface {
249237
RangeInterface::DateTime(_) => {
250-
inferred.push(PayloadFieldSchema::FieldType(PayloadSchemaType::Datetime));
238+
required_indexes.push(FieldIndexType::DatetimeRange);
251239
}
252240
RangeInterface::Float(_) => {
253-
inferred.push(PayloadFieldSchema::FieldType(PayloadSchemaType::Float));
254-
inferred.push(PayloadFieldSchema::FieldType(PayloadSchemaType::Integer));
241+
required_indexes.push(FieldIndexType::FloatRange);
242+
required_indexes.push(FieldIndexType::IntRange);
255243
}
256244
}
257245
}
258246
if geo_bounding_box.is_some() || geo_radius.is_some() || geo_polygon.is_some() {
259-
inferred.push(PayloadFieldSchema::FieldType(PayloadSchemaType::Geo));
247+
required_indexes.push(FieldIndexType::Geo);
260248
}
261249
if values_count.is_some() || is_empty.is_some() || is_null.is_some() {
262250
// Any index will do, let user choose depending on their data type
263-
inferred.extend(all_indexes());
251+
required_indexes.extend(all_indexes());
264252
}
265253

266-
inferred
254+
required_indexes
267255
}
268256

269257
pub struct IssueExtractor<'a> {
@@ -359,12 +347,12 @@ impl<'a> Extractor<'a> {
359347

360348
fn update_from_condition(&mut self, nested_prefix: Option<&JsonPath>, condition: &Condition) {
361349
let key;
362-
let inferred;
350+
let required_index;
363351

364352
match condition {
365353
Condition::Field(field_condition) => {
366354
key = &field_condition.key;
367-
inferred = infer_schema_from_field_condition(field_condition);
355+
required_index = infer_index_from_field_condition(field_condition);
368356
}
369357
Condition::Filter(filter) => {
370358
self.update_from_filter(nested_prefix, filter);
@@ -377,14 +365,14 @@ impl<'a> Extractor<'a> {
377365
);
378366
return;
379367
}
380-
// Any index will suffice
368+
// Any index will suffice to get the satellite null index
381369
Condition::IsEmpty(is_empty) => {
382370
key = &is_empty.is_empty.key;
383-
inferred = all_indexes().collect();
371+
required_index = all_indexes().collect();
384372
}
385373
Condition::IsNull(is_null) => {
386374
key = &is_null.is_null.key;
387-
inferred = all_indexes().collect();
375+
required_index = all_indexes().collect();
388376
}
389377
// No index needed
390378
Condition::HasId(_) => return,
@@ -394,35 +382,26 @@ impl<'a> Extractor<'a> {
394382

395383
let full_key = JsonPath::extend_or_new(nested_prefix, key);
396384

397-
if self.needs_index(&full_key, &inferred) {
385+
if self.needs_index(&full_key, &required_index) {
386+
let schemas = required_index
387+
.into_iter()
388+
.map(PayloadSchemaType::from)
389+
.map(PayloadFieldSchema::FieldType);
398390
self.unindexed_schema
399391
.entry(full_key)
400392
.or_default()
401-
.extend(inferred);
393+
.extend(schemas);
402394
}
403395
}
404396

405-
fn needs_index(&self, key: &JsonPath, inferred: &[PayloadFieldSchema]) -> bool {
397+
fn needs_index(&self, key: &JsonPath, required_indexes: &[FieldIndexType]) -> bool {
406398
match self.payload_schema.get(key) {
407399
Some(index_info) => {
408-
let index_info_kind = index_info.kind();
409-
410-
let already_indexed = inferred
400+
// check if the index present has the right capabilities
401+
let index_field_types = schema_capabilities(index_info);
402+
let already_indexed = required_indexes
411403
.iter()
412-
// TODO(strict-mode):
413-
// Use better comparisons for parametrized indexes. An idea is to make the inferring step
414-
// also output valid parametrized indexes and compare those instead of just the kind (index type)
415-
//
416-
// The only reason why it would be needed is because integer index can be parametrized
417-
// with just lookup or just range, so it is possible to make a false negative here. E.g.
418-
//
419-
// condition: MatchValue
420-
// inferred: FieldType(Integer)
421-
// index_info: FieldParams(IntegerIndex(range))
422-
//
423-
// In this case, we would assume that the field is indexed correctly when it is not
424-
.map(PayloadFieldSchema::kind)
425-
.any(|inferred| inferred == index_info_kind);
404+
.any(|required| index_field_types.contains(required));
426405

427406
!already_indexed
428407
}
@@ -432,7 +411,7 @@ impl<'a> Extractor<'a> {
432411

433412
pub fn update_from_expression(&mut self, expression: &ExpressionInternal) {
434413
let key;
435-
let inferred;
414+
let required_index;
436415

437416
match expression {
438417
ExpressionInternal::Constant(_) => return,
@@ -447,9 +426,10 @@ impl<'a> Extractor<'a> {
447426
VariableId::Score(_) => return,
448427
VariableId::Payload(json_path) => {
449428
key = json_path;
450-
inferred = vec![
451-
PayloadFieldSchema::FieldType(PayloadSchemaType::Integer),
452-
PayloadFieldSchema::FieldType(PayloadSchemaType::Float),
429+
required_index = vec![
430+
FieldIndexType::IntMatch,
431+
FieldIndexType::IntRange,
432+
FieldIndexType::FloatRange,
453433
];
454434
}
455435
VariableId::Condition(_) => return,
@@ -461,12 +441,12 @@ impl<'a> Extractor<'a> {
461441
}
462442
ExpressionInternal::GeoDistance { origin: _, to } => {
463443
key = to.clone();
464-
inferred = vec![PayloadFieldSchema::FieldType(PayloadSchemaType::Geo)];
444+
required_index = vec![FieldIndexType::Geo];
465445
}
466446
ExpressionInternal::Datetime(_) => return,
467447
ExpressionInternal::DatetimeKey(variable) => {
468448
key = variable.clone();
469-
inferred = vec![PayloadFieldSchema::FieldType(PayloadSchemaType::Datetime)];
449+
required_index = vec![FieldIndexType::DatetimeRange];
470450
}
471451
ExpressionInternal::Mult(expression_internals) => {
472452
for expr in expression_internals {
@@ -533,11 +513,116 @@ impl<'a> Extractor<'a> {
533513
}
534514
}
535515

536-
if self.needs_index(&key, &inferred) {
516+
if self.needs_index(&key, &required_index) {
517+
let schemas = required_index
518+
.into_iter()
519+
.map(PayloadSchemaType::from)
520+
.map(PayloadFieldSchema::FieldType);
537521
self.unindexed_schema
538522
.entry(key)
539523
.or_default()
540-
.extend(inferred);
524+
.extend(schemas);
541525
}
542526
}
543527
}
528+
529+
/// All types of internal indexes
530+
#[derive(Debug, Eq, PartialEq, EnumIter, Hash)]
531+
enum FieldIndexType {
532+
IntMatch,
533+
IntRange,
534+
KeywordMatch,
535+
FloatRange,
536+
Text,
537+
BoolMatch,
538+
UuidMatch,
539+
UuidRange,
540+
DatetimeRange,
541+
Geo,
542+
}
543+
544+
fn schema_capabilities(value: &PayloadFieldSchema) -> HashSet<FieldIndexType> {
545+
let mut index_types = HashSet::new();
546+
match value {
547+
PayloadFieldSchema::FieldType(payload_schema_type) => match payload_schema_type {
548+
PayloadSchemaType::Keyword => index_types.insert(FieldIndexType::KeywordMatch),
549+
PayloadSchemaType::Integer => {
550+
index_types.insert(FieldIndexType::IntMatch);
551+
index_types.insert(FieldIndexType::IntRange)
552+
}
553+
PayloadSchemaType::Uuid => {
554+
index_types.insert(FieldIndexType::UuidMatch);
555+
index_types.insert(FieldIndexType::UuidRange)
556+
}
557+
PayloadSchemaType::Bool => index_types.insert(FieldIndexType::BoolMatch),
558+
PayloadSchemaType::Float => index_types.insert(FieldIndexType::FloatRange),
559+
PayloadSchemaType::Geo => index_types.insert(FieldIndexType::Geo),
560+
PayloadSchemaType::Text => index_types.insert(FieldIndexType::Text),
561+
PayloadSchemaType::Datetime => index_types.insert(FieldIndexType::DatetimeRange),
562+
},
563+
PayloadFieldSchema::FieldParams(payload_schema_params) => match payload_schema_params {
564+
PayloadSchemaParams::Keyword(_) => index_types.insert(FieldIndexType::KeywordMatch),
565+
PayloadSchemaParams::Integer(integer_index_params) => {
566+
if integer_index_params.lookup == Some(true) {
567+
index_types.insert(FieldIndexType::IntMatch);
568+
}
569+
if integer_index_params.range == Some(true) {
570+
index_types.insert(FieldIndexType::IntRange);
571+
}
572+
debug_assert!(
573+
!index_types.is_empty(),
574+
"lookup or range must be true for Integer payload index"
575+
);
576+
// unifying match arm types
577+
true
578+
}
579+
PayloadSchemaParams::Uuid(_) => {
580+
index_types.insert(FieldIndexType::UuidMatch);
581+
index_types.insert(FieldIndexType::UuidRange)
582+
}
583+
PayloadSchemaParams::Bool(_) => index_types.insert(FieldIndexType::BoolMatch),
584+
PayloadSchemaParams::Float(_) => index_types.insert(FieldIndexType::FloatRange),
585+
PayloadSchemaParams::Geo(_) => index_types.insert(FieldIndexType::Geo),
586+
PayloadSchemaParams::Text(_) => index_types.insert(FieldIndexType::Text),
587+
PayloadSchemaParams::Datetime(_) => index_types.insert(FieldIndexType::DatetimeRange),
588+
},
589+
};
590+
index_types
591+
}
592+
593+
impl From<FieldIndexType> for PayloadSchemaType {
594+
fn from(val: FieldIndexType) -> Self {
595+
match val {
596+
FieldIndexType::IntMatch => PayloadSchemaType::Integer,
597+
FieldIndexType::IntRange => PayloadSchemaType::Integer,
598+
FieldIndexType::KeywordMatch => PayloadSchemaType::Keyword,
599+
FieldIndexType::FloatRange => PayloadSchemaType::Float,
600+
FieldIndexType::Text => PayloadSchemaType::Text,
601+
FieldIndexType::BoolMatch => PayloadSchemaType::Bool,
602+
FieldIndexType::UuidMatch => PayloadSchemaType::Uuid,
603+
FieldIndexType::UuidRange => PayloadSchemaType::Uuid,
604+
FieldIndexType::DatetimeRange => PayloadSchemaType::Datetime,
605+
FieldIndexType::Geo => PayloadSchemaType::Geo,
606+
}
607+
}
608+
}
609+
610+
#[cfg(test)]
611+
mod tests {
612+
use segment::data_types::index::IntegerIndexParams;
613+
614+
use super::*;
615+
616+
#[test]
617+
fn integer_index_capacities() {
618+
let params = PayloadSchemaParams::Integer(IntegerIndexParams {
619+
lookup: Some(true),
620+
range: Some(true),
621+
..Default::default()
622+
});
623+
let schema = PayloadFieldSchema::FieldParams(params);
624+
let index_types = schema_capabilities(&schema);
625+
assert!(index_types.contains(&FieldIndexType::IntMatch));
626+
assert!(index_types.contains(&FieldIndexType::IntRange));
627+
}
628+
}

0 commit comments

Comments
 (0)