Skip to content

Commit c5ed6bc

Browse files
committed
more generic approach
1 parent af248be commit c5ed6bc

2 files changed

Lines changed: 133 additions & 117 deletions

File tree

lib/collection/src/problems/unindexed_field.rs

Lines changed: 133 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,14 @@ use http::{HeaderMap, HeaderValue, Method, Uri};
88
use issues::{Action, Code, ImmediateSolution, Issue, Solution};
99
use itertools::Itertools;
1010
use segment::common::operation_error::OperationError;
11-
use segment::data_types::index::{TextIndexParams, TextIndexType, TokenizerType};
11+
use segment::data_types::index::IntegerIndexParams;
1212
use segment::index::query_optimization::rescore_formula::parsed_formula::VariableId;
1313
use segment::json_path::JsonPath;
1414
use segment::types::{
1515
AnyVariants, Condition, FieldCondition, Filter, Match, MatchValue, PayloadFieldSchema,
1616
PayloadKeyType, PayloadSchemaParams, PayloadSchemaType, RangeInterface, UuidPayloadType,
1717
};
18-
use strum::IntoEnumIterator as _;
18+
use strum::{EnumIter, IntoEnumIterator as _};
1919

2020
use crate::operations::universal_query::formula::ExpressionInternal;
2121
#[derive(Debug)]
@@ -162,33 +162,33 @@ impl Issue for UnindexedField {
162162
}
163163

164164
/// Suggest any index, let user choose depending on their data type
165-
fn all_indexes() -> impl Iterator<Item = PayloadFieldSchema> {
166-
PayloadSchemaType::iter().map(PayloadFieldSchema::FieldType)
165+
fn all_indexes() -> impl Iterator<Item = FieldIndexType> {
166+
FieldIndexType::iter()
167167
}
168168

169-
fn infer_schema_from_match_value(value: &MatchValue) -> Vec<PayloadFieldSchema> {
169+
fn infer_index_from_match_value(value: &MatchValue) -> Vec<FieldIndexType> {
170170
match &value.value {
171171
segment::types::ValueVariants::String(string) => {
172172
let mut inferred = Vec::new();
173173

174174
if UuidPayloadType::parse_str(string).is_ok() {
175-
inferred.push(PayloadFieldSchema::FieldType(PayloadSchemaType::Uuid))
175+
inferred.push(FieldIndexType::UuidMatch)
176176
}
177177

178-
inferred.push(PayloadFieldSchema::FieldType(PayloadSchemaType::Keyword));
178+
inferred.push(FieldIndexType::KeywordMatch);
179179

180180
inferred
181181
}
182182
segment::types::ValueVariants::Integer(_integer) => {
183-
vec![PayloadFieldSchema::FieldType(PayloadSchemaType::Integer)]
183+
vec![FieldIndexType::IntMatch]
184184
}
185185
segment::types::ValueVariants::Bool(_boolean) => {
186-
vec![PayloadFieldSchema::FieldType(PayloadSchemaType::Bool)]
186+
vec![FieldIndexType::BoolMatch]
187187
}
188188
}
189189
}
190190

191-
fn infer_schema_from_any_variants(value: &AnyVariants) -> Vec<PayloadFieldSchema> {
191+
fn infer_index_from_any_variants(value: &AnyVariants) -> Vec<FieldIndexType> {
192192
match value {
193193
AnyVariants::Strings(strings) => {
194194
let mut inferred = Vec::new();
@@ -197,20 +197,20 @@ fn infer_schema_from_any_variants(value: &AnyVariants) -> Vec<PayloadFieldSchema
197197
.iter()
198198
.all(|s| UuidPayloadType::parse_str(s).is_ok())
199199
{
200-
inferred.push(PayloadFieldSchema::FieldType(PayloadSchemaType::Uuid))
200+
inferred.push(FieldIndexType::UuidMatch)
201201
}
202202

203-
inferred.push(PayloadFieldSchema::FieldType(PayloadSchemaType::Keyword));
203+
inferred.push(FieldIndexType::KeywordMatch);
204204

205205
inferred
206206
}
207207
AnyVariants::Integers(_integers) => {
208-
vec![PayloadFieldSchema::FieldType(PayloadSchemaType::Integer)]
208+
vec![FieldIndexType::IntMatch]
209209
}
210210
}
211211
}
212212

213-
fn infer_schema_from_field_condition(field_condition: &FieldCondition) -> Vec<PayloadFieldSchema> {
213+
fn infer_index_from_field_condition(field_condition: &FieldCondition) -> Vec<FieldIndexType> {
214214
let FieldCondition {
215215
key: _key,
216216
r#match,
@@ -223,47 +223,36 @@ fn infer_schema_from_field_condition(field_condition: &FieldCondition) -> Vec<Pa
223223
is_null,
224224
} = field_condition;
225225

226-
let mut inferred = Vec::new();
226+
let mut required_indexes = Vec::new();
227227

228228
if let Some(r#match) = r#match {
229-
inferred.extend(match r#match {
230-
Match::Value(match_value) => infer_schema_from_match_value(match_value),
231-
Match::Text(_match_text) => {
232-
vec![PayloadFieldSchema::FieldParams(PayloadSchemaParams::Text(
233-
TextIndexParams {
234-
r#type: TextIndexType::Text,
235-
tokenizer: TokenizerType::default(),
236-
min_token_len: None,
237-
max_token_len: None,
238-
lowercase: None,
239-
on_disk: None,
240-
},
241-
))]
242-
}
243-
Match::Any(match_any) => infer_schema_from_any_variants(&match_any.any),
244-
Match::Except(match_except) => infer_schema_from_any_variants(&match_except.except),
229+
required_indexes.extend(match r#match {
230+
Match::Value(match_value) => infer_index_from_match_value(match_value),
231+
Match::Text(_match_text) => vec![FieldIndexType::Text],
232+
Match::Any(match_any) => infer_index_from_any_variants(&match_any.any),
233+
Match::Except(match_except) => infer_index_from_any_variants(&match_except.except),
245234
})
246235
}
247236
if let Some(range_interface) = range {
248237
match range_interface {
249238
RangeInterface::DateTime(_) => {
250-
inferred.push(PayloadFieldSchema::FieldType(PayloadSchemaType::Datetime));
239+
required_indexes.push(FieldIndexType::DatetimeRange);
251240
}
252241
RangeInterface::Float(_) => {
253-
inferred.push(PayloadFieldSchema::FieldType(PayloadSchemaType::Float));
254-
inferred.push(PayloadFieldSchema::FieldType(PayloadSchemaType::Integer));
242+
required_indexes.push(FieldIndexType::FloatRange);
243+
required_indexes.push(FieldIndexType::IntRange);
255244
}
256245
}
257246
}
258247
if geo_bounding_box.is_some() || geo_radius.is_some() || geo_polygon.is_some() {
259-
inferred.push(PayloadFieldSchema::FieldType(PayloadSchemaType::Geo));
248+
required_indexes.push(FieldIndexType::Geo);
260249
}
261250
if values_count.is_some() || is_empty.is_some() || is_null.is_some() {
262251
// Any index will do, let user choose depending on their data type
263-
inferred.extend(all_indexes());
252+
required_indexes.extend(all_indexes());
264253
}
265254

266-
inferred
255+
required_indexes
267256
}
268257

269258
pub struct IssueExtractor<'a> {
@@ -309,8 +298,6 @@ impl<'a> IssueExtractor<'a> {
309298
pub struct Extractor<'a> {
310299
payload_schema: &'a HashMap<PayloadKeyType, PayloadFieldSchema>,
311300
unindexed_schema: HashMap<PayloadKeyType, Vec<PayloadFieldSchema>>,
312-
needs_match_index: bool,
313-
needs_range_index: bool,
314301
}
315302

316303
impl<'a> Extractor<'a> {
@@ -322,8 +309,6 @@ impl<'a> Extractor<'a> {
322309
let mut extractor = Self {
323310
payload_schema,
324311
unindexed_schema: HashMap::new(),
325-
needs_match_index: false,
326-
needs_range_index: false,
327312
};
328313

329314
extractor.update_from_filter(None, filter);
@@ -336,8 +321,6 @@ impl<'a> Extractor<'a> {
336321
Self {
337322
payload_schema,
338323
unindexed_schema: HashMap::new(),
339-
needs_match_index: false,
340-
needs_range_index: false,
341324
}
342325
}
343326

@@ -365,14 +348,12 @@ impl<'a> Extractor<'a> {
365348

366349
fn update_from_condition(&mut self, nested_prefix: Option<&JsonPath>, condition: &Condition) {
367350
let key;
368-
let inferred;
351+
let required_index;
369352

370353
match condition {
371354
Condition::Field(field_condition) => {
372355
key = &field_condition.key;
373-
self.update_needs_match(field_condition.r#match.is_some());
374-
self.update_needs_range(field_condition.range.is_some());
375-
inferred = infer_schema_from_field_condition(field_condition);
356+
required_index = infer_index_from_field_condition(field_condition);
376357
}
377358
Condition::Filter(filter) => {
378359
self.update_from_filter(nested_prefix, filter);
@@ -388,11 +369,11 @@ impl<'a> Extractor<'a> {
388369
// Any index will suffice
389370
Condition::IsEmpty(is_empty) => {
390371
key = &is_empty.is_empty.key;
391-
inferred = all_indexes().collect();
372+
required_index = all_indexes().collect();
392373
}
393374
Condition::IsNull(is_null) => {
394375
key = &is_null.is_null.key;
395-
inferred = all_indexes().collect();
376+
required_index = all_indexes().collect();
396377
}
397378
// No index needed
398379
Condition::HasId(_) => return,
@@ -402,46 +383,23 @@ impl<'a> Extractor<'a> {
402383

403384
let full_key = JsonPath::extend_or_new(nested_prefix, key);
404385

405-
if self.needs_index(&full_key, &inferred) {
386+
if self.needs_index(&full_key, &required_index) {
387+
let schemas = required_index.into_iter().map(PayloadFieldSchema::from);
406388
self.unindexed_schema
407389
.entry(full_key)
408390
.or_default()
409-
.extend(inferred);
391+
.extend(schemas);
410392
}
411393
}
412394

413-
/// Capture that an index with Match capability is required.
414-
/// Does not toggle off if has been set once already.
415-
fn update_needs_match(&mut self, is_match: bool) {
416-
if !self.needs_match_index {
417-
self.needs_match_index = is_match
418-
}
419-
}
420-
421-
/// Capture that an index with Match capability is required.
422-
/// Does not toggle off if has been set once already.
423-
fn update_needs_range(&mut self, is_range: bool) {
424-
if !self.needs_range_index {
425-
self.needs_range_index = is_range
426-
}
427-
}
428-
429-
fn needs_index(&self, key: &JsonPath, inferred: &[PayloadFieldSchema]) -> bool {
395+
fn needs_index(&self, key: &JsonPath, required_indexes: &[FieldIndexType]) -> bool {
430396
match self.payload_schema.get(key) {
431397
Some(index_info) => {
432-
if self.needs_match_index && !index_info.supports_match() {
433-
return true;
434-
}
435-
436-
if self.needs_range_index && !index_info.supports_range() {
437-
return true;
438-
}
439-
440-
let index_info_kind = index_info.kind();
441-
let already_indexed = inferred
398+
// check if the index present has the right capabilities
399+
let index_field_types = schema_capacities(index_info);
400+
let already_indexed = required_indexes
442401
.iter()
443-
.map(PayloadFieldSchema::kind)
444-
.any(|inferred| inferred == index_info_kind);
402+
.any(|required| index_field_types.contains(required));
445403

446404
!already_indexed
447405
}
@@ -451,7 +409,7 @@ impl<'a> Extractor<'a> {
451409

452410
pub fn update_from_expression(&mut self, expression: &ExpressionInternal) {
453411
let key;
454-
let inferred;
412+
let required_index;
455413

456414
match expression {
457415
ExpressionInternal::Constant(_) => return,
@@ -466,10 +424,7 @@ impl<'a> Extractor<'a> {
466424
VariableId::Score(_) => return,
467425
VariableId::Payload(json_path) => {
468426
key = json_path;
469-
inferred = vec![
470-
PayloadFieldSchema::FieldType(PayloadSchemaType::Integer),
471-
PayloadFieldSchema::FieldType(PayloadSchemaType::Float),
472-
];
427+
required_index = vec![FieldIndexType::IntMatch, FieldIndexType::FloatRange];
473428
}
474429
VariableId::Condition(_) => return,
475430
}
@@ -480,12 +435,12 @@ impl<'a> Extractor<'a> {
480435
}
481436
ExpressionInternal::GeoDistance { origin: _, to } => {
482437
key = to.clone();
483-
inferred = vec![PayloadFieldSchema::FieldType(PayloadSchemaType::Geo)];
438+
required_index = vec![FieldIndexType::Geo];
484439
}
485440
ExpressionInternal::Datetime(_) => return,
486441
ExpressionInternal::DatetimeKey(variable) => {
487442
key = variable.clone();
488-
inferred = vec![PayloadFieldSchema::FieldType(PayloadSchemaType::Datetime)];
443+
required_index = vec![FieldIndexType::DatetimeRange];
489444
}
490445
ExpressionInternal::Mult(expression_internals) => {
491446
for expr in expression_internals {
@@ -552,11 +507,100 @@ impl<'a> Extractor<'a> {
552507
}
553508
}
554509

555-
if self.needs_index(&key, &inferred) {
510+
if self.needs_index(&key, &required_index) {
511+
let schemas = required_index.into_iter().map(PayloadFieldSchema::from);
556512
self.unindexed_schema
557513
.entry(key)
558514
.or_default()
559-
.extend(inferred);
515+
.extend(schemas);
516+
}
517+
}
518+
}
519+
520+
/// All types of internal indexes
521+
#[derive(Debug, Eq, PartialEq, EnumIter, Hash)]
522+
enum FieldIndexType {
523+
IntMatch,
524+
IntRange,
525+
KeywordMatch,
526+
FloatRange,
527+
Text,
528+
BoolMatch,
529+
UuidMatch,
530+
UuidRange,
531+
DatetimeRange,
532+
Geo,
533+
}
534+
535+
fn schema_capacities(value: &PayloadFieldSchema) -> HashSet<FieldIndexType> {
536+
let mut index_types = HashSet::new();
537+
match value {
538+
PayloadFieldSchema::FieldType(payload_schema_type) => match payload_schema_type {
539+
PayloadSchemaType::Keyword => index_types.insert(FieldIndexType::KeywordMatch),
540+
PayloadSchemaType::Integer => index_types.insert(FieldIndexType::IntMatch),
541+
PayloadSchemaType::Uuid => index_types.insert(FieldIndexType::UuidMatch),
542+
PayloadSchemaType::Bool => index_types.insert(FieldIndexType::BoolMatch),
543+
PayloadSchemaType::Float => index_types.insert(FieldIndexType::FloatRange),
544+
PayloadSchemaType::Geo => index_types.insert(FieldIndexType::Geo),
545+
PayloadSchemaType::Text => index_types.insert(FieldIndexType::Text),
546+
PayloadSchemaType::Datetime => index_types.insert(FieldIndexType::DatetimeRange),
547+
},
548+
PayloadFieldSchema::FieldParams(payload_schema_params) => match payload_schema_params {
549+
PayloadSchemaParams::Keyword(_) => index_types.insert(FieldIndexType::KeywordMatch),
550+
PayloadSchemaParams::Integer(integer_index_params) => {
551+
if integer_index_params.lookup == Some(true) {
552+
index_types.insert(FieldIndexType::IntMatch);
553+
}
554+
if integer_index_params.range == Some(true) {
555+
index_types.insert(FieldIndexType::IntRange);
556+
}
557+
debug_assert!(
558+
!index_types.is_empty(),
559+
"lookup or range must be true for Integer payload index"
560+
);
561+
// unifying match arm types
562+
true
563+
}
564+
PayloadSchemaParams::Uuid(_) => index_types.insert(FieldIndexType::UuidMatch),
565+
PayloadSchemaParams::Bool(_) => index_types.insert(FieldIndexType::BoolMatch),
566+
PayloadSchemaParams::Float(_) => index_types.insert(FieldIndexType::FloatRange),
567+
PayloadSchemaParams::Geo(_) => index_types.insert(FieldIndexType::Geo),
568+
PayloadSchemaParams::Text(_) => index_types.insert(FieldIndexType::Text),
569+
PayloadSchemaParams::Datetime(_) => index_types.insert(FieldIndexType::DatetimeRange),
570+
},
571+
};
572+
index_types
573+
}
574+
575+
impl From<FieldIndexType> for PayloadFieldSchema {
576+
fn from(val: FieldIndexType) -> Self {
577+
match val {
578+
FieldIndexType::IntMatch => {
579+
let params = IntegerIndexParams {
580+
lookup: Some(true),
581+
..Default::default()
582+
};
583+
PayloadFieldSchema::FieldParams(PayloadSchemaParams::Integer(params))
584+
}
585+
FieldIndexType::IntRange => {
586+
let params = IntegerIndexParams {
587+
range: Some(true),
588+
..Default::default()
589+
};
590+
PayloadFieldSchema::FieldParams(PayloadSchemaParams::Integer(params))
591+
}
592+
FieldIndexType::KeywordMatch => {
593+
PayloadFieldSchema::FieldType(PayloadSchemaType::Keyword)
594+
}
595+
FieldIndexType::FloatRange => PayloadFieldSchema::FieldType(PayloadSchemaType::Float),
596+
FieldIndexType::Text => PayloadFieldSchema::FieldType(PayloadSchemaType::Text),
597+
FieldIndexType::BoolMatch => PayloadFieldSchema::FieldType(PayloadSchemaType::Bool),
598+
FieldIndexType::UuidMatch => PayloadFieldSchema::FieldType(PayloadSchemaType::Uuid),
599+
FieldIndexType::UuidRange => PayloadFieldSchema::FieldType(PayloadSchemaType::Uuid),
600+
FieldIndexType::DatetimeRange => {
601+
PayloadFieldSchema::FieldType(PayloadSchemaType::Datetime)
602+
}
603+
FieldIndexType::Geo => PayloadFieldSchema::FieldType(PayloadSchemaType::Geo),
560604
}
561605
}
562606
}

0 commit comments

Comments
 (0)