From c23b19edeb4cfe32eabaa9801088e0e3c5fd6b86 Mon Sep 17 00:00:00 2001 From: Allen Samuels Date: Wed, 1 Oct 2025 04:46:03 +0000 Subject: [PATCH 01/33] Initial wiring Signed-off-by: Allen Samuels --- src/index_schema.cc | 215 +++++++++++++++++++++++++++---------- src/index_schema.h | 2 + src/index_schema.proto | 2 + src/indexes/index_base.h | 3 +- src/indexes/numeric.cc | 9 ++ src/indexes/numeric.h | 2 + src/indexes/tag.cc | 8 ++ src/indexes/tag.h | 2 + src/indexes/vector_base.cc | 10 ++ src/indexes/vector_base.h | 3 + src/rdb_section.proto | 20 +++- testing/common.h | 4 + 12 files changed, 218 insertions(+), 62 deletions(-) diff --git a/src/index_schema.cc b/src/index_schema.cc index 74f4ec166..eb653c135 100644 --- a/src/index_schema.cc +++ b/src/index_schema.cc @@ -59,6 +59,19 @@ namespace valkey_search { LogLevel GetLogSeverity(bool ok) { return ok ? DEBUG : WARNING; } +static auto config_rdb_write_v2 = + vmsdk::config::BooleanBuilder("rdb_write_v2", false).Hidden().Build(); +static auto config_rdb_read_v2 = + vmsdk::config::BooleanBuilder("rdb_read_v2", false).Hidden().Build(); + +static bool RDBReadV2() { + return dynamic_cast(*config_rdb_read_v2).GetValue(); +} + +static bool RDBWriteV2() { + return dynamic_cast(*config_rdb_write_v2).GetValue(); +} + IndexSchema::BackfillJob::BackfillJob(ValkeyModuleCtx *ctx, absl::string_view name, int db_num) : cursor(vmsdk::MakeUniqueValkeyScanCursor()) { @@ -821,10 +834,34 @@ absl::Status IndexSchema::RDBSave(SafeRDB *rdb) const { rdb_section->set_allocated_index_schema_contents( index_schema_proto.release()); - /* Each attribute has one index content and vector indices also have one - * key-to-id mapping */ - size_t supplemental_count = - GetAttributeCount() + + /* + The legacy V1 format. + + Each non-vector attribute has one SUPPLEMENTAL_CONTENT_INDEX_CONTENT segment. + That segment consists of a header and no additional content. + + Vector attributes have two segments. + One SUPPLEMENTAL_CONTENT_INDEX_CONTENT which has a header and the saved vector index data. + One SUPPLEMENTAL_CONTENT_KEY_TO_ID_MAP segment + + The new V2 format is a superset of the V1 format. + + Each non-vector attribute has two segments. + one SUPPLEMENTAL_CONTENT_INDEX_CONTENT segment which is a header only. (Same as V1) + one SUPPLEMENTAL_CONTENT_INDEX_EXTENSION segment which has saved index data + + Vector attributes are same as V1. + + V2 also has a segment with SUPPLEMENTAL_CONTENT_MUTATION_QUEUE which is the + header and the contents of the mutation queue. + + */ + size_t supplemental_count = + RDBWriteV2() + // For V2, we write two sections for each attribute plus the mutation queue + ? 2 * GetAttributeCount() + 1 + // For V1, we write one section for each attribute plus an additional section for each vector attribute + : GetAttributeCount() + std::count_if(attributes_.begin(), attributes_.end(), [](const auto &attribute) { return IsVectorIndex(attribute.second.GetIndex()); @@ -852,9 +889,8 @@ absl::Status IndexSchema::RDBSave(SafeRDB *rdb) const { << "IO error while saving supplemental content for index content for " "index name: " << this->name_ << " attribute: " << attribute.first << " to RDB"; - RDBChunkOutputStream index_chunked_out(rdb); VMSDK_RETURN_IF_ERROR( - attribute.second.GetIndex()->SaveIndex(std::move(index_chunked_out))) + attribute.second.GetIndex()->SaveIndex(RDBChunkOutputStream(rdb))) << "IO error while saving Index contents (index name: " << this->name_ << ", attribute: " << attribute.first << ") to RDB"; @@ -871,15 +907,65 @@ absl::Status IndexSchema::RDBSave(SafeRDB *rdb) const { << "IO error while saving supplemental content for key to ID mapping " "for index name: " << this->name_ << " attribute: " << attribute.first << " to RDB"; - RDBChunkOutputStream key_to_id_chunked_out(rdb); VMSDK_RETURN_IF_ERROR( dynamic_cast( attribute.second.GetIndex().get()) - ->SaveTrackedKeys(std::move(key_to_id_chunked_out))) + ->SaveTrackedKeys(RDBChunkOutputStream(rdb))) << "IO error while saving Key to ID mapping (index name: " << this->name_ << ", attribute: " << attribute.first << ") to RDB"; + } else if (RDBWriteV2()) { + // + // V2 add index content extension + // + auto index_content_extension = + std::make_unique(); + index_content_extension->set_type( + data_model::SUPPLEMENTAL_CONTENT_INDEX_EXTENSION); + index_content_extension->mutable_index_content_header()->set_allocated_attribute( + attribute.second.ToProto().release()); + auto index_content_extension_str = index_content_extension->SerializeAsString(); + VMSDK_RETURN_IF_ERROR(rdb->SaveStringBuffer(index_content_extension_str)) + << "IO error while saving supplemental content for index content extension for " + "index name: " + << this->name_ << " attribute: " << attribute.first << " to RDB"; + VMSDK_RETURN_IF_ERROR( + attribute.second.GetIndex()->SaveIndexExtension(RDBChunkOutputStream(rdb))) + << "IO error while saving Index content extension (index name: " << this->name_ + << ", attribute: " << attribute.first << ") to RDB"; } } + + if (RDBWriteV2()) { + // + // Save the Mutation Queue + // + auto mutation_queue = std::make_unique(); + mutation_queue->set_type(data_model::SUPPLEMENTAL_CONTENT_MUTATION_QUEUE); + auto mutation_queue_str = mutation_queue->SerializeAsString(); + VMSDK_RETURN_IF_ERROR(rdb->SaveStringBuffer(mutation_queue_str)) + << "IO error while saving mutation queue for index name:" << this->name_; + VMSDK_RETURN_IF_ERROR(this->SaveMutationQueue(RDBChunkOutputStream(rdb))) + << "IO error while saving mutation queue for index name: " << this->name_; + } + + return absl::OkStatus(); +} + +absl::Status IndexSchema::SaveMutationQueue(RDBChunkOutputStream chunked_out) const { + return absl::OkStatus(); +} + +absl::Status IndexSchema::LoadMutationQueue(SupplementalContentChunkIter iter) { + return absl::OkStatus(); +} + +// We need to iterate over the chunks to consume them +static absl::Status SkipSupplementalContent(SupplementalContentIter &supplemental_iter) { + auto chunk_it = supplemental_iter.IterateChunks(); + while (chunk_it.HasNext()) { + VMSDK_ASSIGN_OR_RETURN([[maybe_unused]] auto chunk_result, + chunk_it.Next()); + } return absl::OkStatus(); } @@ -902,58 +988,69 @@ absl::StatusOr> IndexSchema::LoadFromRDB( // Supplemental content will include indices and any content for them while (supplemental_iter.HasNext()) { VMSDK_ASSIGN_OR_RETURN(auto supplemental_content, supplemental_iter.Next()); - if (ABSL_PREDICT_TRUE(!skip_loading_index_data) && - supplemental_content->type() == - data_model::SupplementalContentType:: - SUPPLEMENTAL_CONTENT_INDEX_CONTENT) { - auto &attribute = - supplemental_content->index_content_header().attribute(); - VMSDK_ASSIGN_OR_RETURN(std::shared_ptr index, - IndexFactory(ctx, index_schema.get(), attribute, - supplemental_iter.IterateChunks())); - VMSDK_RETURN_IF_ERROR(index_schema->AddIndex( - attribute.alias(), attribute.identifier(), index)); - } else if (ABSL_PREDICT_TRUE(!skip_loading_index_data) && - supplemental_content->type() == - data_model::SupplementalContentType:: - SUPPLEMENTAL_CONTENT_KEY_TO_ID_MAP) { - auto &attribute = - supplemental_content->key_to_id_map_header().attribute(); - VMSDK_ASSIGN_OR_RETURN(auto index, - index_schema->GetIndex(attribute.alias()), - _ << "Key to ID mapping for " << attribute.alias() - << " found before index definition."); - if (!IsVectorIndex(index)) { - return absl::InternalError( - absl::StrFormat("Key to ID mapping found for non vector index " - "(index: %s, attribute: %s)", - index_schema->GetName(), attribute.alias())); - } - auto vector_index = dynamic_cast(index.get()); - VMSDK_RETURN_IF_ERROR(vector_index->LoadTrackedKeys( - ctx, &index_schema->GetAttributeDataType(), - supplemental_iter.IterateChunks())); + if (skip_loading_index_data) { + VMSDK_RETURN_IF_ERROR(SkipSupplementalContent(supplemental_iter)); } else { - if (ABSL_PREDICT_FALSE(skip_loading_index_data) && - (supplemental_content->type() == - data_model::SupplementalContentType:: - SUPPLEMENTAL_CONTENT_INDEX_CONTENT || - supplemental_content->type() == - data_model::SupplementalContentType:: - SUPPLEMENTAL_CONTENT_KEY_TO_ID_MAP)) { - VMSDK_LOG(NOTICE, ctx) << "Skipping supplemental content type: " - << data_model::SupplementalContentType_Name( - supplemental_content->type()); - } else { - VMSDK_LOG(NOTICE, ctx) << "Unknown supplemental content type: " - << data_model::SupplementalContentType_Name( - supplemental_content->type()); - } - // We need to iterate over the chunks to consume them - [[maybe_unused]] auto chunk_it = supplemental_iter.IterateChunks(); - while (chunk_it.HasNext()) { - VMSDK_ASSIGN_OR_RETURN([[maybe_unused]] auto chunk_result, - chunk_it.Next()); + switch (supplemental_content->type()) { + case data_model::SupplementalContentType::SUPPLEMENTAL_CONTENT_INDEX_CONTENT: { + auto &attribute = + supplemental_content->index_content_header().attribute(); + VMSDK_ASSIGN_OR_RETURN(std::shared_ptr index, + IndexFactory(ctx, index_schema.get(), attribute, + supplemental_iter.IterateChunks())); + VMSDK_RETURN_IF_ERROR(index_schema->AddIndex(attribute.alias(), attribute.identifier(), index)); + break; + } + case data_model::SupplementalContentType::SUPPLEMENTAL_CONTENT_KEY_TO_ID_MAP: { + auto &attribute = + supplemental_content->key_to_id_map_header().attribute(); + VMSDK_ASSIGN_OR_RETURN(auto index, + index_schema->GetIndex(attribute.alias()), + _ << "Key to ID mapping for " << attribute.alias() + << " found before index definition."); + if (!IsVectorIndex(index)) { + return absl::InternalError( + absl::StrFormat("Key to ID mapping found for non vector index " + "(index: %s, attribute: %s)", + index_schema->GetName(), attribute.alias())); + } + auto vector_index = dynamic_cast(index.get()); + VMSDK_RETURN_IF_ERROR(vector_index->LoadTrackedKeys( + ctx, &index_schema->GetAttributeDataType(), + supplemental_iter.IterateChunks())); + break; + } + case data_model::SupplementalContentType::SUPPLEMENTAL_CONTENT_INDEX_EXTENSION: { + if (!RDBReadV2()) { + VMSDK_LOG(NOTICE, ctx) << "Skipping supplemental index extension for attribute " + << supplemental_content->index_content_header().attribute(); + VMSDK_RETURN_IF_ERROR(SkipSupplementalContent(supplemental_iter)); + } else { + auto &attribute = supplemental_content->index_content_header().attribute(); + VMSDK_ASSIGN_OR_RETURN(auto index, + index_schema->GetIndex(attribute.alias()), + _ << "Index extension for " << attribute.alias() + << " found before index definition."); + VMSDK_RETURN_IF_ERROR(index->LoadIndexExtension(supplemental_iter.IterateChunks())); + } + } + case data_model::SupplementalContentType::SUPPLEMENTAL_CONTENT_MUTATION_QUEUE: { + if (!RDBReadV2()) { + VMSDK_LOG(NOTICE, ctx) << "Skipping supplemental mutation queue"; + VMSDK_RETURN_IF_ERROR(SkipSupplementalContent(supplemental_iter)); + } else { + if (index_schema) { + VMSDK_RETURN_IF_ERROR(index_schema->LoadMutationQueue(supplemental_iter.IterateChunks())); + } else { + return absl::InternalError("Supplemental section mutation queue out of order"); + } + } + } + default: + VMSDK_LOG(NOTICE, ctx) << "Unknown supplemental content type: " + << supplemental_content->type(); + VMSDK_RETURN_IF_ERROR(SkipSupplementalContent(supplemental_iter)); + break; } } } diff --git a/src/index_schema.h b/src/index_schema.h index d5fbd0319..14c7a9887 100644 --- a/src/index_schema.h +++ b/src/index_schema.h @@ -134,6 +134,8 @@ class IndexSchema : public KeyspaceEventSubscription, int GetAttributeCount() const { return attributes_.size(); } virtual absl::Status RDBSave(SafeRDB *rdb) const; + absl::Status SaveMutationQueue(RDBChunkOutputStream chunked_out) const; + absl::Status LoadMutationQueue(SupplementalContentChunkIter iter); static absl::StatusOr> LoadFromRDB( ValkeyModuleCtx *ctx, vmsdk::ThreadPool *mutations_thread_pool, diff --git a/src/index_schema.proto b/src/index_schema.proto index 7b03db821..bb5be7deb 100644 --- a/src/index_schema.proto +++ b/src/index_schema.proto @@ -29,6 +29,8 @@ message IndexSchema { repeated Attribute attributes = 6; Stats stats = 7; optional uint32 db_num = 8; + // V2 save/restore will have this field populated + optional bool backfill_completed = 9; } enum AttributeIndexType { diff --git a/src/indexes/index_base.h b/src/indexes/index_base.h index 6db739c3a..d6f2ef4ef 100644 --- a/src/indexes/index_base.h +++ b/src/indexes/index_base.h @@ -56,7 +56,8 @@ class IndexBase { virtual bool IsTracked(const InternedStringPtr& key) const = 0; IndexerType GetIndexerType() const { return indexer_type_; } virtual absl::Status SaveIndex(RDBChunkOutputStream chunked_out) const = 0; - + virtual absl::Status SaveIndexExtension(RDBChunkOutputStream chunked_out) const = 0; + virtual absl::Status LoadIndexExtension(SupplementalContentChunkIter chunked_out) = 0; virtual std::unique_ptr ToProto() const = 0; virtual void ForEachTrackedKey( absl::AnyInvocable fn) const {} diff --git a/src/indexes/numeric.cc b/src/indexes/numeric.cc index 450a8105c..ba29f0129 100644 --- a/src/indexes/numeric.cc +++ b/src/indexes/numeric.cc @@ -259,4 +259,13 @@ uint64_t Numeric::GetRecordCount() const { return tracked_keys_.size(); } +absl::Status Numeric::SaveIndexExtension(RDBChunkOutputStream output) const { + return absl::OkStatus(); +} + +absl::Status Numeric::LoadIndexExtension(SupplementalContentChunkIter chunked_out) { + return absl::OkStatus(); +} + + } // namespace valkey_search::indexes diff --git a/src/indexes/numeric.h b/src/indexes/numeric.h index b83984a11..cbbb8ff6d 100644 --- a/src/indexes/numeric.h +++ b/src/indexes/numeric.h @@ -97,6 +97,8 @@ class Numeric : public IndexBase { absl::Status SaveIndex(RDBChunkOutputStream chunked_out) const override { return absl::OkStatus(); } + absl::Status SaveIndexExtension(RDBChunkOutputStream chunked_out) const override; + absl::Status LoadIndexExtension(SupplementalContentChunkIter chunked_out) override; inline void ForEachTrackedKey( absl::AnyInvocable fn) const override { absl::MutexLock lock(&index_mutex_); diff --git a/src/indexes/tag.cc b/src/indexes/tag.cc index af0865237..5d7b43e8f 100644 --- a/src/indexes/tag.cc +++ b/src/indexes/tag.cc @@ -345,4 +345,12 @@ uint64_t Tag::GetRecordCount() const { return tracked_tags_by_keys_.size(); } +absl::Status Tag::SaveIndexExtension(RDBChunkOutputStream output) const { + return absl::OkStatus(); +} + +absl::Status Tag::LoadIndexExtension(SupplementalContentChunkIter chunked_out) { + return absl::OkStatus(); +} + } // namespace valkey_search::indexes diff --git a/src/indexes/tag.h b/src/indexes/tag.h index 74da9730b..266ff432c 100644 --- a/src/indexes/tag.h +++ b/src/indexes/tag.h @@ -47,6 +47,8 @@ class Tag : public IndexBase { absl::Status SaveIndex(RDBChunkOutputStream chunked_out) const override { return absl::OkStatus(); } + absl::Status SaveIndexExtension(RDBChunkOutputStream chunked_out) const override; + absl::Status LoadIndexExtension(SupplementalContentChunkIter chunked_out) override; inline void ForEachTrackedKey( absl::AnyInvocable fn) const override { diff --git a/src/indexes/vector_base.cc b/src/indexes/vector_base.cc index b71785523..27f768bd5 100644 --- a/src/indexes/vector_base.cc +++ b/src/indexes/vector_base.cc @@ -391,6 +391,16 @@ absl::Status VectorBase::SaveIndex(RDBChunkOutputStream chunked_out) const { return absl::OkStatus(); } +absl::Status VectorBase::SaveIndexExtension(RDBChunkOutputStream chunked_out) const { + CHECK(false); +} + +absl::Status VectorBase::LoadIndexExtension(SupplementalContentChunkIter chunked_out) { + CHECK(false); +} + + + absl::Status VectorBase::SaveTrackedKeys( RDBChunkOutputStream chunked_out) const { absl::ReaderMutexLock lock(&key_to_metadata_mutex_); diff --git a/src/indexes/vector_base.h b/src/indexes/vector_base.h index c66b77cd9..505201350 100644 --- a/src/indexes/vector_base.h +++ b/src/indexes/vector_base.h @@ -125,6 +125,9 @@ class VectorBase : public IndexBase, public hnswlib::VectorTracker { bool GetNormalize() const { return normalize_; } std::unique_ptr ToProto() const override; absl::Status SaveIndex(RDBChunkOutputStream chunked_out) const override; + absl::Status SaveIndexExtension(RDBChunkOutputStream chunked_out) const override; + absl::Status LoadIndexExtension(SupplementalContentChunkIter chunked_out) override; + absl::Status SaveTrackedKeys(RDBChunkOutputStream chunked_out) const ABSL_LOCKS_EXCLUDED(key_to_metadata_mutex_); absl::Status LoadTrackedKeys(ValkeyModuleCtx* ctx, diff --git a/src/rdb_section.proto b/src/rdb_section.proto index ceca67fc2..598be6d4b 100644 --- a/src/rdb_section.proto +++ b/src/rdb_section.proto @@ -38,8 +38,18 @@ message RDBSection { enum SupplementalContentType { SUPPLEMENTAL_CONTENT_UNSPECIFIED = 0; - SUPPLEMENTAL_CONTENT_INDEX_CONTENT = 1; - SUPPLEMENTAL_CONTENT_KEY_TO_ID_MAP = 2; + + // For Vector Indexes, this is the contents of the index. For Non-vector indexes this is empty (just a header) + SUPPLEMENTAL_CONTENT_INDEX_CONTENT = 1; // Header: IndexContentHeader + + // For HNSW Indexes, this contains HNSWLib internal ID to key mapping + SUPPLEMENTAL_CONTENT_KEY_TO_ID_MAP = 2; // Header: KeyToIDMappingHeader + + // V2. Only for non-vector indexes. Contents of the index + SUPPLEMENTAL_CONTENT_INDEX_EXTENSION = 3; // V2 extension for index contents, Header: IndexContentHeader + + // V2. Saved Mutation Queue + SUPPLEMENTAL_CONTENT_MUTATION_QUEUE = 4; // V2 saved mutation queue, Header: MutationQueueHeader } message IndexContentHeader { @@ -50,11 +60,17 @@ message KeyToIDMappingHeader { Attribute attribute = 1; } +// V2 Saved Mutation Queue +message MutationQueueHeader { + +} + message SupplementalContentHeader { SupplementalContentType type = 1; oneof header { IndexContentHeader index_content_header = 2; KeyToIDMappingHeader key_to_id_map_header = 3; + MutationQueueHeader mutation_queue_header = 4; // V2 }; } diff --git a/testing/common.h b/testing/common.h index 3b9ca012c..3fdb3741e 100644 --- a/testing/common.h +++ b/testing/common.h @@ -99,6 +99,10 @@ class MockIndex : public indexes::IndexBase { MOCK_METHOD(int, RespondWithInfo, (ValkeyModuleCtx * ctx), (const, override)); MOCK_METHOD(absl::Status, SaveIndex, (RDBChunkOutputStream chunked_out), (const, override)); + MOCK_METHOD(absl::Status, SaveIndexExtension, (RDBChunkOutputStream chunked_out), + (const, override)); + MOCK_METHOD(absl::Status, LoadIndexExtension, (SupplementalContentChunkIter chunked_out), + (override)); MOCK_METHOD((void), ForEachTrackedKey, (absl::AnyInvocable fn), (const, override)); From ab61e93f7a760324c6aa91d6852d47396cdc09c8 Mon Sep 17 00:00:00 2001 From: Allen Samuels Date: Wed, 1 Oct 2025 04:57:39 +0000 Subject: [PATCH 02/33] formating Signed-off-by: Allen Samuels --- src/index_schema.cc | 149 +++++++++++++++++++++---------------- src/indexes/index_base.h | 6 +- src/indexes/numeric.cc | 4 +- src/indexes/numeric.h | 6 +- src/indexes/tag.h | 6 +- src/indexes/vector_base.cc | 8 +- src/indexes/vector_base.h | 6 +- testing/common.h | 8 +- 8 files changed, 111 insertions(+), 82 deletions(-) diff --git a/src/index_schema.cc b/src/index_schema.cc index eb653c135..44143aaad 100644 --- a/src/index_schema.cc +++ b/src/index_schema.cc @@ -61,15 +61,16 @@ LogLevel GetLogSeverity(bool ok) { return ok ? DEBUG : WARNING; } static auto config_rdb_write_v2 = vmsdk::config::BooleanBuilder("rdb_write_v2", false).Hidden().Build(); -static auto config_rdb_read_v2 = +static auto config_rdb_read_v2 = vmsdk::config::BooleanBuilder("rdb_read_v2", false).Hidden().Build(); static bool RDBReadV2() { - return dynamic_cast(*config_rdb_read_v2).GetValue(); + return dynamic_cast(*config_rdb_read_v2).GetValue(); } static bool RDBWriteV2() { - return dynamic_cast(*config_rdb_write_v2).GetValue(); + return dynamic_cast(*config_rdb_write_v2) + .GetValue(); } IndexSchema::BackfillJob::BackfillJob(ValkeyModuleCtx *ctx, @@ -836,33 +837,37 @@ absl::Status IndexSchema::RDBSave(SafeRDB *rdb) const { /* The legacy V1 format. - - Each non-vector attribute has one SUPPLEMENTAL_CONTENT_INDEX_CONTENT segment. - That segment consists of a header and no additional content. - + + Each non-vector attribute has one SUPPLEMENTAL_CONTENT_INDEX_CONTENT + segment. That segment consists of a header and no additional content. + Vector attributes have two segments. - One SUPPLEMENTAL_CONTENT_INDEX_CONTENT which has a header and the saved vector index data. - One SUPPLEMENTAL_CONTENT_KEY_TO_ID_MAP segment + One SUPPLEMENTAL_CONTENT_INDEX_CONTENT which has a header and the saved + vector index data. One SUPPLEMENTAL_CONTENT_KEY_TO_ID_MAP segment The new V2 format is a superset of the V1 format. Each non-vector attribute has two segments. - one SUPPLEMENTAL_CONTENT_INDEX_CONTENT segment which is a header only. (Same as V1) - one SUPPLEMENTAL_CONTENT_INDEX_EXTENSION segment which has saved index data - + one SUPPLEMENTAL_CONTENT_INDEX_CONTENT segment which is a header only. + (Same as V1) one SUPPLEMENTAL_CONTENT_INDEX_EXTENSION segment which has + saved index data + Vector attributes are same as V1. V2 also has a segment with SUPPLEMENTAL_CONTENT_MUTATION_QUEUE which is the header and the contents of the mutation queue. - + */ - size_t supplemental_count = - RDBWriteV2() - // For V2, we write two sections for each attribute plus the mutation queue - ? 2 * GetAttributeCount() + 1 - // For V1, we write one section for each attribute plus an additional section for each vector attribute - : GetAttributeCount() + - std::count_if(attributes_.begin(), attributes_.end(), + size_t supplemental_count = + RDBWriteV2() + // For V2, we write two sections for each attribute plus the mutation + // queue + ? 2 * GetAttributeCount() + 1 + // For V1, we write one section for each attribute plus an additional + // section for each vector attribute + : GetAttributeCount() + + std::count_if( + attributes_.begin(), attributes_.end(), [](const auto &attribute) { return IsVectorIndex(attribute.second.GetIndex()); }); @@ -907,10 +912,9 @@ absl::Status IndexSchema::RDBSave(SafeRDB *rdb) const { << "IO error while saving supplemental content for key to ID mapping " "for index name: " << this->name_ << " attribute: " << attribute.first << " to RDB"; - VMSDK_RETURN_IF_ERROR( - dynamic_cast( - attribute.second.GetIndex().get()) - ->SaveTrackedKeys(RDBChunkOutputStream(rdb))) + VMSDK_RETURN_IF_ERROR(dynamic_cast( + attribute.second.GetIndex().get()) + ->SaveTrackedKeys(RDBChunkOutputStream(rdb))) << "IO error while saving Key to ID mapping (index name: " << this->name_ << ", attribute: " << attribute.first << ") to RDB"; } else if (RDBWriteV2()) { @@ -921,17 +925,19 @@ absl::Status IndexSchema::RDBSave(SafeRDB *rdb) const { std::make_unique(); index_content_extension->set_type( data_model::SUPPLEMENTAL_CONTENT_INDEX_EXTENSION); - index_content_extension->mutable_index_content_header()->set_allocated_attribute( - attribute.second.ToProto().release()); - auto index_content_extension_str = index_content_extension->SerializeAsString(); + index_content_extension->mutable_index_content_header() + ->set_allocated_attribute(attribute.second.ToProto().release()); + auto index_content_extension_str = + index_content_extension->SerializeAsString(); VMSDK_RETURN_IF_ERROR(rdb->SaveStringBuffer(index_content_extension_str)) - << "IO error while saving supplemental content for index content extension for " - "index name: " + << "IO error while saving supplemental content for index content " + "extension for " + "index name: " << this->name_ << " attribute: " << attribute.first << " to RDB"; - VMSDK_RETURN_IF_ERROR( - attribute.second.GetIndex()->SaveIndexExtension(RDBChunkOutputStream(rdb))) - << "IO error while saving Index content extension (index name: " << this->name_ - << ", attribute: " << attribute.first << ") to RDB"; + VMSDK_RETURN_IF_ERROR(attribute.second.GetIndex()->SaveIndexExtension( + RDBChunkOutputStream(rdb))) + << "IO error while saving Index content extension (index name: " + << this->name_ << ", attribute: " << attribute.first << ") to RDB"; } } @@ -939,20 +945,24 @@ absl::Status IndexSchema::RDBSave(SafeRDB *rdb) const { // // Save the Mutation Queue // - auto mutation_queue = std::make_unique(); + auto mutation_queue = + std::make_unique(); mutation_queue->set_type(data_model::SUPPLEMENTAL_CONTENT_MUTATION_QUEUE); auto mutation_queue_str = mutation_queue->SerializeAsString(); VMSDK_RETURN_IF_ERROR(rdb->SaveStringBuffer(mutation_queue_str)) - << "IO error while saving mutation queue for index name:" << this->name_; + << "IO error while saving mutation queue for index name:" + << this->name_; VMSDK_RETURN_IF_ERROR(this->SaveMutationQueue(RDBChunkOutputStream(rdb))) - << "IO error while saving mutation queue for index name: " << this->name_; + << "IO error while saving mutation queue for index name: " + << this->name_; } return absl::OkStatus(); } -absl::Status IndexSchema::SaveMutationQueue(RDBChunkOutputStream chunked_out) const { - return absl::OkStatus(); +absl::Status IndexSchema::SaveMutationQueue( + RDBChunkOutputStream chunked_out) const { + return absl::OkStatus(); } absl::Status IndexSchema::LoadMutationQueue(SupplementalContentChunkIter iter) { @@ -960,11 +970,11 @@ absl::Status IndexSchema::LoadMutationQueue(SupplementalContentChunkIter iter) { } // We need to iterate over the chunks to consume them -static absl::Status SkipSupplementalContent(SupplementalContentIter &supplemental_iter) { +static absl::Status SkipSupplementalContent( + SupplementalContentIter &supplemental_iter) { auto chunk_it = supplemental_iter.IterateChunks(); while (chunk_it.HasNext()) { - VMSDK_ASSIGN_OR_RETURN([[maybe_unused]] auto chunk_result, - chunk_it.Next()); + VMSDK_ASSIGN_OR_RETURN([[maybe_unused]] auto chunk_result, chunk_it.Next()); } return absl::OkStatus(); } @@ -992,22 +1002,26 @@ absl::StatusOr> IndexSchema::LoadFromRDB( VMSDK_RETURN_IF_ERROR(SkipSupplementalContent(supplemental_iter)); } else { switch (supplemental_content->type()) { - case data_model::SupplementalContentType::SUPPLEMENTAL_CONTENT_INDEX_CONTENT: { + case data_model::SupplementalContentType:: + SUPPLEMENTAL_CONTENT_INDEX_CONTENT: { auto &attribute = supplemental_content->index_content_header().attribute(); - VMSDK_ASSIGN_OR_RETURN(std::shared_ptr index, - IndexFactory(ctx, index_schema.get(), attribute, - supplemental_iter.IterateChunks())); - VMSDK_RETURN_IF_ERROR(index_schema->AddIndex(attribute.alias(), attribute.identifier(), index)); + VMSDK_ASSIGN_OR_RETURN( + std::shared_ptr index, + IndexFactory(ctx, index_schema.get(), attribute, + supplemental_iter.IterateChunks())); + VMSDK_RETURN_IF_ERROR(index_schema->AddIndex( + attribute.alias(), attribute.identifier(), index)); break; } - case data_model::SupplementalContentType::SUPPLEMENTAL_CONTENT_KEY_TO_ID_MAP: { + case data_model::SupplementalContentType:: + SUPPLEMENTAL_CONTENT_KEY_TO_ID_MAP: { auto &attribute = supplemental_content->key_to_id_map_header().attribute(); - VMSDK_ASSIGN_OR_RETURN(auto index, - index_schema->GetIndex(attribute.alias()), - _ << "Key to ID mapping for " << attribute.alias() - << " found before index definition."); + VMSDK_ASSIGN_OR_RETURN( + auto index, index_schema->GetIndex(attribute.alias()), + _ << "Key to ID mapping for " << attribute.alias() + << " found before index definition."); if (!IsVectorIndex(index)) { return absl::InternalError( absl::StrFormat("Key to ID mapping found for non vector index " @@ -1020,35 +1034,42 @@ absl::StatusOr> IndexSchema::LoadFromRDB( supplemental_iter.IterateChunks())); break; } - case data_model::SupplementalContentType::SUPPLEMENTAL_CONTENT_INDEX_EXTENSION: { + case data_model::SupplementalContentType:: + SUPPLEMENTAL_CONTENT_INDEX_EXTENSION: { if (!RDBReadV2()) { - VMSDK_LOG(NOTICE, ctx) << "Skipping supplemental index extension for attribute " - << supplemental_content->index_content_header().attribute(); + VMSDK_LOG(NOTICE, ctx) + << "Skipping supplemental index extension for attribute " + << supplemental_content->index_content_header().attribute(); VMSDK_RETURN_IF_ERROR(SkipSupplementalContent(supplemental_iter)); } else { - auto &attribute = supplemental_content->index_content_header().attribute(); - VMSDK_ASSIGN_OR_RETURN(auto index, - index_schema->GetIndex(attribute.alias()), - _ << "Index extension for " << attribute.alias() - << " found before index definition."); - VMSDK_RETURN_IF_ERROR(index->LoadIndexExtension(supplemental_iter.IterateChunks())); + auto &attribute = + supplemental_content->index_content_header().attribute(); + VMSDK_ASSIGN_OR_RETURN( + auto index, index_schema->GetIndex(attribute.alias()), + _ << "Index extension for " << attribute.alias() + << " found before index definition."); + VMSDK_RETURN_IF_ERROR( + index->LoadIndexExtension(supplemental_iter.IterateChunks())); } } - case data_model::SupplementalContentType::SUPPLEMENTAL_CONTENT_MUTATION_QUEUE: { + case data_model::SupplementalContentType:: + SUPPLEMENTAL_CONTENT_MUTATION_QUEUE: { if (!RDBReadV2()) { VMSDK_LOG(NOTICE, ctx) << "Skipping supplemental mutation queue"; VMSDK_RETURN_IF_ERROR(SkipSupplementalContent(supplemental_iter)); } else { if (index_schema) { - VMSDK_RETURN_IF_ERROR(index_schema->LoadMutationQueue(supplemental_iter.IterateChunks())); + VMSDK_RETURN_IF_ERROR(index_schema->LoadMutationQueue( + supplemental_iter.IterateChunks())); } else { - return absl::InternalError("Supplemental section mutation queue out of order"); + return absl::InternalError( + "Supplemental section mutation queue out of order"); } } } default: VMSDK_LOG(NOTICE, ctx) << "Unknown supplemental content type: " - << supplemental_content->type(); + << supplemental_content->type(); VMSDK_RETURN_IF_ERROR(SkipSupplementalContent(supplemental_iter)); break; } diff --git a/src/indexes/index_base.h b/src/indexes/index_base.h index d6f2ef4ef..9c5a62882 100644 --- a/src/indexes/index_base.h +++ b/src/indexes/index_base.h @@ -56,8 +56,10 @@ class IndexBase { virtual bool IsTracked(const InternedStringPtr& key) const = 0; IndexerType GetIndexerType() const { return indexer_type_; } virtual absl::Status SaveIndex(RDBChunkOutputStream chunked_out) const = 0; - virtual absl::Status SaveIndexExtension(RDBChunkOutputStream chunked_out) const = 0; - virtual absl::Status LoadIndexExtension(SupplementalContentChunkIter chunked_out) = 0; + virtual absl::Status SaveIndexExtension( + RDBChunkOutputStream chunked_out) const = 0; + virtual absl::Status LoadIndexExtension( + SupplementalContentChunkIter chunked_out) = 0; virtual std::unique_ptr ToProto() const = 0; virtual void ForEachTrackedKey( absl::AnyInvocable fn) const {} diff --git a/src/indexes/numeric.cc b/src/indexes/numeric.cc index ba29f0129..521439963 100644 --- a/src/indexes/numeric.cc +++ b/src/indexes/numeric.cc @@ -263,9 +263,9 @@ absl::Status Numeric::SaveIndexExtension(RDBChunkOutputStream output) const { return absl::OkStatus(); } -absl::Status Numeric::LoadIndexExtension(SupplementalContentChunkIter chunked_out) { +absl::Status Numeric::LoadIndexExtension( + SupplementalContentChunkIter chunked_out) { return absl::OkStatus(); } - } // namespace valkey_search::indexes diff --git a/src/indexes/numeric.h b/src/indexes/numeric.h index cbbb8ff6d..eb4d34c36 100644 --- a/src/indexes/numeric.h +++ b/src/indexes/numeric.h @@ -97,8 +97,10 @@ class Numeric : public IndexBase { absl::Status SaveIndex(RDBChunkOutputStream chunked_out) const override { return absl::OkStatus(); } - absl::Status SaveIndexExtension(RDBChunkOutputStream chunked_out) const override; - absl::Status LoadIndexExtension(SupplementalContentChunkIter chunked_out) override; + absl::Status SaveIndexExtension( + RDBChunkOutputStream chunked_out) const override; + absl::Status LoadIndexExtension( + SupplementalContentChunkIter chunked_out) override; inline void ForEachTrackedKey( absl::AnyInvocable fn) const override { absl::MutexLock lock(&index_mutex_); diff --git a/src/indexes/tag.h b/src/indexes/tag.h index 266ff432c..fe1526ea8 100644 --- a/src/indexes/tag.h +++ b/src/indexes/tag.h @@ -47,8 +47,10 @@ class Tag : public IndexBase { absl::Status SaveIndex(RDBChunkOutputStream chunked_out) const override { return absl::OkStatus(); } - absl::Status SaveIndexExtension(RDBChunkOutputStream chunked_out) const override; - absl::Status LoadIndexExtension(SupplementalContentChunkIter chunked_out) override; + absl::Status SaveIndexExtension( + RDBChunkOutputStream chunked_out) const override; + absl::Status LoadIndexExtension( + SupplementalContentChunkIter chunked_out) override; inline void ForEachTrackedKey( absl::AnyInvocable fn) const override { diff --git a/src/indexes/vector_base.cc b/src/indexes/vector_base.cc index 27f768bd5..9af4eccb0 100644 --- a/src/indexes/vector_base.cc +++ b/src/indexes/vector_base.cc @@ -391,16 +391,16 @@ absl::Status VectorBase::SaveIndex(RDBChunkOutputStream chunked_out) const { return absl::OkStatus(); } -absl::Status VectorBase::SaveIndexExtension(RDBChunkOutputStream chunked_out) const { +absl::Status VectorBase::SaveIndexExtension( + RDBChunkOutputStream chunked_out) const { CHECK(false); } -absl::Status VectorBase::LoadIndexExtension(SupplementalContentChunkIter chunked_out) { +absl::Status VectorBase::LoadIndexExtension( + SupplementalContentChunkIter chunked_out) { CHECK(false); } - - absl::Status VectorBase::SaveTrackedKeys( RDBChunkOutputStream chunked_out) const { absl::ReaderMutexLock lock(&key_to_metadata_mutex_); diff --git a/src/indexes/vector_base.h b/src/indexes/vector_base.h index 505201350..64e3d5963 100644 --- a/src/indexes/vector_base.h +++ b/src/indexes/vector_base.h @@ -125,8 +125,10 @@ class VectorBase : public IndexBase, public hnswlib::VectorTracker { bool GetNormalize() const { return normalize_; } std::unique_ptr ToProto() const override; absl::Status SaveIndex(RDBChunkOutputStream chunked_out) const override; - absl::Status SaveIndexExtension(RDBChunkOutputStream chunked_out) const override; - absl::Status LoadIndexExtension(SupplementalContentChunkIter chunked_out) override; + absl::Status SaveIndexExtension( + RDBChunkOutputStream chunked_out) const override; + absl::Status LoadIndexExtension( + SupplementalContentChunkIter chunked_out) override; absl::Status SaveTrackedKeys(RDBChunkOutputStream chunked_out) const ABSL_LOCKS_EXCLUDED(key_to_metadata_mutex_); diff --git a/testing/common.h b/testing/common.h index 3fdb3741e..022facdf9 100644 --- a/testing/common.h +++ b/testing/common.h @@ -99,10 +99,10 @@ class MockIndex : public indexes::IndexBase { MOCK_METHOD(int, RespondWithInfo, (ValkeyModuleCtx * ctx), (const, override)); MOCK_METHOD(absl::Status, SaveIndex, (RDBChunkOutputStream chunked_out), (const, override)); - MOCK_METHOD(absl::Status, SaveIndexExtension, (RDBChunkOutputStream chunked_out), - (const, override)); - MOCK_METHOD(absl::Status, LoadIndexExtension, (SupplementalContentChunkIter chunked_out), - (override)); + MOCK_METHOD(absl::Status, SaveIndexExtension, + (RDBChunkOutputStream chunked_out), (const, override)); + MOCK_METHOD(absl::Status, LoadIndexExtension, + (SupplementalContentChunkIter chunked_out), (override)); MOCK_METHOD((void), ForEachTrackedKey, (absl::AnyInvocable fn), (const, override)); From 01162045becc3da76e705da6569b5d5d0d4657e2 Mon Sep 17 00:00:00 2001 From: Allen Samuels Date: Fri, 3 Oct 2025 03:08:36 +0000 Subject: [PATCH 03/33] Code complete Signed-off-by: Allen Samuels --- src/index_schema.cc | 176 ++++++++++++++++++------------ src/index_schema.h | 6 +- src/indexes/index_base.h | 3 +- src/indexes/numeric.cc | 70 +++++++++++- src/indexes/numeric.h | 6 +- src/indexes/tag.cc | 66 ++++++++++- src/indexes/tag.h | 3 +- src/indexes/vector_base.cc | 3 +- src/indexes/vector_base.h | 3 +- src/rdb_section.proto | 2 +- src/rdb_serialization.h | 29 +++++ testing/common.h | 27 +++-- testing/rdb_serialization_test.cc | 72 ++++++------ 13 files changed, 323 insertions(+), 143 deletions(-) diff --git a/src/index_schema.cc b/src/index_schema.cc index 44143aaad..0bd2644d2 100644 --- a/src/index_schema.cc +++ b/src/index_schema.cc @@ -46,6 +46,7 @@ #include "src/valkey_search_options.h" #include "src/vector_externalizer.h" #include "vmsdk/src/blocked_client.h" +#include "vmsdk/src/info.h" #include "vmsdk/src/log.h" #include "vmsdk/src/managed_pointers.h" #include "vmsdk/src/status/status_macros.h" @@ -59,6 +60,9 @@ namespace valkey_search { LogLevel GetLogSeverity(bool ok) { return ok ? DEBUG : WARNING; } +// +// Controls and stats for V2 RDB file +// static auto config_rdb_write_v2 = vmsdk::config::BooleanBuilder("rdb_write_v2", false).Hidden().Build(); static auto config_rdb_read_v2 = @@ -73,6 +77,22 @@ static bool RDBWriteV2() { .GetValue(); } +static vmsdk::info_field::Integer rdb_load_sections( + "rdb_stats", "rdb_load_sections", + vmsdk::info_field::IntegerBuilder().Dev()); + +static vmsdk::info_field::Integer rdb_load_mutation_entries( + "rdb_stats", "rdb_load_mutation_entries", + vmsdk::info_field::IntegerBuilder().Dev()); + +static vmsdk::info_field::Integer rdb_save_sections( + "rdb_stats", "rdb_save_sections", + vmsdk::info_field::IntegerBuilder().Dev()); + +static vmsdk::info_field::Integer rdb_save_mutation_entries( + "rdb_stats", "rdb_save_mutation_entries", + vmsdk::info_field::IntegerBuilder().Dev()); + IndexSchema::BackfillJob::BackfillJob(ValkeyModuleCtx *ctx, absl::string_view name, int db_num) : cursor(vmsdk::MakeUniqueValkeyScanCursor()) { @@ -828,6 +848,19 @@ std::unique_ptr IndexSchema::ToProto() const { return index_schema_proto; } +static absl::Status SaveSupplementalSection( + SafeRDB *rdb, data_model::SupplementalContentType type, + std::function init, + absl::AnyInvocable write_section) { + rdb_save_sections.Increment(); + auto header = std::make_unique(); + header->set_type(type); + init(*header); + auto header_str = header->SerializeAsString(); + VMSDK_RETURN_IF_ERROR(rdb->SaveStringBuffer(header_str)); + return write_section(RDBChunkOutputStream(rdb)); +} + absl::Status IndexSchema::RDBSave(SafeRDB *rdb) const { auto index_schema_proto = ToProto(); auto rdb_section = std::make_unique(); @@ -883,61 +916,39 @@ absl::Status IndexSchema::RDBSave(SafeRDB *rdb) const { // serialized index schema proto above. We store here again to avoid any // dependencies on the ordering of multiple attributes. // We could remove the duplication in the future. - auto index_content_supp = - std::make_unique(); - index_content_supp->set_type( - data_model::SUPPLEMENTAL_CONTENT_INDEX_CONTENT); - index_content_supp->mutable_index_content_header()->set_allocated_attribute( - attribute.second.ToProto().release()); - auto index_content_supp_str = index_content_supp->SerializeAsString(); - VMSDK_RETURN_IF_ERROR(rdb->SaveStringBuffer(index_content_supp_str)) - << "IO error while saving supplemental content for index content for " - "index name: " - << this->name_ << " attribute: " << attribute.first << " to RDB"; - VMSDK_RETURN_IF_ERROR( - attribute.second.GetIndex()->SaveIndex(RDBChunkOutputStream(rdb))) - << "IO error while saving Index contents (index name: " << this->name_ - << ", attribute: " << attribute.first << ") to RDB"; + VMSDK_RETURN_IF_ERROR(SaveSupplementalSection( + rdb, data_model::SUPPLEMENTAL_CONTENT_INDEX_CONTENT, + [&](auto &header) { + header.mutable_index_content_header()->set_allocated_attribute( + attribute.second.ToProto().release()); + }, + std::bind_front(&indexes::IndexBase::SaveIndex, + attribute.second.GetIndex()))); // Key to ID mapping is stored as a separate chunked supplemental content // for vector indexes. if (IsVectorIndex(attribute.second.GetIndex())) { - auto key_to_id_supp = - std::make_unique(); - key_to_id_supp->set_type(data_model::SUPPLEMENTAL_CONTENT_KEY_TO_ID_MAP); - key_to_id_supp->mutable_key_to_id_map_header()->set_allocated_attribute( - attribute.second.ToProto().release()); - auto key_to_id_supp_str = key_to_id_supp->SerializeAsString(); - VMSDK_RETURN_IF_ERROR(rdb->SaveStringBuffer(key_to_id_supp_str)) - << "IO error while saving supplemental content for key to ID mapping " - "for index name: " - << this->name_ << " attribute: " << attribute.first << " to RDB"; - VMSDK_RETURN_IF_ERROR(dynamic_cast( - attribute.second.GetIndex().get()) - ->SaveTrackedKeys(RDBChunkOutputStream(rdb))) - << "IO error while saving Key to ID mapping (index name: " - << this->name_ << ", attribute: " << attribute.first << ") to RDB"; + VMSDK_RETURN_IF_ERROR(SaveSupplementalSection( + rdb, data_model::SUPPLEMENTAL_CONTENT_KEY_TO_ID_MAP, + [&](auto &header) { + header.mutable_key_to_id_map_header()->set_allocated_attribute( + attribute.second.ToProto().release()); + }, + std::bind_front(&indexes::VectorBase::SaveTrackedKeys, + dynamic_cast( + attribute.second.GetIndex().get())))); } else if (RDBWriteV2()) { // // V2 add index content extension // - auto index_content_extension = - std::make_unique(); - index_content_extension->set_type( - data_model::SUPPLEMENTAL_CONTENT_INDEX_EXTENSION); - index_content_extension->mutable_index_content_header() - ->set_allocated_attribute(attribute.second.ToProto().release()); - auto index_content_extension_str = - index_content_extension->SerializeAsString(); - VMSDK_RETURN_IF_ERROR(rdb->SaveStringBuffer(index_content_extension_str)) - << "IO error while saving supplemental content for index content " - "extension for " - "index name: " - << this->name_ << " attribute: " << attribute.first << " to RDB"; - VMSDK_RETURN_IF_ERROR(attribute.second.GetIndex()->SaveIndexExtension( - RDBChunkOutputStream(rdb))) - << "IO error while saving Index content extension (index name: " - << this->name_ << ", attribute: " << attribute.first << ") to RDB"; + VMSDK_RETURN_IF_ERROR(SaveSupplementalSection( + rdb, data_model::SUPPLEMENTAL_CONTENT_INDEX_EXTENSION, + [&](auto &header) { + header.mutable_index_content_header()->set_allocated_attribute( + attribute.second.ToProto().release()); + }, + std::bind_front(&indexes::IndexBase::SaveIndexExtension, + attribute.second.GetIndex()))); } } @@ -945,27 +956,37 @@ absl::Status IndexSchema::RDBSave(SafeRDB *rdb) const { // // Save the Mutation Queue // - auto mutation_queue = - std::make_unique(); - mutation_queue->set_type(data_model::SUPPLEMENTAL_CONTENT_MUTATION_QUEUE); - auto mutation_queue_str = mutation_queue->SerializeAsString(); - VMSDK_RETURN_IF_ERROR(rdb->SaveStringBuffer(mutation_queue_str)) - << "IO error while saving mutation queue for index name:" - << this->name_; - VMSDK_RETURN_IF_ERROR(this->SaveMutationQueue(RDBChunkOutputStream(rdb))) - << "IO error while saving mutation queue for index name: " - << this->name_; + VMSDK_RETURN_IF_ERROR(SaveSupplementalSection( + rdb, data_model::SUPPLEMENTAL_CONTENT_MUTATION_QUEUE, + [&](auto &header) { + header.mutable_mutation_queue_header()->set_backfilling( + backfill_job_.Get().has_value()); + }, + std::bind_front(&IndexSchema::SaveMutationQueue, this))); } return absl::OkStatus(); } -absl::Status IndexSchema::SaveMutationQueue( - RDBChunkOutputStream chunked_out) const { +absl::Status IndexSchema::SaveMutationQueue(RDBChunkOutputStream out) const { + VMSDK_RETURN_IF_ERROR(out.SaveObject(tracked_mutated_records_.size())); + for (const auto &[key, value] : tracked_mutated_records_) { + VMSDK_RETURN_IF_ERROR(out.SaveString(key->Str())); + rdb_save_mutation_entries.Increment(); + } return absl::OkStatus(); } -absl::Status IndexSchema::LoadMutationQueue(SupplementalContentChunkIter iter) { +absl::Status IndexSchema::LoadMutationQueue(ValkeyModuleCtx *ctx, + RDBChunkInputStream input) { + VMSDK_ASSIGN_OR_RETURN(size_t count, input.LoadObject()); + for (size_t i = 0; i < count; ++i) { + VMSDK_ASSIGN_OR_RETURN(auto keyname_str, input.LoadString()); + auto keyname = vmsdk::MakeUniqueValkeyString(keyname_str); + ProcessKeyspaceNotification(ctx, keyname.get(), false); + rdb_load_mutation_entries.Increment(); + } + loaded_v2_ = true; return absl::OkStatus(); } @@ -998,6 +1019,7 @@ absl::StatusOr> IndexSchema::LoadFromRDB( // Supplemental content will include indices and any content for them while (supplemental_iter.HasNext()) { VMSDK_ASSIGN_OR_RETURN(auto supplemental_content, supplemental_iter.Next()); + rdb_load_sections.Increment(); if (skip_loading_index_data) { VMSDK_RETURN_IF_ERROR(SkipSupplementalContent(supplemental_iter)); } else { @@ -1020,13 +1042,10 @@ absl::StatusOr> IndexSchema::LoadFromRDB( supplemental_content->key_to_id_map_header().attribute(); VMSDK_ASSIGN_OR_RETURN( auto index, index_schema->GetIndex(attribute.alias()), - _ << "Key to ID mapping for " << attribute.alias() - << " found before index definition."); + _ << "Key to ID mapping found before index definition."); if (!IsVectorIndex(index)) { return absl::InternalError( - absl::StrFormat("Key to ID mapping found for non vector index " - "(index: %s, attribute: %s)", - index_schema->GetName(), attribute.alias())); + "Key to ID mapping found for non vector index "); } auto vector_index = dynamic_cast(index.get()); VMSDK_RETURN_IF_ERROR(vector_index->LoadTrackedKeys( @@ -1038,18 +1057,16 @@ absl::StatusOr> IndexSchema::LoadFromRDB( SUPPLEMENTAL_CONTENT_INDEX_EXTENSION: { if (!RDBReadV2()) { VMSDK_LOG(NOTICE, ctx) - << "Skipping supplemental index extension for attribute " - << supplemental_content->index_content_header().attribute(); + << "Skipping supplemental index extension for attribute "; VMSDK_RETURN_IF_ERROR(SkipSupplementalContent(supplemental_iter)); } else { auto &attribute = supplemental_content->index_content_header().attribute(); VMSDK_ASSIGN_OR_RETURN( auto index, index_schema->GetIndex(attribute.alias()), - _ << "Index extension for " << attribute.alias() - << " found before index definition."); - VMSDK_RETURN_IF_ERROR( - index->LoadIndexExtension(supplemental_iter.IterateChunks())); + _ << "Index extension found before index definition."); + VMSDK_RETURN_IF_ERROR(index->LoadIndexExtension( + RDBChunkInputStream(supplemental_iter.IterateChunks()))); } } case data_model::SupplementalContentType:: @@ -1060,7 +1077,13 @@ absl::StatusOr> IndexSchema::LoadFromRDB( } else { if (index_schema) { VMSDK_RETURN_IF_ERROR(index_schema->LoadMutationQueue( - supplemental_iter.IterateChunks())); + ctx, RDBChunkInputStream(supplemental_iter.IterateChunks()))); + bool backfilling = + supplemental_content->mutation_queue_header().backfilling(); + if (!backfilling) { + VMSDK_LOG(DEBUG, ctx) << "Backfill suppressed."; + index_schema->backfill_job_.Get() = std::nullopt; + } } else { return absl::InternalError( "Supplemental section mutation queue out of order"); @@ -1101,6 +1124,15 @@ void IndexSchema::OnSwapDB(ValkeyModuleSwapDbInfo *swap_db_info) { } void IndexSchema::OnLoadingEnded(ValkeyModuleCtx *ctx) { + if (loaded_v2_) { + VMSDK_LOG(NOTICE, ctx) << "New format RDB load completed, " + << " Mutation Queue contains " + << tracked_mutated_records_.size() << " entries." + << (backfill_job_.Get().has_value() + ? " Backfill still required." + : " Backfill not needed."); + return; + } // Clean up any potentially stale index entries that can arise from pending // record deletions being lost during RDB save. vmsdk::StopWatch stop_watch; diff --git a/src/index_schema.h b/src/index_schema.h index 14c7a9887..d0a2e1330 100644 --- a/src/index_schema.h +++ b/src/index_schema.h @@ -134,8 +134,9 @@ class IndexSchema : public KeyspaceEventSubscription, int GetAttributeCount() const { return attributes_.size(); } virtual absl::Status RDBSave(SafeRDB *rdb) const; - absl::Status SaveMutationQueue(RDBChunkOutputStream chunked_out) const; - absl::Status LoadMutationQueue(SupplementalContentChunkIter iter); + absl::Status SaveMutationQueue(RDBChunkOutputStream output) const; + absl::Status LoadMutationQueue(ValkeyModuleCtx *ctx, + RDBChunkInputStream input); static absl::StatusOr> LoadFromRDB( ValkeyModuleCtx *ctx, vmsdk::ThreadPool *mutations_thread_pool, @@ -192,6 +193,7 @@ class IndexSchema : public KeyspaceEventSubscription, std::unique_ptr attribute_data_type_; std::string name_; uint32_t db_num_{0}; + bool loaded_v2_{false}; vmsdk::ThreadPool *mutations_thread_pool_{nullptr}; InternedStringMap tracked_mutated_records_ diff --git a/src/indexes/index_base.h b/src/indexes/index_base.h index 9c5a62882..9368eb3be 100644 --- a/src/indexes/index_base.h +++ b/src/indexes/index_base.h @@ -58,8 +58,7 @@ class IndexBase { virtual absl::Status SaveIndex(RDBChunkOutputStream chunked_out) const = 0; virtual absl::Status SaveIndexExtension( RDBChunkOutputStream chunked_out) const = 0; - virtual absl::Status LoadIndexExtension( - SupplementalContentChunkIter chunked_out) = 0; + virtual absl::Status LoadIndexExtension(RDBChunkInputStream chunked_in) = 0; virtual std::unique_ptr ToProto() const = 0; virtual void ForEachTrackedKey( absl::AnyInvocable fn) const {} diff --git a/src/indexes/numeric.cc b/src/indexes/numeric.cc index 521439963..04fe67e9e 100644 --- a/src/indexes/numeric.cc +++ b/src/indexes/numeric.cc @@ -45,7 +45,11 @@ Numeric::Numeric(const data_model::NumericIndex& numeric_index_proto) absl::StatusOr Numeric::AddRecord(const InternedStringPtr& key, absl::string_view data) { - auto value = ParseNumber(data); + return AddRecordInternal(key, ParseNumber(data)); +} + +absl::StatusOr Numeric::AddRecordInternal(const InternedStringPtr& key, + std::optional value) { absl::MutexLock lock(&index_mutex_); if (!value.has_value()) { untracked_keys_.insert(key); @@ -260,11 +264,71 @@ uint64_t Numeric::GetRecordCount() const { } absl::Status Numeric::SaveIndexExtension(RDBChunkOutputStream output) const { + size_t untracked_key_count = untracked_keys_.size(); + size_t tracked_key_count = tracked_keys_.size(); + VMSDK_RETURN_IF_ERROR(output.SaveObject(untracked_key_count)); + VMSDK_RETURN_IF_ERROR(output.SaveObject(tracked_key_count)); + + for (const auto& key_ptr : untracked_keys_) { + VMSDK_RETURN_IF_ERROR(output.SaveString(key_ptr->Str())); + // + // Ensure no duplicate keys (which would cause a load error) + // + if (tracked_keys_.contains(key_ptr)) { + DCHECK(false); + return absl::InternalError("Numeric field save duplicate key"); + } + } + + for (const auto& [key_ptr, value] : tracked_keys_) { + VMSDK_RETURN_IF_ERROR(output.SaveString(key_ptr->Str())); + VMSDK_RETURN_IF_ERROR(output.SaveObject(value)); + } + return absl::OkStatus(); } -absl::Status Numeric::LoadIndexExtension( - SupplementalContentChunkIter chunked_out) { +absl::Status Numeric::LoadIndexExtension(RDBChunkInputStream input) { + VMSDK_ASSIGN_OR_RETURN(size_t untracked_key_count, input.LoadObject(), + _ << "RDB: Numeric load error on untracked_key_count"); + VMSDK_ASSIGN_OR_RETURN(size_t tracked_key_count, input.LoadObject(), + _ << "RDB: Numeric load error on tracked_key_count"); + + for (size_t i = 0; i < untracked_key_count; ++i) { + VMSDK_ASSIGN_OR_RETURN( + auto key, input.LoadString(), + _ << "RDB: Numeric load key error on key number " << i); + auto key_ptr = StringInternStore::Instance().Intern(key); + VMSDK_ASSIGN_OR_RETURN( + auto tracked, AddRecordInternal(key_ptr, std::nullopt), + _ << "RDB: Numeric load error on insert for key number " << i); + if (tracked) { + return absl::InternalError(absl::StrCat( + "RDB: Numeric field reload error on untracked Key number: ", i)); + } + } + + for (size_t i = 0; i < tracked_key_count; ++i) { + VMSDK_ASSIGN_OR_RETURN( + auto key, input.LoadString(), + _ << "RDB: Numeric load error on tracked key number " << i); + auto key_ptr = StringInternStore::Instance().Intern(key); + VMSDK_ASSIGN_OR_RETURN( + auto value, input.LoadObject(), + _ << "RDB: Numeric load error on value for key number " << i); + VMSDK_ASSIGN_OR_RETURN( + auto tracked, AddRecordInternal(key_ptr, value), + _ << "RDB: Numeric load error on insert for key number " << i); + if (!tracked) { + return absl::InternalError(absl::StrCat( + "RDB: Numeric field reload error on tracked key number ", i)); + } + } + + if (!input.AtEnd()) { + return absl::InternalError("RDB: Numeric Extra data records found"); + } + return absl::OkStatus(); } diff --git a/src/indexes/numeric.h b/src/indexes/numeric.h index eb4d34c36..83bb11e07 100644 --- a/src/indexes/numeric.h +++ b/src/indexes/numeric.h @@ -99,8 +99,7 @@ class Numeric : public IndexBase { } absl::Status SaveIndexExtension( RDBChunkOutputStream chunked_out) const override; - absl::Status LoadIndexExtension( - SupplementalContentChunkIter chunked_out) override; + absl::Status LoadIndexExtension(RDBChunkInputStream chunked_in) override; inline void ForEachTrackedKey( absl::AnyInvocable fn) const override { absl::MutexLock lock(&index_mutex_); @@ -169,6 +168,9 @@ class Numeric : public IndexBase { bool negate) const ABSL_NO_THREAD_SAFETY_ANALYSIS; private: + absl::StatusOr AddRecordInternal(const InternedStringPtr& key, + std::optional data); + mutable absl::Mutex index_mutex_; InternedStringMap tracked_keys_ ABSL_GUARDED_BY(index_mutex_); // untracked keys is needed to support negate filtering diff --git a/src/indexes/tag.cc b/src/indexes/tag.cc index 5d7b43e8f..34d44ac31 100644 --- a/src/indexes/tag.cc +++ b/src/indexes/tag.cc @@ -82,7 +82,7 @@ absl::StatusOr> Tag::ParseSearchTags( if (tag.back() == '*') { if (!IsValidPrefix(tag)) { return absl::InvalidArgumentError( - absl::StrCat("Tag string `", tag, "` ends with multiple *.")); + absl::StrCat("RDB: Tag string `", tag, "` ends with multiple *.")); } // Prefix tags that are shorter than min length are ignored. if (tag.length() <= kDefaultMinPrefixLength) { @@ -346,10 +346,72 @@ uint64_t Tag::GetRecordCount() const { } absl::Status Tag::SaveIndexExtension(RDBChunkOutputStream output) const { + size_t untracked_key_count = untracked_keys_.size(); + size_t tracked_key_count = tracked_tags_by_keys_.size(); + VMSDK_RETURN_IF_ERROR(output.SaveObject(untracked_key_count)); + VMSDK_RETURN_IF_ERROR(output.SaveObject(tracked_key_count)); + + for (const auto& key_ptr : untracked_keys_) { + VMSDK_RETURN_IF_ERROR(output.SaveString(key_ptr->Str())); + // + // Ensure no duplicate keys (which would cause a load error) + // + if (tracked_tags_by_keys_.contains(key_ptr)) { + DCHECK(false); + return absl::InternalError("RDB: Tag field save duplicate key"); + } + } + + for (const auto& [key_ptr, value] : tracked_tags_by_keys_) { + VMSDK_RETURN_IF_ERROR(output.SaveString(key_ptr->Str())); + VMSDK_RETURN_IF_ERROR(output.SaveString(value.raw_tag_string->Str())); + } + return absl::OkStatus(); } -absl::Status Tag::LoadIndexExtension(SupplementalContentChunkIter chunked_out) { +absl::Status Tag::LoadIndexExtension(RDBChunkInputStream input) { + VMSDK_ASSIGN_OR_RETURN(size_t untracked_key_count, input.LoadObject(), + _ << "RDB: Tag load failure on untracked_key_count"); + VMSDK_ASSIGN_OR_RETURN(size_t tracked_key_count, input.LoadObject(), + _ << "RDB: Tag load failure on tracked_key_count"); + + for (size_t i = 0; i < untracked_key_count; ++i) { + VMSDK_ASSIGN_OR_RETURN( + auto key, input.LoadString(), + _ << "RDB: Tag Load failure on untracked key number " << i); + auto key_ptr = StringInternStore::Instance().Intern(key); + VMSDK_ASSIGN_OR_RETURN( + auto tracked, AddRecord(key_ptr, ""), + _ << "RDB: Tag load failure on untracked key number " << i); + if (tracked) { + return absl::InternalError(absl::StrCat( + "RDB: Tag field reload error on untracked Key number ", i)); + } + } + + for (size_t i = 0; i < tracked_key_count; ++i) { + VMSDK_ASSIGN_OR_RETURN( + auto key, input.LoadString(), + _ << "RDB: Tag load failure on tracked key number " << i); + VMSDK_ASSIGN_OR_RETURN( + auto value, input.LoadString(), + _ << "RDB: Tag load failure on tracked key/value number " << i); + auto key_ptr = StringInternStore::Instance().Intern(key); + VMSDK_ASSIGN_OR_RETURN( + auto tracked, AddRecord(key_ptr, value), + _ << "RDB: Tag load AddrRecord failure on tracked key/value number " + << i); + if (!tracked) { + return absl::InternalError(absl::StrCat( + "RDB: Tag field reload error on tracked Key number ", i)); + } + } + + if (!input.AtEnd()) { + return absl::InternalError("RDB: Tag extra data records"); + } + return absl::OkStatus(); } diff --git a/src/indexes/tag.h b/src/indexes/tag.h index fe1526ea8..45a17d5fd 100644 --- a/src/indexes/tag.h +++ b/src/indexes/tag.h @@ -49,8 +49,7 @@ class Tag : public IndexBase { } absl::Status SaveIndexExtension( RDBChunkOutputStream chunked_out) const override; - absl::Status LoadIndexExtension( - SupplementalContentChunkIter chunked_out) override; + absl::Status LoadIndexExtension(RDBChunkInputStream chunked_in) override; inline void ForEachTrackedKey( absl::AnyInvocable fn) const override { diff --git a/src/indexes/vector_base.cc b/src/indexes/vector_base.cc index 9af4eccb0..2714f9aa8 100644 --- a/src/indexes/vector_base.cc +++ b/src/indexes/vector_base.cc @@ -396,8 +396,7 @@ absl::Status VectorBase::SaveIndexExtension( CHECK(false); } -absl::Status VectorBase::LoadIndexExtension( - SupplementalContentChunkIter chunked_out) { +absl::Status VectorBase::LoadIndexExtension(RDBChunkInputStream chunked_in) { CHECK(false); } diff --git a/src/indexes/vector_base.h b/src/indexes/vector_base.h index 64e3d5963..fc2b0086c 100644 --- a/src/indexes/vector_base.h +++ b/src/indexes/vector_base.h @@ -127,8 +127,7 @@ class VectorBase : public IndexBase, public hnswlib::VectorTracker { absl::Status SaveIndex(RDBChunkOutputStream chunked_out) const override; absl::Status SaveIndexExtension( RDBChunkOutputStream chunked_out) const override; - absl::Status LoadIndexExtension( - SupplementalContentChunkIter chunked_out) override; + absl::Status LoadIndexExtension(RDBChunkInputStream chunked_in) override; absl::Status SaveTrackedKeys(RDBChunkOutputStream chunked_out) const ABSL_LOCKS_EXCLUDED(key_to_metadata_mutex_); diff --git a/src/rdb_section.proto b/src/rdb_section.proto index 598be6d4b..51ea732fd 100644 --- a/src/rdb_section.proto +++ b/src/rdb_section.proto @@ -62,7 +62,7 @@ message KeyToIDMappingHeader { // V2 Saved Mutation Queue message MutationQueueHeader { - + bool backfilling = 1; } message SupplementalContentHeader { diff --git a/src/rdb_serialization.h b/src/rdb_serialization.h index 624436cd1..a19c0b251 100644 --- a/src/rdb_serialization.h +++ b/src/rdb_serialization.h @@ -10,6 +10,7 @@ #include #include +#include #include "absl/log/check.h" #include "absl/status/status.h" @@ -20,6 +21,7 @@ #include "third_party/hnswlib/iostream.h" #include "vmsdk/src/log.h" #include "vmsdk/src/managed_pointers.h" +#include "vmsdk/src/status/status_macros.h" #include "vmsdk/src/valkey_module_api/valkey_module.h" namespace valkey_search { @@ -306,6 +308,24 @@ class RDBChunkInputStream : public hnswlib::InputStream { RDBChunkInputStream &operator=(RDBChunkInputStream &other) = delete; absl::StatusOr> LoadChunk() override; + absl::StatusOr LoadString() { + VMSDK_ASSIGN_OR_RETURN(auto str, LoadChunk()); + return *str.release(); + } + + template ::value && + std::is_standard_layout::value, + bool> = true> + absl::StatusOr LoadObject() { + VMSDK_ASSIGN_OR_RETURN(auto buffer, LoadChunk()); + if (buffer->size() != sizeof(T)) { + return absl::InternalError("Mismatched size protocol error"); + } + return *reinterpret_cast(buffer->data()); + } + + bool AtEnd() const { return !iter_.HasNext(); } + private: SupplementalContentChunkIter iter_; }; @@ -336,6 +356,15 @@ class RDBChunkOutputStream : public hnswlib::OutputStream { } } absl::Status SaveChunk(const char *data, size_t len) override; + absl::Status SaveString(const std::string_view s) { + return SaveChunk(s.data(), s.size()); + } + template ::value && + std::is_standard_layout::value, + bool> = true> + absl::Status SaveObject(const T &object) { + return this->SaveChunk(reinterpret_cast(&object), sizeof(T)); + } absl::Status Close(); private: diff --git a/testing/common.h b/testing/common.h index 022facdf9..bad578c4b 100644 --- a/testing/common.h +++ b/testing/common.h @@ -102,7 +102,7 @@ class MockIndex : public indexes::IndexBase { MOCK_METHOD(absl::Status, SaveIndexExtension, (RDBChunkOutputStream chunked_out), (const, override)); MOCK_METHOD(absl::Status, LoadIndexExtension, - (SupplementalContentChunkIter chunked_out), (override)); + (RDBChunkInputStream chunked_in), (override)); MOCK_METHOD((void), ForEachTrackedKey, (absl::AnyInvocable fn), (const, override)); @@ -241,21 +241,20 @@ class MockIndexSchema : public IndexSchema { : IndexSchema(ctx, index_schema_proto, std::move(attribute_data_type), mutations_thread_pool, reload) { ON_CALL(*this, OnLoadingEnded(testing::_)) - .WillByDefault(testing::Invoke([this](ValkeyModuleCtx* ctx) { + .WillByDefault([this](ValkeyModuleCtx* ctx) { return IndexSchema::OnLoadingEnded(ctx); - })); + }); ON_CALL(*this, OnSwapDB(testing::_)) - .WillByDefault( - testing::Invoke([this](ValkeyModuleSwapDbInfo* swap_db_info) { - return IndexSchema::OnSwapDB(swap_db_info); - })); - ON_CALL(*this, RDBSave(testing::_)) - .WillByDefault(testing::Invoke( - [this](SafeRDB* rdb) { return IndexSchema::RDBSave(rdb); })); + .WillByDefault([this](ValkeyModuleSwapDbInfo* swap_db_info) { + return IndexSchema::OnSwapDB(swap_db_info); + }); + ON_CALL(*this, RDBSave(testing::_)).WillByDefault([this](SafeRDB* rdb) { + return IndexSchema::RDBSave(rdb); + }); ON_CALL(*this, GetIdentifier(testing::_)) - .WillByDefault(testing::Invoke([](absl::string_view attribute_name) { + .WillByDefault([](absl::string_view attribute_name) { return std::string(attribute_name); - })); + }); } MOCK_METHOD(void, OnLoadingEnded, (ValkeyModuleCtx * ctx), (override)); MOCK_METHOD(void, OnSwapDB, (ValkeyModuleSwapDbInfo * swap_db_info), @@ -327,10 +326,10 @@ class MockThreadPool : public vmsdk::ThreadPool { MockThreadPool(const std::string& name, size_t num_threads) : vmsdk::ThreadPool(name, num_threads) { ON_CALL(*this, Schedule(testing::_, testing::_)) - .WillByDefault(testing::Invoke( + .WillByDefault( [this](absl::AnyInvocable task, Priority priority) { return ThreadPool::Schedule(std::move(task), priority); - })); + }); } MOCK_METHOD(bool, Schedule, (absl::AnyInvocable task, Priority priority), (override)); diff --git a/testing/rdb_serialization_test.cc b/testing/rdb_serialization_test.cc index 1770d2636..1e571bf73 100644 --- a/testing/rdb_serialization_test.cc +++ b/testing/rdb_serialization_test.cc @@ -298,16 +298,14 @@ TEST_F(RDBSerializationTest, RegisterModuleTypeHappyPath) { *kMockValkeyModule, CreateDataType(&fake_ctx_, testing::StrEq(kValkeySearchModuleTypeName), kCurrentEncVer, testing::_)) - .WillOnce(testing::Invoke( - [](ValkeyModuleCtx* ctx, const char* name, int encver, - ValkeyModuleTypeMethods* type_methods) -> ValkeyModuleType* { - EXPECT_EQ(type_methods->aux_load, AuxLoadCallback); - EXPECT_EQ(type_methods->aux_save2, AuxSaveCallback); - EXPECT_EQ(type_methods->aux_save, nullptr); - EXPECT_EQ(type_methods->aux_save_triggers, - VALKEYMODULE_AUX_AFTER_RDB); - return (ValkeyModuleType*)0xBAADF00D; - })); + .WillOnce([](ValkeyModuleCtx* ctx, const char* name, int encver, + ValkeyModuleTypeMethods* type_methods) -> ValkeyModuleType* { + EXPECT_EQ(type_methods->aux_load, AuxLoadCallback); + EXPECT_EQ(type_methods->aux_save2, AuxSaveCallback); + EXPECT_EQ(type_methods->aux_save, nullptr); + EXPECT_EQ(type_methods->aux_save_triggers, VALKEYMODULE_AUX_AFTER_RDB); + return (ValkeyModuleType*)0xBAADF00D; + }); VMSDK_EXPECT_OK(RegisterModuleType(&fake_ctx_)); } @@ -316,11 +314,10 @@ TEST_F(RDBSerializationTest, RegisterModuleTypeReturnNullptr) { *kMockValkeyModule, CreateDataType(&fake_ctx_, testing::StrEq(kValkeySearchModuleTypeName), kCurrentEncVer, testing::_)) - .WillOnce(testing::Invoke( - [](ValkeyModuleCtx* ctx, const char* name, int encver, - ValkeyModuleTypeMethods* type_methods) -> ValkeyModuleType* { - return nullptr; - })); + .WillOnce([](ValkeyModuleCtx* ctx, const char* name, int encver, + ValkeyModuleTypeMethods* type_methods) -> ValkeyModuleType* { + return nullptr; + }); EXPECT_EQ(RegisterModuleType(&fake_ctx_).code(), absl::StatusCode::kInternal); } @@ -394,13 +391,13 @@ TEST_F(RDBSerializationTest, PerformRDBSaveTwoRDBSection) { .Times(0); EXPECT_CALL(*test_cb.mock_callbacks, save(testing::_, testing::_, testing::_)) - .WillOnce(testing::Invoke( + .WillOnce( [i](ValkeyModuleCtx* ctx, SafeRDB* rdb, int when) -> absl::Status { EXPECT_EQ(when, VALKEYMODULE_AUX_BEFORE_RDB); std::string save_value = absl::StrCat("test-string-", i); VMSDK_EXPECT_OK(rdb->SaveStringBuffer(save_value)); return absl::OkStatus(); - })); + }); EXPECT_CALL(*test_cb.mock_callbacks, section_count(&fake_ctx_, VALKEYMODULE_AUX_BEFORE_RDB)) .WillOnce(testing::Return(1)); @@ -442,10 +439,10 @@ TEST_F(RDBSerializationTest, PerformRDBSaveSectionSaveFail) { EXPECT_CALL(*test_cb.mock_callbacks, load(testing::_, testing::_, testing::_)) .Times(0); EXPECT_CALL(*test_cb.mock_callbacks, save(testing::_, testing::_, testing::_)) - .WillOnce(testing::Invoke( + .WillOnce( [](ValkeyModuleCtx* ctx, SafeRDB* rdb, int when) -> absl::Status { return absl::InternalError("test error"); - })); + }); EXPECT_CALL(*test_cb.mock_callbacks, section_count(&fake_ctx_, VALKEYMODULE_AUX_BEFORE_RDB)) .WillOnce(testing::Return(1)); @@ -471,13 +468,13 @@ TEST_F(RDBSerializationTest, PerformRDBSaveTwoRDBSectionOneEmpty) { if (i == 0) { EXPECT_CALL(*test_cb.mock_callbacks, save(testing::_, testing::_, testing::_)) - .WillOnce(testing::Invoke([i](ValkeyModuleCtx* ctx, SafeRDB* rdb, - int when) -> absl::Status { + .WillOnce([i](ValkeyModuleCtx* ctx, SafeRDB* rdb, + int when) -> absl::Status { EXPECT_EQ(when, VALKEYMODULE_AUX_BEFORE_RDB); std::string save_value = absl::StrCat("test-string-", i); VMSDK_EXPECT_OK(rdb->SaveStringBuffer(save_value)); return absl::OkStatus(); - })); + }); EXPECT_CALL(*test_cb.mock_callbacks, minimum_semantic_version(testing::_, testing::_)) .WillOnce(testing::Return(0x0100ff)); @@ -571,17 +568,15 @@ TEST_F(RDBSerializationTest, PerformRDBLoadRDBSectionRegistered) { auto test_cb = GenerateRDBSectionCallbacks(); EXPECT_CALL(*test_cb.mock_callbacks, load(testing::_, testing::_, testing::_)) - .WillOnce( - testing::Invoke([](ValkeyModuleCtx* ctx, - std::unique_ptr section, - SupplementalContentIter&& iter) { - EXPECT_EQ(section->type(), data_model::RDB_SECTION_INDEX_SCHEMA); - EXPECT_EQ( - section->index_schema_contents().subscribed_key_prefixes_size(), - 1); - EXPECT_FALSE(iter.HasNext()); - return absl::OkStatus(); - })); + .WillOnce([](ValkeyModuleCtx* ctx, + std::unique_ptr section, + SupplementalContentIter&& iter) { + EXPECT_EQ(section->type(), data_model::RDB_SECTION_INDEX_SCHEMA); + EXPECT_EQ( + section->index_schema_contents().subscribed_key_prefixes_size(), 1); + EXPECT_FALSE(iter.HasNext()); + return absl::OkStatus(); + }); EXPECT_CALL(*test_cb.mock_callbacks, save(testing::_, testing::_, testing::_)) .Times(0); EXPECT_CALL(*test_cb.mock_callbacks, @@ -608,12 +603,11 @@ TEST_F(RDBSerializationTest, PerformRDBLoadRDBSectionCallbackFailure) { auto test_cb = GenerateRDBSectionCallbacks(); EXPECT_CALL(*test_cb.mock_callbacks, load(testing::_, testing::_, testing::_)) - .WillOnce( - testing::Invoke([](ValkeyModuleCtx* ctx, - std::unique_ptr section, - SupplementalContentIter&& iter) { - return absl::InternalError("test"); - })); + .WillOnce([](ValkeyModuleCtx* ctx, + std::unique_ptr section, + SupplementalContentIter&& iter) { + return absl::InternalError("test"); + }); EXPECT_CALL(*test_cb.mock_callbacks, save(testing::_, testing::_, testing::_)) .Times(0); EXPECT_CALL(*test_cb.mock_callbacks, From 230682e818d10ac6c273dc08122e491920438a33 Mon Sep 17 00:00:00 2001 From: Allen Samuels Date: Sat, 4 Oct 2025 17:59:31 +0000 Subject: [PATCH 04/33] Finished testing Signed-off-by: Allen Samuels --- integration/indexes.py | 26 +++++--- integration/valkey_search_test_case.py | 5 +- src/index_schema.cc | 91 +++++++++++++++++--------- src/indexes/numeric.cc | 14 ++++ src/indexes/numeric.h | 1 + src/indexes/tag.cc | 14 ++++ src/indexes/tag.h | 1 + src/indexes/vector_base.cc | 2 + src/metrics.h | 4 +- src/rdb_serialization.h | 15 +++++ src/valkey_search.cc | 12 ++++ testing/ft_search_test.cc | 4 +- testing/query/fanout_test.cc | 4 +- testing/rdb_serialization_test.cc | 4 +- vmsdk/src/utils.cc | 13 ++++ vmsdk/src/utils.h | 1 + 16 files changed, 162 insertions(+), 49 deletions(-) diff --git a/integration/indexes.py b/integration/indexes.py index c7ceb6f76..8522745d2 100644 --- a/integration/indexes.py +++ b/integration/indexes.py @@ -10,18 +10,19 @@ import logging, json import struct from enum import Enum +from util import waiters class KeyDataType(Enum): HASH = 1 JSON = 2 - - def float_to_bytes(flt: list[float]) -> bytes: return struct.pack(f"<{len(flt)}f", *flt) class Field: + name: str + alias: Union[str, None] def __init__(self, name: str, alias: Union[str, None]): self.name = name self.alias = alias if alias else name @@ -142,7 +143,7 @@ def __init__( self.prefixes = prefixes self.type = type - def create(self, client: valkey.client): + def create(self, client: valkey.client, wait_for_backfill = False): cmd = ( [ "FT.CREATE", @@ -159,21 +160,26 @@ def create(self, client: valkey.client): cmd += f.create(self.type) print(f"Creating Index: {cmd}") client.execute_command(*cmd) + if wait_for_backfill: + waiters.wait_for_true( + lambda: not self.info(client).backfill_in_progress + ) def load_data(self, client: valkey.client, rows: int, start_index: int = 0): - print("Loading data to ", client) for i in range(start_index, rows): data = self.make_data(i) - if self.type == KeyDataType.HASH: - client.hset(self.keyname(i), mapping=data) - else: - client.execute_command("JSON.SET", self.keyname(i), "$", json.dumps(data)) + self.write_data(client, i, data) + + def write_data(self, client: valkey.client, i:int, data: dict[str, str | bytes]): + if self.type == "HASH": + client.hset(self.keyname(i), mapping=data) + else: + client.execute_command("JSON.SET", self.keyname(i), "$", json.dumps(data)) def load_data_with_ttl(self, client: valkey.client, rows: int, ttl_ms: int, start_index: int = 0): for i in range(start_index, rows): data = self.make_data(i) - key = self.keyname(i) - client.hset(key, mapping=data) + self.write_data(client, i, data) client.pexpire(key, ttl_ms) def keyname(self, row: int) -> str: diff --git a/integration/valkey_search_test_case.py b/integration/valkey_search_test_case.py index 99dc02391..672f35ccd 100644 --- a/integration/valkey_search_test_case.py +++ b/integration/valkey_search_test_case.py @@ -147,6 +147,9 @@ def get_config_file_lines(self, testdir, port) -> List[str]: See ValkeySearchTestCaseBase.get_config_file_lines & ValkeySearchClusterTestCase.get_config_file_lines for example usage.""" raise NotImplementedError + + def append_startup_args(self, args: dict[str, str]) -> dict[str, str]: + return args def start_server( self, @@ -177,7 +180,7 @@ def start_server( server, client = self.create_server( testdir=testdir, server_path=server_path, - args={"logfile": logfile}, + args=self.append_startup_args({"logfile": logfile}), port=port, conf_file=conf_file, ) diff --git a/src/index_schema.cc b/src/index_schema.cc index 0bd2644d2..0d67db2b7 100644 --- a/src/index_schema.cc +++ b/src/index_schema.cc @@ -46,6 +46,7 @@ #include "src/valkey_search_options.h" #include "src/vector_externalizer.h" #include "vmsdk/src/blocked_client.h" +#include "vmsdk/src/debug.h" #include "vmsdk/src/info.h" #include "vmsdk/src/log.h" #include "vmsdk/src/managed_pointers.h" @@ -64,9 +65,9 @@ LogLevel GetLogSeverity(bool ok) { return ok ? DEBUG : WARNING; } // Controls and stats for V2 RDB file // static auto config_rdb_write_v2 = - vmsdk::config::BooleanBuilder("rdb_write_v2", false).Hidden().Build(); + vmsdk::config::BooleanBuilder("rdb-write-v2", false).Dev().Build(); static auto config_rdb_read_v2 = - vmsdk::config::BooleanBuilder("rdb_read_v2", false).Hidden().Build(); + vmsdk::config::BooleanBuilder("rdb-read-v2", false).Dev().Build(); static bool RDBReadV2() { return dynamic_cast(*config_rdb_read_v2).GetValue(); @@ -81,6 +82,10 @@ static vmsdk::info_field::Integer rdb_load_sections( "rdb_stats", "rdb_load_sections", vmsdk::info_field::IntegerBuilder().Dev()); +static vmsdk::info_field::Integer rdb_load_skipped_sections( + "rdb_stats", "rdb_load_skipped_sections", + vmsdk::info_field::IntegerBuilder().Dev()); + static vmsdk::info_field::Integer rdb_load_mutation_entries( "rdb_stats", "rdb_load_mutation_entries", vmsdk::info_field::IntegerBuilder().Dev()); @@ -89,19 +94,12 @@ static vmsdk::info_field::Integer rdb_save_sections( "rdb_stats", "rdb_save_sections", vmsdk::info_field::IntegerBuilder().Dev()); -static vmsdk::info_field::Integer rdb_save_mutation_entries( - "rdb_stats", "rdb_save_mutation_entries", - vmsdk::info_field::IntegerBuilder().Dev()); - IndexSchema::BackfillJob::BackfillJob(ValkeyModuleCtx *ctx, absl::string_view name, int db_num) : cursor(vmsdk::MakeUniqueValkeyScanCursor()) { scan_ctx = vmsdk::MakeUniqueValkeyDetachedThreadSafeContext(ctx); ValkeyModule_SelectDb(scan_ctx.get(), db_num); db_size = ValkeyModule_DbSize(scan_ctx.get()); - VMSDK_LOG(NOTICE, ctx) << "Starting backfill for index schema in DB " - << db_num << ": " << name << " (size: " << db_size - << ")"; } absl::StatusOr> IndexFactory( @@ -432,10 +430,18 @@ void IndexSchema::ProcessKeyspaceNotification(ValkeyModuleCtx *ctx, if (added) { switch (attribute_data_type_->ToProto()) { case data_model::ATTRIBUTE_DATA_TYPE_HASH: - Metrics::GetStats().ingest_hash_keys++; + if (from_backfill) { + Metrics::GetStats().backfill_hash_keys++; + } else { + Metrics::GetStats().ingest_hash_keys++; + } break; case data_model::ATTRIBUTE_DATA_TYPE_JSON: - Metrics::GetStats().ingest_json_keys++; + if (from_backfill) { + Metrics::GetStats().backfill_json_keys++; + } else { + Metrics::GetStats().ingest_json_keys++; + } break; default: CHECK(false); @@ -596,6 +602,7 @@ void IndexSchema::ScheduleMutation(bool from_backfill, [from_backfill, weak_index_schema = GetWeakPtr(), ctx = detached_ctx_.get(), delay_capturer = CreateQueueDelayCapturer(), key_str = std::move(key), blocking_counter]() mutable { + PAUSEPOINT("block_mutation_queue"); auto index_schema = weak_index_schema.lock(); if (ABSL_PREDICT_FALSE(!index_schema)) { return; @@ -682,6 +689,8 @@ void IndexSchema::BackfillScanCallback(ValkeyModuleCtx *ctx, } } +CONTROLLED_BOOLEAN(StopBackfill, false); + uint32_t IndexSchema::PerformBackfill(ValkeyModuleCtx *ctx, uint32_t batch_size) { auto &backfill_job = backfill_job_.Get(); @@ -689,6 +698,11 @@ uint32_t IndexSchema::PerformBackfill(ValkeyModuleCtx *ctx, return 0; } + if (StopBackfill.GetValue()) { + VMSDK_LOG_EVERY_N_SEC(NOTICE, ctx, 1) << "Backfill stopped by request"; + return 0; + } + backfill_job->paused_by_oom = false; // We need to ensure the DB size is monotonically increasing, since it could @@ -855,6 +869,8 @@ static absl::Status SaveSupplementalSection( rdb_save_sections.Increment(); auto header = std::make_unique(); header->set_type(type); + VMSDK_LOG(NOTICE, nullptr) << "Writing supplemental section type " + << data_model::SupplementalContentType_Name(type); init(*header); auto header_str = header->SerializeAsString(); VMSDK_RETURN_IF_ERROR(rdb->SaveStringBuffer(header_str)); @@ -912,6 +928,8 @@ absl::Status IndexSchema::RDBSave(SafeRDB *rdb) const { << " in DB: " << this->db_num_ << " to RDB"; for (auto &attribute : attributes_) { + VMSDK_LOG(NOTICE, nullptr) + << "Starting to save attribute: " << attribute.second.GetAlias(); // Note that the serialized attribute proto is also stored as part of the // serialized index schema proto above. We store here again to avoid any // dependencies on the ordering of multiple attributes. @@ -938,9 +956,6 @@ absl::Status IndexSchema::RDBSave(SafeRDB *rdb) const { dynamic_cast( attribute.second.GetIndex().get())))); } else if (RDBWriteV2()) { - // - // V2 add index content extension - // VMSDK_RETURN_IF_ERROR(SaveSupplementalSection( rdb, data_model::SUPPLEMENTAL_CONTENT_INDEX_EXTENSION, [&](auto &header) { @@ -953,14 +968,14 @@ absl::Status IndexSchema::RDBSave(SafeRDB *rdb) const { } if (RDBWriteV2()) { - // - // Save the Mutation Queue - // VMSDK_RETURN_IF_ERROR(SaveSupplementalSection( rdb, data_model::SUPPLEMENTAL_CONTENT_MUTATION_QUEUE, [&](auto &header) { header.mutable_mutation_queue_header()->set_backfilling( - backfill_job_.Get().has_value()); + IsBackfillInProgress()); + VMSDK_LOG(WARNING, nullptr) + << "RDB: Saving Mutation Queue Backfill = " + << header.mutation_queue_header().backfilling(); }, std::bind_front(&IndexSchema::SaveMutationQueue, this))); } @@ -969,10 +984,11 @@ absl::Status IndexSchema::RDBSave(SafeRDB *rdb) const { } absl::Status IndexSchema::SaveMutationQueue(RDBChunkOutputStream out) const { + VMSDK_LOG(NOTICE, nullptr) << "Writing Mutation Queue, records = " + << tracked_mutated_records_.size(); VMSDK_RETURN_IF_ERROR(out.SaveObject(tracked_mutated_records_.size())); for (const auto &[key, value] : tracked_mutated_records_) { VMSDK_RETURN_IF_ERROR(out.SaveString(key->Str())); - rdb_save_mutation_entries.Increment(); } return absl::OkStatus(); } @@ -980,11 +996,11 @@ absl::Status IndexSchema::SaveMutationQueue(RDBChunkOutputStream out) const { absl::Status IndexSchema::LoadMutationQueue(ValkeyModuleCtx *ctx, RDBChunkInputStream input) { VMSDK_ASSIGN_OR_RETURN(size_t count, input.LoadObject()); + rdb_load_mutation_entries.Increment(count); for (size_t i = 0; i < count; ++i) { VMSDK_ASSIGN_OR_RETURN(auto keyname_str, input.LoadString()); auto keyname = vmsdk::MakeUniqueValkeyString(keyname_str); ProcessKeyspaceNotification(ctx, keyname.get(), false); - rdb_load_mutation_entries.Increment(); } loaded_v2_ = true; return absl::OkStatus(); @@ -992,7 +1008,10 @@ absl::Status IndexSchema::LoadMutationQueue(ValkeyModuleCtx *ctx, // We need to iterate over the chunks to consume them static absl::Status SkipSupplementalContent( - SupplementalContentIter &supplemental_iter) { + SupplementalContentIter &supplemental_iter, std::string_view reason) { + rdb_load_skipped_sections.Increment(); + VMSDK_LOG(NOTICE, nullptr) + << "Skipping supplemental content section (" << reason << ")"; auto chunk_it = supplemental_iter.IterateChunks(); while (chunk_it.HasNext()) { VMSDK_ASSIGN_OR_RETURN([[maybe_unused]] auto chunk_result, chunk_it.Next()); @@ -1021,13 +1040,16 @@ absl::StatusOr> IndexSchema::LoadFromRDB( VMSDK_ASSIGN_OR_RETURN(auto supplemental_content, supplemental_iter.Next()); rdb_load_sections.Increment(); if (skip_loading_index_data) { - VMSDK_RETURN_IF_ERROR(SkipSupplementalContent(supplemental_iter)); + VMSDK_RETURN_IF_ERROR( + SkipSupplementalContent(supplemental_iter, "due to configuration")); } else { switch (supplemental_content->type()) { case data_model::SupplementalContentType:: SUPPLEMENTAL_CONTENT_INDEX_CONTENT: { auto &attribute = supplemental_content->index_content_header().attribute(); + VMSDK_LOG(NOTICE, nullptr) + << "Loading Index Content for attribute: " << attribute.alias(); VMSDK_ASSIGN_OR_RETURN( std::shared_ptr index, IndexFactory(ctx, index_schema.get(), attribute, @@ -1040,6 +1062,9 @@ absl::StatusOr> IndexSchema::LoadFromRDB( SUPPLEMENTAL_CONTENT_KEY_TO_ID_MAP: { auto &attribute = supplemental_content->key_to_id_map_header().attribute(); + VMSDK_LOG(NOTICE, nullptr) + << "Loading Key to ID Map Content for attribute: " + << attribute.alias(); VMSDK_ASSIGN_OR_RETURN( auto index, index_schema->GetIndex(attribute.alias()), _ << "Key to ID mapping found before index definition."); @@ -1055,25 +1080,29 @@ absl::StatusOr> IndexSchema::LoadFromRDB( } case data_model::SupplementalContentType:: SUPPLEMENTAL_CONTENT_INDEX_EXTENSION: { + auto &attribute = + supplemental_content->index_content_header().attribute(); + VMSDK_LOG(NOTICE, nullptr) + << "Loading Index Content Extension for attribute: " + << attribute.alias(); if (!RDBReadV2()) { - VMSDK_LOG(NOTICE, ctx) - << "Skipping supplemental index extension for attribute "; - VMSDK_RETURN_IF_ERROR(SkipSupplementalContent(supplemental_iter)); + VMSDK_RETURN_IF_ERROR( + SkipSupplementalContent(supplemental_iter, attribute.alias())); } else { - auto &attribute = - supplemental_content->index_content_header().attribute(); VMSDK_ASSIGN_OR_RETURN( auto index, index_schema->GetIndex(attribute.alias()), _ << "Index extension found before index definition."); VMSDK_RETURN_IF_ERROR(index->LoadIndexExtension( RDBChunkInputStream(supplemental_iter.IterateChunks()))); } + break; } case data_model::SupplementalContentType:: SUPPLEMENTAL_CONTENT_MUTATION_QUEUE: { + VMSDK_LOG(NOTICE, nullptr) << "Loading Mutation Queue"; if (!RDBReadV2()) { - VMSDK_LOG(NOTICE, ctx) << "Skipping supplemental mutation queue"; - VMSDK_RETURN_IF_ERROR(SkipSupplementalContent(supplemental_iter)); + VMSDK_RETURN_IF_ERROR( + SkipSupplementalContent(supplemental_iter, "mutation queue")); } else { if (index_schema) { VMSDK_RETURN_IF_ERROR(index_schema->LoadMutationQueue( @@ -1089,11 +1118,13 @@ absl::StatusOr> IndexSchema::LoadFromRDB( "Supplemental section mutation queue out of order"); } } + break; } default: VMSDK_LOG(NOTICE, ctx) << "Unknown supplemental content type: " << supplemental_content->type(); - VMSDK_RETURN_IF_ERROR(SkipSupplementalContent(supplemental_iter)); + VMSDK_RETURN_IF_ERROR( + SkipSupplementalContent(supplemental_iter, "unknown type")); break; } } diff --git a/src/indexes/numeric.cc b/src/indexes/numeric.cc index 04fe67e9e..ba632941b 100644 --- a/src/indexes/numeric.cc +++ b/src/indexes/numeric.cc @@ -25,8 +25,17 @@ #include "src/indexes/index_base.h" #include "src/query/predicate.h" #include "src/utils/string_interning.h" +#include "vmsdk/src/info.h" #include "vmsdk/src/valkey_module_api/valkey_module.h" +static vmsdk::info_field::Integer rdb_load_numbers_tracked( + "rdb_stats", "rdb_load_numbers_tracked", + vmsdk::info_field::IntegerBuilder().Dev()); + +static vmsdk::info_field::Integer rdb_load_numbers_untracked( + "rdb_stats", "rdb_load_numbers_untracked", + vmsdk::info_field::IntegerBuilder().Dev()); + namespace valkey_search::indexes { namespace { std::optional ParseNumber(absl::string_view data) { @@ -268,6 +277,8 @@ absl::Status Numeric::SaveIndexExtension(RDBChunkOutputStream output) const { size_t tracked_key_count = tracked_keys_.size(); VMSDK_RETURN_IF_ERROR(output.SaveObject(untracked_key_count)); VMSDK_RETURN_IF_ERROR(output.SaveObject(tracked_key_count)); + VMSDK_LOG(NOTICE, nullptr) << "Saving Numeric tracked: " << tracked_key_count + << " Untracked:" << untracked_key_count; for (const auto& key_ptr : untracked_keys_) { VMSDK_RETURN_IF_ERROR(output.SaveString(key_ptr->Str())); @@ -294,6 +305,9 @@ absl::Status Numeric::LoadIndexExtension(RDBChunkInputStream input) { VMSDK_ASSIGN_OR_RETURN(size_t tracked_key_count, input.LoadObject(), _ << "RDB: Numeric load error on tracked_key_count"); + rdb_load_numbers_untracked.Increment(untracked_key_count); + rdb_load_numbers_tracked.Increment(tracked_key_count); + for (size_t i = 0; i < untracked_key_count; ++i) { VMSDK_ASSIGN_OR_RETURN( auto key, input.LoadString(), diff --git a/src/indexes/numeric.h b/src/indexes/numeric.h index 83bb11e07..073a79d11 100644 --- a/src/indexes/numeric.h +++ b/src/indexes/numeric.h @@ -95,6 +95,7 @@ class Numeric : public IndexBase { int RespondWithInfo(ValkeyModuleCtx* ctx) const override; bool IsTracked(const InternedStringPtr& key) const override; absl::Status SaveIndex(RDBChunkOutputStream chunked_out) const override { + VMSDK_LOG(NOTICE, nullptr) << "Saving Numeric Empty"; return absl::OkStatus(); } absl::Status SaveIndexExtension( diff --git a/src/indexes/tag.cc b/src/indexes/tag.cc index 34d44ac31..2e8563d6e 100644 --- a/src/indexes/tag.cc +++ b/src/indexes/tag.cc @@ -26,8 +26,17 @@ #include "src/query/predicate.h" #include "src/utils/patricia_tree.h" #include "src/utils/string_interning.h" +#include "vmsdk/src/info.h" #include "vmsdk/src/valkey_module_api/valkey_module.h" +static vmsdk::info_field::Integer rdb_load_tags_tracked( + "rdb_stats", "rdb_load_tags_tracked", + vmsdk::info_field::IntegerBuilder().Dev()); + +static vmsdk::info_field::Integer rdb_load_tags_untracked( + "rdb_stats", "rdb_load_tags_untracked", + vmsdk::info_field::IntegerBuilder().Dev()); + namespace valkey_search::indexes { // For performance reasons, a minimum term length is enforced. The default is 2, @@ -350,6 +359,8 @@ absl::Status Tag::SaveIndexExtension(RDBChunkOutputStream output) const { size_t tracked_key_count = tracked_tags_by_keys_.size(); VMSDK_RETURN_IF_ERROR(output.SaveObject(untracked_key_count)); VMSDK_RETURN_IF_ERROR(output.SaveObject(tracked_key_count)); + VMSDK_LOG(NOTICE, nullptr) << "Saving Tag tracked: " << tracked_key_count + << " Untracked:" << untracked_key_count; for (const auto& key_ptr : untracked_keys_) { VMSDK_RETURN_IF_ERROR(output.SaveString(key_ptr->Str())); @@ -376,6 +387,9 @@ absl::Status Tag::LoadIndexExtension(RDBChunkInputStream input) { VMSDK_ASSIGN_OR_RETURN(size_t tracked_key_count, input.LoadObject(), _ << "RDB: Tag load failure on tracked_key_count"); + rdb_load_tags_untracked.Increment(untracked_key_count); + rdb_load_tags_tracked.Increment(tracked_key_count); + for (size_t i = 0; i < untracked_key_count; ++i) { VMSDK_ASSIGN_OR_RETURN( auto key, input.LoadString(), diff --git a/src/indexes/tag.h b/src/indexes/tag.h index 45a17d5fd..565102e5d 100644 --- a/src/indexes/tag.h +++ b/src/indexes/tag.h @@ -45,6 +45,7 @@ class Tag : public IndexBase { int RespondWithInfo(ValkeyModuleCtx* ctx) const override; bool IsTracked(const InternedStringPtr& key) const override; absl::Status SaveIndex(RDBChunkOutputStream chunked_out) const override { + VMSDK_LOG(NOTICE, nullptr) << "Saving Tag empty"; return absl::OkStatus(); } absl::Status SaveIndexExtension( diff --git a/src/indexes/vector_base.cc b/src/indexes/vector_base.cc index 2714f9aa8..e89a4c9b0 100644 --- a/src/indexes/vector_base.cc +++ b/src/indexes/vector_base.cc @@ -402,6 +402,8 @@ absl::Status VectorBase::LoadIndexExtension(RDBChunkInputStream chunked_in) { absl::Status VectorBase::SaveTrackedKeys( RDBChunkOutputStream chunked_out) const { + VMSDK_LOG(NOTICE, nullptr) + << "Save Vector, tracked_keys " << tracked_metadata_by_key_.size(); absl::ReaderMutexLock lock(&key_to_metadata_mutex_); for (const auto &[key, metadata] : tracked_metadata_by_key_) { data_model::TrackedKeyMetadata metadata_pb; diff --git a/src/metrics.h b/src/metrics.h index 65c054e15..6cf5e08b6 100644 --- a/src/metrics.h +++ b/src/metrics.h @@ -75,9 +75,9 @@ class Metrics { // Global ingestion stats (counts across all indexes) std::atomic ingest_hash_keys{0}; - std::atomic ingest_hash_blocked{0}; + std::atomic backfill_hash_keys{0}; std::atomic ingest_json_keys{0}; - std::atomic ingest_json_blocked{0}; + std::atomic backfill_json_keys{0}; std::atomic ingest_field_vector{0}; std::atomic ingest_field_numeric{0}; std::atomic ingest_field_tag{0}; diff --git a/src/rdb_serialization.h b/src/rdb_serialization.h index a19c0b251..073fb2b78 100644 --- a/src/rdb_serialization.h +++ b/src/rdb_serialization.h @@ -19,6 +19,7 @@ #include "absl/strings/string_view.h" #include "src/rdb_section.pb.h" #include "third_party/hnswlib/iostream.h" +#include "type_conversions.h" #include "vmsdk/src/log.h" #include "vmsdk/src/managed_pointers.h" #include "vmsdk/src/status/status_macros.h" @@ -71,6 +72,10 @@ inline std::string HumanReadableSemanticVersion(uint64_t semantic_version) { semantic_version & 0xFF); } +//#define DBG(name, x) \ +// VMSDK_LOG(WARNING, nullptr) << "SafeRDB " << name << " : " << x +#define DBG(name, x) + /* SafeRDB wraps a ValkeyModuleIO object and performs IO error checking, * returning absl::StatusOr to force error handling on the caller side. */ class SafeRDB { @@ -79,6 +84,7 @@ class SafeRDB { virtual absl::StatusOr LoadSizeT() { auto val = ValkeyModule_LoadUnsigned(rdb_); + DBG("LoadSizeT", val); if (ValkeyModule_IsIOError(rdb_)) { return absl::InternalError("ValkeyModule_LoadUnsigned failed"); } @@ -87,6 +93,7 @@ class SafeRDB { virtual absl::StatusOr LoadUnsigned() { auto val = ValkeyModule_LoadUnsigned(rdb_); + DBG("LoadUnSigned", val); if (ValkeyModule_IsIOError(rdb_)) { return absl::InternalError("ValkeyModule_LoadUnsigned failed"); } @@ -95,6 +102,7 @@ class SafeRDB { virtual absl::StatusOr LoadSigned() { auto val = ValkeyModule_LoadSigned(rdb_); + DBG("LoadSigned", val); if (ValkeyModule_IsIOError(rdb_)) { return absl::InternalError("ValkeyModule_LoadSigned failed"); } @@ -103,6 +111,7 @@ class SafeRDB { virtual absl::StatusOr LoadDouble() { auto val = ValkeyModule_LoadDouble(rdb_); + DBG("LoadDouble", val); if (ValkeyModule_IsIOError(rdb_)) { return absl::InternalError("ValkeyModule_LoadDouble failed"); } @@ -111,6 +120,7 @@ class SafeRDB { virtual absl::StatusOr LoadString() { auto str = vmsdk::UniqueValkeyString(ValkeyModule_LoadString(rdb_)); + DBG("Load string", vmsdk::StringToHex(vmsdk::ToStringView(str.get()))); if (ValkeyModule_IsIOError(rdb_)) { return absl::InternalError("ValkeyModule_LoadString failed"); } @@ -120,6 +130,7 @@ class SafeRDB { virtual absl::Status SaveSizeT(size_t val) { ValkeyModule_SaveUnsigned(rdb_, val); + DBG("SaveSizeT", val); if (ValkeyModule_IsIOError(rdb_)) { return absl::InternalError("ValkeyModule_SaveUnsigned failed"); } @@ -128,6 +139,7 @@ class SafeRDB { virtual absl::Status SaveUnsigned(unsigned int val) { ValkeyModule_SaveUnsigned(rdb_, val); + DBG("SaveUnsigned", val); if (ValkeyModule_IsIOError(rdb_)) { return absl::InternalError("ValkeyModule_SaveUnsigned failed"); } @@ -136,6 +148,7 @@ class SafeRDB { virtual absl::Status SaveSigned(int val) { ValkeyModule_SaveSigned(rdb_, val); + DBG("SaveSigned", val); if (ValkeyModule_IsIOError(rdb_)) { return absl::InternalError("ValkeyModule_SaveSigned failed"); } @@ -144,6 +157,7 @@ class SafeRDB { virtual absl::Status SaveDouble(double val) { ValkeyModule_SaveDouble(rdb_, val); + DBG("SaveDouble", val); if (ValkeyModule_IsIOError(rdb_)) { return absl::InternalError("ValkeyModule_SaveDouble failed"); } @@ -152,6 +166,7 @@ class SafeRDB { virtual absl::Status SaveStringBuffer(absl::string_view buf) { ValkeyModule_SaveStringBuffer(rdb_, buf.data(), buf.size()); + DBG("SaveString", vmsdk::StringToHex(buf)); if (ValkeyModule_IsIOError(rdb_)) { return absl::InternalError("ValkeyModule_SaveStringBuffer failed"); } diff --git a/src/valkey_search.cc b/src/valkey_search.cc index 934d3534f..217a75a53 100644 --- a/src/valkey_search.cc +++ b/src/valkey_search.cc @@ -142,6 +142,12 @@ static vmsdk::info_field::Integer ingest_hash_keys( return Metrics::GetStats().ingest_hash_keys; })); +static vmsdk::info_field::Integer backfill_hash_keys( + "global_ingestion", "backfill_hash_keys", + vmsdk::info_field::IntegerBuilder().Dev().Computed([]() -> long long { + return Metrics::GetStats().backfill_hash_keys; + })); + static vmsdk::info_field::Integer ingest_hash_blocked( "global_ingestion", "ingest_hash_blocked", vmsdk::info_field::IntegerBuilder().Dev().Computed([]() -> long long { @@ -155,6 +161,12 @@ static vmsdk::info_field::Integer ingest_json_keys( return Metrics::GetStats().ingest_json_keys; })); +static vmsdk::info_field::Integer backfill_json_keys( + "global_ingestion", "backfill_json_keys", + vmsdk::info_field::IntegerBuilder().Dev().Computed([]() -> long long { + return Metrics::GetStats().backfill_json_keys; + })); + static vmsdk::info_field::Integer ingest_json_blocked( "global_ingestion", "ingest_json_blocked", vmsdk::info_field::IntegerBuilder().Dev().Computed([]() -> long long { diff --git a/testing/ft_search_test.cc b/testing/ft_search_test.cc index 901f2be06..9c5570606 100644 --- a/testing/ft_search_test.cc +++ b/testing/ft_search_test.cc @@ -530,7 +530,7 @@ TEST_P(FTSearchTest, FTSearchTests) { GetClient(testing::StrEq(absl::StrCat("127.0.0.1:", coord_port)))) .WillRepeatedly(testing::Return(mock_client)); EXPECT_CALL(*mock_client, SearchIndexPartition(testing::_, testing::_)) - .WillRepeatedly(testing::Invoke( + .WillRepeatedly( [&](std::unique_ptr request, coordinator::SearchIndexPartitionCallback done) { @@ -538,7 +538,7 @@ TEST_P(FTSearchTest, FTSearchTests) { // nothing. coordinator::SearchIndexPartitionResponse response; done(grpc::Status::OK, response); - })); + }); } } } diff --git a/testing/query/fanout_test.cc b/testing/query/fanout_test.cc index 941b4606c..81073909c 100644 --- a/testing/query/fanout_test.cc +++ b/testing/query/fanout_test.cc @@ -406,7 +406,7 @@ TEST_P(FanoutTest, TestFanout) { GetClient(target.target.address)) .WillOnce(Return(mock_client)); EXPECT_CALL(*mock_client, SearchIndexPartition(testing::_, testing::_)) - .WillOnce(Invoke( + .WillOnce( [target, search_parameters](auto request, auto callback) mutable { search_parameters.mutable_limit()->set_first_index( request->limit().first_index()); @@ -437,7 +437,7 @@ TEST_P(FanoutTest, TestFanout) { } } callback(grpc::Status::OK, response); - })); + }); } } diff --git a/testing/rdb_serialization_test.cc b/testing/rdb_serialization_test.cc index 1e571bf73..6a9c9d2d8 100644 --- a/testing/rdb_serialization_test.cc +++ b/testing/rdb_serialization_test.cc @@ -354,12 +354,12 @@ TEST_F(RDBSerializationTest, PerformRDBSaveOneRDBSection) { EXPECT_CALL(*test_cb.mock_callbacks, load(testing::_, testing::_, testing::_)) .Times(0); EXPECT_CALL(*test_cb.mock_callbacks, save(testing::_, testing::_, testing::_)) - .WillOnce(testing::Invoke( + .WillOnce( [](ValkeyModuleCtx* ctx, SafeRDB* rdb, int when) -> absl::Status { EXPECT_EQ(when, VALKEYMODULE_AUX_BEFORE_RDB); VMSDK_EXPECT_OK(rdb->SaveStringBuffer("test-string")); return absl::OkStatus(); - })); + }); EXPECT_CALL(*test_cb.mock_callbacks, section_count(&fake_ctx_, VALKEYMODULE_AUX_BEFORE_RDB)) .WillOnce(testing::Return(1)); diff --git a/vmsdk/src/utils.cc b/vmsdk/src/utils.cc index 1d7b39cfb..9957fd8f9 100644 --- a/vmsdk/src/utils.cc +++ b/vmsdk/src/utils.cc @@ -308,4 +308,17 @@ std::optional JsonUnquote(absl::string_view sv) { return result; } +std::string StringToHex(std::string_view s) { + std::string result; + static char hex[] = "0123456789ABCDEF"; + for (auto &c : s) { + if (&c != s.begin()) { + result += ' '; + } + result += hex[(c >> 4) & 0xF]; + result += hex[c & 0xF]; + } + return result; +} + } // namespace vmsdk diff --git a/vmsdk/src/utils.h b/vmsdk/src/utils.h index b4ce783b6..dec51b185 100644 --- a/vmsdk/src/utils.h +++ b/vmsdk/src/utils.h @@ -106,6 +106,7 @@ inline int MakeValkeyVersion(int major, int minor, int patch) { CHECK(major < 256 && minor < 256 && patch < 256); return (major << 16) | (minor << 8) | patch; } +std::string StringToHex(std::string_view s); // Checks if a numeric value falls within an optional inclusive range [min, // max]. The range is inclusive: a value is considered valid if min <= value <= From d52dcaaef17828796daeff2feb8b33efe8046698 Mon Sep 17 00:00:00 2001 From: Allen Samuels Date: Sat, 4 Oct 2025 18:02:24 +0000 Subject: [PATCH 05/33] add missing file Signed-off-by: Allen Samuels --- integration/test_saverestore.py | 211 ++++++++++++++++++++++++++++++++ 1 file changed, 211 insertions(+) create mode 100644 integration/test_saverestore.py diff --git a/integration/test_saverestore.py b/integration/test_saverestore.py new file mode 100644 index 000000000..a2b51286a --- /dev/null +++ b/integration/test_saverestore.py @@ -0,0 +1,211 @@ +import base64 +import os +import tempfile +import time +import subprocess +import shutil +import socket +from valkey import ResponseError, Valkey +from valkey_search_test_case import ValkeySearchTestCaseBase, ValkeySearchTestCaseDebugMode +from valkeytestframework.conftest import resource_port_tracker +from indexes import * +import pytest +import logging +from util import waiters +import threading +from ft_info_parser import FTInfoParser + +index = Index("index", [Vector("v", 3, type="HNSW", m=2, efc=1), Numeric("n"), Tag("t")]) +NUM_VECTORS = 10 + +# Keys that are in all results +full_key_names = [index.keyname(i).encode() for i in range(NUM_VECTORS)] + +def check_keys(received_keys, expected_keys): + received_set = set(received_keys) + expected_set = set(expected_keys) + print("Result.keys ", received_set) + print("exected.keys", expected_set) + assert received_set == expected_set + +def do_search(client: Valkey.client, query: str, extra: list[str] = []) -> dict[str, dict[str, str]]: + cmd = ["ft.search index", query, "limit", "0", "100"] + extra + print("Cmd: ", cmd) + res = client.execute_command(*cmd)[1:] + result = dict() + for i in range(0, len(res), 2): + row = res[i+1] + row_dict = dict() + for j in range(0, len(row), 2): + row_dict[row[j]] = row[j+1] + result[res[i]] = row_dict + print("Result is ", result) + return result + +def make_data(): + records = [] + for i in range(0, NUM_VECTORS): + records += [index.make_data(i)] + + data = index.make_data(len(records)) + data["v"] = "0" + records += [data] + + data = index.make_data(len(records)) + data["n"] = "fred" + records += [data] + + data = index.make_data(len(records)) + data["t"] = "" + records += [data] + return records + +def load_data(client: Valkey.client): + records = make_data() + for i in range(0, len(records)): + index.write_data(client, i, records[i]) + +def verify_data(client: Valkey.client): + ''' + Do query operations against each index to ensure that all keys are present + ''' + + res = do_search(client, "@n:[0 100]") + check_keys(res.keys(), full_key_names + [index.keyname(NUM_VECTORS+0).encode(), index.keyname(NUM_VECTORS+2).encode()]) + res = do_search(client, "@t:{Tag*}") + check_keys(res.keys(), full_key_names + [index.keyname(NUM_VECTORS+0).encode(), index.keyname(NUM_VECTORS+1).encode()]) + +def do_save_restore_test(test, write_v2: bool, read_v2: bool): + index.create(test.client, True) + load_data(test.client) + verify_data(test.client) + + test.client.execute_command("save") + os.environ["SKIPLOGCLEAN"] = "1" + test.server.restart(remove_rdb=False) + print(test.client.ping()) + verify_data(test.client) + test.client.execute_command("CONFIG SET search.info-developer-visible yes") + + i = test.client.info("search") + print("Info: ", i) + reads = [ + i["search_rdb_load_sections"], + i["search_rdb_load_skipped_sections"], + i["search_rdb_load_tags_tracked"], + i["search_rdb_load_tags_untracked"], + i["search_rdb_load_numbers_tracked"], + i["search_rdb_load_numbers_untracked"], + ] + if not write_v2: + assert reads == [4, 0, 0, 0, 0, 0] + elif read_v2: + assert reads == [7, 0, 12, 1, 12, 1] + else: + assert reads == [7, 3, 0, 0, 0, 0] + + +class TestSaveRestore_v1_v1(ValkeySearchTestCaseBase): + def append_startup_args(self, args): + args["search.rdb_write_v2"] = "no" + args["search.rdb_read_v2"] = "no" + return args + + def test_saverestore_v1_v1(self): + do_save_restore_test(self, False, False) + +class TestSaveRestore_v1_v2(ValkeySearchTestCaseBase): + def append_startup_args(self, args): + args["search.rdb_write_v2"] = "no" + args["search.rdb_read_v2"] = "yes" + return args + + def test_saverestore_v1_v2(self): + do_save_restore_test(self, False, True) + +class TestSaveRestore_v2_v1(ValkeySearchTestCaseBase): + def append_startup_args(self, args): + args["search.rdb_write_v2"] = "yes" + args["search.rdb_read_v2"] = "no" + return args + + def test_saverestore_v2_v1(self): + do_save_restore_test(self, True, False) + +class TestSaveRestore_v2_v2(ValkeySearchTestCaseBase): + def append_startup_args(self, args): + args["search.rdb_write_v2"] = "yes" + args["search.rdb_read_v2"] = "yes" + return args + + def test_saverestore_v2_v2(self): + do_save_restore_test(self, True, True) + +class TestMutationQueue(ValkeySearchTestCaseDebugMode): + def append_startup_args(self, args): + args["search.rdb_write_v2"] = "yes" + args["search.rdb_read_v2"] = "yes" + return args + + def mutation_queue_size(self): + info = FTInfoParser(self.client.execute_command("ft.info ", index.name)) + return info.mutation_queue_size + + def test_mutation_queue(self): + self.client.execute_command("ft._debug PAUSEPOINT SET block_mutation_queue") + index.create(self.client, True) + records = make_data() + # + # Now, load the data.... But since the mutation queue is blocked it will be stopped.... + # + client_threads = [] + for i in range(len(records)): + new_client = self.server.get_new_client() + t = threading.Thread(target = index.write_data, args=(new_client, i, records[i]) ) + t.start() + client_threads += [t] + + # + # Now, wait for the mutation queue to get fully loaded + # + waiters.wait_for_true(lambda: self.mutation_queue_size() == len(records)) + print("MUTATION QUEUE LOADED") + + self.client.execute_command("save") + + self.client.execute_command("ft._debug pausepoint reset block_mutation_queue") + + for t in client_threads: + t.join() + + verify_data(self.client) + os.environ["SKIPLOGCLEAN"] = "1" + self.server.restart(remove_rdb=False) + verify_data(self.client) + self.client.execute_command("CONFIG SET search.info-developer-visible yes") + i = self.client.info("search") + print("Info: ", i) + reads = [ + i["search_rdb_load_mutation_entries"], + ] + assert reads == [len(records)] + + def test_saverestore_backfill(self): + # + # Delay the backfill and ensure that with new format we will trigger the backfill.... + # + self.client.execute_command("FT._DEBUG CONTROLLED_VARIABLE SET StopBackfill yes") + load_data(self.client) + index.create(self.client, False) + self.client.execute_command("save") + + os.environ["SKIPLOGCLEAN"] = "1" + self.server.restart(remove_rdb=False) + verify_data(self.client) + self.client.execute_command("CONFIG SET search.info-developer-visible yes") + i = self.client.info("search") + print("Info: ", i) + reads = [ + i["search_backfill_hash_keys"], + ] + assert reads == [len(make_data())] From 6344430556910f8ff1a6cc211172c1f7e3564042 Mon Sep 17 00:00:00 2001 From: Allen Samuels Date: Sat, 4 Oct 2025 18:15:05 +0000 Subject: [PATCH 06/33] Revert Signed-off-by: Allen Samuels --- src/index_schema.proto | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/index_schema.proto b/src/index_schema.proto index bb5be7deb..7b03db821 100644 --- a/src/index_schema.proto +++ b/src/index_schema.proto @@ -29,8 +29,6 @@ message IndexSchema { repeated Attribute attributes = 6; Stats stats = 7; optional uint32 db_num = 8; - // V2 save/restore will have this field populated - optional bool backfill_completed = 9; } enum AttributeIndexType { From 9c0d6f6633041756d10567b780f4807c69f211e6 Mon Sep 17 00:00:00 2001 From: Allen Samuels Date: Sat, 4 Oct 2025 18:24:54 +0000 Subject: [PATCH 07/33] Cleanup Signed-off-by: Allen Samuels --- src/indexes/numeric.h | 1 - src/indexes/tag.cc | 2 +- src/indexes/tag.h | 1 - src/indexes/vector_base.cc | 2 -- 4 files changed, 1 insertion(+), 5 deletions(-) diff --git a/src/indexes/numeric.h b/src/indexes/numeric.h index 073a79d11..83bb11e07 100644 --- a/src/indexes/numeric.h +++ b/src/indexes/numeric.h @@ -95,7 +95,6 @@ class Numeric : public IndexBase { int RespondWithInfo(ValkeyModuleCtx* ctx) const override; bool IsTracked(const InternedStringPtr& key) const override; absl::Status SaveIndex(RDBChunkOutputStream chunked_out) const override { - VMSDK_LOG(NOTICE, nullptr) << "Saving Numeric Empty"; return absl::OkStatus(); } absl::Status SaveIndexExtension( diff --git a/src/indexes/tag.cc b/src/indexes/tag.cc index 2e8563d6e..1bace3bd7 100644 --- a/src/indexes/tag.cc +++ b/src/indexes/tag.cc @@ -91,7 +91,7 @@ absl::StatusOr> Tag::ParseSearchTags( if (tag.back() == '*') { if (!IsValidPrefix(tag)) { return absl::InvalidArgumentError( - absl::StrCat("RDB: Tag string `", tag, "` ends with multiple *.")); + absl::StrCat("Tag string `", tag, "` ends with multiple *.")); } // Prefix tags that are shorter than min length are ignored. if (tag.length() <= kDefaultMinPrefixLength) { diff --git a/src/indexes/tag.h b/src/indexes/tag.h index 565102e5d..45a17d5fd 100644 --- a/src/indexes/tag.h +++ b/src/indexes/tag.h @@ -45,7 +45,6 @@ class Tag : public IndexBase { int RespondWithInfo(ValkeyModuleCtx* ctx) const override; bool IsTracked(const InternedStringPtr& key) const override; absl::Status SaveIndex(RDBChunkOutputStream chunked_out) const override { - VMSDK_LOG(NOTICE, nullptr) << "Saving Tag empty"; return absl::OkStatus(); } absl::Status SaveIndexExtension( diff --git a/src/indexes/vector_base.cc b/src/indexes/vector_base.cc index e89a4c9b0..2714f9aa8 100644 --- a/src/indexes/vector_base.cc +++ b/src/indexes/vector_base.cc @@ -402,8 +402,6 @@ absl::Status VectorBase::LoadIndexExtension(RDBChunkInputStream chunked_in) { absl::Status VectorBase::SaveTrackedKeys( RDBChunkOutputStream chunked_out) const { - VMSDK_LOG(NOTICE, nullptr) - << "Save Vector, tracked_keys " << tracked_metadata_by_key_.size(); absl::ReaderMutexLock lock(&key_to_metadata_mutex_); for (const auto &[key, metadata] : tracked_metadata_by_key_) { data_model::TrackedKeyMetadata metadata_pb; From 667d77ed6111e16f4ca95e5440bb8f6255af3540 Mon Sep 17 00:00:00 2001 From: Allen Samuels Date: Sat, 4 Oct 2025 18:27:59 +0000 Subject: [PATCH 08/33] fix spelling Signed-off-by: Allen Samuels --- integration/test_saverestore.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integration/test_saverestore.py b/integration/test_saverestore.py index a2b51286a..063231fd1 100644 --- a/integration/test_saverestore.py +++ b/integration/test_saverestore.py @@ -25,7 +25,7 @@ def check_keys(received_keys, expected_keys): received_set = set(received_keys) expected_set = set(expected_keys) print("Result.keys ", received_set) - print("exected.keys", expected_set) + print("expected.keys", expected_set) assert received_set == expected_set def do_search(client: Valkey.client, query: str, extra: list[str] = []) -> dict[str, dict[str, str]]: From c7bd27448793f66eb3a53b7cc4dfdf5ed2afc64d Mon Sep 17 00:00:00 2001 From: Allen Samuels Date: Sat, 4 Oct 2025 19:39:11 +0000 Subject: [PATCH 09/33] fix bad merge Signed-off-by: Allen Samuels --- integration/indexes.py | 2 +- integration/run.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/integration/indexes.py b/integration/indexes.py index 8522745d2..1a0af06c4 100644 --- a/integration/indexes.py +++ b/integration/indexes.py @@ -171,7 +171,7 @@ def load_data(self, client: valkey.client, rows: int, start_index: int = 0): self.write_data(client, i, data) def write_data(self, client: valkey.client, i:int, data: dict[str, str | bytes]): - if self.type == "HASH": + if self.type == KeyDataType.HASH: client.hset(self.keyname(i), mapping=data) else: client.execute_command("JSON.SET", self.keyname(i), "$", json.dumps(data)) diff --git a/integration/run.sh b/integration/run.sh index a6dbee733..7bf1f09ae 100755 --- a/integration/run.sh +++ b/integration/run.sh @@ -144,7 +144,7 @@ mkdir -p ${LOGS_DIR} function run_pytest() { zap valkey-server LOG_INFO "Running: ${PYTHON_PATH} -m pytest ${FILTER_ARGS} --capture=sys --cache-clear -v ${ROOT_DIR}/integration/" - ${PYTHON_PATH} -m pytest ${FILTER_ARGS} --full-trace --capture=sys --cache-clear -v ${ROOT_DIR}/integration/ + ${PYTHON_PATH} -m pytest ${FILTER_ARGS} --capture=sys --cache-clear -v ${ROOT_DIR}/integration/ RUN_SUCCESS=$? } From 233c675c78d8de774cf80c8698dcba2e6207a322 Mon Sep 17 00:00:00 2001 From: Allen Samuels Date: Sat, 4 Oct 2025 20:25:07 +0000 Subject: [PATCH 10/33] bad merge Signed-off-by: Allen Samuels --- integration/indexes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integration/indexes.py b/integration/indexes.py index 1a0af06c4..1d7ef7d9f 100644 --- a/integration/indexes.py +++ b/integration/indexes.py @@ -180,7 +180,7 @@ def load_data_with_ttl(self, client: valkey.client, rows: int, ttl_ms: int, star for i in range(start_index, rows): data = self.make_data(i) self.write_data(client, i, data) - client.pexpire(key, ttl_ms) + client.pexpire(self.keyname(i), ttl_ms) def keyname(self, row: int) -> str: prefix = self.prefixes[row % len(self.prefixes)] if self.prefixes else "" From 047f0434117fa935649ef438f856e4dac8f32dff Mon Sep 17 00:00:00 2001 From: Allen Samuels Date: Sat, 4 Oct 2025 21:35:22 +0000 Subject: [PATCH 11/33] experiment Signed-off-by: Allen Samuels --- integration/test_saverestore.py | 1 + 1 file changed, 1 insertion(+) diff --git a/integration/test_saverestore.py b/integration/test_saverestore.py index 063231fd1..f6d30e828 100644 --- a/integration/test_saverestore.py +++ b/integration/test_saverestore.py @@ -83,6 +83,7 @@ def do_save_restore_test(test, write_v2: bool, read_v2: bool): test.client.execute_command("save") os.environ["SKIPLOGCLEAN"] = "1" test.server.restart(remove_rdb=False) + time.sleep(5) print(test.client.ping()) verify_data(test.client) test.client.execute_command("CONFIG SET search.info-developer-visible yes") From 923c3d4b78fb6ac033a15e2f25976027bdc3dadd Mon Sep 17 00:00:00 2001 From: Allen Samuels Date: Thu, 16 Oct 2025 03:28:47 +0000 Subject: [PATCH 12/33] Cleanup Signed-off-by: Allen Samuels --- src/index_schema.cc | 5 +- src/indexes/numeric.cc | 89 +---- src/indexes/numeric.h | 6 - src/indexes/tag.cc | 84 ----- src/indexes/tag.h | 3 - src/indexes/vector_base.cc | 723 +++++++++++-------------------------- src/indexes/vector_base.h | 4 - src/rdb_serialization.h | 15 - 8 files changed, 225 insertions(+), 704 deletions(-) diff --git a/src/index_schema.cc b/src/index_schema.cc index 0d67db2b7..14e592294 100644 --- a/src/index_schema.cc +++ b/src/index_schema.cc @@ -100,6 +100,9 @@ IndexSchema::BackfillJob::BackfillJob(ValkeyModuleCtx *ctx, scan_ctx = vmsdk::MakeUniqueValkeyDetachedThreadSafeContext(ctx); ValkeyModule_SelectDb(scan_ctx.get(), db_num); db_size = ValkeyModule_DbSize(scan_ctx.get()); + VMSDK_LOG(NOTICE, ctx) << "Starting backfill for index schema in DB " + << db_num << ": " << name << " (size: " << db_size + << ")"; } absl::StatusOr> IndexFactory( @@ -1156,7 +1159,7 @@ void IndexSchema::OnSwapDB(ValkeyModuleSwapDbInfo *swap_db_info) { void IndexSchema::OnLoadingEnded(ValkeyModuleCtx *ctx) { if (loaded_v2_) { - VMSDK_LOG(NOTICE, ctx) << "New format RDB load completed, " + VMSDK_LOG(NOTICE, ctx) << "RDB load completed, " << " Mutation Queue contains " << tracked_mutated_records_.size() << " entries." << (backfill_job_.Get().has_value() diff --git a/src/indexes/numeric.cc b/src/indexes/numeric.cc index ba632941b..450a8105c 100644 --- a/src/indexes/numeric.cc +++ b/src/indexes/numeric.cc @@ -25,17 +25,8 @@ #include "src/indexes/index_base.h" #include "src/query/predicate.h" #include "src/utils/string_interning.h" -#include "vmsdk/src/info.h" #include "vmsdk/src/valkey_module_api/valkey_module.h" -static vmsdk::info_field::Integer rdb_load_numbers_tracked( - "rdb_stats", "rdb_load_numbers_tracked", - vmsdk::info_field::IntegerBuilder().Dev()); - -static vmsdk::info_field::Integer rdb_load_numbers_untracked( - "rdb_stats", "rdb_load_numbers_untracked", - vmsdk::info_field::IntegerBuilder().Dev()); - namespace valkey_search::indexes { namespace { std::optional ParseNumber(absl::string_view data) { @@ -54,11 +45,7 @@ Numeric::Numeric(const data_model::NumericIndex& numeric_index_proto) absl::StatusOr Numeric::AddRecord(const InternedStringPtr& key, absl::string_view data) { - return AddRecordInternal(key, ParseNumber(data)); -} - -absl::StatusOr Numeric::AddRecordInternal(const InternedStringPtr& key, - std::optional value) { + auto value = ParseNumber(data); absl::MutexLock lock(&index_mutex_); if (!value.has_value()) { untracked_keys_.insert(key); @@ -272,78 +259,4 @@ uint64_t Numeric::GetRecordCount() const { return tracked_keys_.size(); } -absl::Status Numeric::SaveIndexExtension(RDBChunkOutputStream output) const { - size_t untracked_key_count = untracked_keys_.size(); - size_t tracked_key_count = tracked_keys_.size(); - VMSDK_RETURN_IF_ERROR(output.SaveObject(untracked_key_count)); - VMSDK_RETURN_IF_ERROR(output.SaveObject(tracked_key_count)); - VMSDK_LOG(NOTICE, nullptr) << "Saving Numeric tracked: " << tracked_key_count - << " Untracked:" << untracked_key_count; - - for (const auto& key_ptr : untracked_keys_) { - VMSDK_RETURN_IF_ERROR(output.SaveString(key_ptr->Str())); - // - // Ensure no duplicate keys (which would cause a load error) - // - if (tracked_keys_.contains(key_ptr)) { - DCHECK(false); - return absl::InternalError("Numeric field save duplicate key"); - } - } - - for (const auto& [key_ptr, value] : tracked_keys_) { - VMSDK_RETURN_IF_ERROR(output.SaveString(key_ptr->Str())); - VMSDK_RETURN_IF_ERROR(output.SaveObject(value)); - } - - return absl::OkStatus(); -} - -absl::Status Numeric::LoadIndexExtension(RDBChunkInputStream input) { - VMSDK_ASSIGN_OR_RETURN(size_t untracked_key_count, input.LoadObject(), - _ << "RDB: Numeric load error on untracked_key_count"); - VMSDK_ASSIGN_OR_RETURN(size_t tracked_key_count, input.LoadObject(), - _ << "RDB: Numeric load error on tracked_key_count"); - - rdb_load_numbers_untracked.Increment(untracked_key_count); - rdb_load_numbers_tracked.Increment(tracked_key_count); - - for (size_t i = 0; i < untracked_key_count; ++i) { - VMSDK_ASSIGN_OR_RETURN( - auto key, input.LoadString(), - _ << "RDB: Numeric load key error on key number " << i); - auto key_ptr = StringInternStore::Instance().Intern(key); - VMSDK_ASSIGN_OR_RETURN( - auto tracked, AddRecordInternal(key_ptr, std::nullopt), - _ << "RDB: Numeric load error on insert for key number " << i); - if (tracked) { - return absl::InternalError(absl::StrCat( - "RDB: Numeric field reload error on untracked Key number: ", i)); - } - } - - for (size_t i = 0; i < tracked_key_count; ++i) { - VMSDK_ASSIGN_OR_RETURN( - auto key, input.LoadString(), - _ << "RDB: Numeric load error on tracked key number " << i); - auto key_ptr = StringInternStore::Instance().Intern(key); - VMSDK_ASSIGN_OR_RETURN( - auto value, input.LoadObject(), - _ << "RDB: Numeric load error on value for key number " << i); - VMSDK_ASSIGN_OR_RETURN( - auto tracked, AddRecordInternal(key_ptr, value), - _ << "RDB: Numeric load error on insert for key number " << i); - if (!tracked) { - return absl::InternalError(absl::StrCat( - "RDB: Numeric field reload error on tracked key number ", i)); - } - } - - if (!input.AtEnd()) { - return absl::InternalError("RDB: Numeric Extra data records found"); - } - - return absl::OkStatus(); -} - } // namespace valkey_search::indexes diff --git a/src/indexes/numeric.h b/src/indexes/numeric.h index 83bb11e07..b83984a11 100644 --- a/src/indexes/numeric.h +++ b/src/indexes/numeric.h @@ -97,9 +97,6 @@ class Numeric : public IndexBase { absl::Status SaveIndex(RDBChunkOutputStream chunked_out) const override { return absl::OkStatus(); } - absl::Status SaveIndexExtension( - RDBChunkOutputStream chunked_out) const override; - absl::Status LoadIndexExtension(RDBChunkInputStream chunked_in) override; inline void ForEachTrackedKey( absl::AnyInvocable fn) const override { absl::MutexLock lock(&index_mutex_); @@ -168,9 +165,6 @@ class Numeric : public IndexBase { bool negate) const ABSL_NO_THREAD_SAFETY_ANALYSIS; private: - absl::StatusOr AddRecordInternal(const InternedStringPtr& key, - std::optional data); - mutable absl::Mutex index_mutex_; InternedStringMap tracked_keys_ ABSL_GUARDED_BY(index_mutex_); // untracked keys is needed to support negate filtering diff --git a/src/indexes/tag.cc b/src/indexes/tag.cc index 1bace3bd7..af0865237 100644 --- a/src/indexes/tag.cc +++ b/src/indexes/tag.cc @@ -26,17 +26,8 @@ #include "src/query/predicate.h" #include "src/utils/patricia_tree.h" #include "src/utils/string_interning.h" -#include "vmsdk/src/info.h" #include "vmsdk/src/valkey_module_api/valkey_module.h" -static vmsdk::info_field::Integer rdb_load_tags_tracked( - "rdb_stats", "rdb_load_tags_tracked", - vmsdk::info_field::IntegerBuilder().Dev()); - -static vmsdk::info_field::Integer rdb_load_tags_untracked( - "rdb_stats", "rdb_load_tags_untracked", - vmsdk::info_field::IntegerBuilder().Dev()); - namespace valkey_search::indexes { // For performance reasons, a minimum term length is enforced. The default is 2, @@ -354,79 +345,4 @@ uint64_t Tag::GetRecordCount() const { return tracked_tags_by_keys_.size(); } -absl::Status Tag::SaveIndexExtension(RDBChunkOutputStream output) const { - size_t untracked_key_count = untracked_keys_.size(); - size_t tracked_key_count = tracked_tags_by_keys_.size(); - VMSDK_RETURN_IF_ERROR(output.SaveObject(untracked_key_count)); - VMSDK_RETURN_IF_ERROR(output.SaveObject(tracked_key_count)); - VMSDK_LOG(NOTICE, nullptr) << "Saving Tag tracked: " << tracked_key_count - << " Untracked:" << untracked_key_count; - - for (const auto& key_ptr : untracked_keys_) { - VMSDK_RETURN_IF_ERROR(output.SaveString(key_ptr->Str())); - // - // Ensure no duplicate keys (which would cause a load error) - // - if (tracked_tags_by_keys_.contains(key_ptr)) { - DCHECK(false); - return absl::InternalError("RDB: Tag field save duplicate key"); - } - } - - for (const auto& [key_ptr, value] : tracked_tags_by_keys_) { - VMSDK_RETURN_IF_ERROR(output.SaveString(key_ptr->Str())); - VMSDK_RETURN_IF_ERROR(output.SaveString(value.raw_tag_string->Str())); - } - - return absl::OkStatus(); -} - -absl::Status Tag::LoadIndexExtension(RDBChunkInputStream input) { - VMSDK_ASSIGN_OR_RETURN(size_t untracked_key_count, input.LoadObject(), - _ << "RDB: Tag load failure on untracked_key_count"); - VMSDK_ASSIGN_OR_RETURN(size_t tracked_key_count, input.LoadObject(), - _ << "RDB: Tag load failure on tracked_key_count"); - - rdb_load_tags_untracked.Increment(untracked_key_count); - rdb_load_tags_tracked.Increment(tracked_key_count); - - for (size_t i = 0; i < untracked_key_count; ++i) { - VMSDK_ASSIGN_OR_RETURN( - auto key, input.LoadString(), - _ << "RDB: Tag Load failure on untracked key number " << i); - auto key_ptr = StringInternStore::Instance().Intern(key); - VMSDK_ASSIGN_OR_RETURN( - auto tracked, AddRecord(key_ptr, ""), - _ << "RDB: Tag load failure on untracked key number " << i); - if (tracked) { - return absl::InternalError(absl::StrCat( - "RDB: Tag field reload error on untracked Key number ", i)); - } - } - - for (size_t i = 0; i < tracked_key_count; ++i) { - VMSDK_ASSIGN_OR_RETURN( - auto key, input.LoadString(), - _ << "RDB: Tag load failure on tracked key number " << i); - VMSDK_ASSIGN_OR_RETURN( - auto value, input.LoadString(), - _ << "RDB: Tag load failure on tracked key/value number " << i); - auto key_ptr = StringInternStore::Instance().Intern(key); - VMSDK_ASSIGN_OR_RETURN( - auto tracked, AddRecord(key_ptr, value), - _ << "RDB: Tag load AddrRecord failure on tracked key/value number " - << i); - if (!tracked) { - return absl::InternalError(absl::StrCat( - "RDB: Tag field reload error on tracked Key number ", i)); - } - } - - if (!input.AtEnd()) { - return absl::InternalError("RDB: Tag extra data records"); - } - - return absl::OkStatus(); -} - } // namespace valkey_search::indexes diff --git a/src/indexes/tag.h b/src/indexes/tag.h index 45a17d5fd..74da9730b 100644 --- a/src/indexes/tag.h +++ b/src/indexes/tag.h @@ -47,9 +47,6 @@ class Tag : public IndexBase { absl::Status SaveIndex(RDBChunkOutputStream chunked_out) const override { return absl::OkStatus(); } - absl::Status SaveIndexExtension( - RDBChunkOutputStream chunked_out) const override; - absl::Status LoadIndexExtension(RDBChunkInputStream chunked_in) override; inline void ForEachTrackedKey( absl::AnyInvocable fn) const override { diff --git a/src/indexes/vector_base.cc b/src/indexes/vector_base.cc index 2714f9aa8..c66b77cd9 100644 --- a/src/indexes/vector_base.cc +++ b/src/indexes/vector_base.cc @@ -5,545 +5,262 @@ * */ -#include "src/indexes/vector_base.h" +#ifndef VALKEYSEARCH_SRC_INDEXES_VECTOR_BASE_H_ +#define VALKEYSEARCH_SRC_INDEXES_VECTOR_BASE_H_ -#include - -#include -#include #include #include -#include #include #include #include #include #include #include -#include #include #include +#include "absl/base/no_destructor.h" +#include "absl/base/thread_annotations.h" #include "absl/container/flat_hash_map.h" #include "absl/container/flat_hash_set.h" -#include "absl/log/check.h" +#include "absl/functional/any_invocable.h" #include "absl/status/status.h" #include "absl/status/statusor.h" -#include "absl/strings/numbers.h" -#include "absl/strings/str_cat.h" -#include "absl/strings/str_split.h" #include "absl/strings/string_view.h" -#include "absl/strings/strip.h" #include "absl/synchronization/mutex.h" #include "src/attribute_data_type.h" #include "src/index_schema.pb.h" #include "src/indexes/index_base.h" -#include "src/indexes/numeric.h" -#include "src/indexes/tag.h" #include "src/query/predicate.h" #include "src/rdb_serialization.h" +#include "src/utils/allocator.h" #include "src/utils/string_interning.h" -#include "src/vector_externalizer.h" #include "third_party/hnswlib/hnswlib.h" -#include "third_party/hnswlib/space_ip.h" -#include "third_party/hnswlib/space_l2.h" -#include "vmsdk/src/log.h" +#include "third_party/hnswlib/iostream.h" #include "vmsdk/src/managed_pointers.h" -#include "vmsdk/src/status/status_macros.h" -#include "vmsdk/src/type_conversions.h" #include "vmsdk/src/valkey_module_api/valkey_module.h" -namespace valkey_search { -constexpr float kDefaultMagnitude = -1.0f; - -namespace { - -template -std::unique_ptr> CreateSpace( - int dimensions, valkey_search::data_model::DistanceMetric distance_metric) { - if constexpr (std::is_same_v) { - if (distance_metric == - valkey_search::data_model::DistanceMetric::DISTANCE_METRIC_COSINE || - distance_metric == - valkey_search::data_model::DistanceMetric::DISTANCE_METRIC_IP) { - return std::make_unique(dimensions); - } else { - return std::make_unique(dimensions); - } - } - DCHECK(false) << "no matching spacer"; - return std::make_unique(dimensions); -} - -} // namespace - -namespace indexes { -bool PrefilterEvaluator::Evaluate(const query::Predicate &predicate, - const InternedStringPtr &key) { - key_ = &key; - auto res = predicate.Evaluate(*this); - key_ = nullptr; - return res; -} - -bool PrefilterEvaluator::EvaluateTags(const query::TagPredicate &predicate) { - bool case_sensitive = true; - auto tags = predicate.GetIndex()->GetValue(*key_, case_sensitive); - return predicate.Evaluate(tags, case_sensitive); -} - -bool PrefilterEvaluator::EvaluateNumeric( - const query::NumericPredicate &predicate) { - CHECK(key_); - auto value = predicate.GetIndex()->GetValue(*key_); - return predicate.Evaluate(value); -} - -template -T CopyAndNormalizeEmbedding(T *dst, T *src, size_t size) { - T magnitude = 0.0f; - for (size_t i = 0; i < size; i++) { - magnitude += src[i] * src[i]; - } - magnitude = std::sqrt(magnitude); - T norm = (magnitude == 0.0f) ? 1.0f : (1.0f / magnitude); - for (size_t i = 0; i < size; i++) { - dst[i] = norm * src[i]; - } - return magnitude; -} +namespace valkey_search::indexes { std::vector NormalizeEmbedding(absl::string_view record, size_t type_size, - float *magnitude) { - std::vector ret(record.size()); - if (type_size == sizeof(float)) { - float result = CopyAndNormalizeEmbedding( - (float *)&ret[0], (float *)record.data(), ret.size() / sizeof(float)); - if (magnitude) { - *magnitude = result; - } - return ret; - } - CHECK(false) << "unsupported type size"; -} - -template -void VectorBase::Init(int dimensions, - valkey_search::data_model::DistanceMetric distance_metric, - std::unique_ptr> &space) { - space = CreateSpace(dimensions, distance_metric); - distance_metric_ = distance_metric; - if (distance_metric == - valkey_search::data_model::DistanceMetric::DISTANCE_METRIC_COSINE) { - normalize_ = true; - } -} - -std::shared_ptr VectorBase::InternVector( - absl::string_view record, std::optional &magnitude) { - if (!IsValidSizeVector(record)) { - return nullptr; - } - if (normalize_) { - magnitude = kDefaultMagnitude; - auto norm_record = - NormalizeEmbedding(record, GetDataTypeSize(), &magnitude.value()); - return StringInternStore::Intern( - absl::string_view((const char *)norm_record.data(), norm_record.size()), - vector_allocator_.get()); - } - return StringInternStore::Intern(record, vector_allocator_.get()); -} - -absl::StatusOr VectorBase::AddRecord(const InternedStringPtr &key, - absl::string_view record) { - std::optional magnitude; - auto interned_vector = InternVector(record, magnitude); - if (!interned_vector) { - return false; - } - VMSDK_ASSIGN_OR_RETURN( - auto internal_id, - TrackKey(key, magnitude.value_or(kDefaultMagnitude), interned_vector)); - absl::Status add_result = AddRecordImpl(internal_id, interned_vector->Str()); - if (!add_result.ok()) { - auto untrack_result = UnTrackKey(key); - if (!untrack_result.ok()) { - VMSDK_LOG_EVERY_N_SEC(WARNING, nullptr, 1) - << "While processing error for AddRecord, encountered error in " - "UntrackKey: " - << untrack_result.status().message(); + float* magnitude = nullptr); + +struct Neighbor { + InternedStringPtr external_id; + float distance; + std::optional attribute_contents; + Neighbor(const InternedStringPtr& external_id, float distance) + : external_id(external_id), distance(distance) {} + Neighbor(const InternedStringPtr& external_id, float distance, + std::optional&& attribute_contents) + : external_id(external_id), + distance(distance), + attribute_contents(std::move(attribute_contents)) {} + Neighbor(Neighbor&& other) noexcept + : external_id(std::move(other.external_id)), + distance(other.distance), + attribute_contents(std::move(other.attribute_contents)) {} + Neighbor& operator=(Neighbor&& other) noexcept { + if (this != &other) { + external_id = std::move(other.external_id); + distance = other.distance; + attribute_contents = std::move(other.attribute_contents); } - return add_result; - } - return true; -} - -absl::StatusOr VectorBase::GetInternalId( - const InternedStringPtr &key) const { - absl::ReaderMutexLock lock(&key_to_metadata_mutex_); - auto it = tracked_metadata_by_key_.find(key); - if (it == tracked_metadata_by_key_.end()) { - return absl::InvalidArgumentError("Record was not found"); + return *this; } - return it->second.internal_id; -} - -absl::StatusOr VectorBase::GetInternalIdDuringSearch( - const InternedStringPtr &key) const { - auto it = tracked_metadata_by_key_.find(key); - if (it == tracked_metadata_by_key_.end()) { - return absl::InvalidArgumentError("Record was not found"); - } - return it->second.internal_id; -} - -absl::StatusOr VectorBase::GetKeyDuringSearch( - uint64_t internal_id) const { - auto it = key_by_internal_id_.find(internal_id); - if (it == key_by_internal_id_.end()) { - return absl::InvalidArgumentError("Record was not found"); - } - return it->second; -} - -absl::StatusOr VectorBase::ModifyRecord(const InternedStringPtr &key, - absl::string_view record) { - // VectorExternalizer tracks added entries. We need to untrack mutations which - // are processed as modified records. - std::optional magnitude; - auto interned_vector = InternVector(record, magnitude); - if (!interned_vector) { - [[maybe_unused]] auto res = - RemoveRecord(key, indexes::DeletionType::kRecord); - return false; - } - VMSDK_ASSIGN_OR_RETURN(auto internal_id, GetInternalId(key)); - VMSDK_ASSIGN_OR_RETURN( - bool res, UpdateMetadata(key, magnitude.value_or(kDefaultMagnitude), - interned_vector)); - if (!res) { - return false; - } - - auto modify_result = ModifyRecordImpl(internal_id, interned_vector->Str()); - if (!modify_result.ok()) { - auto untrack_result = UnTrackKey(key); - if (!untrack_result.ok()) { - VMSDK_LOG_EVERY_N_SEC(WARNING, nullptr, 1) - << "While processing error for ModifyRecord, encountered error " - "in UntrackKey: " - << untrack_result.status().message(); - } - } - return true; -} - -template -absl::StatusOr> VectorBase::CreateReply( - std::priority_queue> &knn_res) { - std::deque ret; - while (!knn_res.empty()) { - auto &ele = knn_res.top(); - auto vector_key = GetKeyDuringSearch(ele.second); - if (!vector_key.ok()) { - knn_res.pop(); - continue; - } - // Sorting in asc order. - ret.emplace_front(Neighbor{vector_key.value(), ele.first}); - knn_res.pop(); - } - return ret; -} - -absl::StatusOr> VectorBase::GetValue( - const InternedStringPtr &key) const { - auto it = tracked_metadata_by_key_.find(key); - if (it == tracked_metadata_by_key_.end()) { - return absl::NotFoundError("Record was not found"); - } - std::vector result; - char *value = GetValueImpl(it->second.internal_id); - if (normalize_) { - if (it->second.magnitude < 0) { - return absl::InternalError("Magnitude is not initialized"); + friend std::ostream& operator<<(std::ostream& os, const Neighbor& n) { + os << "Key: " << n.external_id->Str() << " Dist: " << n.distance; + if (n.attribute_contents.has_value()) { + os << ' ' << *n.attribute_contents; + } else { + os << " [NoContents]"; } - result = DenormalizeVector(absl::string_view(value, GetVectorDataSize()), - GetDataTypeSize(), it->second.magnitude); + return os; + } +}; + +const absl::NoDestructor> + kVectorAlgoByStr({ + {"HNSW", data_model::VectorIndex::AlgorithmCase::kHnswAlgorithm}, + {"FLAT", data_model::VectorIndex::AlgorithmCase::kFlatAlgorithm}, + }); + +const absl::NoDestructor< + absl::flat_hash_map> + kDistanceMetricByStr( + {{"L2", data_model::DistanceMetric::DISTANCE_METRIC_L2}, + {"IP", data_model::DistanceMetric::DISTANCE_METRIC_IP}, + {"COSINE", data_model::DistanceMetric::DISTANCE_METRIC_COSINE}}); + +const absl::NoDestructor< + absl::flat_hash_map> + kVectorDataTypeByStr({{"FLOAT32", data_model::VECTOR_DATA_TYPE_FLOAT32}}); + +template +absl::string_view LookupKeyByValue( + const absl::flat_hash_map& map, const V& value) { + auto it = std::find_if(map.begin(), map.end(), [&value](const auto& pair) { + return pair.second == value; + }); + if (it != map.end()) { + return it->first; // Return the key } else { - result.assign(value, value + GetVectorDataSize()); - } - return result; -} - -bool VectorBase::IsTracked(const InternedStringPtr &key) const { - absl::ReaderMutexLock lock(&key_to_metadata_mutex_); - auto it = tracked_metadata_by_key_.find(key); - return (it != tracked_metadata_by_key_.end()); -} - -absl::StatusOr VectorBase::RemoveRecord( - const InternedStringPtr &key, - [[maybe_unused]] indexes::DeletionType deletion_type) { - VMSDK_ASSIGN_OR_RETURN(auto res, UnTrackKey(key)); - if (!res.has_value()) { - return false; - } - VMSDK_RETURN_IF_ERROR(RemoveRecordImpl(res.value())); - return true; -} - -absl::StatusOr> VectorBase::UnTrackKey( - const InternedStringPtr &key) { - if (key->Str().empty()) { - return std::nullopt; - } - absl::WriterMutexLock lock(&key_to_metadata_mutex_); - auto it = tracked_metadata_by_key_.find(key); - if (it == tracked_metadata_by_key_.end()) { - return std::nullopt; - } - auto id = it->second.internal_id; - UnTrackVector(id); - tracked_metadata_by_key_.erase(it); - auto key_by_internal_id_it = key_by_internal_id_.find(id); - if (key_by_internal_id_it == key_by_internal_id_.end()) { - return absl::InvalidArgumentError( - "Error while untracking key - key was not found in key_by_internal_id_ " - "but in internal_by_key_"); - } - key_by_internal_id_.erase(key_by_internal_id_it); - return id; -} - -char *VectorBase::TrackVector(uint64_t internal_id, char *vector, size_t len) { - auto interned_vector = StringInternStore::Intern( - absl::string_view(vector, len), vector_allocator_.get()); - TrackVector(internal_id, interned_vector); - return (char *)interned_vector->Str().data(); -} - -absl::StatusOr VectorBase::TrackKey(const InternedStringPtr &key, - float magnitude, - const InternedStringPtr &vector) { - if (key->Str().empty()) { - return absl::InvalidArgumentError("key can't be empty"); - } - absl::WriterMutexLock lock(&key_to_metadata_mutex_); - auto id = inc_id_++; - auto [_, succ] = tracked_metadata_by_key_.insert( - {key, {.internal_id = id, .magnitude = magnitude}}); - - if (!succ) { - return absl::InvalidArgumentError( - absl::StrCat("Embedding id already exists: ", key->Str())); - } - TrackVector(id, vector); - key_by_internal_id_.insert({id, key}); - return id; -} -// Return an error if the key is empty or not being tracked. -// Return false if the tracked vector matches the input vector. -// Otherwise, track the new vector and return true. -absl::StatusOr VectorBase::UpdateMetadata( - const InternedStringPtr &key, float magnitude, - const InternedStringPtr &vector) { - if (key->Str().empty()) { - return absl::InvalidArgumentError("key can't be empty"); - } - uint64_t internal_id; - { - absl::WriterMutexLock lock(&key_to_metadata_mutex_); - auto it = tracked_metadata_by_key_.find(key); - if (it == tracked_metadata_by_key_.end()) { - return absl::InvalidArgumentError( - absl::StrCat("Embedding id not found: ", key->Str())); - } - it->second.magnitude = magnitude; - internal_id = it->second.internal_id; - } - if (IsVectorMatch(internal_id, vector)) { - return false; - } - TrackVector(internal_id, vector); - return true; -} - -int VectorBase::RespondWithInfo(ValkeyModuleCtx *ctx) const { - ValkeyModule_ReplyWithSimpleString(ctx, "type"); - ValkeyModule_ReplyWithSimpleString(ctx, "VECTOR"); - int array_len = 2; - array_len += RespondWithInfoImpl(ctx); - ValkeyModule_ReplyWithSimpleString(ctx, "capacity"); - ValkeyModule_ReplyWithLongLong(ctx, GetCapacity()); - ValkeyModule_ReplyWithSimpleString(ctx, "size"); - { + return ""; + } +} + +class VectorBase : public IndexBase, public hnswlib::VectorTracker { + public: + absl::StatusOr AddRecord(const InternedStringPtr& key, + absl::string_view record) override; + absl::StatusOr RemoveRecord(const InternedStringPtr& key, + indexes::DeletionType deletion_type = + indexes::DeletionType::kNone) override; + absl::StatusOr ModifyRecord(const InternedStringPtr& key, + absl::string_view record) override; + bool IsTracked(const InternedStringPtr& key) const override + ABSL_LOCKS_EXCLUDED(key_to_metadata_mutex_); + virtual size_t GetCapacity() const = 0; + bool GetNormalize() const { return normalize_; } + std::unique_ptr ToProto() const override; + absl::Status SaveIndex(RDBChunkOutputStream chunked_out) const override; + absl::Status SaveTrackedKeys(RDBChunkOutputStream chunked_out) const + ABSL_LOCKS_EXCLUDED(key_to_metadata_mutex_); + absl::Status LoadTrackedKeys(ValkeyModuleCtx* ctx, + const AttributeDataType* attribute_data_type, + SupplementalContentChunkIter&& iter); + void ForEachTrackedKey( + absl::AnyInvocable fn) const override { absl::MutexLock lock(&key_to_metadata_mutex_); - ValkeyModule_ReplyWithCString( - ctx, std::to_string(key_by_internal_id_.size()).c_str()); - } - return array_len + 4; -} - -absl::Status VectorBase::SaveIndex(RDBChunkOutputStream chunked_out) const { - VMSDK_RETURN_IF_ERROR(SaveIndexImpl(std::move(chunked_out))); - return absl::OkStatus(); -} - -absl::Status VectorBase::SaveIndexExtension( - RDBChunkOutputStream chunked_out) const { - CHECK(false); -} - -absl::Status VectorBase::LoadIndexExtension(RDBChunkInputStream chunked_in) { - CHECK(false); -} - -absl::Status VectorBase::SaveTrackedKeys( - RDBChunkOutputStream chunked_out) const { - absl::ReaderMutexLock lock(&key_to_metadata_mutex_); - for (const auto &[key, metadata] : tracked_metadata_by_key_) { - data_model::TrackedKeyMetadata metadata_pb; - metadata_pb.set_key(key->Str()); - metadata_pb.set_internal_id(metadata.internal_id); - metadata_pb.set_magnitude(metadata.magnitude); - auto metadata_pb_str = metadata_pb.SerializeAsString(); - VMSDK_RETURN_IF_ERROR( - chunked_out.SaveChunk(metadata_pb_str.data(), metadata_pb_str.size())) - << "Error saving key_by_internal_id_ entry"; - } - return absl::OkStatus(); -} - -void VectorBase::ExternalizeVector(ValkeyModuleCtx *ctx, - const AttributeDataType *attribute_data_type, - absl::string_view key_cstr, - absl::string_view attribute_identifier) { - auto key_obj = vmsdk::MakeUniqueValkeyOpenKey( - ctx, vmsdk::MakeUniqueValkeyString(key_cstr).get(), - VALKEYMODULE_OPEN_KEY_NOEFFECTS | VALKEYMODULE_READ); - if (!key_obj || !attribute_data_type->IsProperType(key_obj.get())) { - return; - } - bool is_module_owned; - vmsdk::UniqueValkeyString record = VectorExternalizer::Instance().GetRecord( - ctx, attribute_data_type, key_obj.get(), key_cstr, attribute_identifier, - is_module_owned); - CHECK(!is_module_owned); - std::optional magnitude; - auto interned_key = StringInternStore::Intern(key_cstr); - auto interned_vector = - InternVector(vmsdk::ToStringView(record.get()), magnitude); - if (interned_vector) { - VectorExternalizer::Instance().Externalize( - interned_key, attribute_identifier, attribute_data_type->ToProto(), - interned_vector, magnitude); - } -} - -absl::Status VectorBase::LoadTrackedKeys( - ValkeyModuleCtx *ctx, const AttributeDataType *attribute_data_type, - SupplementalContentChunkIter &&iter) { - absl::WriterMutexLock lock(&key_to_metadata_mutex_); - while (iter.HasNext()) { - VMSDK_ASSIGN_OR_RETURN(auto metadata_str, iter.Next(), - _ << "Error loading metadata"); - data_model::TrackedKeyMetadata tracked_key_metadata; - if (!tracked_key_metadata.ParseFromString(metadata_str->binary_content())) { - return absl::InvalidArgumentError("Error parsing metadata from proto"); + for (const auto& [key, _] : tracked_metadata_by_key_) { + fn(key); } - auto interned_key = StringInternStore::Intern(tracked_key_metadata.key()); - tracked_metadata_by_key_.insert( - {interned_key, - {.internal_id = tracked_key_metadata.internal_id(), - .magnitude = tracked_key_metadata.magnitude()}}); - key_by_internal_id_.insert( - {tracked_key_metadata.internal_id(), interned_key}); - inc_id_ = std::max( - inc_id_, static_cast(tracked_key_metadata.internal_id())); - ExternalizeVector(ctx, attribute_data_type, tracked_key_metadata.key(), - attribute_identifier_); } - ++inc_id_; - return absl::OkStatus(); -} - -std::unique_ptr VectorBase::ToProto() const { - absl::ReaderMutexLock lock(&key_to_metadata_mutex_); - auto index_proto = std::make_unique(); - auto vector_index = std::make_unique(); - vector_index->set_normalize(normalize_); - vector_index->set_distance_metric(distance_metric_); - vector_index->set_dimension_count(dimensions_); - vector_index->set_initial_cap(GetCapacity()); - ToProtoImpl(vector_index.get()); - index_proto->set_allocated_vector_index(vector_index.release()); - return index_proto; -} - -absl::StatusOr> -VectorBase::ComputeDistanceFromRecord(const InternedStringPtr &key, - absl::string_view query) const { - VMSDK_ASSIGN_OR_RETURN(auto internal_id, GetInternalIdDuringSearch(key)); - return ComputeDistanceFromRecordImpl(internal_id, query); -} - -bool VectorBase::AddPrefilteredKey( - absl::string_view query, uint64_t count, const InternedStringPtr &key, - std::priority_queue> &results, - absl::flat_hash_set &top_keys) const { - auto result = ComputeDistanceFromRecord(key, query); - if (!result.ok()) { - return false; - } - if (results.size() < count) { - results.emplace(result.value()); - return true; - } - if (result.value().first < results.top().first) { - auto top = results.top(); - auto vector_key = GetKeyDuringSearch(top.second); - top_keys.erase(vector_key.value()->Str().data()); - results.pop(); - results.emplace(result.value()); - return true; - } - return false; -} - -vmsdk::UniqueValkeyString VectorBase::NormalizeStringRecord( - vmsdk::UniqueValkeyString record) const { - CHECK_EQ(GetDataTypeSize(), sizeof(float)); - auto record_str = vmsdk::ToStringView(record.get()); - if (absl::ConsumePrefix(&record_str, "[")) { - absl::ConsumeSuffix(&record_str, "]"); - } - std::vector float_strings = - absl::StrSplit(record_str, ',', absl::SkipWhitespace()); - std::string binary_string; - binary_string.reserve(float_strings.size() * sizeof(float)); - for (const auto &float_str : float_strings) { - float value; - if (!absl::SimpleAtof(float_str, &value)) { - return nullptr; - } - binary_string += std::string((char *)&value, sizeof(float)); + absl::StatusOr GetKeyDuringSearch( + uint64_t internal_id) const ABSL_NO_THREAD_SAFETY_ANALYSIS; + bool AddPrefilteredKey( + absl::string_view query, uint64_t count, const InternedStringPtr& key, + std::priority_queue>& results, + absl::flat_hash_set& top_keys) const; + vmsdk::UniqueValkeyString NormalizeStringRecord( + vmsdk::UniqueValkeyString record) const override; + uint64_t GetRecordCount() const override; + template + absl::StatusOr> CreateReply( + std::priority_queue>& knn_res); + absl::StatusOr> GetValue(const InternedStringPtr& key) const + ABSL_NO_THREAD_SAFETY_ANALYSIS; + int GetVectorDataSize() const { return GetDataTypeSize() * dimensions_; } + char* TrackVector(uint64_t internal_id, char* vector, size_t len) override; + std::shared_ptr InternVector(absl::string_view record, + std::optional& magnitude); + + protected: + VectorBase(IndexerType indexer_type, int dimensions, + data_model::AttributeDataType attribute_data_type, + absl::string_view attribute_identifier) + : IndexBase(indexer_type), + dimensions_(dimensions), + attribute_identifier_(attribute_identifier), + attribute_data_type_(attribute_data_type) +#ifndef SAN_BUILD + , + vector_allocator_(CREATE_UNIQUE_PTR( + FixedSizeAllocator, dimensions * sizeof(float) + 1, true)) +#endif // !SAN_BUILD + { } - return vmsdk::MakeUniqueValkeyString(binary_string); -} - -uint64_t VectorBase::GetRecordCount() const { - absl::ReaderMutexLock lock(&key_to_metadata_mutex_); - return key_by_internal_id_.size(); -} - -template void VectorBase::Init( - int dimensions, data_model::DistanceMetric distance_metric, - std::unique_ptr> &space); - -template absl::StatusOr> VectorBase::CreateReply( - std::priority_queue> &knn_res); -} // namespace indexes -} // namespace valkey_search + bool IsValidSizeVector(absl::string_view record) { + const auto data_type_size = GetDataTypeSize(); + int32_t dim = record.size() / GetDataTypeSize(); + return dim == dimensions_ && (record.size() % data_type_size == 0); + } + int RespondWithInfo(ValkeyModuleCtx* ctx) const override; + template + void Init(int dimensions, data_model::DistanceMetric distance_metric, + std::unique_ptr>& space); + virtual absl::Status AddRecordImpl(uint64_t internal_id, + absl::string_view record) = 0; + + virtual absl::Status RemoveRecordImpl(uint64_t internal_id) = 0; + virtual absl::Status ModifyRecordImpl(uint64_t internal_id, + absl::string_view record) = 0; + virtual int RespondWithInfoImpl(ValkeyModuleCtx* ctx) const = 0; + + virtual size_t GetDataTypeSize() const = 0; + virtual void ToProtoImpl( + data_model::VectorIndex* vector_index_proto) const = 0; + virtual absl::Status SaveIndexImpl( + RDBChunkOutputStream chunked_out) const = 0; + void ExternalizeVector(ValkeyModuleCtx* ctx, + const AttributeDataType* attribute_data_type, + absl::string_view key_cstr, + absl::string_view attribute_identifier); + virtual char* GetValueImpl(uint64_t internal_id) const = 0; + + int dimensions_; + std::string attribute_identifier_; + bool normalize_{false}; + data_model::AttributeDataType attribute_data_type_; + data_model::DistanceMetric distance_metric_; + virtual absl::StatusOr> + ComputeDistanceFromRecordImpl(uint64_t internal_id, + absl::string_view query) const = 0; + virtual void TrackVector(uint64_t internal_id, + const InternedStringPtr& vector) = 0; + virtual bool IsVectorMatch(uint64_t internal_id, + const InternedStringPtr& vector) = 0; + virtual void UnTrackVector(uint64_t internal_id) = 0; + + private: + absl::StatusOr TrackKey(const InternedStringPtr& key, + float magnitude, + const InternedStringPtr& vector) + ABSL_LOCKS_EXCLUDED(key_to_metadata_mutex_); + absl::StatusOr> UnTrackKey( + const InternedStringPtr& key) ABSL_LOCKS_EXCLUDED(key_to_metadata_mutex_); + absl::StatusOr UpdateMetadata(const InternedStringPtr& key, + float magnitude, + const InternedStringPtr& vector) + ABSL_LOCKS_EXCLUDED(key_to_metadata_mutex_); + absl::StatusOr GetInternalId(const InternedStringPtr& key) const + ABSL_LOCKS_EXCLUDED(key_to_metadata_mutex_); + absl::StatusOr GetInternalIdDuringSearch( + const InternedStringPtr& key) const ABSL_NO_THREAD_SAFETY_ANALYSIS; + absl::flat_hash_map key_by_internal_id_ + ABSL_GUARDED_BY(key_to_metadata_mutex_); + struct TrackedKeyMetadata { + uint64_t internal_id; + // If normalize_ is false, this will be -1.0f. Otherwise, it will be the + // magnitude of the vector. If the magnitude is not initialized, it will be + // -inf (this is an intermediate state during backfill when transitioning + // from the old RDB format that didn't include magnitudes). + float magnitude; + }; + + InternedStringMap tracked_metadata_by_key_ + ABSL_GUARDED_BY(key_to_metadata_mutex_); + uint64_t inc_id_ ABSL_GUARDED_BY(key_to_metadata_mutex_){0}; + mutable absl::Mutex key_to_metadata_mutex_; + absl::StatusOr> + ComputeDistanceFromRecord(const InternedStringPtr& key, + absl::string_view query) const; + UniqueFixedSizeAllocatorPtr vector_allocator_{nullptr, nullptr}; +}; + +class PrefilterEvaluator : public query::Evaluator { + public: + bool Evaluate(const query::Predicate& predicate, + const InternedStringPtr& key); + + private: + bool EvaluateTags(const query::TagPredicate& predicate) override; + bool EvaluateNumeric(const query::NumericPredicate& predicate) override; + const InternedStringPtr* key_{nullptr}; +}; + +} // namespace valkey_search::indexes + +#endif // VALKEYSEARCH_SRC_INDEXES_VECTOR_BASE_H_ diff --git a/src/indexes/vector_base.h b/src/indexes/vector_base.h index fc2b0086c..c66b77cd9 100644 --- a/src/indexes/vector_base.h +++ b/src/indexes/vector_base.h @@ -125,10 +125,6 @@ class VectorBase : public IndexBase, public hnswlib::VectorTracker { bool GetNormalize() const { return normalize_; } std::unique_ptr ToProto() const override; absl::Status SaveIndex(RDBChunkOutputStream chunked_out) const override; - absl::Status SaveIndexExtension( - RDBChunkOutputStream chunked_out) const override; - absl::Status LoadIndexExtension(RDBChunkInputStream chunked_in) override; - absl::Status SaveTrackedKeys(RDBChunkOutputStream chunked_out) const ABSL_LOCKS_EXCLUDED(key_to_metadata_mutex_); absl::Status LoadTrackedKeys(ValkeyModuleCtx* ctx, diff --git a/src/rdb_serialization.h b/src/rdb_serialization.h index 073fb2b78..a19c0b251 100644 --- a/src/rdb_serialization.h +++ b/src/rdb_serialization.h @@ -19,7 +19,6 @@ #include "absl/strings/string_view.h" #include "src/rdb_section.pb.h" #include "third_party/hnswlib/iostream.h" -#include "type_conversions.h" #include "vmsdk/src/log.h" #include "vmsdk/src/managed_pointers.h" #include "vmsdk/src/status/status_macros.h" @@ -72,10 +71,6 @@ inline std::string HumanReadableSemanticVersion(uint64_t semantic_version) { semantic_version & 0xFF); } -//#define DBG(name, x) \ -// VMSDK_LOG(WARNING, nullptr) << "SafeRDB " << name << " : " << x -#define DBG(name, x) - /* SafeRDB wraps a ValkeyModuleIO object and performs IO error checking, * returning absl::StatusOr to force error handling on the caller side. */ class SafeRDB { @@ -84,7 +79,6 @@ class SafeRDB { virtual absl::StatusOr LoadSizeT() { auto val = ValkeyModule_LoadUnsigned(rdb_); - DBG("LoadSizeT", val); if (ValkeyModule_IsIOError(rdb_)) { return absl::InternalError("ValkeyModule_LoadUnsigned failed"); } @@ -93,7 +87,6 @@ class SafeRDB { virtual absl::StatusOr LoadUnsigned() { auto val = ValkeyModule_LoadUnsigned(rdb_); - DBG("LoadUnSigned", val); if (ValkeyModule_IsIOError(rdb_)) { return absl::InternalError("ValkeyModule_LoadUnsigned failed"); } @@ -102,7 +95,6 @@ class SafeRDB { virtual absl::StatusOr LoadSigned() { auto val = ValkeyModule_LoadSigned(rdb_); - DBG("LoadSigned", val); if (ValkeyModule_IsIOError(rdb_)) { return absl::InternalError("ValkeyModule_LoadSigned failed"); } @@ -111,7 +103,6 @@ class SafeRDB { virtual absl::StatusOr LoadDouble() { auto val = ValkeyModule_LoadDouble(rdb_); - DBG("LoadDouble", val); if (ValkeyModule_IsIOError(rdb_)) { return absl::InternalError("ValkeyModule_LoadDouble failed"); } @@ -120,7 +111,6 @@ class SafeRDB { virtual absl::StatusOr LoadString() { auto str = vmsdk::UniqueValkeyString(ValkeyModule_LoadString(rdb_)); - DBG("Load string", vmsdk::StringToHex(vmsdk::ToStringView(str.get()))); if (ValkeyModule_IsIOError(rdb_)) { return absl::InternalError("ValkeyModule_LoadString failed"); } @@ -130,7 +120,6 @@ class SafeRDB { virtual absl::Status SaveSizeT(size_t val) { ValkeyModule_SaveUnsigned(rdb_, val); - DBG("SaveSizeT", val); if (ValkeyModule_IsIOError(rdb_)) { return absl::InternalError("ValkeyModule_SaveUnsigned failed"); } @@ -139,7 +128,6 @@ class SafeRDB { virtual absl::Status SaveUnsigned(unsigned int val) { ValkeyModule_SaveUnsigned(rdb_, val); - DBG("SaveUnsigned", val); if (ValkeyModule_IsIOError(rdb_)) { return absl::InternalError("ValkeyModule_SaveUnsigned failed"); } @@ -148,7 +136,6 @@ class SafeRDB { virtual absl::Status SaveSigned(int val) { ValkeyModule_SaveSigned(rdb_, val); - DBG("SaveSigned", val); if (ValkeyModule_IsIOError(rdb_)) { return absl::InternalError("ValkeyModule_SaveSigned failed"); } @@ -157,7 +144,6 @@ class SafeRDB { virtual absl::Status SaveDouble(double val) { ValkeyModule_SaveDouble(rdb_, val); - DBG("SaveDouble", val); if (ValkeyModule_IsIOError(rdb_)) { return absl::InternalError("ValkeyModule_SaveDouble failed"); } @@ -166,7 +152,6 @@ class SafeRDB { virtual absl::Status SaveStringBuffer(absl::string_view buf) { ValkeyModule_SaveStringBuffer(rdb_, buf.data(), buf.size()); - DBG("SaveString", vmsdk::StringToHex(buf)); if (ValkeyModule_IsIOError(rdb_)) { return absl::InternalError("ValkeyModule_SaveStringBuffer failed"); } From e4ac641f37558c9cf147bde9be83bda2a1525daa Mon Sep 17 00:00:00 2001 From: Allen Samuels Date: Thu, 16 Oct 2025 06:09:14 +0000 Subject: [PATCH 13/33] Revised Signed-off-by: Allen Samuels --- integration/test_saverestore.py | 43 +- src/index_schema.cc | 230 ++++++---- src/index_schema.h | 7 +- src/indexes/index_base.h | 17 +- src/indexes/numeric.cc | 40 +- src/indexes/numeric.h | 21 +- src/indexes/tag.cc | 40 +- src/indexes/tag.h | 19 +- src/indexes/text.h | 7 +- src/indexes/text/text.h | 7 +- src/indexes/vector_base.cc | 734 ++++++++++++++++++++++---------- src/indexes/vector_base.h | 23 +- src/rdb_section.proto | 15 +- testing/common.h | 22 +- testing/index_schema_test.cc | 35 +- testing/numeric_index_test.cc | 2 +- testing/tag_index_test.cc | 2 +- vmsdk/src/info.h | 7 + 18 files changed, 849 insertions(+), 422 deletions(-) diff --git a/integration/test_saverestore.py b/integration/test_saverestore.py index f6d30e828..310feab23 100644 --- a/integration/test_saverestore.py +++ b/integration/test_saverestore.py @@ -6,7 +6,7 @@ import shutil import socket from valkey import ResponseError, Valkey -from valkey_search_test_case import ValkeySearchTestCaseBase, ValkeySearchTestCaseDebugMode +from valkey_search_test_case import ValkeySearchTestCaseDebugMode, ValkeySearchTestCaseDebugMode from valkeytestframework.conftest import resource_port_tracker from indexes import * import pytest @@ -64,6 +64,7 @@ def load_data(client: Valkey.client): records = make_data() for i in range(0, len(records)): index.write_data(client, i, records[i]) + return len(records) def verify_data(client: Valkey.client): ''' @@ -77,11 +78,23 @@ def verify_data(client: Valkey.client): def do_save_restore_test(test, write_v2: bool, read_v2: bool): index.create(test.client, True) - load_data(test.client) + key_count = load_data(test.client) verify_data(test.client) - + test.client.config_set("search.rdb-validate-on-write", "yes") test.client.execute_command("save") os.environ["SKIPLOGCLEAN"] = "1" + test.client.execute_command("CONFIG SET search.info-developer-visible yes") + i = test.client.info("search") + print("Info after save: ", i) + writes = [ + i["search_rdb_save_sections"], + i["search_rdb_save_keys"], + i["search_rdb_save_mutation_entries"], + ] + if write_v2: + assert writes == [5, key_count, 0] + else: + assert writes == [4, 0, 0] test.server.restart(remove_rdb=False) time.sleep(5) print(test.client.ping()) @@ -89,24 +102,22 @@ def do_save_restore_test(test, write_v2: bool, read_v2: bool): test.client.execute_command("CONFIG SET search.info-developer-visible yes") i = test.client.info("search") - print("Info: ", i) + print("Info after load: ", i) reads = [ i["search_rdb_load_sections"], - i["search_rdb_load_skipped_sections"], - i["search_rdb_load_tags_tracked"], - i["search_rdb_load_tags_untracked"], - i["search_rdb_load_numbers_tracked"], - i["search_rdb_load_numbers_untracked"], + i["search_rdb_load_sections_skipped"], + i["search_rdb_load_keys"], + i["search_rdb_load_mutation_entries"], ] if not write_v2: - assert reads == [4, 0, 0, 0, 0, 0] + assert reads == [4, 0, 0, 0] elif read_v2: - assert reads == [7, 0, 12, 1, 12, 1] + assert reads == [5, 0, key_count, 0] else: - assert reads == [7, 3, 0, 0, 0, 0] + assert reads == [5, 1, 0, 0] -class TestSaveRestore_v1_v1(ValkeySearchTestCaseBase): +class TestSaveRestore_v1_v1(ValkeySearchTestCaseDebugMode): def append_startup_args(self, args): args["search.rdb_write_v2"] = "no" args["search.rdb_read_v2"] = "no" @@ -115,7 +126,7 @@ def append_startup_args(self, args): def test_saverestore_v1_v1(self): do_save_restore_test(self, False, False) -class TestSaveRestore_v1_v2(ValkeySearchTestCaseBase): +class TestSaveRestore_v1_v2(ValkeySearchTestCaseDebugMode): def append_startup_args(self, args): args["search.rdb_write_v2"] = "no" args["search.rdb_read_v2"] = "yes" @@ -124,7 +135,7 @@ def append_startup_args(self, args): def test_saverestore_v1_v2(self): do_save_restore_test(self, False, True) -class TestSaveRestore_v2_v1(ValkeySearchTestCaseBase): +class TestSaveRestore_v2_v1(ValkeySearchTestCaseDebugMode): def append_startup_args(self, args): args["search.rdb_write_v2"] = "yes" args["search.rdb_read_v2"] = "no" @@ -133,7 +144,7 @@ def append_startup_args(self, args): def test_saverestore_v2_v1(self): do_save_restore_test(self, True, False) -class TestSaveRestore_v2_v2(ValkeySearchTestCaseBase): +class TestSaveRestore_v2_v2(ValkeySearchTestCaseDebugMode): def append_startup_args(self, args): args["search.rdb_write_v2"] = "yes" args["search.rdb_read_v2"] = "yes" diff --git a/src/index_schema.cc b/src/index_schema.cc index 14e592294..087970c44 100644 --- a/src/index_schema.cc +++ b/src/index_schema.cc @@ -68,6 +68,8 @@ static auto config_rdb_write_v2 = vmsdk::config::BooleanBuilder("rdb-write-v2", false).Dev().Build(); static auto config_rdb_read_v2 = vmsdk::config::BooleanBuilder("rdb-read-v2", false).Dev().Build(); +static auto config_rdb_validate_on_write = + vmsdk::config::BooleanBuilder("rdb-validate-on-write", false).Dev().Build(); static bool RDBReadV2() { return dynamic_cast(*config_rdb_read_v2).GetValue(); @@ -78,21 +80,18 @@ static bool RDBWriteV2() { .GetValue(); } -static vmsdk::info_field::Integer rdb_load_sections( - "rdb_stats", "rdb_load_sections", - vmsdk::info_field::IntegerBuilder().Dev()); - -static vmsdk::info_field::Integer rdb_load_skipped_sections( - "rdb_stats", "rdb_load_skipped_sections", - vmsdk::info_field::IntegerBuilder().Dev()); - -static vmsdk::info_field::Integer rdb_load_mutation_entries( - "rdb_stats", "rdb_load_mutation_entries", - vmsdk::info_field::IntegerBuilder().Dev()); +static bool RDBValidateOnWrite() { + return dynamic_cast(*config_rdb_validate_on_write) + .GetValue(); +} -static vmsdk::info_field::Integer rdb_save_sections( - "rdb_stats", "rdb_save_sections", - vmsdk::info_field::IntegerBuilder().Dev()); +DEV_INTEGER_COUNTER(rdb_stats, rdb_save_keys); +DEV_INTEGER_COUNTER(rdb_stats, rdb_load_keys); +DEV_INTEGER_COUNTER(rdb_stats, rdb_save_sections); +DEV_INTEGER_COUNTER(rdb_stats, rdb_load_sections); +DEV_INTEGER_COUNTER(rdb_stats, rdb_load_sections_skipped); +DEV_INTEGER_COUNTER(rdb_stats, rdb_save_mutation_entries); +DEV_INTEGER_COUNTER(rdb_stats, rdb_load_mutation_entries); IndexSchema::BackfillJob::BackfillJob(ValkeyModuleCtx *ctx, absl::string_view name, int db_num) @@ -779,7 +778,7 @@ absl::string_view IndexSchema::GetStateForInfo() const { uint64_t IndexSchema::CountRecords() const { uint64_t record_cnt = 0; for (const auto &attribute : attributes_) { - record_cnt += attribute.second.GetIndex()->GetRecordCount(); + record_cnt += attribute.second.GetIndex()->GetTrackedKeyCount(); } return record_cnt; } @@ -887,42 +886,15 @@ absl::Status IndexSchema::RDBSave(SafeRDB *rdb) const { rdb_section->set_allocated_index_schema_contents( index_schema_proto.release()); - /* - The legacy V1 format. - - Each non-vector attribute has one SUPPLEMENTAL_CONTENT_INDEX_CONTENT - segment. That segment consists of a header and no additional content. - - Vector attributes have two segments. - One SUPPLEMENTAL_CONTENT_INDEX_CONTENT which has a header and the saved - vector index data. One SUPPLEMENTAL_CONTENT_KEY_TO_ID_MAP segment - - The new V2 format is a superset of the V1 format. - - Each non-vector attribute has two segments. - one SUPPLEMENTAL_CONTENT_INDEX_CONTENT segment which is a header only. - (Same as V1) one SUPPLEMENTAL_CONTENT_INDEX_EXTENSION segment which has - saved index data - - Vector attributes are same as V1. - - V2 also has a segment with SUPPLEMENTAL_CONTENT_MUTATION_QUEUE which is the - header and the contents of the mutation queue. - - */ size_t supplemental_count = - RDBWriteV2() - // For V2, we write two sections for each attribute plus the mutation - // queue - ? 2 * GetAttributeCount() + 1 - // For V1, we write one section for each attribute plus an additional - // section for each vector attribute - : GetAttributeCount() + - std::count_if( - attributes_.begin(), attributes_.end(), + GetAttributeCount() + + std::count_if(attributes_.begin(), attributes_.end(), [](const auto &attribute) { return IsVectorIndex(attribute.second.GetIndex()); }); + if (RDBWriteV2()) { + supplemental_count += 1; // For Index Extension + } rdb_section->set_supplemental_count(supplemental_count); auto rdb_section_string = rdb_section->SerializeAsString(); @@ -958,47 +930,139 @@ absl::Status IndexSchema::RDBSave(SafeRDB *rdb) const { std::bind_front(&indexes::VectorBase::SaveTrackedKeys, dynamic_cast( attribute.second.GetIndex().get())))); - } else if (RDBWriteV2()) { - VMSDK_RETURN_IF_ERROR(SaveSupplementalSection( - rdb, data_model::SUPPLEMENTAL_CONTENT_INDEX_EXTENSION, - [&](auto &header) { - header.mutable_index_content_header()->set_allocated_attribute( - attribute.second.ToProto().release()); - }, - std::bind_front(&indexes::IndexBase::SaveIndexExtension, - attribute.second.GetIndex()))); } } if (RDBWriteV2()) { VMSDK_RETURN_IF_ERROR(SaveSupplementalSection( - rdb, data_model::SUPPLEMENTAL_CONTENT_MUTATION_QUEUE, + rdb, data_model::SUPPLEMENTAL_CONTENT_INDEX_EXTENSION, [&](auto &header) { header.mutable_mutation_queue_header()->set_backfilling( IsBackfillInProgress()); - VMSDK_LOG(WARNING, nullptr) - << "RDB: Saving Mutation Queue Backfill = " + VMSDK_LOG(NOTICE, nullptr) + << "RDB: Saving Index Extension Backfill = " << header.mutation_queue_header().backfilling(); }, - std::bind_front(&IndexSchema::SaveMutationQueue, this))); + std::bind_front(&IndexSchema::SaveIndexExtension, this))); } return absl::OkStatus(); } -absl::Status IndexSchema::SaveMutationQueue(RDBChunkOutputStream out) const { - VMSDK_LOG(NOTICE, nullptr) << "Writing Mutation Queue, records = " - << tracked_mutated_records_.size(); +absl::Status IndexSchema::ValidateIndex() const { + absl::Status status = absl::OkStatus(); + // + // Again, find a non-vector index as the oracle + // + auto oracle_index = attributes_.begin()->second.GetIndex(); + auto oracle_name = attributes_.begin()->first; + for (const auto &attribute : attributes_) { + if (!IsVectorIndex(attribute.second.GetIndex())) { + oracle_index = attribute.second.GetIndex(); + oracle_name = attribute.first; + break; + } + } + size_t oracle_key_count = + oracle_index->GetTrackedKeyCount() + oracle_index->GetUnTrackedKeyCount(); + // + // Now, make sure all the other indexes have the same key count, except for + // vector indexes which may have less keys + // + for (const auto &[name, attr] : attributes_) { + auto idx = attr.GetIndex(); + size_t cnt = idx->GetTrackedKeyCount() + idx->GetUnTrackedKeyCount(); + if (cnt != oracle_key_count) { + if (IsVectorIndex(idx) && cnt < oracle_key_count) { + continue; + } + VMSDK_LOG(WARNING, nullptr) + << "Index validation failed for index " << name + << " expected key count " << oracle_key_count << " got " << cnt; + // + // Ok, do a detailed comparison + // + auto larger_index = (cnt > oracle_key_count) ? idx : oracle_index; + auto larger_name = (cnt > oracle_key_count) ? name : oracle_name; + auto smaller_index = (cnt > oracle_key_count) ? oracle_index : idx; + auto smaller_name = (cnt > oracle_key_count) ? oracle_name : name; + auto key_check = [&](const InternedStringPtr &key) { + if (!smaller_index->IsTracked(key) && + !smaller_index->IsUnTracked(key)) { + VMSDK_LOG(WARNING, nullptr) + << "Key found in " << larger_name << " not found in " + << smaller_name << ": " << key->Str(); + status = absl::InternalError( + absl::StrCat("Key found in ", larger_name, " not found in ", + smaller_name, ": ", key->Str())); + return absl::OkStatus(); + } + }; + auto status1 = larger_index->ForEachTrackedKey(key_check); + if (!status1.ok()) { + status = status1; + } + auto status2 = larger_index->ForEachUnTrackedKey(key_check); + if (!status2.ok()) { + status = status1; + } + } + } + return status; +} + +absl::Status IndexSchema::SaveIndexExtension(RDBChunkOutputStream out) const { + if (RDBValidateOnWrite()) { + VMSDK_RETURN_IF_ERROR(ValidateIndex()); + } + // + // Need to find an attribute index that has the right tracked/untracked + // keys. Any non-vector index will do. But it there are only vector + // indexes we will use that. + // + auto index = attributes_.begin()->second.GetIndex(); + for (const auto &attribute : attributes_) { + if (!IsVectorIndex(attribute.second.GetIndex())) { + index = attribute.second.GetIndex(); + break; + } + } + size_t key_count = + index->GetTrackedKeyCount() + index->GetUnTrackedKeyCount(); + VMSDK_RETURN_IF_ERROR(out.SaveObject(key_count)); + rdb_save_keys.Increment(key_count); + VMSDK_LOG(DEBUG, nullptr) << "Writing Index Extension, keys = " << key_count; + + auto write_a_key = [&](const InternedStringPtr &key) { + key_count--; + return out.SaveString(key->Str()); + }; + VMSDK_RETURN_IF_ERROR(index->ForEachTrackedKey(write_a_key)); + VMSDK_RETURN_IF_ERROR(index->ForEachUnTrackedKey(write_a_key)); + CHECK(key_count == 0) << "Key count mismatch for index " << GetName(); + + VMSDK_LOG(DEBUG, nullptr) << "Writing Mutation Queue, records = " + << tracked_mutated_records_.size(); VMSDK_RETURN_IF_ERROR(out.SaveObject(tracked_mutated_records_.size())); + rdb_save_mutation_entries.Increment(tracked_mutated_records_.size()); for (const auto &[key, value] : tracked_mutated_records_) { VMSDK_RETURN_IF_ERROR(out.SaveString(key->Str())); } return absl::OkStatus(); } -absl::Status IndexSchema::LoadMutationQueue(ValkeyModuleCtx *ctx, - RDBChunkInputStream input) { +absl::Status IndexSchema::LoadIndexExtension(ValkeyModuleCtx *ctx, + RDBChunkInputStream input) { + VMSDK_ASSIGN_OR_RETURN(size_t key_count, input.LoadObject()); + rdb_load_keys.Increment(key_count); + VMSDK_LOG(DEBUG, ctx) << "Loading Index Extension, keys = " << key_count; + for (size_t i = 0; i < key_count; ++i) { + VMSDK_ASSIGN_OR_RETURN(auto keyname_str, input.LoadString()); + auto keyname = vmsdk::MakeUniqueValkeyString(keyname_str); + ProcessKeyspaceNotification(ctx, keyname.get(), false); + } VMSDK_ASSIGN_OR_RETURN(size_t count, input.LoadObject()); + VMSDK_LOG(DEBUG, ctx) << "Loading Mutation Entries, entries = " << count; rdb_load_mutation_entries.Increment(count); for (size_t i = 0; i < count; ++i) { VMSDK_ASSIGN_OR_RETURN(auto keyname_str, input.LoadString()); @@ -1012,7 +1076,7 @@ absl::Status IndexSchema::LoadMutationQueue(ValkeyModuleCtx *ctx, // We need to iterate over the chunks to consume them static absl::Status SkipSupplementalContent( SupplementalContentIter &supplemental_iter, std::string_view reason) { - rdb_load_skipped_sections.Increment(); + rdb_load_sections_skipped.Increment(); VMSDK_LOG(NOTICE, nullptr) << "Skipping supplemental content section (" << reason << ")"; auto chunk_it = supplemental_iter.IterateChunks(); @@ -1040,8 +1104,8 @@ absl::StatusOr> IndexSchema::LoadFromRDB( // Supplemental content will include indices and any content for them while (supplemental_iter.HasNext()) { - VMSDK_ASSIGN_OR_RETURN(auto supplemental_content, supplemental_iter.Next()); rdb_load_sections.Increment(); + VMSDK_ASSIGN_OR_RETURN(auto supplemental_content, supplemental_iter.Next()); if (skip_loading_index_data) { VMSDK_RETURN_IF_ERROR( SkipSupplementalContent(supplemental_iter, "due to configuration")); @@ -1083,32 +1147,13 @@ absl::StatusOr> IndexSchema::LoadFromRDB( } case data_model::SupplementalContentType:: SUPPLEMENTAL_CONTENT_INDEX_EXTENSION: { - auto &attribute = - supplemental_content->index_content_header().attribute(); - VMSDK_LOG(NOTICE, nullptr) - << "Loading Index Content Extension for attribute: " - << attribute.alias(); - if (!RDBReadV2()) { - VMSDK_RETURN_IF_ERROR( - SkipSupplementalContent(supplemental_iter, attribute.alias())); - } else { - VMSDK_ASSIGN_OR_RETURN( - auto index, index_schema->GetIndex(attribute.alias()), - _ << "Index extension found before index definition."); - VMSDK_RETURN_IF_ERROR(index->LoadIndexExtension( - RDBChunkInputStream(supplemental_iter.IterateChunks()))); - } - break; - } - case data_model::SupplementalContentType:: - SUPPLEMENTAL_CONTENT_MUTATION_QUEUE: { VMSDK_LOG(NOTICE, nullptr) << "Loading Mutation Queue"; if (!RDBReadV2()) { VMSDK_RETURN_IF_ERROR( SkipSupplementalContent(supplemental_iter, "mutation queue")); } else { if (index_schema) { - VMSDK_RETURN_IF_ERROR(index_schema->LoadMutationQueue( + VMSDK_RETURN_IF_ERROR(index_schema->LoadIndexExtension( ctx, RDBChunkInputStream(supplemental_iter.IterateChunks()))); bool backfilling = supplemental_content->mutation_queue_header().backfilling(); @@ -1167,8 +1212,8 @@ void IndexSchema::OnLoadingEnded(ValkeyModuleCtx *ctx) { : " Backfill not needed."); return; } - // Clean up any potentially stale index entries that can arise from pending - // record deletions being lost during RDB save. + // Clean up any potentially stale index entries that can arise from + // pending record deletions being lost during RDB save. vmsdk::StopWatch stop_watch; ValkeyModule_SelectDb(ctx, db_num_); // Make sure we are in the right DB. absl::flat_hash_map deletion_attributes; @@ -1177,8 +1222,10 @@ void IndexSchema::OnLoadingEnded(ValkeyModuleCtx *ctx) { std::vector to_delete; uint64_t key_size = 0; uint64_t stale_entries = 0; - index->ForEachTrackedKey([ctx, &deletion_attributes, &key_size, &attribute, - &stale_entries](const InternedStringPtr &key) { + auto status = index->ForEachTrackedKey([ctx, &deletion_attributes, + &key_size, &attribute, + &stale_entries]( + const InternedStringPtr &key) { auto r_str = vmsdk::MakeUniqueValkeyString(*key); if (!ValkeyModule_KeyExists(ctx, r_str.get())) { deletion_attributes[std::string(*key)][attribute.second.GetAlias()] = { @@ -1186,6 +1233,7 @@ void IndexSchema::OnLoadingEnded(ValkeyModuleCtx *ctx) { stale_entries++; } key_size++; + return absl::OkStatus(); }); VMSDK_LOG(NOTICE, ctx) << "Deleting " << stale_entries << " stale entries of " << key_size diff --git a/src/index_schema.h b/src/index_schema.h index d0a2e1330..42544383f 100644 --- a/src/index_schema.h +++ b/src/index_schema.h @@ -134,9 +134,10 @@ class IndexSchema : public KeyspaceEventSubscription, int GetAttributeCount() const { return attributes_.size(); } virtual absl::Status RDBSave(SafeRDB *rdb) const; - absl::Status SaveMutationQueue(RDBChunkOutputStream output) const; - absl::Status LoadMutationQueue(ValkeyModuleCtx *ctx, - RDBChunkInputStream input); + absl::Status SaveIndexExtension(RDBChunkOutputStream output) const; + absl::Status LoadIndexExtension(ValkeyModuleCtx *ctx, + RDBChunkInputStream input); + absl::Status ValidateIndex() const; static absl::StatusOr> LoadFromRDB( ValkeyModuleCtx *ctx, vmsdk::ThreadPool *mutations_thread_pool, diff --git a/src/indexes/index_base.h b/src/indexes/index_base.h index 9368eb3be..5044e9175 100644 --- a/src/indexes/index_base.h +++ b/src/indexes/index_base.h @@ -53,21 +53,24 @@ class IndexBase { virtual absl::StatusOr ModifyRecord(const InternedStringPtr& key, absl::string_view data) = 0; virtual int RespondWithInfo(ValkeyModuleCtx* ctx) const = 0; - virtual bool IsTracked(const InternedStringPtr& key) const = 0; IndexerType GetIndexerType() const { return indexer_type_; } virtual absl::Status SaveIndex(RDBChunkOutputStream chunked_out) const = 0; - virtual absl::Status SaveIndexExtension( - RDBChunkOutputStream chunked_out) const = 0; - virtual absl::Status LoadIndexExtension(RDBChunkInputStream chunked_in) = 0; + virtual std::unique_ptr ToProto() const = 0; - virtual void ForEachTrackedKey( - absl::AnyInvocable fn) const {} + + virtual size_t GetTrackedKeyCount() const = 0; + virtual size_t GetUnTrackedKeyCount() const = 0; + virtual bool IsTracked(const InternedStringPtr& key) const = 0; + virtual bool IsUnTracked(const InternedStringPtr& key) const = 0; + virtual absl::Status ForEachTrackedKey( + absl::AnyInvocable fn) const = 0; + virtual absl::Status ForEachUnTrackedKey( + absl::AnyInvocable fn) const = 0; virtual vmsdk::UniqueValkeyString NormalizeStringRecord( vmsdk::UniqueValkeyString input) const { return input; } - virtual uint64_t GetRecordCount() const = 0; private: IndexerType indexer_type_{IndexerType::kNone}; diff --git a/src/indexes/numeric.cc b/src/indexes/numeric.cc index 450a8105c..198014ed4 100644 --- a/src/indexes/numeric.cc +++ b/src/indexes/numeric.cc @@ -111,11 +111,6 @@ int Numeric::RespondWithInfo(ValkeyModuleCtx* ctx) const { return 4; } -bool Numeric::IsTracked(const InternedStringPtr& key) const { - absl::MutexLock lock(&index_mutex_); - return tracked_keys_.contains(key); -} - std::unique_ptr Numeric::ToProto() const { auto index_proto = std::make_unique(); auto numeric_index = std::make_unique(); @@ -254,9 +249,42 @@ std::unique_ptr Numeric::EntriesFetcher::Begin() { return itr; } -uint64_t Numeric::GetRecordCount() const { +size_t Numeric::GetTrackedKeyCount() const { absl::MutexLock lock(&index_mutex_); return tracked_keys_.size(); } +size_t Numeric::GetUnTrackedKeyCount() const { + absl::MutexLock lock(&index_mutex_); + return untracked_keys_.size(); +} + +bool Numeric::IsTracked(const InternedStringPtr& key) const { + absl::MutexLock lock(&index_mutex_); + return tracked_keys_.contains(key); +} + +bool Numeric::IsUnTracked(const InternedStringPtr& key) const { + absl::MutexLock lock(&index_mutex_); + return untracked_keys_.contains(key); +} + +absl::Status Numeric::ForEachTrackedKey( + absl::AnyInvocable fn) const { + absl::MutexLock lock(&index_mutex_); + for (const auto& [key, _] : tracked_keys_) { + VMSDK_RETURN_IF_ERROR(fn(key)); + } + return absl::OkStatus(); +} + +absl::Status Numeric::ForEachUnTrackedKey( + absl::AnyInvocable fn) const { + absl::MutexLock lock(&index_mutex_); + for (const auto& key : untracked_keys_) { + VMSDK_RETURN_IF_ERROR(fn(key)); + } + return absl::OkStatus(); +} + } // namespace valkey_search::indexes diff --git a/src/indexes/numeric.h b/src/indexes/numeric.h index b83984a11..9459df3c5 100644 --- a/src/indexes/numeric.h +++ b/src/indexes/numeric.h @@ -93,18 +93,21 @@ class Numeric : public IndexBase { absl::string_view data) override ABSL_LOCKS_EXCLUDED(index_mutex_); int RespondWithInfo(ValkeyModuleCtx* ctx) const override; - bool IsTracked(const InternedStringPtr& key) const override; absl::Status SaveIndex(RDBChunkOutputStream chunked_out) const override { return absl::OkStatus(); } - inline void ForEachTrackedKey( - absl::AnyInvocable fn) const override { - absl::MutexLock lock(&index_mutex_); - for (const auto& [key, _] : tracked_keys_) { - fn(key); - } - } - uint64_t GetRecordCount() const override; + + size_t GetTrackedKeyCount() const override; + size_t GetUnTrackedKeyCount() const override; + bool IsTracked(const InternedStringPtr& key) const override; + bool IsUnTracked(const InternedStringPtr& key) const override; + absl::Status ForEachTrackedKey( + absl::AnyInvocable fn) + const override; + absl::Status ForEachUnTrackedKey( + absl::AnyInvocable fn) + const override; + std::unique_ptr ToProto() const override; const double* GetValue(const InternedStringPtr& key) const diff --git a/src/indexes/tag.cc b/src/indexes/tag.cc index af0865237..956a561f7 100644 --- a/src/indexes/tag.cc +++ b/src/indexes/tag.cc @@ -182,11 +182,6 @@ int Tag::RespondWithInfo(ValkeyModuleCtx* ctx) const { return num_replies; } -bool Tag::IsTracked(const InternedStringPtr& key) const { - absl::MutexLock lock(&index_mutex_); - return tracked_tags_by_keys_.contains(key); -} - std::unique_ptr Tag::ToProto() const { auto index_proto = std::make_unique(); auto tag_index = std::make_unique(); @@ -340,9 +335,42 @@ std::unique_ptr Tag::EntriesFetcher::Begin() { size_t Tag::EntriesFetcher::Size() const { return size_; } -uint64_t Tag::GetRecordCount() const { +size_t Tag::GetTrackedKeyCount() const { absl::MutexLock lock(&index_mutex_); return tracked_tags_by_keys_.size(); } +size_t Tag::GetUnTrackedKeyCount() const { + absl::MutexLock lock(&index_mutex_); + return untracked_keys_.size(); +} + +bool Tag::IsTracked(const InternedStringPtr& key) const { + absl::MutexLock lock(&index_mutex_); + return tracked_tags_by_keys_.contains(key); +} + +bool Tag::IsUnTracked(const InternedStringPtr& key) const { + absl::MutexLock lock(&index_mutex_); + return untracked_keys_.contains(key); +} + +absl::Status Tag::ForEachTrackedKey( + absl::AnyInvocable fn) const { + absl::MutexLock lock(&index_mutex_); + for (const auto& [key, _] : tracked_tags_by_keys_) { + VMSDK_RETURN_IF_ERROR(fn(key)); + } + return absl::OkStatus(); +} + +absl::Status Tag::ForEachUnTrackedKey( + absl::AnyInvocable fn) const { + absl::MutexLock lock(&index_mutex_); + for (const auto& key : untracked_keys_) { + VMSDK_RETURN_IF_ERROR(fn(key)); + } + return absl::OkStatus(); +} + } // namespace valkey_search::indexes diff --git a/src/indexes/tag.h b/src/indexes/tag.h index 74da9730b..f8339d019 100644 --- a/src/indexes/tag.h +++ b/src/indexes/tag.h @@ -43,19 +43,20 @@ class Tag : public IndexBase { absl::string_view data) override ABSL_LOCKS_EXCLUDED(index_mutex_); int RespondWithInfo(ValkeyModuleCtx* ctx) const override; - bool IsTracked(const InternedStringPtr& key) const override; absl::Status SaveIndex(RDBChunkOutputStream chunked_out) const override { return absl::OkStatus(); } - inline void ForEachTrackedKey( - absl::AnyInvocable fn) const override { - absl::MutexLock lock(&index_mutex_); - for (const auto& [key, _] : tracked_tags_by_keys_) { - fn(key); - } - } - uint64_t GetRecordCount() const override; + size_t GetTrackedKeyCount() const override; + size_t GetUnTrackedKeyCount() const override; + bool IsTracked(const InternedStringPtr& key) const override; + bool IsUnTracked(const InternedStringPtr& key) const override; + absl::Status ForEachTrackedKey( + absl::AnyInvocable fn) + const override; + absl::Status ForEachUnTrackedKey( + absl::AnyInvocable fn) + const override; std::unique_ptr ToProto() const override; InternedStringPtr GetRawValue(const InternedStringPtr& key) const diff --git a/src/indexes/text.h b/src/indexes/text.h index d4f89b7ad..8d7cbf247 100644 --- a/src/indexes/text.h +++ b/src/indexes/text.h @@ -53,14 +53,15 @@ class Text : public IndexBase { return absl::OkStatus(); } - inline void ForEachTrackedKey( - absl::AnyInvocable fn) const override { + inline absl::Status ForEachTrackedKey( + absl::AnyInvocable fn) + const override { absl::MutexLock lock(&index_mutex_); for (const auto& [key, _] : tracked_tags_by_keys_) { fn(key); } } - uint64_t GetRecordCount() const override; + uint64_t GetTrackedKeyCount() const override; std::unique_ptr ToProto() const override; InternedStringPtr GetRawValue(const InternedStringPtr& key) const diff --git a/src/indexes/text/text.h b/src/indexes/text/text.h index 9261588b2..645bd6bd4 100644 --- a/src/indexes/text/text.h +++ b/src/indexes/text/text.h @@ -36,10 +36,11 @@ struct TextFieldIndex : public indexes::IndexBase { virtual absl::Status SaveIndex(RDBOutputStream& rdb_stream) const override; virtual std::unique_ptr ToProto() const override; - virtual void ForEachTrackedKey( - absl::AnyInvocable fn) const override; + virtual absl::Status ForEachTrackedKey( + absl::AnyInvocable fn) + const override; - virtual uint64_t GetRecordCount() const override; + virtual uint64_t GetTrackedKeyCount() const override; private: // Each text field is assigned a unique number within the containing index, diff --git a/src/indexes/vector_base.cc b/src/indexes/vector_base.cc index c66b77cd9..b93c08358 100644 --- a/src/indexes/vector_base.cc +++ b/src/indexes/vector_base.cc @@ -5,262 +5,556 @@ * */ -#ifndef VALKEYSEARCH_SRC_INDEXES_VECTOR_BASE_H_ -#define VALKEYSEARCH_SRC_INDEXES_VECTOR_BASE_H_ +#include "src/indexes/vector_base.h" +#include + +#include +#include #include #include +#include #include #include #include #include #include #include +#include #include #include -#include "absl/base/no_destructor.h" -#include "absl/base/thread_annotations.h" #include "absl/container/flat_hash_map.h" #include "absl/container/flat_hash_set.h" -#include "absl/functional/any_invocable.h" +#include "absl/log/check.h" #include "absl/status/status.h" #include "absl/status/statusor.h" +#include "absl/strings/numbers.h" +#include "absl/strings/str_cat.h" +#include "absl/strings/str_split.h" #include "absl/strings/string_view.h" +#include "absl/strings/strip.h" #include "absl/synchronization/mutex.h" #include "src/attribute_data_type.h" #include "src/index_schema.pb.h" #include "src/indexes/index_base.h" +#include "src/indexes/numeric.h" +#include "src/indexes/tag.h" #include "src/query/predicate.h" #include "src/rdb_serialization.h" -#include "src/utils/allocator.h" #include "src/utils/string_interning.h" +#include "src/vector_externalizer.h" #include "third_party/hnswlib/hnswlib.h" -#include "third_party/hnswlib/iostream.h" +#include "third_party/hnswlib/space_ip.h" +#include "third_party/hnswlib/space_l2.h" +#include "vmsdk/src/log.h" #include "vmsdk/src/managed_pointers.h" +#include "vmsdk/src/status/status_macros.h" +#include "vmsdk/src/type_conversions.h" #include "vmsdk/src/valkey_module_api/valkey_module.h" -namespace valkey_search::indexes { +namespace valkey_search { +constexpr float kDefaultMagnitude = -1.0f; + +namespace { + +template +std::unique_ptr> CreateSpace( + int dimensions, valkey_search::data_model::DistanceMetric distance_metric) { + if constexpr (std::is_same_v) { + if (distance_metric == + valkey_search::data_model::DistanceMetric::DISTANCE_METRIC_COSINE || + distance_metric == + valkey_search::data_model::DistanceMetric::DISTANCE_METRIC_IP) { + return std::make_unique(dimensions); + } else { + return std::make_unique(dimensions); + } + } + DCHECK(false) << "no matching spacer"; + return std::make_unique(dimensions); +} + +} // namespace + +namespace indexes { +bool PrefilterEvaluator::Evaluate(const query::Predicate &predicate, + const InternedStringPtr &key) { + key_ = &key; + auto res = predicate.Evaluate(*this); + key_ = nullptr; + return res; +} + +bool PrefilterEvaluator::EvaluateTags(const query::TagPredicate &predicate) { + bool case_sensitive = true; + auto tags = predicate.GetIndex()->GetValue(*key_, case_sensitive); + return predicate.Evaluate(tags, case_sensitive); +} + +bool PrefilterEvaluator::EvaluateNumeric( + const query::NumericPredicate &predicate) { + CHECK(key_); + auto value = predicate.GetIndex()->GetValue(*key_); + return predicate.Evaluate(value); +} + +template +T CopyAndNormalizeEmbedding(T *dst, T *src, size_t size) { + T magnitude = 0.0f; + for (size_t i = 0; i < size; i++) { + magnitude += src[i] * src[i]; + } + magnitude = std::sqrt(magnitude); + T norm = (magnitude == 0.0f) ? 1.0f : (1.0f / magnitude); + for (size_t i = 0; i < size; i++) { + dst[i] = norm * src[i]; + } + return magnitude; +} std::vector NormalizeEmbedding(absl::string_view record, size_t type_size, - float* magnitude = nullptr); - -struct Neighbor { - InternedStringPtr external_id; - float distance; - std::optional attribute_contents; - Neighbor(const InternedStringPtr& external_id, float distance) - : external_id(external_id), distance(distance) {} - Neighbor(const InternedStringPtr& external_id, float distance, - std::optional&& attribute_contents) - : external_id(external_id), - distance(distance), - attribute_contents(std::move(attribute_contents)) {} - Neighbor(Neighbor&& other) noexcept - : external_id(std::move(other.external_id)), - distance(other.distance), - attribute_contents(std::move(other.attribute_contents)) {} - Neighbor& operator=(Neighbor&& other) noexcept { - if (this != &other) { - external_id = std::move(other.external_id); - distance = other.distance; - attribute_contents = std::move(other.attribute_contents); + float *magnitude) { + std::vector ret(record.size()); + if (type_size == sizeof(float)) { + float result = CopyAndNormalizeEmbedding( + (float *)&ret[0], (float *)record.data(), ret.size() / sizeof(float)); + if (magnitude) { + *magnitude = result; } - return *this; + return ret; } - friend std::ostream& operator<<(std::ostream& os, const Neighbor& n) { - os << "Key: " << n.external_id->Str() << " Dist: " << n.distance; - if (n.attribute_contents.has_value()) { - os << ' ' << *n.attribute_contents; - } else { - os << " [NoContents]"; + CHECK(false) << "unsupported type size"; +} + +template +void VectorBase::Init(int dimensions, + valkey_search::data_model::DistanceMetric distance_metric, + std::unique_ptr> &space) { + space = CreateSpace(dimensions, distance_metric); + distance_metric_ = distance_metric; + if (distance_metric == + valkey_search::data_model::DistanceMetric::DISTANCE_METRIC_COSINE) { + normalize_ = true; + } +} + +std::shared_ptr VectorBase::InternVector( + absl::string_view record, std::optional &magnitude) { + if (!IsValidSizeVector(record)) { + return nullptr; + } + if (normalize_) { + magnitude = kDefaultMagnitude; + auto norm_record = + NormalizeEmbedding(record, GetDataTypeSize(), &magnitude.value()); + return StringInternStore::Intern( + absl::string_view((const char *)norm_record.data(), norm_record.size()), + vector_allocator_.get()); + } + return StringInternStore::Intern(record, vector_allocator_.get()); +} + +absl::StatusOr VectorBase::AddRecord(const InternedStringPtr &key, + absl::string_view record) { + std::optional magnitude; + auto interned_vector = InternVector(record, magnitude); + if (!interned_vector) { + return false; + } + VMSDK_ASSIGN_OR_RETURN( + auto internal_id, + TrackKey(key, magnitude.value_or(kDefaultMagnitude), interned_vector)); + absl::Status add_result = AddRecordImpl(internal_id, interned_vector->Str()); + if (!add_result.ok()) { + auto untrack_result = UnTrackKey(key); + if (!untrack_result.ok()) { + VMSDK_LOG_EVERY_N_SEC(WARNING, nullptr, 1) + << "While processing error for AddRecord, encountered error in " + "UntrackKey: " + << untrack_result.status().message(); + } + return add_result; + } + return true; +} + +absl::StatusOr VectorBase::GetInternalId( + const InternedStringPtr &key) const { + absl::ReaderMutexLock lock(&key_to_metadata_mutex_); + auto it = tracked_metadata_by_key_.find(key); + if (it == tracked_metadata_by_key_.end()) { + return absl::InvalidArgumentError("Record was not found"); + } + return it->second.internal_id; +} + +absl::StatusOr VectorBase::GetInternalIdDuringSearch( + const InternedStringPtr &key) const { + auto it = tracked_metadata_by_key_.find(key); + if (it == tracked_metadata_by_key_.end()) { + return absl::InvalidArgumentError("Record was not found"); + } + return it->second.internal_id; +} + +absl::StatusOr VectorBase::GetKeyDuringSearch( + uint64_t internal_id) const { + auto it = key_by_internal_id_.find(internal_id); + if (it == key_by_internal_id_.end()) { + return absl::InvalidArgumentError("Record was not found"); + } + return it->second; +} + +absl::StatusOr VectorBase::ModifyRecord(const InternedStringPtr &key, + absl::string_view record) { + // VectorExternalizer tracks added entries. We need to untrack mutations which + // are processed as modified records. + std::optional magnitude; + auto interned_vector = InternVector(record, magnitude); + if (!interned_vector) { + [[maybe_unused]] auto res = + RemoveRecord(key, indexes::DeletionType::kRecord); + return false; + } + VMSDK_ASSIGN_OR_RETURN(auto internal_id, GetInternalId(key)); + VMSDK_ASSIGN_OR_RETURN( + bool res, UpdateMetadata(key, magnitude.value_or(kDefaultMagnitude), + interned_vector)); + if (!res) { + return false; + } + + auto modify_result = ModifyRecordImpl(internal_id, interned_vector->Str()); + if (!modify_result.ok()) { + auto untrack_result = UnTrackKey(key); + if (!untrack_result.ok()) { + VMSDK_LOG_EVERY_N_SEC(WARNING, nullptr, 1) + << "While processing error for ModifyRecord, encountered error " + "in UntrackKey: " + << untrack_result.status().message(); } - return os; - } -}; - -const absl::NoDestructor> - kVectorAlgoByStr({ - {"HNSW", data_model::VectorIndex::AlgorithmCase::kHnswAlgorithm}, - {"FLAT", data_model::VectorIndex::AlgorithmCase::kFlatAlgorithm}, - }); - -const absl::NoDestructor< - absl::flat_hash_map> - kDistanceMetricByStr( - {{"L2", data_model::DistanceMetric::DISTANCE_METRIC_L2}, - {"IP", data_model::DistanceMetric::DISTANCE_METRIC_IP}, - {"COSINE", data_model::DistanceMetric::DISTANCE_METRIC_COSINE}}); - -const absl::NoDestructor< - absl::flat_hash_map> - kVectorDataTypeByStr({{"FLOAT32", data_model::VECTOR_DATA_TYPE_FLOAT32}}); - -template -absl::string_view LookupKeyByValue( - const absl::flat_hash_map& map, const V& value) { - auto it = std::find_if(map.begin(), map.end(), [&value](const auto& pair) { - return pair.second == value; - }); - if (it != map.end()) { - return it->first; // Return the key + } + return true; +} + +template +absl::StatusOr> VectorBase::CreateReply( + std::priority_queue> &knn_res) { + std::deque ret; + while (!knn_res.empty()) { + auto &ele = knn_res.top(); + auto vector_key = GetKeyDuringSearch(ele.second); + if (!vector_key.ok()) { + knn_res.pop(); + continue; + } + // Sorting in asc order. + ret.emplace_front(Neighbor{vector_key.value(), ele.first}); + knn_res.pop(); + } + return ret; +} + +absl::StatusOr> VectorBase::GetValue( + const InternedStringPtr &key) const { + auto it = tracked_metadata_by_key_.find(key); + if (it == tracked_metadata_by_key_.end()) { + return absl::NotFoundError("Record was not found"); + } + std::vector result; + char *value = GetValueImpl(it->second.internal_id); + if (normalize_) { + if (it->second.magnitude < 0) { + return absl::InternalError("Magnitude is not initialized"); + } + result = DenormalizeVector(absl::string_view(value, GetVectorDataSize()), + GetDataTypeSize(), it->second.magnitude); } else { - return ""; - } -} - -class VectorBase : public IndexBase, public hnswlib::VectorTracker { - public: - absl::StatusOr AddRecord(const InternedStringPtr& key, - absl::string_view record) override; - absl::StatusOr RemoveRecord(const InternedStringPtr& key, - indexes::DeletionType deletion_type = - indexes::DeletionType::kNone) override; - absl::StatusOr ModifyRecord(const InternedStringPtr& key, - absl::string_view record) override; - bool IsTracked(const InternedStringPtr& key) const override - ABSL_LOCKS_EXCLUDED(key_to_metadata_mutex_); - virtual size_t GetCapacity() const = 0; - bool GetNormalize() const { return normalize_; } - std::unique_ptr ToProto() const override; - absl::Status SaveIndex(RDBChunkOutputStream chunked_out) const override; - absl::Status SaveTrackedKeys(RDBChunkOutputStream chunked_out) const - ABSL_LOCKS_EXCLUDED(key_to_metadata_mutex_); - absl::Status LoadTrackedKeys(ValkeyModuleCtx* ctx, - const AttributeDataType* attribute_data_type, - SupplementalContentChunkIter&& iter); - void ForEachTrackedKey( - absl::AnyInvocable fn) const override { - absl::MutexLock lock(&key_to_metadata_mutex_); - for (const auto& [key, _] : tracked_metadata_by_key_) { - fn(key); + result.assign(value, value + GetVectorDataSize()); + } + return result; +} + +absl::StatusOr VectorBase::RemoveRecord( + const InternedStringPtr &key, + [[maybe_unused]] indexes::DeletionType deletion_type) { + VMSDK_ASSIGN_OR_RETURN(auto res, UnTrackKey(key)); + if (!res.has_value()) { + return false; + } + VMSDK_RETURN_IF_ERROR(RemoveRecordImpl(res.value())); + return true; +} + +absl::StatusOr> VectorBase::UnTrackKey( + const InternedStringPtr &key) { + if (key->Str().empty()) { + return std::nullopt; + } + absl::WriterMutexLock lock(&key_to_metadata_mutex_); + auto it = tracked_metadata_by_key_.find(key); + if (it == tracked_metadata_by_key_.end()) { + return std::nullopt; + } + auto id = it->second.internal_id; + UnTrackVector(id); + tracked_metadata_by_key_.erase(it); + auto key_by_internal_id_it = key_by_internal_id_.find(id); + if (key_by_internal_id_it == key_by_internal_id_.end()) { + return absl::InvalidArgumentError( + "Error while untracking key - key was not found in key_by_internal_id_ " + "but in internal_by_key_"); + } + key_by_internal_id_.erase(key_by_internal_id_it); + return id; +} + +char *VectorBase::TrackVector(uint64_t internal_id, char *vector, size_t len) { + auto interned_vector = StringInternStore::Intern( + absl::string_view(vector, len), vector_allocator_.get()); + TrackVector(internal_id, interned_vector); + return (char *)interned_vector->Str().data(); +} + +absl::StatusOr VectorBase::TrackKey(const InternedStringPtr &key, + float magnitude, + const InternedStringPtr &vector) { + if (key->Str().empty()) { + return absl::InvalidArgumentError("key can't be empty"); + } + absl::WriterMutexLock lock(&key_to_metadata_mutex_); + auto id = inc_id_++; + auto [_, succ] = tracked_metadata_by_key_.insert( + {key, {.internal_id = id, .magnitude = magnitude}}); + + if (!succ) { + return absl::InvalidArgumentError( + absl::StrCat("Embedding id already exists: ", key->Str())); + } + TrackVector(id, vector); + key_by_internal_id_.insert({id, key}); + return id; +} +// Return an error if the key is empty or not being tracked. +// Return false if the tracked vector matches the input vector. +// Otherwise, track the new vector and return true. +absl::StatusOr VectorBase::UpdateMetadata( + const InternedStringPtr &key, float magnitude, + const InternedStringPtr &vector) { + if (key->Str().empty()) { + return absl::InvalidArgumentError("key can't be empty"); + } + uint64_t internal_id; + { + absl::WriterMutexLock lock(&key_to_metadata_mutex_); + auto it = tracked_metadata_by_key_.find(key); + if (it == tracked_metadata_by_key_.end()) { + return absl::InvalidArgumentError( + absl::StrCat("Embedding id not found: ", key->Str())); } + it->second.magnitude = magnitude; + internal_id = it->second.internal_id; + } + if (IsVectorMatch(internal_id, vector)) { + return false; } - absl::StatusOr GetKeyDuringSearch( - uint64_t internal_id) const ABSL_NO_THREAD_SAFETY_ANALYSIS; - bool AddPrefilteredKey( - absl::string_view query, uint64_t count, const InternedStringPtr& key, - std::priority_queue>& results, - absl::flat_hash_set& top_keys) const; - vmsdk::UniqueValkeyString NormalizeStringRecord( - vmsdk::UniqueValkeyString record) const override; - uint64_t GetRecordCount() const override; - template - absl::StatusOr> CreateReply( - std::priority_queue>& knn_res); - absl::StatusOr> GetValue(const InternedStringPtr& key) const - ABSL_NO_THREAD_SAFETY_ANALYSIS; - int GetVectorDataSize() const { return GetDataTypeSize() * dimensions_; } - char* TrackVector(uint64_t internal_id, char* vector, size_t len) override; - std::shared_ptr InternVector(absl::string_view record, - std::optional& magnitude); - - protected: - VectorBase(IndexerType indexer_type, int dimensions, - data_model::AttributeDataType attribute_data_type, - absl::string_view attribute_identifier) - : IndexBase(indexer_type), - dimensions_(dimensions), - attribute_identifier_(attribute_identifier), - attribute_data_type_(attribute_data_type) -#ifndef SAN_BUILD - , - vector_allocator_(CREATE_UNIQUE_PTR( - FixedSizeAllocator, dimensions * sizeof(float) + 1, true)) -#endif // !SAN_BUILD + TrackVector(internal_id, vector); + return true; +} + +int VectorBase::RespondWithInfo(ValkeyModuleCtx *ctx) const { + ValkeyModule_ReplyWithSimpleString(ctx, "type"); + ValkeyModule_ReplyWithSimpleString(ctx, "VECTOR"); + int array_len = 2; + array_len += RespondWithInfoImpl(ctx); + ValkeyModule_ReplyWithSimpleString(ctx, "capacity"); + ValkeyModule_ReplyWithLongLong(ctx, GetCapacity()); + ValkeyModule_ReplyWithSimpleString(ctx, "size"); { + absl::MutexLock lock(&key_to_metadata_mutex_); + ValkeyModule_ReplyWithCString( + ctx, std::to_string(key_by_internal_id_.size()).c_str()); + } + return array_len + 4; +} + +absl::Status VectorBase::SaveIndex(RDBChunkOutputStream chunked_out) const { + VMSDK_RETURN_IF_ERROR(SaveIndexImpl(std::move(chunked_out))); + return absl::OkStatus(); +} + +absl::Status VectorBase::SaveTrackedKeys( + RDBChunkOutputStream chunked_out) const { + absl::ReaderMutexLock lock(&key_to_metadata_mutex_); + for (const auto &[key, metadata] : tracked_metadata_by_key_) { + data_model::TrackedKeyMetadata metadata_pb; + metadata_pb.set_key(key->Str()); + metadata_pb.set_internal_id(metadata.internal_id); + metadata_pb.set_magnitude(metadata.magnitude); + auto metadata_pb_str = metadata_pb.SerializeAsString(); + VMSDK_RETURN_IF_ERROR( + chunked_out.SaveChunk(metadata_pb_str.data(), metadata_pb_str.size())) + << "Error saving key_by_internal_id_ entry"; + } + return absl::OkStatus(); +} + +void VectorBase::ExternalizeVector(ValkeyModuleCtx *ctx, + const AttributeDataType *attribute_data_type, + absl::string_view key_cstr, + absl::string_view attribute_identifier) { + auto key_obj = vmsdk::MakeUniqueValkeyOpenKey( + ctx, vmsdk::MakeUniqueValkeyString(key_cstr).get(), + VALKEYMODULE_OPEN_KEY_NOEFFECTS | VALKEYMODULE_READ); + if (!key_obj || !attribute_data_type->IsProperType(key_obj.get())) { + return; + } + bool is_module_owned; + vmsdk::UniqueValkeyString record = VectorExternalizer::Instance().GetRecord( + ctx, attribute_data_type, key_obj.get(), key_cstr, attribute_identifier, + is_module_owned); + CHECK(!is_module_owned); + std::optional magnitude; + auto interned_key = StringInternStore::Intern(key_cstr); + auto interned_vector = + InternVector(vmsdk::ToStringView(record.get()), magnitude); + if (interned_vector) { + VectorExternalizer::Instance().Externalize( + interned_key, attribute_identifier, attribute_data_type->ToProto(), + interned_vector, magnitude); + } +} + +absl::Status VectorBase::LoadTrackedKeys( + ValkeyModuleCtx *ctx, const AttributeDataType *attribute_data_type, + SupplementalContentChunkIter &&iter) { + absl::WriterMutexLock lock(&key_to_metadata_mutex_); + while (iter.HasNext()) { + VMSDK_ASSIGN_OR_RETURN(auto metadata_str, iter.Next(), + _ << "Error loading metadata"); + data_model::TrackedKeyMetadata tracked_key_metadata; + if (!tracked_key_metadata.ParseFromString(metadata_str->binary_content())) { + return absl::InvalidArgumentError("Error parsing metadata from proto"); + } + auto interned_key = StringInternStore::Intern(tracked_key_metadata.key()); + tracked_metadata_by_key_.insert( + {interned_key, + {.internal_id = tracked_key_metadata.internal_id(), + .magnitude = tracked_key_metadata.magnitude()}}); + key_by_internal_id_.insert( + {tracked_key_metadata.internal_id(), interned_key}); + inc_id_ = std::max( + inc_id_, static_cast(tracked_key_metadata.internal_id())); + ExternalizeVector(ctx, attribute_data_type, tracked_key_metadata.key(), + attribute_identifier_); + } + ++inc_id_; + return absl::OkStatus(); +} + +std::unique_ptr VectorBase::ToProto() const { + absl::ReaderMutexLock lock(&key_to_metadata_mutex_); + auto index_proto = std::make_unique(); + auto vector_index = std::make_unique(); + vector_index->set_normalize(normalize_); + vector_index->set_distance_metric(distance_metric_); + vector_index->set_dimension_count(dimensions_); + vector_index->set_initial_cap(GetCapacity()); + ToProtoImpl(vector_index.get()); + index_proto->set_allocated_vector_index(vector_index.release()); + return index_proto; +} + +absl::StatusOr> +VectorBase::ComputeDistanceFromRecord(const InternedStringPtr &key, + absl::string_view query) const { + VMSDK_ASSIGN_OR_RETURN(auto internal_id, GetInternalIdDuringSearch(key)); + return ComputeDistanceFromRecordImpl(internal_id, query); +} + +bool VectorBase::AddPrefilteredKey( + absl::string_view query, uint64_t count, const InternedStringPtr &key, + std::priority_queue> &results, + absl::flat_hash_set &top_keys) const { + auto result = ComputeDistanceFromRecord(key, query); + if (!result.ok()) { + return false; + } + if (results.size() < count) { + results.emplace(result.value()); + return true; + } + if (result.value().first < results.top().first) { + auto top = results.top(); + auto vector_key = GetKeyDuringSearch(top.second); + top_keys.erase(vector_key.value()->Str().data()); + results.pop(); + results.emplace(result.value()); + return true; + } + return false; +} + +vmsdk::UniqueValkeyString VectorBase::NormalizeStringRecord( + vmsdk::UniqueValkeyString record) const { + CHECK_EQ(GetDataTypeSize(), sizeof(float)); + auto record_str = vmsdk::ToStringView(record.get()); + if (absl::ConsumePrefix(&record_str, "[")) { + absl::ConsumeSuffix(&record_str, "]"); + } + std::vector float_strings = + absl::StrSplit(record_str, ',', absl::SkipWhitespace()); + std::string binary_string; + binary_string.reserve(float_strings.size() * sizeof(float)); + for (const auto &float_str : float_strings) { + float value; + if (!absl::SimpleAtof(float_str, &value)) { + return nullptr; + } + binary_string += std::string((char *)&value, sizeof(float)); } + return vmsdk::MakeUniqueValkeyString(binary_string); +} + +size_t VectorBase::GetTrackedKeyCount() const { + absl::ReaderMutexLock lock(&key_to_metadata_mutex_); + return key_by_internal_id_.size(); +} + +size_t VectorBase::GetUnTrackedKeyCount() const { return 0; } + +bool VectorBase::IsTracked(const InternedStringPtr &key) const { + absl::ReaderMutexLock lock(&key_to_metadata_mutex_); + auto it = tracked_metadata_by_key_.find(key); + return (it != tracked_metadata_by_key_.end()); +} + +bool VectorBase::IsUnTracked(const InternedStringPtr &key) const { + return false; +} + +absl::Status VectorBase::ForEachTrackedKey( + absl::AnyInvocable fn) const { + absl::MutexLock lock(&key_to_metadata_mutex_); + for (const auto &[key, _] : tracked_metadata_by_key_) { + VMSDK_RETURN_IF_ERROR(fn(key)); + } + return absl::OkStatus(); +} + +absl::Status VectorBase::ForEachUnTrackedKey( + absl::AnyInvocable fn) const { + return absl::OkStatus(); +} + +template void VectorBase::Init( + int dimensions, data_model::DistanceMetric distance_metric, + std::unique_ptr> &space); + +template absl::StatusOr> VectorBase::CreateReply( + std::priority_queue> &knn_res); +} // namespace indexes - bool IsValidSizeVector(absl::string_view record) { - const auto data_type_size = GetDataTypeSize(); - int32_t dim = record.size() / GetDataTypeSize(); - return dim == dimensions_ && (record.size() % data_type_size == 0); - } - int RespondWithInfo(ValkeyModuleCtx* ctx) const override; - template - void Init(int dimensions, data_model::DistanceMetric distance_metric, - std::unique_ptr>& space); - virtual absl::Status AddRecordImpl(uint64_t internal_id, - absl::string_view record) = 0; - - virtual absl::Status RemoveRecordImpl(uint64_t internal_id) = 0; - virtual absl::Status ModifyRecordImpl(uint64_t internal_id, - absl::string_view record) = 0; - virtual int RespondWithInfoImpl(ValkeyModuleCtx* ctx) const = 0; - - virtual size_t GetDataTypeSize() const = 0; - virtual void ToProtoImpl( - data_model::VectorIndex* vector_index_proto) const = 0; - virtual absl::Status SaveIndexImpl( - RDBChunkOutputStream chunked_out) const = 0; - void ExternalizeVector(ValkeyModuleCtx* ctx, - const AttributeDataType* attribute_data_type, - absl::string_view key_cstr, - absl::string_view attribute_identifier); - virtual char* GetValueImpl(uint64_t internal_id) const = 0; - - int dimensions_; - std::string attribute_identifier_; - bool normalize_{false}; - data_model::AttributeDataType attribute_data_type_; - data_model::DistanceMetric distance_metric_; - virtual absl::StatusOr> - ComputeDistanceFromRecordImpl(uint64_t internal_id, - absl::string_view query) const = 0; - virtual void TrackVector(uint64_t internal_id, - const InternedStringPtr& vector) = 0; - virtual bool IsVectorMatch(uint64_t internal_id, - const InternedStringPtr& vector) = 0; - virtual void UnTrackVector(uint64_t internal_id) = 0; - - private: - absl::StatusOr TrackKey(const InternedStringPtr& key, - float magnitude, - const InternedStringPtr& vector) - ABSL_LOCKS_EXCLUDED(key_to_metadata_mutex_); - absl::StatusOr> UnTrackKey( - const InternedStringPtr& key) ABSL_LOCKS_EXCLUDED(key_to_metadata_mutex_); - absl::StatusOr UpdateMetadata(const InternedStringPtr& key, - float magnitude, - const InternedStringPtr& vector) - ABSL_LOCKS_EXCLUDED(key_to_metadata_mutex_); - absl::StatusOr GetInternalId(const InternedStringPtr& key) const - ABSL_LOCKS_EXCLUDED(key_to_metadata_mutex_); - absl::StatusOr GetInternalIdDuringSearch( - const InternedStringPtr& key) const ABSL_NO_THREAD_SAFETY_ANALYSIS; - absl::flat_hash_map key_by_internal_id_ - ABSL_GUARDED_BY(key_to_metadata_mutex_); - struct TrackedKeyMetadata { - uint64_t internal_id; - // If normalize_ is false, this will be -1.0f. Otherwise, it will be the - // magnitude of the vector. If the magnitude is not initialized, it will be - // -inf (this is an intermediate state during backfill when transitioning - // from the old RDB format that didn't include magnitudes). - float magnitude; - }; - - InternedStringMap tracked_metadata_by_key_ - ABSL_GUARDED_BY(key_to_metadata_mutex_); - uint64_t inc_id_ ABSL_GUARDED_BY(key_to_metadata_mutex_){0}; - mutable absl::Mutex key_to_metadata_mutex_; - absl::StatusOr> - ComputeDistanceFromRecord(const InternedStringPtr& key, - absl::string_view query) const; - UniqueFixedSizeAllocatorPtr vector_allocator_{nullptr, nullptr}; -}; - -class PrefilterEvaluator : public query::Evaluator { - public: - bool Evaluate(const query::Predicate& predicate, - const InternedStringPtr& key); - - private: - bool EvaluateTags(const query::TagPredicate& predicate) override; - bool EvaluateNumeric(const query::NumericPredicate& predicate) override; - const InternedStringPtr* key_{nullptr}; -}; - -} // namespace valkey_search::indexes - -#endif // VALKEYSEARCH_SRC_INDEXES_VECTOR_BASE_H_ +} // namespace valkey_search diff --git a/src/indexes/vector_base.h b/src/indexes/vector_base.h index c66b77cd9..aa74c7daa 100644 --- a/src/indexes/vector_base.h +++ b/src/indexes/vector_base.h @@ -119,8 +119,6 @@ class VectorBase : public IndexBase, public hnswlib::VectorTracker { indexes::DeletionType::kNone) override; absl::StatusOr ModifyRecord(const InternedStringPtr& key, absl::string_view record) override; - bool IsTracked(const InternedStringPtr& key) const override - ABSL_LOCKS_EXCLUDED(key_to_metadata_mutex_); virtual size_t GetCapacity() const = 0; bool GetNormalize() const { return normalize_; } std::unique_ptr ToProto() const override; @@ -130,13 +128,19 @@ class VectorBase : public IndexBase, public hnswlib::VectorTracker { absl::Status LoadTrackedKeys(ValkeyModuleCtx* ctx, const AttributeDataType* attribute_data_type, SupplementalContentChunkIter&& iter); - void ForEachTrackedKey( - absl::AnyInvocable fn) const override { - absl::MutexLock lock(&key_to_metadata_mutex_); - for (const auto& [key, _] : tracked_metadata_by_key_) { - fn(key); - } - } + + size_t GetTrackedKeyCount() const override; + size_t GetUnTrackedKeyCount() const override; + bool IsTracked(const InternedStringPtr& key) const override + ABSL_LOCKS_EXCLUDED(key_to_metadata_mutex_); + bool IsUnTracked(const InternedStringPtr& key) const override; + absl::Status ForEachTrackedKey( + absl::AnyInvocable fn) + const override; + absl::Status ForEachUnTrackedKey( + absl::AnyInvocable fn) + const override; + absl::StatusOr GetKeyDuringSearch( uint64_t internal_id) const ABSL_NO_THREAD_SAFETY_ANALYSIS; bool AddPrefilteredKey( @@ -145,7 +149,6 @@ class VectorBase : public IndexBase, public hnswlib::VectorTracker { absl::flat_hash_set& top_keys) const; vmsdk::UniqueValkeyString NormalizeStringRecord( vmsdk::UniqueValkeyString record) const override; - uint64_t GetRecordCount() const override; template absl::StatusOr> CreateReply( std::priority_queue>& knn_res); diff --git a/src/rdb_section.proto b/src/rdb_section.proto index 51ea732fd..5b37ded14 100644 --- a/src/rdb_section.proto +++ b/src/rdb_section.proto @@ -38,18 +38,9 @@ message RDBSection { enum SupplementalContentType { SUPPLEMENTAL_CONTENT_UNSPECIFIED = 0; - - // For Vector Indexes, this is the contents of the index. For Non-vector indexes this is empty (just a header) - SUPPLEMENTAL_CONTENT_INDEX_CONTENT = 1; // Header: IndexContentHeader - - // For HNSW Indexes, this contains HNSWLib internal ID to key mapping - SUPPLEMENTAL_CONTENT_KEY_TO_ID_MAP = 2; // Header: KeyToIDMappingHeader - - // V2. Only for non-vector indexes. Contents of the index - SUPPLEMENTAL_CONTENT_INDEX_EXTENSION = 3; // V2 extension for index contents, Header: IndexContentHeader - - // V2. Saved Mutation Queue - SUPPLEMENTAL_CONTENT_MUTATION_QUEUE = 4; // V2 saved mutation queue, Header: MutationQueueHeader + SUPPLEMENTAL_CONTENT_INDEX_CONTENT = 1; + SUPPLEMENTAL_CONTENT_KEY_TO_ID_MAP = 2; + SUPPLEMENTAL_CONTENT_INDEX_EXTENSION = 3; } message IndexContentHeader { diff --git a/testing/common.h b/testing/common.h index bad578c4b..dadcd1f85 100644 --- a/testing/common.h +++ b/testing/common.h @@ -92,21 +92,25 @@ class MockIndex : public indexes::IndexBase { MOCK_METHOD(absl::StatusOr, ModifyRecord, (const InternedStringPtr& key, absl::string_view data), (override)); - MOCK_METHOD(bool, IsTracked, (const InternedStringPtr& key), - (const, override)); MOCK_METHOD(std::unique_ptr, ToProto, (), (const, override)); MOCK_METHOD(int, RespondWithInfo, (ValkeyModuleCtx * ctx), (const, override)); MOCK_METHOD(absl::Status, SaveIndex, (RDBChunkOutputStream chunked_out), (const, override)); - MOCK_METHOD(absl::Status, SaveIndexExtension, - (RDBChunkOutputStream chunked_out), (const, override)); - MOCK_METHOD(absl::Status, LoadIndexExtension, - (RDBChunkInputStream chunked_in), (override)); - MOCK_METHOD((void), ForEachTrackedKey, - (absl::AnyInvocable fn), + MOCK_METHOD((size_t), GetTrackedKeyCount, (), (const, override)); + MOCK_METHOD((size_t), GetUnTrackedKeyCount, (), (const, override)); + MOCK_METHOD(bool, IsTracked, (const InternedStringPtr& key), + (const, override)); + MOCK_METHOD(bool, IsUnTracked, (const InternedStringPtr& key), (const, override)); - MOCK_METHOD((uint64_t), GetRecordCount, (), (const, override)); + MOCK_METHOD( + (absl::Status), ForEachTrackedKey, + (absl::AnyInvocable fn), + (const, override)); + MOCK_METHOD( + (absl::Status), ForEachUnTrackedKey, + (absl::AnyInvocable fn), + (const, override)); }; class MockKeyspaceEventSubscription : public KeyspaceEventSubscription { diff --git a/testing/index_schema_test.cc b/testing/index_schema_test.cc index 00123d80d..ad80a97bb 100644 --- a/testing/index_schema_test.cc +++ b/testing/index_schema_test.cc @@ -1281,13 +1281,16 @@ TEST_F(IndexSchemaRDBTest, LoadEndedDeletesOrphanedKeys) { absl::flat_hash_map keys_in_index = { {"key1", 1}, {"key2", 2}, {"key3", 3}}; EXPECT_CALL(*mock_index, ForEachTrackedKey(testing::_)) - .WillOnce([&keys_in_index]( - absl::AnyInvocable fn) { - for (const auto &[key, internal_id] : keys_in_index) { - InternedStringPtr interned_key = StringInternStore::Intern(key); - fn(interned_key); - } - }); + .WillOnce( + [&keys_in_index]( + absl::AnyInvocable fn) + -> absl::Status { + for (const auto &[key, internal_id] : keys_in_index) { + InternedStringPtr interned_key = StringInternStore::Intern(key); + VMSDK_RETURN_IF_ERROR(fn(interned_key)); + } + return absl::OkStatus(); + }); std::vector key_prefixes = {"prefix1", "prefix2"}; std::string index_schema_name_str("index_schema_name"); @@ -1674,7 +1677,7 @@ TEST_F(IndexSchemaRDBTest, ComprehensiveSkipLoadTest) { indexes::DeletionType::kNone); } - EXPECT_EQ(hnsw_index->GetRecordCount(), num_vectors); + EXPECT_EQ(hnsw_index->GetTrackedKeyCount(), num_vectors); VMSDK_EXPECT_OK(index_schema->RDBSave(&rdb_stream_step1)); LOG(INFO) << "✓ Step 1 completed - saved " << num_vectors << " vectors to RDB"; @@ -1709,7 +1712,7 @@ TEST_F(IndexSchemaRDBTest, ComprehensiveSkipLoadTest) { EXPECT_EQ(normal_schema->GetStats().document_cnt, num_vectors); auto vec_index = normal_schema->GetIndex("embedding"); VMSDK_EXPECT_OK_STATUSOR(vec_index); - EXPECT_EQ(vec_index.value()->GetRecordCount(), num_vectors); + EXPECT_EQ(vec_index.value()->GetTrackedKeyCount(), num_vectors); LOG(INFO) << "✓ Normal load verified - " << num_vectors << " vectors loaded"; } @@ -1773,7 +1776,7 @@ TEST_F(IndexSchemaRDBTest, ComprehensiveSkipLoadTest) { EXPECT_EQ(skip_schema->GetStats().document_cnt, num_vectors); auto vec_index = skip_schema->GetIndex("embedding"); VMSDK_EXPECT_OK_STATUSOR(vec_index); - EXPECT_EQ(vec_index.value()->GetRecordCount(), 0); + EXPECT_EQ(vec_index.value()->GetTrackedKeyCount(), 0); EXPECT_TRUE(skip_schema->IsBackfillInProgress()); LOG(INFO) << "✓ Skip load verified - index empty, backfill ready"; } @@ -1856,7 +1859,7 @@ TEST_F(IndexSchemaRDBTest, ComprehensiveSkipLoadTest) { indexes::DeletionType::kNone); } - EXPECT_EQ(hnsw_index->GetRecordCount(), num_vectors); + EXPECT_EQ(hnsw_index->GetTrackedKeyCount(), num_vectors); VMSDK_EXPECT_OK(index_schema->RDBSave(&rdb_stream_step4)); LOG(INFO) << "✓ Step 4 completed - saved mixed index with " << num_vectors << " records"; @@ -1898,7 +1901,7 @@ TEST_F(IndexSchemaRDBTest, ComprehensiveSkipLoadTest) { VMSDK_EXPECT_OK_STATUSOR(num_index); VMSDK_EXPECT_OK_STATUSOR(tag_index); - EXPECT_EQ(vec_index.value()->GetRecordCount(), num_vectors); + EXPECT_EQ(vec_index.value()->GetTrackedKeyCount(), num_vectors); LOG(INFO) << "✓ Mixed index normal load verified"; } @@ -1969,7 +1972,7 @@ TEST_F(IndexSchemaRDBTest, ComprehensiveSkipLoadTest) { VMSDK_EXPECT_OK_STATUSOR(num_index); VMSDK_EXPECT_OK_STATUSOR(tag_index); - EXPECT_EQ(vec_index.value()->GetRecordCount(), 0); + EXPECT_EQ(vec_index.value()->GetTrackedKeyCount(), 0); EXPECT_TRUE(mixed_skip_schema->IsBackfillInProgress()); LOG(INFO) << "✓ Mixed index skip load verified"; } @@ -2065,9 +2068,9 @@ TEST_F(IndexSchemaRDBTest, ComprehensiveSkipLoadTest) { indexes::DeletionType::kNone); } - EXPECT_EQ(hnsw_index1->GetRecordCount(), additional_index_vectors); - EXPECT_EQ(hnsw_index2->GetRecordCount(), additional_index_vectors); - EXPECT_EQ(flat_index->GetRecordCount(), additional_index_vectors); + EXPECT_EQ(hnsw_index1->GetTrackedKeyCount(), additional_index_vectors); + EXPECT_EQ(hnsw_index2->GetTrackedKeyCount(), additional_index_vectors); + EXPECT_EQ(flat_index->GetTrackedKeyCount(), additional_index_vectors); VMSDK_EXPECT_OK(index_schema->RDBSave(&rdb_stream_multi)); LOG(INFO) << "✓ Steps 6-7 completed - saved 3 indexes with " diff --git a/testing/numeric_index_test.cc b/testing/numeric_index_test.cc index cfaf29c33..2ffeb6960 100644 --- a/testing/numeric_index_test.cc +++ b/testing/numeric_index_test.cc @@ -116,7 +116,7 @@ TEST_F(NumericIndexTest, ModifyWithNonNumericString) { fetcher = index.Search(predicate, false); EXPECT_EQ(Fetch(*fetcher).size(), 0); - EXPECT_EQ(index.GetRecordCount(), 0); + EXPECT_EQ(index.GetTrackedKeyCount(), 0); } TEST_F(NumericIndexTest, RangeSearchInclusiveExclusive) { diff --git a/testing/tag_index_test.cc b/testing/tag_index_test.cc index 111cc4b0e..e7958a9ab 100644 --- a/testing/tag_index_test.cc +++ b/testing/tag_index_test.cc @@ -114,7 +114,7 @@ TEST_F(TagIndexTest, ModifyRecordWithEmptyString) { auto entries_fetcher = index->Search(predicate, false); EXPECT_EQ(entries_fetcher->Size(), 0); - EXPECT_EQ(index->GetRecordCount(), 0); + EXPECT_EQ(index->GetTrackedKeyCount(), 0); } TEST_F(TagIndexTest, KeyTrackingTest) { diff --git a/vmsdk/src/info.h b/vmsdk/src/info.h index 5e1b79fe5..469265df3 100644 --- a/vmsdk/src/info.h +++ b/vmsdk/src/info.h @@ -355,6 +355,13 @@ void DoSections(ValkeyModuleInfoCtx* ctx, int for_crash_report); absl::Status ShowInfo(ValkeyModuleCtx* ctx, vmsdk::ArgsIterator& itr, const vmsdk::module::Options& options); +// +// Some often used declarations +// +#define DEV_INTEGER_COUNTER(section, name) \ + static vmsdk::info_field::Integer name( \ + #section, #name, vmsdk::info_field::IntegerBuilder().Dev()) + } // namespace info_field } // namespace vmsdk From d535c91825338cc7c3cf73e49fd00a076399f1c4 Mon Sep 17 00:00:00 2001 From: Allen Samuels Date: Thu, 16 Oct 2025 18:07:19 +0000 Subject: [PATCH 14/33] hopefully fix valkey server version Signed-off-by: Allen Samuels --- scripts/common.rc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/common.rc b/scripts/common.rc index a89003f41..2f50f5d19 100644 --- a/scripts/common.rc +++ b/scripts/common.rc @@ -166,7 +166,7 @@ function setup_valkey_server() { fi # Clone and build it - VALKEY_VERSION="${VALKEY_VERSION:=8.1.1}" + VALKEY_VERSION="${VALKEY_VERSION:=9.0.0-rc3}" export VALKEY_SERVER_HOME_DIR=$(get_third_party_build_dir)/valkey-server export VALKEY_SERVER_BUILD_DIR=${VALKEY_SERVER_HOME_DIR}/.build-release if [ ! -d ${VALKEY_SERVER_HOME_DIR} ]; then From df9cd259114dd7adbd540635e2bf6eec64def0b5 Mon Sep 17 00:00:00 2001 From: Allen Samuels Date: Wed, 29 Oct 2025 05:00:27 +0000 Subject: [PATCH 15/33] Revise per review. Add save/restore of multi/exec Signed-off-by: Allen Samuels --- integration/test_saverestore.py | 32 +++++++ src/index_schema.cc | 151 +++++++++++++++++++++----------- src/index_schema.h | 2 +- src/indexes/numeric.h | 18 ++-- src/indexes/tag.h | 18 ++-- src/indexes/vector_base.h | 19 ++-- 6 files changed, 165 insertions(+), 75 deletions(-) diff --git a/integration/test_saverestore.py b/integration/test_saverestore.py index 310feab23..21043d0d8 100644 --- a/integration/test_saverestore.py +++ b/integration/test_saverestore.py @@ -202,6 +202,38 @@ def test_mutation_queue(self): ] assert reads == [len(records)] + def test_multi_exec_queue(self): + self.client.execute_command("ft._debug PAUSEPOINT SET block_mutation_queue") + self.client.execute_command("CONFIG SET search.info-developer-visible yes") + index.create(self.client, True) + records = make_data() + # + # Now, load the data as a multi/exec... But this won't block us. + # + self.client.execute_command("MULTI") + for i in range(len(records)): + index.write_data(self.client, i, records[i]) + self.client.execute_command("EXEC") + + self.client.execute_command("save") + + i = self.client.info("search") + assert i["search_rdb_save_multi_exec_entries"] == len(records) + + self.client.execute_command("ft._debug pausepoint reset block_mutation_queue") + + verify_data(self.client) + os.environ["SKIPLOGCLEAN"] = "1" + self.server.restart(remove_rdb=False) + verify_data(self.client) + self.client.execute_command("CONFIG SET search.info-developer-visible yes") + i = self.client.info("search") + print("Info: ", i) + reads = [ + i["search_rdb_load_multi_exec_entries"], + ] + assert reads == [len(records)] + def test_saverestore_backfill(self): # # Delay the backfill and ensure that with new format we will trigger the backfill.... diff --git a/src/index_schema.cc b/src/index_schema.cc index 087970c44..f2286bfac 100644 --- a/src/index_schema.cc +++ b/src/index_schema.cc @@ -90,6 +90,8 @@ DEV_INTEGER_COUNTER(rdb_stats, rdb_load_keys); DEV_INTEGER_COUNTER(rdb_stats, rdb_save_sections); DEV_INTEGER_COUNTER(rdb_stats, rdb_load_sections); DEV_INTEGER_COUNTER(rdb_stats, rdb_load_sections_skipped); +DEV_INTEGER_COUNTER(rdb_stats, rdb_save_multi_exec_entries); +DEV_INTEGER_COUNTER(rdb_stats, rdb_load_multi_exec_entries); DEV_INTEGER_COUNTER(rdb_stats, rdb_save_mutation_entries); DEV_INTEGER_COUNTER(rdb_stats, rdb_load_mutation_entries); @@ -563,7 +565,7 @@ void IndexSchema::ProcessMultiQueue() { vmsdk::WriterMutexLock lock(&time_sliced_mutex_); while (!multi_mutations.keys.empty()) { auto key = multi_mutations.keys.front(); - multi_mutations.keys.pop(); + multi_mutations.keys.pop_front(); ScheduleMutation(false, key, vmsdk::ThreadPool::Priority::kMax, multi_mutations.blocking_counter.get()); } @@ -573,7 +575,7 @@ void IndexSchema::ProcessMultiQueue() { void IndexSchema::EnqueueMultiMutation(const InternedStringPtr &key) { auto &multi_mutations = multi_mutations_.Get(); - multi_mutations.keys.push(key); + multi_mutations.keys.push_back(key); if (multi_mutations.keys.size() >= mutations_thread_pool_->Size() && !schedule_multi_exec_processing_.Get()) { schedule_multi_exec_processing_.Get() = true; @@ -972,40 +974,38 @@ absl::Status IndexSchema::ValidateIndex() const { for (const auto &[name, attr] : attributes_) { auto idx = attr.GetIndex(); size_t cnt = idx->GetTrackedKeyCount() + idx->GetUnTrackedKeyCount(); - if (cnt != oracle_key_count) { - if (IsVectorIndex(idx) && cnt < oracle_key_count) { - continue; - } - VMSDK_LOG(WARNING, nullptr) - << "Index validation failed for index " << name - << " expected key count " << oracle_key_count << " got " << cnt; - // - // Ok, do a detailed comparison - // - auto larger_index = (cnt > oracle_key_count) ? idx : oracle_index; - auto larger_name = (cnt > oracle_key_count) ? name : oracle_name; - auto smaller_index = (cnt > oracle_key_count) ? oracle_index : idx; - auto smaller_name = (cnt > oracle_key_count) ? oracle_name : name; - auto key_check = [&](const InternedStringPtr &key) { - if (!smaller_index->IsTracked(key) && - !smaller_index->IsUnTracked(key)) { - VMSDK_LOG(WARNING, nullptr) - << "Key found in " << larger_name << " not found in " - << smaller_name << ": " << key->Str(); - status = absl::InternalError( - absl::StrCat("Key found in ", larger_name, " not found in ", - smaller_name, ": ", key->Str())); - return absl::OkStatus(); - } - }; - auto status1 = larger_index->ForEachTrackedKey(key_check); - if (!status1.ok()) { - status = status1; - } - auto status2 = larger_index->ForEachUnTrackedKey(key_check); - if (!status2.ok()) { - status = status1; + if (IsVectorIndex(idx) ? cnt <= oracle_key_count + : cnt == oracle_key_count) { + continue; + } + VMSDK_LOG(WARNING, nullptr) + << "Index validation failed for index " << name + << " expected key count " << oracle_key_count << " got " << cnt; + // + // Ok, do a detailed comparison + // + auto larger_index = (cnt > oracle_key_count) ? idx : oracle_index; + auto larger_name = (cnt > oracle_key_count) ? name : oracle_name; + auto smaller_index = (cnt > oracle_key_count) ? oracle_index : idx; + auto smaller_name = (cnt > oracle_key_count) ? oracle_name : name; + auto key_check = [&](const InternedStringPtr &key) { + if (!smaller_index->IsTracked(key) && !smaller_index->IsUnTracked(key)) { + VMSDK_LOG(WARNING, nullptr) + << "Key found in " << larger_name << " not found in " + << smaller_name << ": " << key->Str(); + status = absl::InternalError( + absl::StrCat("Key found in ", larger_name, " not found in ", + smaller_name, ": ", key->Str())); } + return absl::OkStatus(); + }; + auto status1 = larger_index->ForEachTrackedKey(key_check); + if (!status1.ok()) { + status = status1; + } + auto status2 = larger_index->ForEachUnTrackedKey(key_check); + if (!status2.ok()) { + status = status2; } } return status; @@ -1016,31 +1016,47 @@ absl::Status IndexSchema::SaveIndexExtension(RDBChunkOutputStream out) const { VMSDK_RETURN_IF_ERROR(ValidateIndex()); } // - // Need to find an attribute index that has the right tracked/untracked - // keys. Any non-vector index will do. But it there are only vector - // indexes we will use that. + // To reconstruct an index-schema, we want to ingest all of the keys that are + // currently within the index. If there is a non-vector index, we can use the + // tracked and untracked key lists from that index. If there is ONLY vector + // indexes, then this key list is not needed as there aren't any non-vector + // indexes to ingest. + // + // The V1 format doesn't have this list and substitutes a backfill to rebuild. + // In the absence of support for SKIPINITIALSCAN the backfill is sufficient to + // determine which keys are in the index. However, once we support this option + // it's no longer possible to determine which keys are in the index without + // storing them explicitly. Thus the V2 format includes this key list + // explicitly which will trivially enable the SKIPINITIALSCAN option. // - auto index = attributes_.begin()->second.GetIndex(); + std::shared_ptr index; for (const auto &attribute : attributes_) { if (!IsVectorIndex(attribute.second.GetIndex())) { index = attribute.second.GetIndex(); break; } } - size_t key_count = - index->GetTrackedKeyCount() + index->GetUnTrackedKeyCount(); - VMSDK_RETURN_IF_ERROR(out.SaveObject(key_count)); - rdb_save_keys.Increment(key_count); - VMSDK_LOG(DEBUG, nullptr) << "Writing Index Extension, keys = " << key_count; - - auto write_a_key = [&](const InternedStringPtr &key) { - key_count--; - return out.SaveString(key->Str()); - }; - VMSDK_RETURN_IF_ERROR(index->ForEachTrackedKey(write_a_key)); - VMSDK_RETURN_IF_ERROR(index->ForEachUnTrackedKey(write_a_key)); - CHECK(key_count == 0) << "Key count mismatch for index " << GetName(); - + if (!index) { + VMSDK_RETURN_IF_ERROR(out.SaveObject(0)); // zero keys + } else { + size_t key_count = + index->GetTrackedKeyCount() + index->GetUnTrackedKeyCount(); + VMSDK_RETURN_IF_ERROR(out.SaveObject(key_count)); + rdb_save_keys.Increment(key_count); + VMSDK_LOG(DEBUG, nullptr) + << "Writing Index Extension, keys = " << key_count; + + auto write_a_key = [&](const InternedStringPtr &key) { + key_count--; + return out.SaveString(key->Str()); + }; + VMSDK_RETURN_IF_ERROR(index->ForEachTrackedKey(write_a_key)); + VMSDK_RETURN_IF_ERROR(index->ForEachUnTrackedKey(write_a_key)); + CHECK(key_count == 0) << "Key count mismatch for index " << GetName(); + } + // + // Write out the mutation queue entries + // VMSDK_LOG(DEBUG, nullptr) << "Writing Mutation Queue, records = " << tracked_mutated_records_.size(); VMSDK_RETURN_IF_ERROR(out.SaveObject(tracked_mutated_records_.size())); @@ -1048,6 +1064,18 @@ absl::Status IndexSchema::SaveIndexExtension(RDBChunkOutputStream out) const { for (const auto &[key, value] : tracked_mutated_records_) { VMSDK_RETURN_IF_ERROR(out.SaveString(key->Str())); } + // + // Write out the multi/exec queued keys + // + VMSDK_RETURN_IF_ERROR( + out.SaveObject(multi_mutations_.Get().keys.size())); + rdb_save_multi_exec_entries.Increment(multi_mutations_.Get().keys.size()); + VMSDK_LOG(DEBUG, nullptr) << "Writing Multi/Exec Queue, records = " + << multi_mutations_.Get().keys.size(); + for (const auto &key : multi_mutations_.Get().keys) { + CHECK(tracked_mutated_records_.find(key) != tracked_mutated_records_.end()); + VMSDK_RETURN_IF_ERROR(out.SaveString(key->Str())); + } return absl::OkStatus(); } @@ -1069,6 +1097,23 @@ absl::Status IndexSchema::LoadIndexExtension(ValkeyModuleCtx *ctx, auto keyname = vmsdk::MakeUniqueValkeyString(keyname_str); ProcessKeyspaceNotification(ctx, keyname.get(), false); } + VMSDK_ASSIGN_OR_RETURN(size_t multi_count, input.LoadObject()); + rdb_load_multi_exec_entries.Increment(multi_count); + VMSDK_LOG(DEBUG, ctx) << "Loading Multi/Exec Entries, entries = " + << multi_count; + for (size_t i = 0; i < count; ++i) { + VMSDK_ASSIGN_OR_RETURN(auto keyname_str, input.LoadString()); + if (tracked_mutated_records_.find(StringInternStore::Intern(keyname_str)) == + tracked_mutated_records_.end()) { + VMSDK_LOG(WARNING, ctx) + << "Multi/Exec key not found in mutation tracked records: " + << keyname_str; + return absl::InternalError( + "Multi/Exec key not found in mutation tracked records"); + } + auto keyname = StringInternStore::Intern(keyname_str); + EnqueueMultiMutation(keyname); + } loaded_v2_ = true; return absl::OkStatus(); } diff --git a/src/index_schema.h b/src/index_schema.h index 42544383f..69c3676ce 100644 --- a/src/index_schema.h +++ b/src/index_schema.h @@ -270,7 +270,7 @@ class IndexSchema : public KeyspaceEventSubscription, mutable vmsdk::TimeSlicedMRMWMutex time_sliced_mutex_; struct MultiMutations { std::unique_ptr blocking_counter; - std::queue keys; + std::deque keys; }; vmsdk::MainThreadAccessGuard multi_mutations_; vmsdk::MainThreadAccessGuard schedule_multi_exec_processing_{false}; diff --git a/src/indexes/numeric.h b/src/indexes/numeric.h index 9459df3c5..cda44ebc3 100644 --- a/src/indexes/numeric.h +++ b/src/indexes/numeric.h @@ -92,21 +92,25 @@ class Numeric : public IndexBase { absl::StatusOr ModifyRecord(const InternedStringPtr& key, absl::string_view data) override ABSL_LOCKS_EXCLUDED(index_mutex_); - int RespondWithInfo(ValkeyModuleCtx* ctx) const override; + int RespondWithInfo(ValkeyModuleCtx* ctx) const override + ABSL_LOCKS_EXCLUDED(index_mutex_); absl::Status SaveIndex(RDBChunkOutputStream chunked_out) const override { return absl::OkStatus(); } - size_t GetTrackedKeyCount() const override; - size_t GetUnTrackedKeyCount() const override; - bool IsTracked(const InternedStringPtr& key) const override; - bool IsUnTracked(const InternedStringPtr& key) const override; + size_t GetTrackedKeyCount() const override ABSL_LOCKS_EXCLUDED(index_mutex_); + size_t GetUnTrackedKeyCount() const override + ABSL_LOCKS_EXCLUDED(index_mutex_); + bool IsTracked(const InternedStringPtr& key) const override + ABSL_LOCKS_EXCLUDED(index_mutex_); + bool IsUnTracked(const InternedStringPtr& key) const override + ABSL_LOCKS_EXCLUDED(index_mutex_); absl::Status ForEachTrackedKey( absl::AnyInvocable fn) - const override; + const override ABSL_LOCKS_EXCLUDED(index_mutex_); absl::Status ForEachUnTrackedKey( absl::AnyInvocable fn) - const override; + const override ABSL_LOCKS_EXCLUDED(index_mutex_); std::unique_ptr ToProto() const override; diff --git a/src/indexes/tag.h b/src/indexes/tag.h index f8339d019..f433e88db 100644 --- a/src/indexes/tag.h +++ b/src/indexes/tag.h @@ -42,21 +42,25 @@ class Tag : public IndexBase { absl::StatusOr ModifyRecord(const InternedStringPtr& key, absl::string_view data) override ABSL_LOCKS_EXCLUDED(index_mutex_); - int RespondWithInfo(ValkeyModuleCtx* ctx) const override; + int RespondWithInfo(ValkeyModuleCtx* ctx) const override + ABSL_LOCKS_EXCLUDED(index_mutex_); absl::Status SaveIndex(RDBChunkOutputStream chunked_out) const override { return absl::OkStatus(); } - size_t GetTrackedKeyCount() const override; - size_t GetUnTrackedKeyCount() const override; - bool IsTracked(const InternedStringPtr& key) const override; - bool IsUnTracked(const InternedStringPtr& key) const override; + size_t GetTrackedKeyCount() const override ABSL_LOCKS_EXCLUDED(index_mutex_); + size_t GetUnTrackedKeyCount() const override + ABSL_LOCKS_EXCLUDED(index_mutex_); + bool IsTracked(const InternedStringPtr& key) const override + ABSL_LOCKS_EXCLUDED(index_mutex_); + bool IsUnTracked(const InternedStringPtr& key) const override + ABSL_LOCKS_EXCLUDED(index_mutex_); absl::Status ForEachTrackedKey( absl::AnyInvocable fn) - const override; + const override ABSL_LOCKS_EXCLUDED(index_mutex_); absl::Status ForEachUnTrackedKey( absl::AnyInvocable fn) - const override; + const override ABSL_LOCKS_EXCLUDED(index_mutex_); std::unique_ptr ToProto() const override; InternedStringPtr GetRawValue(const InternedStringPtr& key) const diff --git a/src/indexes/vector_base.h b/src/indexes/vector_base.h index aa74c7daa..f23894f5b 100644 --- a/src/indexes/vector_base.h +++ b/src/indexes/vector_base.h @@ -113,12 +113,15 @@ absl::string_view LookupKeyByValue( class VectorBase : public IndexBase, public hnswlib::VectorTracker { public: absl::StatusOr AddRecord(const InternedStringPtr& key, - absl::string_view record) override; + absl::string_view record) override + ABSL_LOCKS_EXCLUDED(key_to_metadata_mutex_); absl::StatusOr RemoveRecord(const InternedStringPtr& key, indexes::DeletionType deletion_type = - indexes::DeletionType::kNone) override; + indexes::DeletionType::kNone) override + ABSL_LOCKS_EXCLUDED(key_to_metadata_mutex_); absl::StatusOr ModifyRecord(const InternedStringPtr& key, - absl::string_view record) override; + absl::string_view record) override + ABSL_LOCKS_EXCLUDED(key_to_metadata_mutex_); virtual size_t GetCapacity() const = 0; bool GetNormalize() const { return normalize_; } std::unique_ptr ToProto() const override; @@ -129,17 +132,19 @@ class VectorBase : public IndexBase, public hnswlib::VectorTracker { const AttributeDataType* attribute_data_type, SupplementalContentChunkIter&& iter); - size_t GetTrackedKeyCount() const override; - size_t GetUnTrackedKeyCount() const override; + size_t GetTrackedKeyCount() const override + ABSL_LOCKS_EXCLUDED(key_to_metadata_mutex_); + size_t GetUnTrackedKeyCount() const override + ABSL_LOCKS_EXCLUDED(key_to_metadata_mutex_); bool IsTracked(const InternedStringPtr& key) const override ABSL_LOCKS_EXCLUDED(key_to_metadata_mutex_); bool IsUnTracked(const InternedStringPtr& key) const override; absl::Status ForEachTrackedKey( absl::AnyInvocable fn) - const override; + const override ABSL_LOCKS_EXCLUDED(key_to_metadata_mutex_); absl::Status ForEachUnTrackedKey( absl::AnyInvocable fn) - const override; + const override ABSL_LOCKS_EXCLUDED(key_to_metadata_mutex_); absl::StatusOr GetKeyDuringSearch( uint64_t internal_id) const ABSL_NO_THREAD_SAFETY_ANALYSIS; From c0f27dd07b8b623a01f1d0977883cfcdae747302 Mon Sep 17 00:00:00 2001 From: Allen Samuels Date: Thu, 30 Oct 2025 00:47:37 +0000 Subject: [PATCH 16/33] Fix validation of vector-only indexes. Add test cases for same. Signed-off-by: Allen Samuels --- integration/indexes.py | 3 ++ integration/test_saverestore.py | 82 ++++++++++++++++++++++++--------- src/index_schema.cc | 15 ++++-- 3 files changed, 73 insertions(+), 27 deletions(-) diff --git a/integration/indexes.py b/integration/indexes.py index 1d7ef7d9f..28867b06e 100644 --- a/integration/indexes.py +++ b/integration/indexes.py @@ -200,3 +200,6 @@ def info(self, client: valkey.client) -> FTInfoParser: def backfill_complete(self, client: valkey.client) -> bool: res = self.info(client) return res.backfill_in_progress == 0 + + def has_field(self, name: str) -> bool: + return any(f.name == name for f in self.fields) diff --git a/integration/test_saverestore.py b/integration/test_saverestore.py index 21043d0d8..4603e3daa 100644 --- a/integration/test_saverestore.py +++ b/integration/test_saverestore.py @@ -1,4 +1,5 @@ import base64 +from math import exp import os import tempfile import time @@ -14,8 +15,12 @@ from util import waiters import threading from ft_info_parser import FTInfoParser +from typing import Any, List index = Index("index", [Vector("v", 3, type="HNSW", m=2, efc=1), Numeric("n"), Tag("t")]) +vector_only_index = Index("vector_only_index", [Vector("v", 3, type="HNSW", m=2, efc=1)]) +non_vector_index = Index("non_vector_index", [Numeric("n"), Tag("t")]) + NUM_VECTORS = 10 # Keys that are in all results @@ -24,12 +29,15 @@ def check_keys(received_keys, expected_keys): received_set = set(received_keys) expected_set = set(expected_keys) - print("Result.keys ", received_set) - print("expected.keys", expected_set) + if received_set != expected_set: + print("Result.keys ", received_set) + print("expected.keys", expected_set) + print("Difference(extra received): ", received_set - expected_set) + print("Difference(extra expected): ", expected_set - received_set) assert received_set == expected_set -def do_search(client: Valkey.client, query: str, extra: list[str] = []) -> dict[str, dict[str, str]]: - cmd = ["ft.search index", query, "limit", "0", "100"] + extra +def do_search(client: Valkey.client, index: Index, query: str, extra: list[str] = []) -> dict[str, dict[str, str]]: + cmd = ["ft.search", index.name, query, "limit", "0", "100"] + extra print("Cmd: ", cmd) res = client.execute_command(*cmd)[1:] result = dict() @@ -60,26 +68,29 @@ def make_data(): records += [data] return records +KEY_COUNT = len(make_data()) + def load_data(client: Valkey.client): records = make_data() for i in range(0, len(records)): index.write_data(client, i, records[i]) return len(records) -def verify_data(client: Valkey.client): +def verify_data(client: Valkey.client, this_index: Index): ''' Do query operations against each index to ensure that all keys are present ''' - - res = do_search(client, "@n:[0 100]") - check_keys(res.keys(), full_key_names + [index.keyname(NUM_VECTORS+0).encode(), index.keyname(NUM_VECTORS+2).encode()]) - res = do_search(client, "@t:{Tag*}") - check_keys(res.keys(), full_key_names + [index.keyname(NUM_VECTORS+0).encode(), index.keyname(NUM_VECTORS+1).encode()]) - -def do_save_restore_test(test, write_v2: bool, read_v2: bool): + if this_index.has_field("n"): + res = do_search(client, this_index, "@n:[0 100]") + check_keys(res.keys(), full_key_names + [index.keyname(NUM_VECTORS+0).encode(), index.keyname(NUM_VECTORS+2).encode()]) + if this_index.has_field("t"): + res = do_search(client, this_index, "@t:{Tag*}") + check_keys(res.keys(), full_key_names + [index.keyname(NUM_VECTORS+0).encode(), index.keyname(NUM_VECTORS+1).encode()]) + +def do_save_restore_test(test, index: Index, expected_writes: List[int], expected_reads: List[int]): index.create(test.client, True) key_count = load_data(test.client) - verify_data(test.client) + verify_data(test.client, index) test.client.config_set("search.rdb-validate-on-write", "yes") test.client.execute_command("save") os.environ["SKIPLOGCLEAN"] = "1" @@ -91,14 +102,17 @@ def do_save_restore_test(test, write_v2: bool, read_v2: bool): i["search_rdb_save_keys"], i["search_rdb_save_mutation_entries"], ] + assert writes == expected_writes + ''' if write_v2: assert writes == [5, key_count, 0] else: assert writes == [4, 0, 0] + ''' test.server.restart(remove_rdb=False) time.sleep(5) print(test.client.ping()) - verify_data(test.client) + verify_data(test.client, index) test.client.execute_command("CONFIG SET search.info-developer-visible yes") i = test.client.info("search") @@ -109,6 +123,8 @@ def do_save_restore_test(test, write_v2: bool, read_v2: bool): i["search_rdb_load_keys"], i["search_rdb_load_mutation_entries"], ] + assert reads == expected_reads + ''' if not write_v2: assert reads == [4, 0, 0, 0] elif read_v2: @@ -116,15 +132,20 @@ def do_save_restore_test(test, write_v2: bool, read_v2: bool): else: assert reads == [5, 1, 0, 0] + ''' class TestSaveRestore_v1_v1(ValkeySearchTestCaseDebugMode): def append_startup_args(self, args): args["search.rdb_write_v2"] = "no" args["search.rdb_read_v2"] = "no" return args - - def test_saverestore_v1_v1(self): - do_save_restore_test(self, False, False) + @pytest.mark.parametrize("parameters", [ + [index, [4, 0, 0], [4, 0, 0, 0]], + [vector_only_index, [2, 0, 0], [2, 0, 0, 0]], + [non_vector_index, [2, 0, 0], [2, 0, 0, 0]], + ]) + def test_saverestore_v1_v1(self, parameters): + do_save_restore_test(self, parameters[0], parameters[1], parameters[2]) class TestSaveRestore_v1_v2(ValkeySearchTestCaseDebugMode): def append_startup_args(self, args): @@ -132,8 +153,13 @@ def append_startup_args(self, args): args["search.rdb_read_v2"] = "yes" return args - def test_saverestore_v1_v2(self): - do_save_restore_test(self, False, True) + @pytest.mark.parametrize("parameters", [ + [index, [4, 0, 0], [4, 0, 0, 0]], + [vector_only_index, [2, 0, 0], [2, 0, 0, 0]], + [non_vector_index, [2, 0, 0], [2, 0, 0, 0]], + ]) + def test_saverestore_v1_v2(self, parameters): + do_save_restore_test(self, parameters[0], parameters[1], parameters[2]) class TestSaveRestore_v2_v1(ValkeySearchTestCaseDebugMode): def append_startup_args(self, args): @@ -141,8 +167,13 @@ def append_startup_args(self, args): args["search.rdb_read_v2"] = "no" return args - def test_saverestore_v2_v1(self): - do_save_restore_test(self, True, False) + @pytest.mark.parametrize("parameters", [ + [index, [5, KEY_COUNT, 0], [5, 1, 0, 0]], + [vector_only_index, [3, 0, 0], [3, 1, 0, 0]], + [non_vector_index, [3, KEY_COUNT, 0], [3, 1, 0, 0]], + ]) + def test_saverestore_v2_v1(self, parameters): + do_save_restore_test(self, parameters[0], parameters[1], parameters[2]) class TestSaveRestore_v2_v2(ValkeySearchTestCaseDebugMode): def append_startup_args(self, args): @@ -150,8 +181,13 @@ def append_startup_args(self, args): args["search.rdb_read_v2"] = "yes" return args - def test_saverestore_v2_v2(self): - do_save_restore_test(self, True, True) + @pytest.mark.parametrize("parameters", [ + [index, [5, KEY_COUNT, 0], [5, 0, KEY_COUNT, 0]], + [vector_only_index, [3, 0, 0], [3, 0, 0, 0]], + [non_vector_index, [3, KEY_COUNT, 0], [3, 0, KEY_COUNT, 0]], + ]) + def test_saverestore_v2_v2(self, parameters): + do_save_restore_test(self, parameters[0], parameters[1], parameters[2]) class TestMutationQueue(ValkeySearchTestCaseDebugMode): def append_startup_args(self, args): diff --git a/src/index_schema.cc b/src/index_schema.cc index f2286bfac..1df7bcb07 100644 --- a/src/index_schema.cc +++ b/src/index_schema.cc @@ -953,11 +953,13 @@ absl::Status IndexSchema::RDBSave(SafeRDB *rdb) const { absl::Status IndexSchema::ValidateIndex() const { absl::Status status = absl::OkStatus(); + // + // Find a non-vector index as the oracle + // If all indexes are vector indexes, no validation is needed // - // Again, find a non-vector index as the oracle - // - auto oracle_index = attributes_.begin()->second.GetIndex(); - auto oracle_name = attributes_.begin()->first; + std::shared_ptr oracle_index = nullptr; + std::string oracle_name; + for (const auto &attribute : attributes_) { if (!IsVectorIndex(attribute.second.GetIndex())) { oracle_index = attribute.second.GetIndex(); @@ -965,6 +967,11 @@ absl::Status IndexSchema::ValidateIndex() const { break; } } + + // If no non-vector index found, all indexes are vectors - no validation needed + if (oracle_index == nullptr) { + return absl::OkStatus(); + } size_t oracle_key_count = oracle_index->GetTrackedKeyCount() + oracle_index->GetUnTrackedKeyCount(); // From 058e7d8abe32a92c4000322ceb9e70eeba69287f Mon Sep 17 00:00:00 2001 From: Allen Samuels Date: Thu, 30 Oct 2025 02:45:44 +0000 Subject: [PATCH 17/33] review comments update Signed-off-by: Allen Samuels --- src/index_schema.cc | 14 +++++++------- src/rdb_serialization.h | 2 ++ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/src/index_schema.cc b/src/index_schema.cc index 1df7bcb07..e24c54cf7 100644 --- a/src/index_schema.cc +++ b/src/index_schema.cc @@ -953,13 +953,13 @@ absl::Status IndexSchema::RDBSave(SafeRDB *rdb) const { absl::Status IndexSchema::ValidateIndex() const { absl::Status status = absl::OkStatus(); - // + // // Find a non-vector index as the oracle // If all indexes are vector indexes, no validation is needed // std::shared_ptr oracle_index = nullptr; std::string oracle_name; - + for (const auto &attribute : attributes_) { if (!IsVectorIndex(attribute.second.GetIndex())) { oracle_index = attribute.second.GetIndex(); @@ -967,8 +967,9 @@ absl::Status IndexSchema::ValidateIndex() const { break; } } - - // If no non-vector index found, all indexes are vectors - no validation needed + + // If no non-vector index found, all indexes are vectors - no validation + // needed if (oracle_index == nullptr) { return absl::OkStatus(); } @@ -1207,9 +1208,8 @@ absl::StatusOr> IndexSchema::LoadFromRDB( if (index_schema) { VMSDK_RETURN_IF_ERROR(index_schema->LoadIndexExtension( ctx, RDBChunkInputStream(supplemental_iter.IterateChunks()))); - bool backfilling = - supplemental_content->mutation_queue_header().backfilling(); - if (!backfilling) { + if (!supplemental_content->mutation_queue_header() + .backfilling()) { VMSDK_LOG(DEBUG, ctx) << "Backfill suppressed."; index_schema->backfill_job_.Get() = std::nullopt; } diff --git a/src/rdb_serialization.h b/src/rdb_serialization.h index a19c0b251..a619dcac5 100644 --- a/src/rdb_serialization.h +++ b/src/rdb_serialization.h @@ -319,6 +319,8 @@ class RDBChunkInputStream : public hnswlib::InputStream { absl::StatusOr LoadObject() { VMSDK_ASSIGN_OR_RETURN(auto buffer, LoadChunk()); if (buffer->size() != sizeof(T)) { + DCHECK(false) << "Mismatched size protocol error: expected " << sizeof(T) + << " got " << buffer->size(); return absl::InternalError("Mismatched size protocol error"); } return *reinterpret_cast(buffer->data()); From 38189b6b9b5a8244a8cd71c125efbb6c11b54589 Mon Sep 17 00:00:00 2001 From: Allen Samuels Date: Wed, 5 Nov 2025 04:07:46 +0000 Subject: [PATCH 18/33] Fix review issue Signed-off-by: Allen Samuels --- src/index_schema.cc | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/index_schema.cc b/src/index_schema.cc index e24c54cf7..2f8e871aa 100644 --- a/src/index_schema.cc +++ b/src/index_schema.cc @@ -1071,6 +1071,8 @@ absl::Status IndexSchema::SaveIndexExtension(RDBChunkOutputStream out) const { rdb_save_mutation_entries.Increment(tracked_mutated_records_.size()); for (const auto &[key, value] : tracked_mutated_records_) { VMSDK_RETURN_IF_ERROR(out.SaveString(key->Str())); + VMSDK_RETURN_IF_ERROR(out.SaveObject(value.from_backfill)); + VMSDK_RETURN_IF_ERROR(out.SaveObject(value.from_multi)); } // // Write out the multi/exec queued keys @@ -1102,8 +1104,11 @@ absl::Status IndexSchema::LoadIndexExtension(ValkeyModuleCtx *ctx, rdb_load_mutation_entries.Increment(count); for (size_t i = 0; i < count; ++i) { VMSDK_ASSIGN_OR_RETURN(auto keyname_str, input.LoadString()); + VMSDK_ASSIGN_OR_RETURN(auto from_backfill, input.LoadObject()); + VMSDK_ASSIGN_OR_RETURN(auto from_multi, input.LoadObject()); + auto keyname = vmsdk::MakeUniqueValkeyString(keyname_str); - ProcessKeyspaceNotification(ctx, keyname.get(), false); + ProcessKeyspaceNotification(ctx, keyname.get(), from_backfill); } VMSDK_ASSIGN_OR_RETURN(size_t multi_count, input.LoadObject()); rdb_load_multi_exec_entries.Increment(multi_count); From 8374c6e2c9fe11f1f18ad924d98e42e4e6ef1a20 Mon Sep 17 00:00:00 2001 From: Allen Samuels Date: Sun, 9 Nov 2025 23:12:28 +0000 Subject: [PATCH 19/33] Fixup merge errors Signed-off-by: Allen Samuels --- integration/test_saverestore.py | 10 +++++----- src/index_schema.cc | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/integration/test_saverestore.py b/integration/test_saverestore.py index 4603e3daa..4ba2089b7 100644 --- a/integration/test_saverestore.py +++ b/integration/test_saverestore.py @@ -226,10 +226,10 @@ def test_mutation_queue(self): for t in client_threads: t.join() - verify_data(self.client) + verify_data(self.client, index) os.environ["SKIPLOGCLEAN"] = "1" self.server.restart(remove_rdb=False) - verify_data(self.client) + verify_data(self.client, index) self.client.execute_command("CONFIG SET search.info-developer-visible yes") i = self.client.info("search") print("Info: ", i) @@ -258,10 +258,10 @@ def test_multi_exec_queue(self): self.client.execute_command("ft._debug pausepoint reset block_mutation_queue") - verify_data(self.client) + verify_data(self.client, index) os.environ["SKIPLOGCLEAN"] = "1" self.server.restart(remove_rdb=False) - verify_data(self.client) + verify_data(self.client, index) self.client.execute_command("CONFIG SET search.info-developer-visible yes") i = self.client.info("search") print("Info: ", i) @@ -281,7 +281,7 @@ def test_saverestore_backfill(self): os.environ["SKIPLOGCLEAN"] = "1" self.server.restart(remove_rdb=False) - verify_data(self.client) + verify_data(self.client, index) self.client.execute_command("CONFIG SET search.info-developer-visible yes") i = self.client.info("search") print("Info: ", i) diff --git a/src/index_schema.cc b/src/index_schema.cc index 2f8e871aa..116d1a77c 100644 --- a/src/index_schema.cc +++ b/src/index_schema.cc @@ -1114,7 +1114,7 @@ absl::Status IndexSchema::LoadIndexExtension(ValkeyModuleCtx *ctx, rdb_load_multi_exec_entries.Increment(multi_count); VMSDK_LOG(DEBUG, ctx) << "Loading Multi/Exec Entries, entries = " << multi_count; - for (size_t i = 0; i < count; ++i) { + for (size_t i = 0; i < multi_count; ++i) { VMSDK_ASSIGN_OR_RETURN(auto keyname_str, input.LoadString()); if (tracked_mutated_records_.find(StringInternStore::Intern(keyname_str)) == tracked_mutated_records_.end()) { From 2857cacbf9758c858bbafc33acc2b4827b4207df Mon Sep 17 00:00:00 2001 From: Allen Samuels Date: Mon, 10 Nov 2025 05:35:08 +0000 Subject: [PATCH 20/33] Stop writer threads during restore Signed-off-by: Allen Samuels --- src/index_schema.cc | 73 +++++++++++++++++++++++---------------------- 1 file changed, 38 insertions(+), 35 deletions(-) diff --git a/src/index_schema.cc b/src/index_schema.cc index 116d1a77c..bfb9b72cd 100644 --- a/src/index_schema.cc +++ b/src/index_schema.cc @@ -43,6 +43,7 @@ #include "src/metrics.h" #include "src/rdb_serialization.h" #include "src/utils/string_interning.h" +#include "src/valkey_search.h" #include "src/valkey_search_options.h" #include "src/vector_externalizer.h" #include "vmsdk/src/blocked_client.h" @@ -1051,7 +1052,7 @@ absl::Status IndexSchema::SaveIndexExtension(RDBChunkOutputStream out) const { index->GetTrackedKeyCount() + index->GetUnTrackedKeyCount(); VMSDK_RETURN_IF_ERROR(out.SaveObject(key_count)); rdb_save_keys.Increment(key_count); - VMSDK_LOG(DEBUG, nullptr) + VMSDK_LOG(NOTICE, nullptr) << "Writing Index Extension, keys = " << key_count; auto write_a_key = [&](const InternedStringPtr &key) { @@ -1065,8 +1066,8 @@ absl::Status IndexSchema::SaveIndexExtension(RDBChunkOutputStream out) const { // // Write out the mutation queue entries // - VMSDK_LOG(DEBUG, nullptr) << "Writing Mutation Queue, records = " - << tracked_mutated_records_.size(); + VMSDK_LOG(NOTICE, nullptr) << "Writing Mutation Queue, records = " + << tracked_mutated_records_.size(); VMSDK_RETURN_IF_ERROR(out.SaveObject(tracked_mutated_records_.size())); rdb_save_mutation_entries.Increment(tracked_mutated_records_.size()); for (const auto &[key, value] : tracked_mutated_records_) { @@ -1080,8 +1081,8 @@ absl::Status IndexSchema::SaveIndexExtension(RDBChunkOutputStream out) const { VMSDK_RETURN_IF_ERROR( out.SaveObject(multi_mutations_.Get().keys.size())); rdb_save_multi_exec_entries.Increment(multi_mutations_.Get().keys.size()); - VMSDK_LOG(DEBUG, nullptr) << "Writing Multi/Exec Queue, records = " - << multi_mutations_.Get().keys.size(); + VMSDK_LOG(NOTICE, nullptr) << "Writing Multi/Exec Queue, records = " + << multi_mutations_.Get().keys.size(); for (const auto &key : multi_mutations_.Get().keys) { CHECK(tracked_mutated_records_.find(key) != tracked_mutated_records_.end()); VMSDK_RETURN_IF_ERROR(out.SaveString(key->Str())); @@ -1093,42 +1094,44 @@ absl::Status IndexSchema::LoadIndexExtension(ValkeyModuleCtx *ctx, RDBChunkInputStream input) { VMSDK_ASSIGN_OR_RETURN(size_t key_count, input.LoadObject()); rdb_load_keys.Increment(key_count); - VMSDK_LOG(DEBUG, ctx) << "Loading Index Extension, keys = " << key_count; + VMSDK_LOG(NOTICE, ctx) << "Loading Index Extension, keys = " << key_count; for (size_t i = 0; i < key_count; ++i) { VMSDK_ASSIGN_OR_RETURN(auto keyname_str, input.LoadString()); auto keyname = vmsdk::MakeUniqueValkeyString(keyname_str); ProcessKeyspaceNotification(ctx, keyname.get(), false); } - VMSDK_ASSIGN_OR_RETURN(size_t count, input.LoadObject()); - VMSDK_LOG(DEBUG, ctx) << "Loading Mutation Entries, entries = " << count; - rdb_load_mutation_entries.Increment(count); - for (size_t i = 0; i < count; ++i) { - VMSDK_ASSIGN_OR_RETURN(auto keyname_str, input.LoadString()); - VMSDK_ASSIGN_OR_RETURN(auto from_backfill, input.LoadObject()); - VMSDK_ASSIGN_OR_RETURN(auto from_multi, input.LoadObject()); - - auto keyname = vmsdk::MakeUniqueValkeyString(keyname_str); - ProcessKeyspaceNotification(ctx, keyname.get(), from_backfill); - } - VMSDK_ASSIGN_OR_RETURN(size_t multi_count, input.LoadObject()); - rdb_load_multi_exec_entries.Increment(multi_count); - VMSDK_LOG(DEBUG, ctx) << "Loading Multi/Exec Entries, entries = " - << multi_count; - for (size_t i = 0; i < multi_count; ++i) { - VMSDK_ASSIGN_OR_RETURN(auto keyname_str, input.LoadString()); - if (tracked_mutated_records_.find(StringInternStore::Intern(keyname_str)) == - tracked_mutated_records_.end()) { - VMSDK_LOG(WARNING, ctx) - << "Multi/Exec key not found in mutation tracked records: " - << keyname_str; - return absl::InternalError( - "Multi/Exec key not found in mutation tracked records"); + // Need to suspend workers so that MultiMutation and Regular Mutation queues + // are synced + VMSDK_RETURN_IF_ERROR( + ValkeySearch::Instance().GetWriterThreadPool()->SuspendWorkers()); + auto reload_queues = [&]() -> absl::Status { + VMSDK_ASSIGN_OR_RETURN(size_t count, input.LoadObject()); + VMSDK_LOG(NOTICE, ctx) << "Loading Mutation Entries, entries = " << count; + rdb_load_mutation_entries.Increment(count); + for (size_t i = 0; i < count; ++i) { + VMSDK_ASSIGN_OR_RETURN(auto keyname_str, input.LoadString()); + VMSDK_ASSIGN_OR_RETURN(auto from_backfill, input.LoadObject()); + VMSDK_ASSIGN_OR_RETURN(auto from_multi, input.LoadObject()); + + auto keyname = vmsdk::MakeUniqueValkeyString(keyname_str); + ProcessKeyspaceNotification(ctx, keyname.get(), from_backfill); } - auto keyname = StringInternStore::Intern(keyname_str); - EnqueueMultiMutation(keyname); - } - loaded_v2_ = true; - return absl::OkStatus(); + VMSDK_ASSIGN_OR_RETURN(size_t multi_count, input.LoadObject()); + rdb_load_multi_exec_entries.Increment(multi_count); + VMSDK_LOG(NOTICE, ctx) << "Loading Multi/Exec Entries, entries = " + << multi_count; + for (size_t i = 0; i < multi_count; ++i) { + VMSDK_ASSIGN_OR_RETURN(auto keyname_str, input.LoadString()); + auto keyname = StringInternStore::Intern(keyname_str); + EnqueueMultiMutation(keyname); + } + loaded_v2_ = true; + return absl::OkStatus(); + }; + auto status = reload_queues(); + VMSDK_RETURN_IF_ERROR( + ValkeySearch::Instance().GetWriterThreadPool()->ResumeWorkers()); + return status; } // We need to iterate over the chunks to consume them From 0113af0d2a22d6f4f3d5efe7bf5ca523c90f3be7 Mon Sep 17 00:00:00 2001 From: Allen Samuels Date: Tue, 11 Nov 2025 05:02:16 +0000 Subject: [PATCH 21/33] prevent run-away test runtimes Signed-off-by: Allen Samuels --- .github/workflows/integration_tests-asan.yml | 1 + .github/workflows/integration_tests.yml | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/integration_tests-asan.yml b/.github/workflows/integration_tests-asan.yml index 0e9fca140..648de44e9 100644 --- a/.github/workflows/integration_tests-asan.yml +++ b/.github/workflows/integration_tests-asan.yml @@ -32,6 +32,7 @@ jobs: - name: Run Integration Tests id: test-run + timeout-minutes: 40 continue-on-error: true run: | mkdir test-results diff --git a/.github/workflows/integration_tests.yml b/.github/workflows/integration_tests.yml index 7c9362ca0..f01fca8d3 100644 --- a/.github/workflows/integration_tests.yml +++ b/.github/workflows/integration_tests.yml @@ -30,8 +30,9 @@ jobs: - name: Build Docker Image run: docker build -t presubmit-image -f .devcontainer/Dockerfile . - - name: Run Integration Tests + - name: Run Integration Tests id: test-run + timeout-minutes: 30 continue-on-error: true run: | mkdir test-results From dab6546e24771019ea3db7382d60651bf9e93c3d Mon Sep 17 00:00:00 2001 From: Allen Samuels Date: Wed, 12 Nov 2025 04:27:27 +0000 Subject: [PATCH 22/33] hack the timeouts Signed-off-by: Allen Samuels --- .github/workflows/integration_tests.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/integration_tests.yml b/.github/workflows/integration_tests.yml index f01fca8d3..18111421f 100644 --- a/.github/workflows/integration_tests.yml +++ b/.github/workflows/integration_tests.yml @@ -32,7 +32,7 @@ jobs: - name: Run Integration Tests id: test-run - timeout-minutes: 30 + timeout-minutes: 40 continue-on-error: true run: | mkdir test-results @@ -41,7 +41,7 @@ jobs: -v "$(pwd):/workspace" \ --user "ubuntu:ubuntu" \ presubmit-image \ - sudo ci/build_ubuntu.sh --run-integration-tests --integration-output=/workspace/test-results + sudo timeout 30m ci/build_ubuntu.sh --run-integration-tests --integration-output=/workspace/test-results - name: Update Test Results Permissions if: always() From 4173b85cd30e84d17f035297b1c16b93000d9369 Mon Sep 17 00:00:00 2001 From: Allen Samuels Date: Wed, 12 Nov 2025 04:58:27 +0000 Subject: [PATCH 23/33] Revert "hack the timeouts" This reverts commit dab6546e24771019ea3db7382d60651bf9e93c3d. Signed-off-by: Allen Samuels --- .github/workflows/integration_tests.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/integration_tests.yml b/.github/workflows/integration_tests.yml index 18111421f..f01fca8d3 100644 --- a/.github/workflows/integration_tests.yml +++ b/.github/workflows/integration_tests.yml @@ -32,7 +32,7 @@ jobs: - name: Run Integration Tests id: test-run - timeout-minutes: 40 + timeout-minutes: 30 continue-on-error: true run: | mkdir test-results @@ -41,7 +41,7 @@ jobs: -v "$(pwd):/workspace" \ --user "ubuntu:ubuntu" \ presubmit-image \ - sudo timeout 30m ci/build_ubuntu.sh --run-integration-tests --integration-output=/workspace/test-results + sudo ci/build_ubuntu.sh --run-integration-tests --integration-output=/workspace/test-results - name: Update Test Results Permissions if: always() From a7a659ca5bcb9c56684ca189d7fb2e15cf03b731 Mon Sep 17 00:00:00 2001 From: Allen Samuels Date: Wed, 12 Nov 2025 05:00:28 +0000 Subject: [PATCH 24/33] limit pytest to 30 minutes Signed-off-by: Allen Samuels --- integration/run.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integration/run.sh b/integration/run.sh index 7bf1f09ae..82ced82e1 100755 --- a/integration/run.sh +++ b/integration/run.sh @@ -144,7 +144,7 @@ mkdir -p ${LOGS_DIR} function run_pytest() { zap valkey-server LOG_INFO "Running: ${PYTHON_PATH} -m pytest ${FILTER_ARGS} --capture=sys --cache-clear -v ${ROOT_DIR}/integration/" - ${PYTHON_PATH} -m pytest ${FILTER_ARGS} --capture=sys --cache-clear -v ${ROOT_DIR}/integration/ + timeout 30m ${PYTHON_PATH} -m pytest ${FILTER_ARGS} --capture=sys --cache-clear -v ${ROOT_DIR}/integration/ RUN_SUCCESS=$? } From 7d985ae30df5564b1c6f210f66f21763e838855a Mon Sep 17 00:00:00 2001 From: Allen Samuels Date: Wed, 12 Nov 2025 05:00:28 +0000 Subject: [PATCH 25/33] limit pytest to 30 minutes Signed-off-by: Allen Samuels --- .github/workflows/integration_tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration_tests.yml b/.github/workflows/integration_tests.yml index f01fca8d3..35a765487 100644 --- a/.github/workflows/integration_tests.yml +++ b/.github/workflows/integration_tests.yml @@ -32,7 +32,7 @@ jobs: - name: Run Integration Tests id: test-run - timeout-minutes: 30 + timeout-minutes: 40 continue-on-error: true run: | mkdir test-results From 5d60325b965acbe5ca09565419a65cd8edfea845 Mon Sep 17 00:00:00 2001 From: Allen Samuels Date: Wed, 12 Nov 2025 05:02:41 +0000 Subject: [PATCH 26/33] formatting Signed-off-by: Allen Samuels --- .github/workflows/integration_tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration_tests.yml b/.github/workflows/integration_tests.yml index 35a765487..9b3ada665 100644 --- a/.github/workflows/integration_tests.yml +++ b/.github/workflows/integration_tests.yml @@ -30,7 +30,7 @@ jobs: - name: Build Docker Image run: docker build -t presubmit-image -f .devcontainer/Dockerfile . - - name: Run Integration Tests + - name: Run Integration Tests id: test-run timeout-minutes: 40 continue-on-error: true From 1e92d437d61c4ae9370052e879fcb2cfcb4fd09d Mon Sep 17 00:00:00 2001 From: Allen Samuels Date: Wed, 12 Nov 2025 05:56:23 +0000 Subject: [PATCH 27/33] rejigger timeouts Signed-off-by: Allen Samuels --- .github/workflows/integration_tests-asan.yml | 2 +- .github/workflows/integration_tests.yml | 2 +- integration/run.sh | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/integration_tests-asan.yml b/.github/workflows/integration_tests-asan.yml index 648de44e9..09fa3b66e 100644 --- a/.github/workflows/integration_tests-asan.yml +++ b/.github/workflows/integration_tests-asan.yml @@ -32,7 +32,7 @@ jobs: - name: Run Integration Tests id: test-run - timeout-minutes: 40 + timeout-minutes: 140 continue-on-error: true run: | mkdir test-results diff --git a/.github/workflows/integration_tests.yml b/.github/workflows/integration_tests.yml index 9b3ada665..619ff2cba 100644 --- a/.github/workflows/integration_tests.yml +++ b/.github/workflows/integration_tests.yml @@ -32,7 +32,7 @@ jobs: - name: Run Integration Tests id: test-run - timeout-minutes: 40 + timeout-minutes: 140 continue-on-error: true run: | mkdir test-results diff --git a/integration/run.sh b/integration/run.sh index 82ced82e1..2eb6e63fe 100755 --- a/integration/run.sh +++ b/integration/run.sh @@ -144,7 +144,7 @@ mkdir -p ${LOGS_DIR} function run_pytest() { zap valkey-server LOG_INFO "Running: ${PYTHON_PATH} -m pytest ${FILTER_ARGS} --capture=sys --cache-clear -v ${ROOT_DIR}/integration/" - timeout 30m ${PYTHON_PATH} -m pytest ${FILTER_ARGS} --capture=sys --cache-clear -v ${ROOT_DIR}/integration/ + timeout 10m ${PYTHON_PATH} -m pytest ${FILTER_ARGS} --capture=sys --cache-clear -v ${ROOT_DIR}/integration/ RUN_SUCCESS=$? } From c88792750dbe4bda67f90ee81eb9354c16578f6a Mon Sep 17 00:00:00 2001 From: Allen Samuels Date: Wed, 12 Nov 2025 06:54:02 +0000 Subject: [PATCH 28/33] more debugging Signed-off-by: Allen Samuels --- integration/test_saverestore.py | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/integration/test_saverestore.py b/integration/test_saverestore.py index 4ba2089b7..e236d439e 100644 --- a/integration/test_saverestore.py +++ b/integration/test_saverestore.py @@ -238,37 +238,51 @@ def test_mutation_queue(self): ] assert reads == [len(records)] + def block(self, label): + x = self.client.execute_command("ft._debug pausepoint test block_mutation_queue") + self.client.execute_command("debug log",f"pausepoint block:{x} @ {label}") + def test_multi_exec_queue(self): self.client.execute_command("ft._debug PAUSEPOINT SET block_mutation_queue") self.client.execute_command("CONFIG SET search.info-developer-visible yes") index.create(self.client, True) records = make_data() + num_records = 2 # len(records) # # Now, load the data as a multi/exec... But this won't block us. # + self.block("Before multi") self.client.execute_command("MULTI") - for i in range(len(records)): + self.block("after multi") + for i in range(2): + self.client.execute_command("DEBUG LOG", f"Writing record: {i}") index.write_data(self.client, i, records[i]) + self.block("After write") + self.client.execute_command("DEBUG LOG","Executing EXEC") + self.block("Before exec") self.client.execute_command("EXEC") + self.client.execute_command("DEBUG LOG", "Completed EXEC, starting SAVE command") + self.block("after exec") self.client.execute_command("save") + self.block("after save") + self.client.execute_command("ft._debug pausepoint reset block_mutation_queue") - i = self.client.info("search") - assert i["search_rdb_save_multi_exec_entries"] == len(records) - self.client.execute_command("ft._debug pausepoint reset block_mutation_queue") + i = self.client.info("search") + assert i["search_rdb_save_multi_exec_entries"] == num_records - verify_data(self.client, index) + # verify_data(self.client, index) os.environ["SKIPLOGCLEAN"] = "1" self.server.restart(remove_rdb=False) - verify_data(self.client, index) + # verify_data(self.client, index) self.client.execute_command("CONFIG SET search.info-developer-visible yes") i = self.client.info("search") print("Info: ", i) reads = [ i["search_rdb_load_multi_exec_entries"], ] - assert reads == [len(records)] + assert reads == [num_records] def test_saverestore_backfill(self): # From f0466e8d1891fb8acfed0378b44c45b2efca2de1 Mon Sep 17 00:00:00 2001 From: Allen Samuels Date: Wed, 12 Nov 2025 15:25:47 +0000 Subject: [PATCH 29/33] more debug Signed-off-by: Allen Samuels --- integration/test_saverestore.py | 6 ++---- src/commands/ft_debug.cc | 1 + vmsdk/src/debug.cc | 1 + 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/integration/test_saverestore.py b/integration/test_saverestore.py index e236d439e..389c3ae21 100644 --- a/integration/test_saverestore.py +++ b/integration/test_saverestore.py @@ -240,7 +240,7 @@ def test_mutation_queue(self): def block(self, label): x = self.client.execute_command("ft._debug pausepoint test block_mutation_queue") - self.client.execute_command("debug log",f"pausepoint block:{x} @ {label}") + self.client.execute_command(*["debug", "log",f"pausepoint block:{x} @ {label}"]) def test_multi_exec_queue(self): self.client.execute_command("ft._debug PAUSEPOINT SET block_mutation_queue") @@ -255,13 +255,11 @@ def test_multi_exec_queue(self): self.client.execute_command("MULTI") self.block("after multi") for i in range(2): - self.client.execute_command("DEBUG LOG", f"Writing record: {i}") + self.block("before write") index.write_data(self.client, i, records[i]) self.block("After write") - self.client.execute_command("DEBUG LOG","Executing EXEC") self.block("Before exec") self.client.execute_command("EXEC") - self.client.execute_command("DEBUG LOG", "Completed EXEC, starting SAVE command") self.block("after exec") self.client.execute_command("save") diff --git a/src/commands/ft_debug.cc b/src/commands/ft_debug.cc index 97e6a27cb..61bb6bfc8 100644 --- a/src/commands/ft_debug.cc +++ b/src/commands/ft_debug.cc @@ -66,6 +66,7 @@ absl::Status PausePointControlCmd(ValkeyModuleCtx *ctx, } std::string point; VMSDK_RETURN_IF_ERROR(vmsdk::ParseParamValue(itr, point)); + point = absl::AsciiStrToUpper(point); VMSDK_RETURN_IF_ERROR(CheckEndOfArgs(itr)); if (keyword == "TEST") { auto result = vmsdk::debug::PausePointWaiters(point); diff --git a/vmsdk/src/debug.cc b/vmsdk/src/debug.cc index 21f4fc0fe..a1ad12ebb 100644 --- a/vmsdk/src/debug.cc +++ b/vmsdk/src/debug.cc @@ -100,6 +100,7 @@ absl::StatusOr PausePointWaiters(absl::string_view point) { absl::MutexLock lock(&pause_point_lock); auto it = pause_point_waiters.find(point); if (it == pause_point_waiters.end()) { + VMSDK_LOG(WARNING, nullptr) << "PAUSEPOINT: " << point << " not found"; return absl::NotFoundError("Pause Point not found"); } else { VMSDK_LOG(WARNING, nullptr) From 13e6583b5996b9956ba03a1f81d5c4d12cbea414 Mon Sep 17 00:00:00 2001 From: Allen Samuels Date: Wed, 12 Nov 2025 18:06:13 +0000 Subject: [PATCH 30/33] more test output Signed-off-by: Allen Samuels --- integration/test_saverestore.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/integration/test_saverestore.py b/integration/test_saverestore.py index 389c3ae21..dacea6627 100644 --- a/integration/test_saverestore.py +++ b/integration/test_saverestore.py @@ -253,15 +253,13 @@ def test_multi_exec_queue(self): # self.block("Before multi") self.client.execute_command("MULTI") - self.block("after multi") for i in range(2): - self.block("before write") index.write_data(self.client, i, records[i]) - self.block("After write") - self.block("Before exec") self.client.execute_command("EXEC") self.block("after exec") + + self.client.execute_command("save") self.block("after save") self.client.execute_command("ft._debug pausepoint reset block_mutation_queue") @@ -272,6 +270,7 @@ def test_multi_exec_queue(self): # verify_data(self.client, index) os.environ["SKIPLOGCLEAN"] = "1" + self.block("BEFORE RESTART") self.server.restart(remove_rdb=False) # verify_data(self.client, index) self.client.execute_command("CONFIG SET search.info-developer-visible yes") From 9426112fe66e8f0d9da11d8392d0c44d0315e73d Mon Sep 17 00:00:00 2001 From: Allen Samuels Date: Thu, 13 Nov 2025 02:34:08 +0000 Subject: [PATCH 31/33] more debugging output Signed-off-by: Allen Samuels --- integration/run.sh | 4 ++-- integration/test_saverestore.py | 18 +++++++----------- 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/integration/run.sh b/integration/run.sh index 2eb6e63fe..e905cb2cf 100755 --- a/integration/run.sh +++ b/integration/run.sh @@ -143,8 +143,8 @@ mkdir -p ${LOGS_DIR} function run_pytest() { zap valkey-server - LOG_INFO "Running: ${PYTHON_PATH} -m pytest ${FILTER_ARGS} --capture=sys --cache-clear -v ${ROOT_DIR}/integration/" - timeout 10m ${PYTHON_PATH} -m pytest ${FILTER_ARGS} --capture=sys --cache-clear -v ${ROOT_DIR}/integration/ + LOG_INFO "Running: ${PYTHON_PATH} -m pytest ${FILTER_ARGS} --capture=no --cache-clear -v ${ROOT_DIR}/integration/" + timeout 10m ${PYTHON_PATH} -m pytest ${FILTER_ARGS} --capture=no --cache-clear -v ${ROOT_DIR}/integration/ RUN_SUCCESS=$? } diff --git a/integration/test_saverestore.py b/integration/test_saverestore.py index dacea6627..d866d17aa 100644 --- a/integration/test_saverestore.py +++ b/integration/test_saverestore.py @@ -247,39 +247,35 @@ def test_multi_exec_queue(self): self.client.execute_command("CONFIG SET search.info-developer-visible yes") index.create(self.client, True) records = make_data() - num_records = 2 # len(records) # # Now, load the data as a multi/exec... But this won't block us. # self.block("Before multi") self.client.execute_command("MULTI") - for i in range(2): + for i in range(len(records)): index.write_data(self.client, i, records[i]) self.client.execute_command("EXEC") self.block("after exec") - - self.client.execute_command("save") - self.block("after save") self.client.execute_command("ft._debug pausepoint reset block_mutation_queue") - i = self.client.info("search") - assert i["search_rdb_save_multi_exec_entries"] == num_records - - # verify_data(self.client, index) + assert i["search_rdb_save_multi_exec_entries"] == len(records) + self.block("before verify") + verify_data(self.client, index) os.environ["SKIPLOGCLEAN"] = "1" self.block("BEFORE RESTART") self.server.restart(remove_rdb=False) - # verify_data(self.client, index) + self.block("AFTER RESTART") + verify_data(self.client, index) self.client.execute_command("CONFIG SET search.info-developer-visible yes") i = self.client.info("search") print("Info: ", i) reads = [ i["search_rdb_load_multi_exec_entries"], ] - assert reads == [num_records] + assert reads == [len(records)] def test_saverestore_backfill(self): # From 9fb131d04fa7e01e397d11b729dc066b354d32eb Mon Sep 17 00:00:00 2001 From: Allen Samuels Date: Thu, 13 Nov 2025 04:41:02 +0000 Subject: [PATCH 32/33] more debugging Signed-off-by: Allen Samuels --- integration/test_saverestore.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/integration/test_saverestore.py b/integration/test_saverestore.py index d866d17aa..378ce9336 100644 --- a/integration/test_saverestore.py +++ b/integration/test_saverestore.py @@ -238,8 +238,14 @@ def test_mutation_queue(self): ] assert reads == [len(records)] + def get_pausepoint(self, p): + try: + return int(self.client.execute_command(f"ft._debug pausepoint test {p}")) + except: + return 0 + def block(self, label): - x = self.client.execute_command("ft._debug pausepoint test block_mutation_queue") + x = self.get_pausepoint("block_mutation_queue") self.client.execute_command(*["debug", "log",f"pausepoint block:{x} @ {label}"]) def test_multi_exec_queue(self): @@ -260,6 +266,10 @@ def test_multi_exec_queue(self): self.client.execute_command("save") self.client.execute_command("ft._debug pausepoint reset block_mutation_queue") + while self.get_pausepoint("block_mutation_queue") > 0: + self.block("waiting for pausepoint block_mutation_queue to clear") + time.sleep(0.1) + i = self.client.info("search") assert i["search_rdb_save_multi_exec_entries"] == len(records) self.block("before verify") From 3c03a5c83af43faed76fc3f8ffa86c5771a0e714 Mon Sep 17 00:00:00 2001 From: Allen Samuels Date: Fri, 14 Nov 2025 16:47:47 +0000 Subject: [PATCH 33/33] fix test_CMD, disable stability test Signed-off-by: Allen Samuels --- integration/test_cancel.py | 4 ++-- testing/integration/run.sh | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/integration/test_cancel.py b/integration/test_cancel.py index 54eeedcad..dd65716db 100644 --- a/integration/test_cancel.py +++ b/integration/test_cancel.py @@ -175,12 +175,12 @@ def test_timeoutCMD(self): client.execute_command("FT._DEBUG PAUSEPOINT SET Cancel") == b"OK" ) - assert(client.execute_command("FT._DEBUG PAUSEPOINT LIST") == [b"Cancel", []]) + assert(client.execute_command("FT._DEBUG PAUSEPOINT LIST") == [b"CANCEL", []]) hnsw_result = search(client, "hnsw", True, 2) waiters.wait_for_true(lambda: client.execute_command("FT._DEBUG PAUSEPOINT TEST Cancel") > 0) w = client.execute_command("FT._DEBUG PAUSEPOINT LIST") - assert(w[0] == b'Cancel') + assert(w[0] == b'CANCEL') assert(len(w[1]) > 0) assert ( client.execute_command("FT._DEBUG PAUSEPOINT RESET Cancel") diff --git a/testing/integration/run.sh b/testing/integration/run.sh index 5155d0e38..03819435f 100755 --- a/testing/integration/run.sh +++ b/testing/integration/run.sh @@ -4,9 +4,10 @@ ROOT_DIR=$(readlink -f $(dirname $0)) WORKSPACE_HOME=$(readlink -f ${ROOT_DIR}/../..) BUILD_CONFIG=release -TEST=all +# TEST=all +TEST=vector_search_integration CLEAN="no" -VALKEY_VERSION="8.1.1" +VALKEY_VERSION="9.0.0" VALKEY_JSON_VERSION="unstable" DUMP_TEST_ERRORS_STDOUT="no"