[MOD-14732] fix race condition on hybrid (#9029)

JoanFM · web-flow · commit 3bb5f0376374 · 2026-04-14T12:58:04.000Z
* fix race condition on hybrid

* remove unrelated file

* use Strong/WeakRef instead of refCount

* fix the real issue, count proper numShards

* fix by setting numShards from IO thread

* rephrase

* fix potential problem if shards can be 0

* use initialized flag

* update comment

* remove unused numShards
diff --git a/src/coord/hybrid/dist_hybrid.c b/src/coord/hybrid/dist_hybrid.c
@@ -255,6 +255,8 @@ void HybridRequest_buildMRCommand(RedisModuleString **argv, int argc,
   MRCommand_appendVsim(xcmd, argv, argc, vsimOffset, &kArgIndex);
 
   // Calculate and apply effective K for KNN queries if SHARD_K_RATIO is set
+  // TODO: Potentially edit in IO thread where numShards is actually known.
+  // Now we have a risk that by the time I/O thread sends the command, the number of shards changed, making the effective K inaccurate.
   if (vq && vq->type == VECSIM_QT_KNN) {
     double shardWindowRatio = vq->knn.shardWindowRatio;
     if (shardWindowRatio < MAX_SHARD_WINDOW_RATIO && numShards > 1) {
@@ -713,11 +715,10 @@ static int HybridRequest_executePlan(HybridRequest *hreq, struct ConcurrentCmdCt
 
     // Get the command from the RPNet (it was set during prepareForExecution)
     MRCommand *cmd = &searchRPNet->cmd;
-    int numShards = ConcurrentCmdCtx_GetNumShards(cmdCtx);
     cmd->coordStartTime = hreq->profileClocks.coordStartTime;
 
     const RSOomPolicy oomPolicy = hreq->reqConfig.oomPolicy;
-    if (!ProcessHybridCursorMappings(cmd, numShards, searchMappingsRef, vsimMappingsRef, hreq->tailPipeline->qctx.err, oomPolicy)) {
+    if (!ProcessHybridCursorMappings(cmd, searchMappingsRef, vsimMappingsRef, hreq->tailPipeline->qctx.err, oomPolicy)) {
         // Handle error
         StrongRef_Release(searchMappingsRef);
         StrongRef_Release(vsimMappingsRef);
@@ -852,7 +853,7 @@ void RSExecDistHybrid(RedisModuleCtx *ctx, RedisModuleString **argv, int argc,
     // Store coordinator start time for dispatch time tracking
     hreq->profileClocks.coordStartTime = ConcurrentCmdCtx_GetCoordStartTime(cmdCtx);
 
-    // Get numShards captured from main thread for thread-safe access
+    // Get numShards captured from main thread for thread-safe access and to compute effective K
     size_t numShards = ConcurrentCmdCtx_GetNumShards(cmdCtx);
 
     if (HybridRequest_prepareForExecution(hreq, ctx, argv, argc, sp, numShards, &status) != REDISMODULE_OK) {
diff --git a/src/coord/hybrid/hybrid_cursor_mappings.c b/src/coord/hybrid/hybrid_cursor_mappings.c
@@ -26,6 +26,7 @@ typedef struct {
     pthread_mutex_t *mutex;           // Mutex for array access and completion tracking
     pthread_cond_t *completionCond;   // Condition variable for completion signaling
     int numShards;                    // Total number of expected shards
+    bool initialized;                 // Whether numShards has been set by the IO thread
 } processCursorMappingCallbackContext;
 
 void CursorMapping_Release(CursorMapping *mapping) {
@@ -191,6 +192,19 @@ static void processCursorMappingCallback(MRIteratorCallbackCtx *ctx, MRReply *re
     MRReply_Free(rep);
 }
 
+// Init callback for the private data, so that numShards is set to the actual number of shards in the cluster, and the expected responses.
+static void processCursorMappingInit(void *privateData, MRIterator *it) {
+    processCursorMappingCallbackContext *ctx = (processCursorMappingCallbackContext *)privateData;
+    int actualNumShards = (int)MRIterator_GetNumShards(it);
+    pthread_mutex_lock(ctx->mutex);
+    ctx->numShards = actualNumShards;
+    ctx->initialized = true;
+    ctx->errors = array_new(QueryError, actualNumShards);
+    // Signal so the coordinator can re-check the wait condition.
+    pthread_cond_signal(ctx->completionCond);
+    pthread_mutex_unlock(ctx->mutex);
+}
+
 static inline void cleanupCtx(processCursorMappingCallbackContext *ctx) {
     pthread_mutex_destroy(ctx->mutex);
     pthread_cond_destroy(ctx->completionCond);
@@ -202,7 +216,7 @@ static inline void cleanupCtx(processCursorMappingCallbackContext *ctx) {
     rm_free(ctx);
 }
 
-bool ProcessHybridCursorMappings(const MRCommand *cmd, int numShards, StrongRef searchMappingsRef, StrongRef vsimMappingsRef, QueryError *status, const RSOomPolicy oomPolicy) {
+bool ProcessHybridCursorMappings(const MRCommand *cmd, StrongRef searchMappingsRef, StrongRef vsimMappingsRef, QueryError *status, const RSOomPolicy oomPolicy) {
     CursorMappings *searchMappings = StrongRef_Get(searchMappingsRef);
     CursorMappings *vsimMappings = StrongRef_Get(vsimMappingsRef);
     RS_ASSERT(array_len(searchMappings->mappings) == 0 && array_len(vsimMappings->mappings) == 0);
@@ -217,18 +231,22 @@ bool ProcessHybridCursorMappings(const MRCommand *cmd, int numShards, StrongRef
     pthread_cond_init(ctx->completionCond, NULL);
 
     // Setup callback context
-    *ctx = (processCursorMappingCallbackContext){
+    *ctx = (processCursorMappingCallbackContext) {
         .searchMappings = StrongRef_Clone(searchMappingsRef),
         .vsimMappings = StrongRef_Clone(vsimMappingsRef),
-        .errors = array_new(QueryError, numShards),
+        .errors = NULL,
         .responseCount = 0,
         .mutex = ctx->mutex,
         .completionCond = ctx->completionCond,
-        .numShards = numShards
-    };
+        .numShards = 0,
+        .initialized = false
+      };
 
     // Start iteration (ctx is cleaned up manually in cleanupCtx, no destructor needed)
-    MRIterator *it = MR_IterateWithPrivateData(cmd, processCursorMappingCallback, ctx, NULL, NULL, iterStartCb, NULL);
+    // processCursorMappingInit is called from iterStartCb to update ctx->numShards
+    // with the actual shard count from the live topology, preventing use-after-free
+    // when topology changes during shard migration.
+    MRIterator *it = MR_IterateWithPrivateData(cmd, processCursorMappingCallback, ctx, NULL, processCursorMappingInit, iterStartCb, NULL);
     if (!it) {
         // Cleanup on error
         QueryError_SetWithoutUserDataFmt(status, QUERY_ERROR_CODE_GENERIC, "Failed to communicate with shards");
@@ -237,8 +255,8 @@ bool ProcessHybridCursorMappings(const MRCommand *cmd, int numShards, StrongRef
     }
     // Wait for all callbacks to complete
     pthread_mutex_lock(ctx->mutex);
-    // initialize count with response counts in case some shards already sent a response
-    for (size_t count = ctx->responseCount; count < numShards; count = ctx->responseCount) {
+    // Wait until the IO thread has initialized numShards and all responses arrive.
+    while (!ctx->initialized || ctx->responseCount < ctx->numShards) {
         pthread_cond_wait(ctx->completionCond, ctx->mutex);
     }
     pthread_mutex_unlock(ctx->mutex);
diff --git a/src/coord/hybrid/hybrid_cursor_mappings.h b/src/coord/hybrid/hybrid_cursor_mappings.h
@@ -42,14 +42,13 @@ typedef struct QueryError QueryError;
  * Handles shard errors by recording them in the status parameter while continuing to process all shards.
  * Returns true even if all shards fail with warnings (e.g., OOM), resulting in empty mapping arrays and allowing the caller to handle the warnings.
  * @param cmd The MRCommand to execute
- * @param numShards Expected number of shards (determines expected callbacks)
  * @param searchMappings Empty array to populate with search cursor mappings
  * @param vsimMappings Empty array to populate with vector similarity cursor mappings
  * @param status QueryError pointer to store warning/error information
  * @param oomPolicy OOM policy to determine error handling behavior
  * @return true if processing completed (even with warnings), false on fatal errors; status will contain error/warning information
  */
-bool ProcessHybridCursorMappings(const MRCommand *cmd,int numShards, StrongRef searchMappings, StrongRef vsimMappings, QueryError *status, RSOomPolicy oomPolicy);
+bool ProcessHybridCursorMappings(const MRCommand *cmd, StrongRef searchMappings, StrongRef vsimMappings, QueryError *status, RSOomPolicy oomPolicy);
 
 /**
  * Release resources associated with a cursor mapping
diff --git a/tests/pytests/test_asm.py b/tests/pytests/test_asm.py
@@ -764,8 +764,6 @@ def test_add_shard_and_migrate_hybrid():
 
 @skip(cluster=False, min_shards=2)
 def test_add_shard_and_migrate_hybrid_BG():
-    # TODO: MOD-14732 - Skipped due to flaky crash (SIGSEGV) during hybrid cursor migration.
-    raise SkipTest()
     env = Env(clusterNodeTimeout=cluster_node_timeout, moduleArgs='WORKERS 2')
     add_shard_and_migrate_test(env, 'FT.HYBRID')