@@ -359,6 +359,9 @@ ALWAYS_INLINE PARSER_RC pluginsd_replay_rrdset_collection_state(char **words, si
359359ALWAYS_INLINE PARSER_RC pluginsd_replay_end (char * * words , size_t num_words , PARSER * parser ) {
360360 if (num_words < 7 ) { // accepts 7, but the 7th is optional
361361 nd_log (NDLS_DAEMON , NDLP_ERR , "REPLAY: malformed " PLUGINSD_KEYWORD_REPLAY_END " command" );
362+ RRDSET * st = pluginsd_get_scope_chart (parser );
363+ if (st )
364+ st -> replication_empty_response_count = 0 ;
362365 return PARSER_RC_ERROR ;
363366 }
364367
@@ -402,6 +405,10 @@ ALWAYS_INLINE PARSER_RC pluginsd_replay_end(char **words, size_t num_words, PARS
402405
403406 parser -> user .data_collections_count ++ ;
404407
408+ // Reset empty response counter when we receive actual data
409+ if (parser -> user .replay .rset_enabled && st )
410+ st -> replication_empty_response_count = 0 ;
411+
405412 if (parser -> user .replay .rset_enabled && st -> rrdhost -> receiver ) {
406413 time_t now = now_realtime_sec ();
407414 time_t started = st -> rrdhost -> receiver -> replication .first_time_s ;
@@ -433,6 +440,9 @@ ALWAYS_INLINE PARSER_RC pluginsd_replay_end(char **words, size_t num_words, PARS
433440 st -> replay .log_next_data_collection = true;
434441#endif
435442
443+ if (start_streaming )
444+ st -> replication_empty_response_count = 0 ;
445+
436446 if (start_streaming ) {
437447#ifdef REPLICATION_TRACKING
438448 st -> stream .rcv .who = REPLAY_WHO_FINISHED ;
@@ -465,6 +475,111 @@ ALWAYS_INLINE PARSER_RC pluginsd_replay_end(char **words, size_t num_words, PARS
465475 return PARSER_RC_OK ;
466476 }
467477
478+ // ========================================================================
479+ // SAFETY NET: Detect stuck replication loops
480+ // ========================================================================
481+ //
482+ // We received start_streaming=false, which means we need to send another
483+ // replication request. However, we need to detect if we're stuck in an
484+ // infinite retry loop where no progress is being made.
485+ //
486+ // This can happen when:
487+ // 1. Parent already has newer data than child
488+ // 2. Child keeps splitting responses due to buffer constraints
489+ // 3. Network issues causing repeated empty/failed responses
490+
491+ // Check parent's current retention to detect if we're already caught up
492+ time_t local_first_entry = 0 , local_last_entry = 0 ;
493+ rrdset_get_retention_of_tier_for_collected_chart (
494+ st , & local_first_entry , & local_last_entry , now_realtime_sec (), 0 );
495+
496+ // Detect suspicious pattern: parent requested data but is already caught up
497+ // This indicates we're in a loop where child keeps splitting responses
498+ // even though parent doesn't need more data.
499+ bool parent_already_caught_up = (local_last_entry >= last_entry_child );
500+ bool requested_non_empty_range = (first_entry_requested != 0 || last_entry_requested != 0 );
501+ bool is_suspicious_response = (requested_non_empty_range && parent_already_caught_up );
502+
503+ bool should_check_for_stuck_replication = false;
504+
505+ // Track consecutive suspicious responses - applies to all builds
506+ if (is_suspicious_response ) {
507+ st -> replication_empty_response_count ++ ;
508+ // After 3 consecutive suspicious responses, we need to investigate
509+ if (st -> replication_empty_response_count >= 3 ) {
510+ should_check_for_stuck_replication = true;
511+ }
512+ } else {
513+ // Reset counter if this was a legitimate response (parent still catching up)
514+ st -> replication_empty_response_count = 0 ;
515+ }
516+
517+ if (should_check_for_stuck_replication ) {
518+ // We already have local_first_entry and local_last_entry from above
519+
520+ // Check multiple conditions to ensure we're truly stuck:
521+ //
522+ // Condition 1: Parent has data that covers or exceeds child's retention
523+ // (We already checked this in parent_already_caught_up, but verify again)
524+ bool parent_has_equal_or_newer_data = (local_last_entry >= last_entry_child );
525+
526+ // Calculate the gap for logging purposes
527+ time_t gap_to_child = (last_entry_child > local_last_entry ) ?
528+ (last_entry_child - local_last_entry ) : 0 ;
529+
530+ // Condition 2: Parent's data is reasonably recent
531+ time_t wall_clock = now_realtime_sec ();
532+ bool parent_data_is_recent = (local_last_entry > 0 &&
533+ (wall_clock - local_last_entry ) < 300 );
534+
535+ // Only finish replication if parent has equal or newer data than child
536+ // Do NOT terminate if there's any gap, as that would cause data loss
537+ if (parent_has_equal_or_newer_data ) {
538+
539+ // Log with appropriate level based on confidence
540+ ND_LOG_FIELD_PRIORITY level = (parent_has_equal_or_newer_data && parent_data_is_recent ) ?
541+ NDLP_INFO : NDLP_WARNING ;
542+
543+ nd_log (NDLS_DAEMON , level ,
544+ "PLUGINSD REPLAY: 'host:%s/chart:%s' detected stuck replication loop. "
545+ "Parent last entry: %llu, Child last entry: %llu, Gap: %llu seconds, "
546+ "Empty responses: %u. Forcing replication to finish." ,
547+ rrdhost_hostname (host ), rrdset_id (st ),
548+ (unsigned long long )local_last_entry ,
549+ (unsigned long long )last_entry_child ,
550+ (unsigned long long )gap_to_child ,
551+ (unsigned int )st -> replication_empty_response_count
552+ );
553+
554+ st -> replication_empty_response_count = 0 ;
555+
556+ // IMPORTANT: Mark as finished and decrement counter NOW, before sending final request.
557+ // This prevents infinite loops even if child continues to respond with start_streaming=false.
558+ // The next REPLAY_END will see FINISHED flag and handle accordingly.
559+ RRDSET_FLAGS old = rrdset_flag_set_and_clear (
560+ st , RRDSET_FLAG_RECEIVER_REPLICATION_FINISHED ,
561+ RRDSET_FLAG_RECEIVER_REPLICATION_IN_PROGRESS | RRDSET_FLAG_SYNC_CLOCK );
562+
563+ if (!(old & RRDSET_FLAG_RECEIVER_REPLICATION_FINISHED )) {
564+ if (rrdhost_receiver_replicating_charts_minus_one (st -> rrdhost ) == 0 )
565+ pulse_host_status (host , PULSE_HOST_STATUS_RCV_RUNNING , 0 );
566+ }
567+
568+ pluginsd_clear_scope_chart (parser , PLUGINSD_KEYWORD_REPLAY_END );
569+ host -> stream .rcv .status .replication .percent = 100.0 ;
570+ worker_set_metric (WORKER_RECEIVER_JOB_REPLICATION_COMPLETION , host -> stream .rcv .status .replication .percent );
571+
572+ // Send one final request to notify child. If child responds with start_streaming=true,
573+ // it will start streaming. If it responds with start_streaming=false, the next
574+ // REPLAY_END will see the FINISHED flag and log a warning but not loop forever.
575+ bool ok = replicate_chart_request (send_to_plugin , parser , host , st ,
576+ first_entry_child , last_entry_child , child_world_time ,
577+ 0 , 0 ); // prev_wanted = 0,0 to trigger empty request path
578+
579+ return ok ? PARSER_RC_OK : PARSER_RC_ERROR ;
580+ }
581+ }
582+
468583#ifdef REPLICATION_TRACKING
469584 st -> stream .rcv .who = REPLAY_WHO_ME ;
470585#endif
0 commit comments