Skip to content

Commit 3cc5974

Browse files
committed
fix(discord): add circuit breaker for WebSocket resume loop (#13180)
When the Discord WebSocket connection enters a stall loop (connects but never receives HELLO), the existing zombie timeout handler would disconnect and reconnect indefinitely. The library's reconnectAttempts counter resets on every WebSocket open event, so the maxAttempts circuit breaker is never reached. This adds an application-level circuit breaker: - Track consecutive stalls (WS opens but no HELLO within 30s) - After 5 consecutive stalls, invalidate the session state (sessionId, resumeGatewayUrl) and force a fresh IDENTIFY instead of resume - Log stall count on each attempt for observability - Reset counter on successful HELLO receipt This breaks the infinite resume loop observed in production where 1400+ reconnect attempts occurred over 12+ hours with a stale session token. Fixes: #13180
1 parent 4e9f933 commit 3cc5974

File tree

1 file changed

+49
-7
lines changed

1 file changed

+49
-7
lines changed

src/discord/monitor/provider.ts

Lines changed: 49 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -634,10 +634,24 @@ export async function monitorDiscordProvider(opts: MonitorDiscordOpts = {}) {
634634
abortSignal?.addEventListener("abort", onAbort, { once: true });
635635
}
636636
// Timeout to detect zombie connections where HELLO is never received.
637+
// Includes a circuit breaker: after MAX_STALL_RETRIES consecutive stalls without
638+
// a successful HELLO, invalidate the session and do a full re-identify instead of
639+
// resuming with a stale session token forever. See openclaw/openclaw#13180.
637640
const HELLO_TIMEOUT_MS = 30000;
641+
const MAX_STALL_RETRIES = 5;
638642
let helloTimeoutId: ReturnType<typeof setTimeout> | undefined;
643+
let consecutiveStalls = 0;
639644
const onGatewayDebug = (msg: unknown) => {
640645
const message = String(msg);
646+
// Reset stall counter on successful HELLO (dispatched as "Received HELLO" or opcode 10).
647+
if (message.includes("heartbeat_interval") || message.includes("Received HELLO")) {
648+
consecutiveStalls = 0;
649+
if (helloTimeoutId) {
650+
clearTimeout(helloTimeoutId);
651+
helloTimeoutId = undefined;
652+
}
653+
return;
654+
}
641655
if (!message.includes("WebSocket connection opened")) {
642656
return;
643657
}
@@ -646,13 +660,41 @@ export async function monitorDiscordProvider(opts: MonitorDiscordOpts = {}) {
646660
}
647661
helloTimeoutId = setTimeout(() => {
648662
if (!gateway?.isConnected) {
649-
runtime.log?.(
650-
danger(
651-
`connection stalled: no HELLO received within ${HELLO_TIMEOUT_MS}ms, forcing reconnect`,
652-
),
653-
);
654-
gateway?.disconnect();
655-
gateway?.connect(false);
663+
consecutiveStalls++;
664+
if (consecutiveStalls >= MAX_STALL_RETRIES) {
665+
runtime.log?.(
666+
danger(
667+
`connection stalled ${consecutiveStalls} consecutive times — circuit breaker tripped, forcing fresh identify (see #13180)`,
668+
),
669+
);
670+
// Invalidate the stale session so the library does a fresh IDENTIFY
671+
// instead of trying to resume with a dead session token.
672+
if (gateway) {
673+
(
674+
gateway as unknown as {
675+
state: {
676+
sessionId: string | null;
677+
resumeGatewayUrl: string | null;
678+
sequence: number | null;
679+
};
680+
}
681+
).state.sessionId = null;
682+
(
683+
gateway as unknown as { state: { resumeGatewayUrl: string | null } }
684+
).state.resumeGatewayUrl = null;
685+
}
686+
consecutiveStalls = 0;
687+
gateway?.disconnect();
688+
gateway?.connect(false);
689+
} else {
690+
runtime.log?.(
691+
danger(
692+
`connection stalled: no HELLO received within ${HELLO_TIMEOUT_MS}ms, forcing reconnect (${consecutiveStalls}/${MAX_STALL_RETRIES} before circuit breaker)`,
693+
),
694+
);
695+
gateway?.disconnect();
696+
gateway?.connect(false);
697+
}
656698
}
657699
helloTimeoutId = undefined;
658700
}, HELLO_TIMEOUT_MS);

0 commit comments

Comments
 (0)