You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
fix(mcp_catalog): retry Start on existing unstarted entries
The idempotent guard at the top of handleEnable short-circuited every
re-enable on a registered entry, regardless of whether the previous
Start had actually completed. This created two dead-ends the model
could not escape on its own:
- The AuthorizationRequired branch tells the model to call enable
again to surface a fresh OAuth dialog, but the second call was
intercepted by the guard with an "already enabled but not yet
connected" message. The dialog never re-appeared.
- A context.Canceled mid-handshake (Ctrl+C, soft per-turn cancel)
left an unstarted entry behind. A subsequent enable hit the same
guard. Recovery relied on Toolset.Stop firing between turns to
clean the entry up — an invariant the catalog can't enforce.
Both paths converge on the same fix: detect the existing-but-unstarted
case in the guard and re-attempt Start on the wrapper. The fresh-enable
path keeps its existing handler-attachment and tools_changed notify;
the retry path reuses the wrapper as-is, avoiding a spurious
tool-surface-change signal.
StartableToolSet.Start is idempotent and single-flight, so the retry
re-invokes the inner connect without re-running the bookkeeping when
the previous attempt left the wrapper in the not-started state.
// Two paths land here: a fresh enable (no entry yet) and a re-enable
720
+
// of an existing entry whose previous Start did not complete (deferred
721
+
// auth-required fallback, cancellation, or any other transient
722
+
// failure). Both must converge on a Start attempt, otherwise the
723
+
// model-facing retry instructions in the failure branches dead-end at
724
+
// the guard.
719
725
t.mu.Lock()
720
-
ifexisting, exists:=t.enabled[id]; exists {
721
-
started:=existing.IsStarted()
726
+
wrapped, alreadyEnabled:=t.enabled[id]
727
+
ifalreadyEnabled&&wrapped.IsStarted() {
722
728
t.mu.Unlock()
723
-
// Idempotent re-enable. If the toolset is already started, its
724
-
// tools are live; otherwise the previous enable hit a deferred-
725
-
// auth fallback and the user can retry via disable+enable.
726
-
ifstarted {
727
-
returntools.ResultSuccess(fmt.Sprintf(
728
-
"server %q is already enabled and connected. Its tools (names starting with %q) are live; proceed with the user's original request using them.",
729
-
id, id+"_")), nil
730
-
}
729
+
// Live entry — nothing to do.
731
730
returntools.ResultSuccess(fmt.Sprintf(
732
-
"server %q is already enabled but not yet connected (likely waiting on user authorization). If the user has just authorized, retry their original request; otherwise tell them the connection still needs authorization.",
733
-
id)), nil
731
+
"server %q is already enabled and connected. Its tools (names starting with %q) are live; proceed with the user's original request using them.",
732
+
id, id+"_")), nil
734
733
}
735
734
736
-
// Create the MCP toolset with the pre-computed headers.
737
-
// The nil third arg (*latest.RemoteOAuthConfig) is intentional: every
738
-
// server currently in the catalog works with default Dynamic Client
739
-
// Registration and the runtime's default callback. If a future entry
740
-
// needs custom scopes / a fixed client_id / a non-default callback,
741
-
// extend Auth in servers.go and plumb the resulting *RemoteOAuthConfig
@@ -816,17 +833,21 @@ func (t *Toolset) handleEnableStartError(ctx context.Context, id string, server
816
833
"user declined the authorization dialog for %q (%s). No tools were activated — do NOT claim the server is connected and do NOT call any %q tools. Tell the user the request needs them to authorize the connection. If the user then says \"yes\", \"retry\", or re-asks for the same thing, call %s for %q again to surface a fresh authorization dialog.",
817
834
id, server.Title, id+"_", ToolNameEnable, id))
818
835
casemcp.IsAuthorizationRequired(err):
819
-
slog.DebugContext(ctx, "Remote MCP server enable deferred: authorization required, leaving in enabled set for next interactive Tools() to retry",
836
+
slog.DebugContext(ctx, "Remote MCP server enable deferred: authorization required, leaving in enabled set for next interactive Tools() / enable to retry",
820
837
"id", id, "error", err)
821
838
returntools.ResultSuccess(fmt.Sprintf(
822
839
"enable requested for %q (%s); authorization is required and the host will surface the dialog. On your next turn, if tools whose names start with %q appear in your available tools, proceed with the user's original request using them. If NO such tools appear, the user dismissed the dialog — tell them the request needs them to authorize, and call %s for %q again if they want to retry.",
823
840
id, server.Title, id+"_", ToolNameEnable, id))
824
841
caseerrors.Is(err, context.Canceled):
825
-
// Don't roll back: cancellation is the runtime tearing down the
826
-
// turn (Ctrl+C, parent cancellation). The cleanup paths upstream
827
-
// (Toolset.Stop) own the lifecycle here.
842
+
// Don't roll back. If this is full-session cancellation (Ctrl+C,
843
+
// parent shutdown), Toolset.Stop will tear the entry down with
844
+
// the rest of the session; if it's softer (per-turn cancel /
845
+
// timeout), the next enable's top-of-function guard re-attempts
846
+
// Start on the existing unstarted wrapper, so the model's retry
847
+
// instruction still has somewhere to land.
828
848
returntools.ResultError(fmt.Sprintf(
829
-
"enable cancelled for %q before the connection completed.", id))
849
+
"enable cancelled for %q before the connection completed. Call %s for %q again to retry once the user is ready.",
require.True(t, res.IsError, "cancelled Start must surface a tool error: %s", res.Output)
686
+
assert.Contains(t, res.Output, "cancelled",
687
+
"the error must name cancellation so the model can explain to the user")
688
+
assert.Contains(t, res.Output, ToolNameEnable,
689
+
"the error must instruct the model how to retry")
690
+
691
+
ts.mu.RLock()
692
+
_, stillEnabled:=ts.enabled[id]
693
+
ts.mu.RUnlock()
694
+
require.True(t, stillEnabled,
695
+
"context.Canceled must leave the entry so the next enable can retry — the alternative would be relying on Toolset.Stop firing between turns, which is a fragile invariant")
696
+
697
+
// Second enable: must drive Start again, not short-circuit at the
698
+
// guard with a "still not connected" message. Otherwise the user is
0 commit comments