fix(mcp_catalog): retry Start on existing unstarted entries

trungutt · trungutt · commit 33d73f569fae · 2026-06-03T08:15:29.000+02:00
The idempotent guard at the top of handleEnable short-circuited every
re-enable on a registered entry, regardless of whether the previous
Start had actually completed. This created two dead-ends the model
could not escape on its own:

  - The AuthorizationRequired branch tells the model to call enable
    again to surface a fresh OAuth dialog, but the second call was
    intercepted by the guard with an "already enabled but not yet
    connected" message. The dialog never re-appeared.

  - A context.Canceled mid-handshake (Ctrl+C, soft per-turn cancel)
    left an unstarted entry behind. A subsequent enable hit the same
    guard. Recovery relied on Toolset.Stop firing between turns to
    clean the entry up — an invariant the catalog can't enforce.

Both paths converge on the same fix: detect the existing-but-unstarted
case in the guard and re-attempt Start on the wrapper. The fresh-enable
path keeps its existing handler-attachment and tools_changed notify;
the retry path reuses the wrapper as-is, avoiding a spurious
tool-surface-change signal.

StartableToolSet.Start is idempotent and single-flight, so the retry
re-invokes the inner connect without re-running the bookkeeping when
the previous attempt left the wrapper in the not-started state.
diff --git a/pkg/tools/builtin/mcpcatalog/mcpcatalog.go b/pkg/tools/builtin/mcpcatalog/mcpcatalog.go
@@ -716,60 +716,65 @@ func (t *Toolset) handleEnable(ctx context.Context, args EnableArgs) (*tools.Too
 			id, strings.Join(missing, ", "), ToolNameEnable)), nil
 	}
 
+	// Two paths land here: a fresh enable (no entry yet) and a re-enable
+	// of an existing entry whose previous Start did not complete (deferred
+	// auth-required fallback, cancellation, or any other transient
+	// failure). Both must converge on a Start attempt, otherwise the
+	// model-facing retry instructions in the failure branches dead-end at
+	// the guard.
 	t.mu.Lock()
-	if existing, exists := t.enabled[id]; exists {
-		started := existing.IsStarted()
+	wrapped, alreadyEnabled := t.enabled[id]
+	if alreadyEnabled && wrapped.IsStarted() {
 		t.mu.Unlock()
-		// Idempotent re-enable. If the toolset is already started, its
-		// tools are live; otherwise the previous enable hit a deferred-
-		// auth fallback and the user can retry via disable+enable.
-		if started {
-			return tools.ResultSuccess(fmt.Sprintf(
-				"server %q is already enabled and connected. Its tools (names starting with %q) are live; proceed with the user's original request using them.",
-				id, id+"_")), nil
-		}
+		// Live entry — nothing to do.
 		return tools.ResultSuccess(fmt.Sprintf(
-			"server %q is already enabled but not yet connected (likely waiting on user authorization). If the user has just authorized, retry their original request; otherwise tell them the connection still needs authorization.",
-			id)), nil
+			"server %q is already enabled and connected. Its tools (names starting with %q) are live; proceed with the user's original request using them.",
+			id, id+"_")), nil
 	}
 
-	// Create the MCP toolset with the pre-computed headers.
-	// The nil third arg (*latest.RemoteOAuthConfig) is intentional: every
-	// server currently in the catalog works with default Dynamic Client
-	// Registration and the runtime's default callback. If a future entry
-	// needs custom scopes / a fixed client_id / a non-default callback,
-	// extend Auth in servers.go and plumb the resulting *RemoteOAuthConfig
-	// through here.
-	mcpToolset := mcp.NewRemoteToolset(id, server.URL, server.Transport, headers, nil)
-
-	// Re-attach the captured handlers so OAuth flows behave identically to
-	// a YAML-declared mcp.remote toolset. Apply BEFORE wrapping so we hit
-	// the *mcp.Toolset's typed setters directly without a tools.As walk.
-	if t.elicitationHandler != nil {
-		mcpToolset.SetElicitationHandler(t.elicitationHandler)
-	}
-	if t.oauthSuccessHandler != nil {
-		mcpToolset.SetOAuthSuccessHandler(t.oauthSuccessHandler)
-	}
-	if t.toolsChangedHandler != nil {
-		mcpToolset.SetToolsChangedHandler(t.toolsChangedHandler)
-	}
-	if t.managedOAuthSet {
-		mcpToolset.SetManagedOAuth(t.managedOAuth)
-	}
-	if t.unmanagedOAuthRedirectURI != "" {
-		mcpToolset.SetUnmanagedOAuthRedirectURI(t.unmanagedOAuthRedirectURI)
-	}
+	var notify func()
+	if !alreadyEnabled {
+		// Create the MCP toolset with the pre-computed headers.
+		// The nil third arg (*latest.RemoteOAuthConfig) is intentional:
+		// every server currently in the catalog works with default
+		// Dynamic Client Registration and the runtime's default callback.
+		// If a future entry needs custom scopes / a fixed client_id / a
+		// non-default callback, extend Auth in servers.go and plumb the
+		// resulting *RemoteOAuthConfig through here.
+		mcpToolset := mcp.NewRemoteToolset(id, server.URL, server.Transport, headers, nil)
+
+		// Re-attach the captured handlers so OAuth flows behave
+		// identically to a YAML-declared mcp.remote toolset. Apply
+		// BEFORE wrapping so we hit the *mcp.Toolset's typed setters
+		// directly without a tools.As walk.
+		if t.elicitationHandler != nil {
+			mcpToolset.SetElicitationHandler(t.elicitationHandler)
+		}
+		if t.oauthSuccessHandler != nil {
+			mcpToolset.SetOAuthSuccessHandler(t.oauthSuccessHandler)
+		}
+		if t.toolsChangedHandler != nil {
+			mcpToolset.SetToolsChangedHandler(t.toolsChangedHandler)
+		}
+		if t.managedOAuthSet {
+			mcpToolset.SetManagedOAuth(t.managedOAuth)
+		}
+		if t.unmanagedOAuthRedirectURI != "" {
+			mcpToolset.SetUnmanagedOAuthRedirectURI(t.unmanagedOAuthRedirectURI)
+		}
 
-	wrapped := tools.NewStartable(mcpToolset)
-	t.enabled[id] = wrapped
-	notify := t.toolsChangedHandler
+		wrapped = tools.NewStartable(mcpToolset)
+		t.enabled[id] = wrapped
+		notify = t.toolsChangedHandler
+	}
 	t.mu.Unlock()
 
 	// Notify the runtime that the meta-tool surface changed (disable /
 	// reset_auth become visible the moment the entry lands in t.enabled).
 	// Done BEFORE the synchronous Start so the TUI can reflect the
 	// pending server while the (potentially slow) OAuth dialog is open.
+	// Only the first-time enable notifies; a retry on an existing entry
+	// has no surface change to report.
 	if notify != nil {
 		notify()
 	}
@@ -784,6 +789,10 @@ func (t *Toolset) handleEnable(ctx context.Context, args EnableArgs) (*tools.Too
 	// handleEnable returns, the toolset is started, the runtime's next
 	// getTools() picks up its tools, and the model can call them in its
 	// follow-up turn — no second user message required.
+	//
+	// StartableToolSet.Start is idempotent and single-flight: on a retry
+	// path it re-invokes the inner Start when the previous attempt left
+	// the wrapper in the not-started state.
 	if err := t.startToolset(ctx, wrapped); err != nil {
 		return t.handleEnableStartError(ctx, id, server, wrapped, err), nil
 	}
@@ -796,18 +805,26 @@ func (t *Toolset) handleEnable(ctx context.Context, args EnableArgs) (*tools.Too
 // handleEnableStartError translates a failed Start() into a model-facing
 // result and, where appropriate, rolls back the t.enabled bookkeeping so
 // the next Tools() enumeration doesn't replay the same failing handshake.
-// The three branches mirror the cases the model needs to distinguish:
+// The four branches mirror the cases the model needs to distinguish:
 //
 //   - OAuthDeclined → user actively dismissed the dialog. Drop the entry
 //     (mirrors the existing Tools() handling) and tell the model to ask
-//     before retrying.
+//     before retrying. A subsequent enable for the same id will land on
+//     a fresh entry and run a fresh OAuth flow.
 //   - AuthorizationRequired → defensive fallback for the (rare) case where
 //     the elicitation bridge isn't wired up yet. Keep the entry so the
-//     next interactive Tools() call can surface the OAuth dialog, and
-//     fall back to the legacy "tools appear next turn" wording.
-//   - any other error (transport, server refused, context cancelled) →
-//     drop the entry and surface the underlying message so the model can
-//     decide what to tell the user.
+//     next interactive Tools() call can surface the OAuth dialog. A
+//     subsequent enable for the same id is funnelled through
+//     handleEnable's top-of-function guard, which sees an unstarted
+//     entry and re-attempts Start — that is what makes the model-facing
+//     "call enable again" instruction below actually work.
+//   - context.Canceled → leave the entry. If the cancellation tore down
+//     the whole session, Toolset.Stop will clean it up; if it was a
+//     softer per-turn cancellation, the next enable's top-of-function
+//     guard re-attempts Start on the existing wrapper.
+//   - any other error (transport, server refused, …) → drop the entry
+//     and surface the underlying message so the model can decide what to
+//     tell the user. A subsequent enable lands on a fresh entry.
 func (t *Toolset) handleEnableStartError(ctx context.Context, id string, server Server, wrapped *tools.StartableToolSet, err error) *tools.ToolCallResult {
 	switch {
 	case mcp.IsOAuthDeclined(err):
@@ -816,17 +833,21 @@ func (t *Toolset) handleEnableStartError(ctx context.Context, id string, server
 			"user declined the authorization dialog for %q (%s). No tools were activated — do NOT claim the server is connected and do NOT call any %q tools. Tell the user the request needs them to authorize the connection. If the user then says \"yes\", \"retry\", or re-asks for the same thing, call %s for %q again to surface a fresh authorization dialog.",
 			id, server.Title, id+"_", ToolNameEnable, id))
 	case mcp.IsAuthorizationRequired(err):
-		slog.DebugContext(ctx, "Remote MCP server enable deferred: authorization required, leaving in enabled set for next interactive Tools() to retry",
+		slog.DebugContext(ctx, "Remote MCP server enable deferred: authorization required, leaving in enabled set for next interactive Tools() / enable to retry",
 			"id", id, "error", err)
 		return tools.ResultSuccess(fmt.Sprintf(
 			"enable requested for %q (%s); authorization is required and the host will surface the dialog. On your next turn, if tools whose names start with %q appear in your available tools, proceed with the user's original request using them. If NO such tools appear, the user dismissed the dialog — tell them the request needs them to authorize, and call %s for %q again if they want to retry.",
 			id, server.Title, id+"_", ToolNameEnable, id))
 	case errors.Is(err, context.Canceled):
-		// Don't roll back: cancellation is the runtime tearing down the
-		// turn (Ctrl+C, parent cancellation). The cleanup paths upstream
-		// (Toolset.Stop) own the lifecycle here.
+		// Don't roll back. If this is full-session cancellation (Ctrl+C,
+		// parent shutdown), Toolset.Stop will tear the entry down with
+		// the rest of the session; if it's softer (per-turn cancel /
+		// timeout), the next enable's top-of-function guard re-attempts
+		// Start on the existing unstarted wrapper, so the model's retry
+		// instruction still has somewhere to land.
 		return tools.ResultError(fmt.Sprintf(
-			"enable cancelled for %q before the connection completed.", id))
+			"enable cancelled for %q before the connection completed. Call %s for %q again to retry once the user is ready.",
+			id, ToolNameEnable, id))
 	default:
 		t.disableAfterDecline(ctx, id, wrapped)
 		return tools.ResultError(fmt.Sprintf(
diff --git a/pkg/tools/builtin/mcpcatalog/mcpcatalog_test.go b/pkg/tools/builtin/mcpcatalog/mcpcatalog_test.go
@@ -161,13 +161,18 @@ func TestEnableDisableLifecycle(t *testing.T) {
 	ts.mu.RUnlock()
 	assert.True(t, exists)
 
-	// Re-enable: idempotent, no extra change notification. The success
-	// re-enable must tell the model the tools are still live so it does
-	// not stop to "set up" the connection again.
+	// Re-enable on a registered-but-still-unstarted entry: the guard at
+	// the top of handleEnable falls through to the Start retry path
+	// (otherwise the retry instructions emitted by the AuthorizationRequired
+	// / Canceled branches would dead-end at the guard). No extra
+	// tools_changed notification: the entry was already in t.enabled.
 	res, err = ts.handleEnable(ctx, EnableArgs{ID: oauthID})
 	require.NoError(t, err)
-	assert.Contains(t, res.Output, "already enabled")
-	assert.Equal(t, int32(1), changes.Load())
+	require.False(t, res.IsError, "re-enable: %s", res.Output)
+	assert.Contains(t, res.Output, "enabled",
+		"re-enable must report success — the Start retry succeeded under stubStartOK")
+	assert.Equal(t, int32(1), changes.Load(),
+		"re-enable of an existing entry must not fire tools-changed again")
 
 	// Search now reports it as enabled.
 	res, err = ts.handleSearch(ctx, SearchArgs{Query: oauthID})
@@ -579,6 +584,127 @@ func TestEnableSyncStartTransportError(t *testing.T) {
 		"a failed enable must roll back t.enabled so the next Tools() call does not replay the failure")
 }
 
+// flakyStartToolSet is a Startable test fake whose Start fails N times
+// before returning nil. It exists so the retry-on-second-enable regression
+// tests can assert that the model's retry instructions actually drive a
+// fresh Start attempt instead of dead-ending at the idempotent guard.
+type flakyStartToolSet struct {
+	startCalls atomic.Int32
+	failures   int   // number of times Start should fail before succeeding
+	failWith   error // sentinel returned on each failure
+}
+
+func (f *flakyStartToolSet) Tools(context.Context) ([]tools.Tool, error) {
+	return nil, nil
+}
+
+func (f *flakyStartToolSet) Start(context.Context) error {
+	n := f.startCalls.Add(1)
+	if int(n) <= f.failures {
+		return f.failWith
+	}
+	return nil
+}
+
+func (f *flakyStartToolSet) Stop(context.Context) error { return nil }
+
+// TestEnableRetriesStartOnExistingUnstartedEntry is the regression test
+// for the AuthorizationRequired-branch dead-end the reviewer flagged: the
+// model-facing retry instruction must actually drive a second Start
+// attempt. Before the fix, the top-of-handleEnable guard short-circuited
+// every retry into the "already enabled but not yet connected" message
+// without invoking the seam, so the OAuth dialog never re-surfaced and
+// the only escape was disable+enable.
+func TestEnableRetriesStartOnExistingUnstartedEntry(t *testing.T) {
+	ts := New(stubEnv{vars: map[string]string{}})
+
+	id := firstOAuthServerID(t, ts)
+	fake := &flakyStartToolSet{
+		failures: 1,
+		failWith: &mcptools.AuthorizationRequiredError{URL: ts.byID[id].URL},
+	}
+	ts.startToolset = func(ctx context.Context, _ *tools.StartableToolSet) error {
+		return fake.Start(ctx)
+	}
+
+	var changes atomic.Int32
+	ts.SetToolsChangedHandler(func() { changes.Add(1) })
+
+	// First enable: defensive AuthorizationRequired fallback. Entry stays
+	// in t.enabled, message tells the model to call enable again.
+	res, err := ts.handleEnable(t.Context(), EnableArgs{ID: id})
+	require.NoError(t, err)
+	require.False(t, res.IsError, "deferred-auth must not be returned as an error: %s", res.Output)
+	assert.Contains(t, res.Output, "next turn",
+		"the AuthRequired branch must surface its deferred-auth wording on first failure")
+
+	ts.mu.RLock()
+	_, stillEnabled := ts.enabled[id]
+	ts.mu.RUnlock()
+	require.True(t, stillEnabled, "AuthRequired must leave the entry in t.enabled for retry")
+
+	// Second enable on the same id: the guard must NOT short-circuit on
+	// the existing unstarted entry. It must invoke Start again so the
+	// model's "call enable again" instruction actually drives a retry.
+	res, err = ts.handleEnable(t.Context(), EnableArgs{ID: id})
+	require.NoError(t, err)
+	require.False(t, res.IsError, "retry: %s", res.Output)
+	assert.Contains(t, res.Output, "enabled",
+		"re-enable must report success once the underlying Start succeeds")
+	assert.Contains(t, res.Output, id+"_",
+		"re-enable must reference the tool-name prefix so the model knows the tools are live")
+
+	assert.Equal(t, int32(2), fake.startCalls.Load(),
+		"re-enable must invoke Start a second time — the AuthRequired fallback's retry instruction depends on it")
+	// Only the first enable registers the entry; the retry reuses it, so
+	// the tools-changed notification fires exactly once across both calls.
+	assert.Equal(t, int32(1), changes.Load(),
+		"re-enable of an existing entry must NOT re-fire tools-changed — it would falsely signal a tool-surface change")
+}
+
+// TestEnableRetriesStartAfterCancellation is the regression test for the
+// context.Canceled branch the reviewer flagged: when Start is cancelled,
+// the entry stays in t.enabled, and a subsequent enable must drive Start
+// again rather than dead-ending at the guard. This covers the soft
+// per-turn cancellation case where Toolset.Stop has not (and will not)
+// run between the cancelled enable and the retry.
+func TestEnableRetriesStartAfterCancellation(t *testing.T) {
+	ts := New(stubEnv{vars: map[string]string{}})
+
+	id := firstOAuthServerID(t, ts)
+	fake := &flakyStartToolSet{failures: 1, failWith: context.Canceled}
+	ts.startToolset = func(ctx context.Context, _ *tools.StartableToolSet) error {
+		return fake.Start(ctx)
+	}
+
+	// First enable: Start returns context.Canceled. The handler must
+	// surface a tool error AND leave the entry behind so a retry has
+	// somewhere to land.
+	res, err := ts.handleEnable(t.Context(), EnableArgs{ID: id})
+	require.NoError(t, err)
+	require.True(t, res.IsError, "cancelled Start must surface a tool error: %s", res.Output)
+	assert.Contains(t, res.Output, "cancelled",
+		"the error must name cancellation so the model can explain to the user")
+	assert.Contains(t, res.Output, ToolNameEnable,
+		"the error must instruct the model how to retry")
+
+	ts.mu.RLock()
+	_, stillEnabled := ts.enabled[id]
+	ts.mu.RUnlock()
+	require.True(t, stillEnabled,
+		"context.Canceled must leave the entry so the next enable can retry — the alternative would be relying on Toolset.Stop firing between turns, which is a fragile invariant")
+
+	// Second enable: must drive Start again, not short-circuit at the
+	// guard with a "still not connected" message. Otherwise the user is
+	// stuck and the only escape is disable+enable.
+	res, err = ts.handleEnable(t.Context(), EnableArgs{ID: id})
+	require.NoError(t, err)
+	require.False(t, res.IsError, "retry after cancel: %s", res.Output)
+	assert.Contains(t, res.Output, "enabled")
+	assert.Equal(t, int32(2), fake.startCalls.Load(),
+		"the retry must invoke Start a second time — the post-cancel recovery depends on it")
+}
+
 func TestListEnabled(t *testing.T) {
 	ts := New(stubEnv{vars: map[string]string{}})
 	stubStartOK(ts)