Skip to content

Commit 33d73f5

Browse files
committed
fix(mcp_catalog): retry Start on existing unstarted entries
The idempotent guard at the top of handleEnable short-circuited every re-enable on a registered entry, regardless of whether the previous Start had actually completed. This created two dead-ends the model could not escape on its own: - The AuthorizationRequired branch tells the model to call enable again to surface a fresh OAuth dialog, but the second call was intercepted by the guard with an "already enabled but not yet connected" message. The dialog never re-appeared. - A context.Canceled mid-handshake (Ctrl+C, soft per-turn cancel) left an unstarted entry behind. A subsequent enable hit the same guard. Recovery relied on Toolset.Stop firing between turns to clean the entry up — an invariant the catalog can't enforce. Both paths converge on the same fix: detect the existing-but-unstarted case in the guard and re-attempt Start on the wrapper. The fresh-enable path keeps its existing handler-attachment and tools_changed notify; the retry path reuses the wrapper as-is, avoiding a spurious tool-surface-change signal. StartableToolSet.Start is idempotent and single-flight, so the retry re-invokes the inner connect without re-running the bookkeeping when the previous attempt left the wrapper in the not-started state.
1 parent 822d654 commit 33d73f5

2 files changed

Lines changed: 206 additions & 59 deletions

File tree

pkg/tools/builtin/mcpcatalog/mcpcatalog.go

Lines changed: 75 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -716,60 +716,65 @@ func (t *Toolset) handleEnable(ctx context.Context, args EnableArgs) (*tools.Too
716716
id, strings.Join(missing, ", "), ToolNameEnable)), nil
717717
}
718718

719+
// Two paths land here: a fresh enable (no entry yet) and a re-enable
720+
// of an existing entry whose previous Start did not complete (deferred
721+
// auth-required fallback, cancellation, or any other transient
722+
// failure). Both must converge on a Start attempt, otherwise the
723+
// model-facing retry instructions in the failure branches dead-end at
724+
// the guard.
719725
t.mu.Lock()
720-
if existing, exists := t.enabled[id]; exists {
721-
started := existing.IsStarted()
726+
wrapped, alreadyEnabled := t.enabled[id]
727+
if alreadyEnabled && wrapped.IsStarted() {
722728
t.mu.Unlock()
723-
// Idempotent re-enable. If the toolset is already started, its
724-
// tools are live; otherwise the previous enable hit a deferred-
725-
// auth fallback and the user can retry via disable+enable.
726-
if started {
727-
return tools.ResultSuccess(fmt.Sprintf(
728-
"server %q is already enabled and connected. Its tools (names starting with %q) are live; proceed with the user's original request using them.",
729-
id, id+"_")), nil
730-
}
729+
// Live entry — nothing to do.
731730
return tools.ResultSuccess(fmt.Sprintf(
732-
"server %q is already enabled but not yet connected (likely waiting on user authorization). If the user has just authorized, retry their original request; otherwise tell them the connection still needs authorization.",
733-
id)), nil
731+
"server %q is already enabled and connected. Its tools (names starting with %q) are live; proceed with the user's original request using them.",
732+
id, id+"_")), nil
734733
}
735734

736-
// Create the MCP toolset with the pre-computed headers.
737-
// The nil third arg (*latest.RemoteOAuthConfig) is intentional: every
738-
// server currently in the catalog works with default Dynamic Client
739-
// Registration and the runtime's default callback. If a future entry
740-
// needs custom scopes / a fixed client_id / a non-default callback,
741-
// extend Auth in servers.go and plumb the resulting *RemoteOAuthConfig
742-
// through here.
743-
mcpToolset := mcp.NewRemoteToolset(id, server.URL, server.Transport, headers, nil)
744-
745-
// Re-attach the captured handlers so OAuth flows behave identically to
746-
// a YAML-declared mcp.remote toolset. Apply BEFORE wrapping so we hit
747-
// the *mcp.Toolset's typed setters directly without a tools.As walk.
748-
if t.elicitationHandler != nil {
749-
mcpToolset.SetElicitationHandler(t.elicitationHandler)
750-
}
751-
if t.oauthSuccessHandler != nil {
752-
mcpToolset.SetOAuthSuccessHandler(t.oauthSuccessHandler)
753-
}
754-
if t.toolsChangedHandler != nil {
755-
mcpToolset.SetToolsChangedHandler(t.toolsChangedHandler)
756-
}
757-
if t.managedOAuthSet {
758-
mcpToolset.SetManagedOAuth(t.managedOAuth)
759-
}
760-
if t.unmanagedOAuthRedirectURI != "" {
761-
mcpToolset.SetUnmanagedOAuthRedirectURI(t.unmanagedOAuthRedirectURI)
762-
}
735+
var notify func()
736+
if !alreadyEnabled {
737+
// Create the MCP toolset with the pre-computed headers.
738+
// The nil third arg (*latest.RemoteOAuthConfig) is intentional:
739+
// every server currently in the catalog works with default
740+
// Dynamic Client Registration and the runtime's default callback.
741+
// If a future entry needs custom scopes / a fixed client_id / a
742+
// non-default callback, extend Auth in servers.go and plumb the
743+
// resulting *RemoteOAuthConfig through here.
744+
mcpToolset := mcp.NewRemoteToolset(id, server.URL, server.Transport, headers, nil)
745+
746+
// Re-attach the captured handlers so OAuth flows behave
747+
// identically to a YAML-declared mcp.remote toolset. Apply
748+
// BEFORE wrapping so we hit the *mcp.Toolset's typed setters
749+
// directly without a tools.As walk.
750+
if t.elicitationHandler != nil {
751+
mcpToolset.SetElicitationHandler(t.elicitationHandler)
752+
}
753+
if t.oauthSuccessHandler != nil {
754+
mcpToolset.SetOAuthSuccessHandler(t.oauthSuccessHandler)
755+
}
756+
if t.toolsChangedHandler != nil {
757+
mcpToolset.SetToolsChangedHandler(t.toolsChangedHandler)
758+
}
759+
if t.managedOAuthSet {
760+
mcpToolset.SetManagedOAuth(t.managedOAuth)
761+
}
762+
if t.unmanagedOAuthRedirectURI != "" {
763+
mcpToolset.SetUnmanagedOAuthRedirectURI(t.unmanagedOAuthRedirectURI)
764+
}
763765

764-
wrapped := tools.NewStartable(mcpToolset)
765-
t.enabled[id] = wrapped
766-
notify := t.toolsChangedHandler
766+
wrapped = tools.NewStartable(mcpToolset)
767+
t.enabled[id] = wrapped
768+
notify = t.toolsChangedHandler
769+
}
767770
t.mu.Unlock()
768771

769772
// Notify the runtime that the meta-tool surface changed (disable /
770773
// reset_auth become visible the moment the entry lands in t.enabled).
771774
// Done BEFORE the synchronous Start so the TUI can reflect the
772775
// pending server while the (potentially slow) OAuth dialog is open.
776+
// Only the first-time enable notifies; a retry on an existing entry
777+
// has no surface change to report.
773778
if notify != nil {
774779
notify()
775780
}
@@ -784,6 +789,10 @@ func (t *Toolset) handleEnable(ctx context.Context, args EnableArgs) (*tools.Too
784789
// handleEnable returns, the toolset is started, the runtime's next
785790
// getTools() picks up its tools, and the model can call them in its
786791
// follow-up turn — no second user message required.
792+
//
793+
// StartableToolSet.Start is idempotent and single-flight: on a retry
794+
// path it re-invokes the inner Start when the previous attempt left
795+
// the wrapper in the not-started state.
787796
if err := t.startToolset(ctx, wrapped); err != nil {
788797
return t.handleEnableStartError(ctx, id, server, wrapped, err), nil
789798
}
@@ -796,18 +805,26 @@ func (t *Toolset) handleEnable(ctx context.Context, args EnableArgs) (*tools.Too
796805
// handleEnableStartError translates a failed Start() into a model-facing
797806
// result and, where appropriate, rolls back the t.enabled bookkeeping so
798807
// the next Tools() enumeration doesn't replay the same failing handshake.
799-
// The three branches mirror the cases the model needs to distinguish:
808+
// The four branches mirror the cases the model needs to distinguish:
800809
//
801810
// - OAuthDeclined → user actively dismissed the dialog. Drop the entry
802811
// (mirrors the existing Tools() handling) and tell the model to ask
803-
// before retrying.
812+
// before retrying. A subsequent enable for the same id will land on
813+
// a fresh entry and run a fresh OAuth flow.
804814
// - AuthorizationRequired → defensive fallback for the (rare) case where
805815
// the elicitation bridge isn't wired up yet. Keep the entry so the
806-
// next interactive Tools() call can surface the OAuth dialog, and
807-
// fall back to the legacy "tools appear next turn" wording.
808-
// - any other error (transport, server refused, context cancelled) →
809-
// drop the entry and surface the underlying message so the model can
810-
// decide what to tell the user.
816+
// next interactive Tools() call can surface the OAuth dialog. A
817+
// subsequent enable for the same id is funnelled through
818+
// handleEnable's top-of-function guard, which sees an unstarted
819+
// entry and re-attempts Start — that is what makes the model-facing
820+
// "call enable again" instruction below actually work.
821+
// - context.Canceled → leave the entry. If the cancellation tore down
822+
// the whole session, Toolset.Stop will clean it up; if it was a
823+
// softer per-turn cancellation, the next enable's top-of-function
824+
// guard re-attempts Start on the existing wrapper.
825+
// - any other error (transport, server refused, …) → drop the entry
826+
// and surface the underlying message so the model can decide what to
827+
// tell the user. A subsequent enable lands on a fresh entry.
811828
func (t *Toolset) handleEnableStartError(ctx context.Context, id string, server Server, wrapped *tools.StartableToolSet, err error) *tools.ToolCallResult {
812829
switch {
813830
case mcp.IsOAuthDeclined(err):
@@ -816,17 +833,21 @@ func (t *Toolset) handleEnableStartError(ctx context.Context, id string, server
816833
"user declined the authorization dialog for %q (%s). No tools were activated — do NOT claim the server is connected and do NOT call any %q tools. Tell the user the request needs them to authorize the connection. If the user then says \"yes\", \"retry\", or re-asks for the same thing, call %s for %q again to surface a fresh authorization dialog.",
817834
id, server.Title, id+"_", ToolNameEnable, id))
818835
case mcp.IsAuthorizationRequired(err):
819-
slog.DebugContext(ctx, "Remote MCP server enable deferred: authorization required, leaving in enabled set for next interactive Tools() to retry",
836+
slog.DebugContext(ctx, "Remote MCP server enable deferred: authorization required, leaving in enabled set for next interactive Tools() / enable to retry",
820837
"id", id, "error", err)
821838
return tools.ResultSuccess(fmt.Sprintf(
822839
"enable requested for %q (%s); authorization is required and the host will surface the dialog. On your next turn, if tools whose names start with %q appear in your available tools, proceed with the user's original request using them. If NO such tools appear, the user dismissed the dialog — tell them the request needs them to authorize, and call %s for %q again if they want to retry.",
823840
id, server.Title, id+"_", ToolNameEnable, id))
824841
case errors.Is(err, context.Canceled):
825-
// Don't roll back: cancellation is the runtime tearing down the
826-
// turn (Ctrl+C, parent cancellation). The cleanup paths upstream
827-
// (Toolset.Stop) own the lifecycle here.
842+
// Don't roll back. If this is full-session cancellation (Ctrl+C,
843+
// parent shutdown), Toolset.Stop will tear the entry down with
844+
// the rest of the session; if it's softer (per-turn cancel /
845+
// timeout), the next enable's top-of-function guard re-attempts
846+
// Start on the existing unstarted wrapper, so the model's retry
847+
// instruction still has somewhere to land.
828848
return tools.ResultError(fmt.Sprintf(
829-
"enable cancelled for %q before the connection completed.", id))
849+
"enable cancelled for %q before the connection completed. Call %s for %q again to retry once the user is ready.",
850+
id, ToolNameEnable, id))
830851
default:
831852
t.disableAfterDecline(ctx, id, wrapped)
832853
return tools.ResultError(fmt.Sprintf(

pkg/tools/builtin/mcpcatalog/mcpcatalog_test.go

Lines changed: 131 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -161,13 +161,18 @@ func TestEnableDisableLifecycle(t *testing.T) {
161161
ts.mu.RUnlock()
162162
assert.True(t, exists)
163163

164-
// Re-enable: idempotent, no extra change notification. The success
165-
// re-enable must tell the model the tools are still live so it does
166-
// not stop to "set up" the connection again.
164+
// Re-enable on a registered-but-still-unstarted entry: the guard at
165+
// the top of handleEnable falls through to the Start retry path
166+
// (otherwise the retry instructions emitted by the AuthorizationRequired
167+
// / Canceled branches would dead-end at the guard). No extra
168+
// tools_changed notification: the entry was already in t.enabled.
167169
res, err = ts.handleEnable(ctx, EnableArgs{ID: oauthID})
168170
require.NoError(t, err)
169-
assert.Contains(t, res.Output, "already enabled")
170-
assert.Equal(t, int32(1), changes.Load())
171+
require.False(t, res.IsError, "re-enable: %s", res.Output)
172+
assert.Contains(t, res.Output, "enabled",
173+
"re-enable must report success — the Start retry succeeded under stubStartOK")
174+
assert.Equal(t, int32(1), changes.Load(),
175+
"re-enable of an existing entry must not fire tools-changed again")
171176

172177
// Search now reports it as enabled.
173178
res, err = ts.handleSearch(ctx, SearchArgs{Query: oauthID})
@@ -579,6 +584,127 @@ func TestEnableSyncStartTransportError(t *testing.T) {
579584
"a failed enable must roll back t.enabled so the next Tools() call does not replay the failure")
580585
}
581586

587+
// flakyStartToolSet is a Startable test fake whose Start fails N times
588+
// before returning nil. It exists so the retry-on-second-enable regression
589+
// tests can assert that the model's retry instructions actually drive a
590+
// fresh Start attempt instead of dead-ending at the idempotent guard.
591+
type flakyStartToolSet struct {
592+
startCalls atomic.Int32
593+
failures int // number of times Start should fail before succeeding
594+
failWith error // sentinel returned on each failure
595+
}
596+
597+
func (f *flakyStartToolSet) Tools(context.Context) ([]tools.Tool, error) {
598+
return nil, nil
599+
}
600+
601+
func (f *flakyStartToolSet) Start(context.Context) error {
602+
n := f.startCalls.Add(1)
603+
if int(n) <= f.failures {
604+
return f.failWith
605+
}
606+
return nil
607+
}
608+
609+
func (f *flakyStartToolSet) Stop(context.Context) error { return nil }
610+
611+
// TestEnableRetriesStartOnExistingUnstartedEntry is the regression test
612+
// for the AuthorizationRequired-branch dead-end the reviewer flagged: the
613+
// model-facing retry instruction must actually drive a second Start
614+
// attempt. Before the fix, the top-of-handleEnable guard short-circuited
615+
// every retry into the "already enabled but not yet connected" message
616+
// without invoking the seam, so the OAuth dialog never re-surfaced and
617+
// the only escape was disable+enable.
618+
func TestEnableRetriesStartOnExistingUnstartedEntry(t *testing.T) {
619+
ts := New(stubEnv{vars: map[string]string{}})
620+
621+
id := firstOAuthServerID(t, ts)
622+
fake := &flakyStartToolSet{
623+
failures: 1,
624+
failWith: &mcptools.AuthorizationRequiredError{URL: ts.byID[id].URL},
625+
}
626+
ts.startToolset = func(ctx context.Context, _ *tools.StartableToolSet) error {
627+
return fake.Start(ctx)
628+
}
629+
630+
var changes atomic.Int32
631+
ts.SetToolsChangedHandler(func() { changes.Add(1) })
632+
633+
// First enable: defensive AuthorizationRequired fallback. Entry stays
634+
// in t.enabled, message tells the model to call enable again.
635+
res, err := ts.handleEnable(t.Context(), EnableArgs{ID: id})
636+
require.NoError(t, err)
637+
require.False(t, res.IsError, "deferred-auth must not be returned as an error: %s", res.Output)
638+
assert.Contains(t, res.Output, "next turn",
639+
"the AuthRequired branch must surface its deferred-auth wording on first failure")
640+
641+
ts.mu.RLock()
642+
_, stillEnabled := ts.enabled[id]
643+
ts.mu.RUnlock()
644+
require.True(t, stillEnabled, "AuthRequired must leave the entry in t.enabled for retry")
645+
646+
// Second enable on the same id: the guard must NOT short-circuit on
647+
// the existing unstarted entry. It must invoke Start again so the
648+
// model's "call enable again" instruction actually drives a retry.
649+
res, err = ts.handleEnable(t.Context(), EnableArgs{ID: id})
650+
require.NoError(t, err)
651+
require.False(t, res.IsError, "retry: %s", res.Output)
652+
assert.Contains(t, res.Output, "enabled",
653+
"re-enable must report success once the underlying Start succeeds")
654+
assert.Contains(t, res.Output, id+"_",
655+
"re-enable must reference the tool-name prefix so the model knows the tools are live")
656+
657+
assert.Equal(t, int32(2), fake.startCalls.Load(),
658+
"re-enable must invoke Start a second time — the AuthRequired fallback's retry instruction depends on it")
659+
// Only the first enable registers the entry; the retry reuses it, so
660+
// the tools-changed notification fires exactly once across both calls.
661+
assert.Equal(t, int32(1), changes.Load(),
662+
"re-enable of an existing entry must NOT re-fire tools-changed — it would falsely signal a tool-surface change")
663+
}
664+
665+
// TestEnableRetriesStartAfterCancellation is the regression test for the
666+
// context.Canceled branch the reviewer flagged: when Start is cancelled,
667+
// the entry stays in t.enabled, and a subsequent enable must drive Start
668+
// again rather than dead-ending at the guard. This covers the soft
669+
// per-turn cancellation case where Toolset.Stop has not (and will not)
670+
// run between the cancelled enable and the retry.
671+
func TestEnableRetriesStartAfterCancellation(t *testing.T) {
672+
ts := New(stubEnv{vars: map[string]string{}})
673+
674+
id := firstOAuthServerID(t, ts)
675+
fake := &flakyStartToolSet{failures: 1, failWith: context.Canceled}
676+
ts.startToolset = func(ctx context.Context, _ *tools.StartableToolSet) error {
677+
return fake.Start(ctx)
678+
}
679+
680+
// First enable: Start returns context.Canceled. The handler must
681+
// surface a tool error AND leave the entry behind so a retry has
682+
// somewhere to land.
683+
res, err := ts.handleEnable(t.Context(), EnableArgs{ID: id})
684+
require.NoError(t, err)
685+
require.True(t, res.IsError, "cancelled Start must surface a tool error: %s", res.Output)
686+
assert.Contains(t, res.Output, "cancelled",
687+
"the error must name cancellation so the model can explain to the user")
688+
assert.Contains(t, res.Output, ToolNameEnable,
689+
"the error must instruct the model how to retry")
690+
691+
ts.mu.RLock()
692+
_, stillEnabled := ts.enabled[id]
693+
ts.mu.RUnlock()
694+
require.True(t, stillEnabled,
695+
"context.Canceled must leave the entry so the next enable can retry — the alternative would be relying on Toolset.Stop firing between turns, which is a fragile invariant")
696+
697+
// Second enable: must drive Start again, not short-circuit at the
698+
// guard with a "still not connected" message. Otherwise the user is
699+
// stuck and the only escape is disable+enable.
700+
res, err = ts.handleEnable(t.Context(), EnableArgs{ID: id})
701+
require.NoError(t, err)
702+
require.False(t, res.IsError, "retry after cancel: %s", res.Output)
703+
assert.Contains(t, res.Output, "enabled")
704+
assert.Equal(t, int32(2), fake.startCalls.Load(),
705+
"the retry must invoke Start a second time — the post-cancel recovery depends on it")
706+
}
707+
582708
func TestListEnabled(t *testing.T) {
583709
ts := New(stubEnv{vars: map[string]string{}})
584710
stubStartOK(ts)

0 commit comments

Comments
 (0)