Skip to content

Commit a76e810

Browse files
authored
fix(gateway): harden token fallback/reconnect behavior and docs (openclaw#42507)
* fix(gateway): harden token fallback and auth reconnect handling * docs(gateway): clarify auth retry and token-drift recovery * fix(gateway): tighten auth reconnect gating across clients * fix: harden gateway token retry (openclaw#42507) (thanks @joshavant)
1 parent ff2e7a2 commit a76e810

File tree

21 files changed

+1188
-80
lines changed

21 files changed

+1188
-80
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ Docs: https://docs.openclaw.ai
7878
- ACP/sessions_spawn: implicitly stream `mode="run"` ACP spawns to parent only for eligible subagent orchestrator sessions (heartbeat `target: "last"` with a usable session-local route), restoring parent progress relays without thread binding. (#42404) Thanks @davidguttman.
7979
- Sessions/reset model recompute: clear stale runtime model, context-token, and system-prompt metadata before session resets recompute the replacement session, so resets pick up current defaults and explicit overrides instead of reusing old runtime model state. (#41173) thanks @PonyX-lab.
8080
- Browser/Browserbase 429 handling: surface stable no-retry rate-limit guidance without buffering discarded HTTP 429 response bodies from remote browser services. (#40491) thanks @mvanhorn.
81+
- Gateway/auth: allow one trusted device-token retry on shared-token mismatch with recovery hints to prevent reconnect churn during token drift. (#42507) Thanks @joshavant.
8182

8283
## 2026.3.8
8384

apps/shared/OpenClawKit/Sources/OpenClawKit/GatewayChannel.swift

Lines changed: 168 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,41 @@ private let defaultOperatorConnectScopes: [String] = [
131131
"operator.pairing",
132132
]
133133

134+
private enum GatewayConnectErrorCodes {
135+
static let authTokenMismatch = "AUTH_TOKEN_MISMATCH"
136+
static let authDeviceTokenMismatch = "AUTH_DEVICE_TOKEN_MISMATCH"
137+
static let authTokenMissing = "AUTH_TOKEN_MISSING"
138+
static let authPasswordMissing = "AUTH_PASSWORD_MISSING"
139+
static let authPasswordMismatch = "AUTH_PASSWORD_MISMATCH"
140+
static let authRateLimited = "AUTH_RATE_LIMITED"
141+
static let pairingRequired = "PAIRING_REQUIRED"
142+
static let controlUiDeviceIdentityRequired = "CONTROL_UI_DEVICE_IDENTITY_REQUIRED"
143+
static let deviceIdentityRequired = "DEVICE_IDENTITY_REQUIRED"
144+
}
145+
146+
private struct GatewayConnectAuthError: LocalizedError {
147+
let message: String
148+
let detailCode: String?
149+
let canRetryWithDeviceToken: Bool
150+
151+
var errorDescription: String? { self.message }
152+
153+
var isNonRecoverable: Bool {
154+
switch self.detailCode {
155+
case GatewayConnectErrorCodes.authTokenMissing,
156+
GatewayConnectErrorCodes.authPasswordMissing,
157+
GatewayConnectErrorCodes.authPasswordMismatch,
158+
GatewayConnectErrorCodes.authRateLimited,
159+
GatewayConnectErrorCodes.pairingRequired,
160+
GatewayConnectErrorCodes.controlUiDeviceIdentityRequired,
161+
GatewayConnectErrorCodes.deviceIdentityRequired:
162+
return true
163+
default:
164+
return false
165+
}
166+
}
167+
}
168+
134169
public actor GatewayChannelActor {
135170
private let logger = Logger(subsystem: "ai.openclaw", category: "gateway")
136171
private var task: WebSocketTaskBox?
@@ -160,6 +195,9 @@ public actor GatewayChannelActor {
160195
private var watchdogTask: Task<Void, Never>?
161196
private var tickTask: Task<Void, Never>?
162197
private var keepaliveTask: Task<Void, Never>?
198+
private var pendingDeviceTokenRetry = false
199+
private var deviceTokenRetryBudgetUsed = false
200+
private var reconnectPausedForAuthFailure = false
163201
private let defaultRequestTimeoutMs: Double = 15000
164202
private let pushHandler: (@Sendable (GatewayPush) async -> Void)?
165203
private let connectOptions: GatewayConnectOptions?
@@ -232,10 +270,19 @@ public actor GatewayChannelActor {
232270
while self.shouldReconnect {
233271
guard await self.sleepUnlessCancelled(nanoseconds: 30 * 1_000_000_000) else { return } // 30s cadence
234272
guard self.shouldReconnect else { return }
273+
if self.reconnectPausedForAuthFailure { continue }
235274
if self.connected { continue }
236275
do {
237276
try await self.connect()
238277
} catch {
278+
if self.shouldPauseReconnectAfterAuthFailure(error) {
279+
self.reconnectPausedForAuthFailure = true
280+
self.logger.error(
281+
"gateway watchdog reconnect paused for non-recoverable auth failure " +
282+
"\(error.localizedDescription, privacy: .public)"
283+
)
284+
continue
285+
}
239286
let wrapped = self.wrap(error, context: "gateway watchdog reconnect")
240287
self.logger.error("gateway watchdog reconnect failed \(wrapped.localizedDescription, privacy: .public)")
241288
}
@@ -267,7 +314,12 @@ public actor GatewayChannelActor {
267314
},
268315
operation: { try await self.sendConnect() })
269316
} catch {
270-
let wrapped = self.wrap(error, context: "connect to gateway @ \(self.url.absoluteString)")
317+
let wrapped: Error
318+
if let authError = error as? GatewayConnectAuthError {
319+
wrapped = authError
320+
} else {
321+
wrapped = self.wrap(error, context: "connect to gateway @ \(self.url.absoluteString)")
322+
}
271323
self.connected = false
272324
self.task?.cancel(with: .goingAway, reason: nil)
273325
await self.disconnectHandler?("connect failed: \(wrapped.localizedDescription)")
@@ -281,6 +333,7 @@ public actor GatewayChannelActor {
281333
}
282334
self.listen()
283335
self.connected = true
336+
self.reconnectPausedForAuthFailure = false
284337
self.backoffMs = 500
285338
self.lastSeq = nil
286339
self.startKeepalive()
@@ -371,11 +424,18 @@ public actor GatewayChannelActor {
371424
(includeDeviceIdentity && identity != nil)
372425
? DeviceAuthStore.loadToken(deviceId: identity!.deviceId, role: role)?.token
373426
: nil
374-
// If we're not sending a device identity, a device token can't be validated server-side.
375-
// In that mode we always use the shared gateway token/password.
376-
let authToken = includeDeviceIdentity ? (storedToken ?? self.token) : self.token
427+
let shouldUseDeviceRetryToken =
428+
includeDeviceIdentity && self.pendingDeviceTokenRetry &&
429+
storedToken != nil && self.token != nil && self.isTrustedDeviceRetryEndpoint()
430+
if shouldUseDeviceRetryToken {
431+
self.pendingDeviceTokenRetry = false
432+
}
433+
// Keep shared credentials explicit when provided. Device token retry is attached
434+
// only on a bounded second attempt after token mismatch.
435+
let authToken = self.token ?? (includeDeviceIdentity ? storedToken : nil)
436+
let authDeviceToken = shouldUseDeviceRetryToken ? storedToken : nil
377437
let authSource: GatewayAuthSource
378-
if storedToken != nil {
438+
if authDeviceToken != nil || (self.token == nil && storedToken != nil) {
379439
authSource = .deviceToken
380440
} else if authToken != nil {
381441
authSource = .sharedToken
@@ -386,9 +446,12 @@ public actor GatewayChannelActor {
386446
}
387447
self.lastAuthSource = authSource
388448
self.logger.info("gateway connect auth=\(authSource.rawValue, privacy: .public)")
389-
let canFallbackToShared = includeDeviceIdentity && storedToken != nil && self.token != nil
390449
if let authToken {
391-
params["auth"] = ProtoAnyCodable(["token": ProtoAnyCodable(authToken)])
450+
var auth: [String: ProtoAnyCodable] = ["token": ProtoAnyCodable(authToken)]
451+
if let authDeviceToken {
452+
auth["deviceToken"] = ProtoAnyCodable(authDeviceToken)
453+
}
454+
params["auth"] = ProtoAnyCodable(auth)
392455
} else if let password = self.password {
393456
params["auth"] = ProtoAnyCodable(["password": ProtoAnyCodable(password)])
394457
}
@@ -426,11 +489,24 @@ public actor GatewayChannelActor {
426489
do {
427490
let response = try await self.waitForConnectResponse(reqId: reqId)
428491
try await self.handleConnectResponse(response, identity: identity, role: role)
492+
self.pendingDeviceTokenRetry = false
493+
self.deviceTokenRetryBudgetUsed = false
429494
} catch {
430-
if canFallbackToShared {
431-
if let identity {
432-
DeviceAuthStore.clearToken(deviceId: identity.deviceId, role: role)
433-
}
495+
let shouldRetryWithDeviceToken = self.shouldRetryWithStoredDeviceToken(
496+
error: error,
497+
explicitGatewayToken: self.token,
498+
storedToken: storedToken,
499+
attemptedDeviceTokenRetry: authDeviceToken != nil)
500+
if shouldRetryWithDeviceToken {
501+
self.pendingDeviceTokenRetry = true
502+
self.deviceTokenRetryBudgetUsed = true
503+
self.backoffMs = min(self.backoffMs, 250)
504+
} else if authDeviceToken != nil,
505+
let identity,
506+
self.shouldClearStoredDeviceTokenAfterRetry(error)
507+
{
508+
// Retry failed with an explicit device-token mismatch; clear stale local token.
509+
DeviceAuthStore.clearToken(deviceId: identity.deviceId, role: role)
434510
}
435511
throw error
436512
}
@@ -443,7 +519,13 @@ public actor GatewayChannelActor {
443519
) async throws {
444520
if res.ok == false {
445521
let msg = (res.error?["message"]?.value as? String) ?? "gateway connect failed"
446-
throw NSError(domain: "Gateway", code: 1008, userInfo: [NSLocalizedDescriptionKey: msg])
522+
let details = res.error?["details"]?.value as? [String: ProtoAnyCodable]
523+
let detailCode = details?["code"]?.value as? String
524+
let canRetryWithDeviceToken = details?["canRetryWithDeviceToken"]?.value as? Bool ?? false
525+
throw GatewayConnectAuthError(
526+
message: msg,
527+
detailCode: detailCode,
528+
canRetryWithDeviceToken: canRetryWithDeviceToken)
447529
}
448530
guard let payload = res.payload else {
449531
throw NSError(
@@ -616,19 +698,91 @@ public actor GatewayChannelActor {
616698

617699
private func scheduleReconnect() async {
618700
guard self.shouldReconnect else { return }
701+
guard !self.reconnectPausedForAuthFailure else { return }
619702
let delay = self.backoffMs / 1000
620703
self.backoffMs = min(self.backoffMs * 2, 30000)
621704
guard await self.sleepUnlessCancelled(nanoseconds: UInt64(delay * 1_000_000_000)) else { return }
622705
guard self.shouldReconnect else { return }
706+
guard !self.reconnectPausedForAuthFailure else { return }
623707
do {
624708
try await self.connect()
625709
} catch {
710+
if self.shouldPauseReconnectAfterAuthFailure(error) {
711+
self.reconnectPausedForAuthFailure = true
712+
self.logger.error(
713+
"gateway reconnect paused for non-recoverable auth failure " +
714+
"\(error.localizedDescription, privacy: .public)"
715+
)
716+
return
717+
}
626718
let wrapped = self.wrap(error, context: "gateway reconnect")
627719
self.logger.error("gateway reconnect failed \(wrapped.localizedDescription, privacy: .public)")
628720
await self.scheduleReconnect()
629721
}
630722
}
631723

724+
private func shouldRetryWithStoredDeviceToken(
725+
error: Error,
726+
explicitGatewayToken: String?,
727+
storedToken: String?,
728+
attemptedDeviceTokenRetry: Bool
729+
) -> Bool {
730+
if self.deviceTokenRetryBudgetUsed {
731+
return false
732+
}
733+
if attemptedDeviceTokenRetry {
734+
return false
735+
}
736+
guard explicitGatewayToken != nil, storedToken != nil else {
737+
return false
738+
}
739+
guard self.isTrustedDeviceRetryEndpoint() else {
740+
return false
741+
}
742+
guard let authError = error as? GatewayConnectAuthError else {
743+
return false
744+
}
745+
return authError.canRetryWithDeviceToken ||
746+
authError.detailCode == GatewayConnectErrorCodes.authTokenMismatch
747+
}
748+
749+
private func shouldPauseReconnectAfterAuthFailure(_ error: Error) -> Bool {
750+
guard let authError = error as? GatewayConnectAuthError else {
751+
return false
752+
}
753+
if authError.isNonRecoverable {
754+
return true
755+
}
756+
if authError.detailCode == GatewayConnectErrorCodes.authTokenMismatch &&
757+
self.deviceTokenRetryBudgetUsed && !self.pendingDeviceTokenRetry
758+
{
759+
return true
760+
}
761+
return false
762+
}
763+
764+
private func shouldClearStoredDeviceTokenAfterRetry(_ error: Error) -> Bool {
765+
guard let authError = error as? GatewayConnectAuthError else {
766+
return false
767+
}
768+
return authError.detailCode == GatewayConnectErrorCodes.authDeviceTokenMismatch
769+
}
770+
771+
private func isTrustedDeviceRetryEndpoint() -> Bool {
772+
// This client currently treats loopback as the only trusted retry target.
773+
// Unlike the Node gateway client, it does not yet expose a pinned TLS-fingerprint
774+
// trust path for remote retry, so remote fallback remains disabled by default.
775+
guard let host = self.url.host?.trimmingCharacters(in: .whitespacesAndNewlines).lowercased(),
776+
!host.isEmpty
777+
else {
778+
return false
779+
}
780+
if host == "localhost" || host == "::1" || host == "127.0.0.1" || host.hasPrefix("127.") {
781+
return true
782+
}
783+
return false
784+
}
785+
632786
private nonisolated func sleepUnlessCancelled(nanoseconds: UInt64) async -> Bool {
633787
do {
634788
try await Task.sleep(nanoseconds: nanoseconds)
@@ -756,7 +910,8 @@ public actor GatewayChannelActor {
756910
return (id: id, data: data)
757911
} catch {
758912
self.logger.error(
759-
"gateway \(kind) encode failed \(method, privacy: .public) error=\(error.localizedDescription, privacy: .public)")
913+
"gateway \(kind) encode failed \(method, privacy: .public) " +
914+
"error=\(error.localizedDescription, privacy: .public)")
760915
throw error
761916
}
762917
}

docs/cli/devices.md

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,3 +92,40 @@ Pass `--token` or `--password` explicitly. Missing explicit credentials is an er
9292
- These commands require `operator.pairing` (or `operator.admin`) scope.
9393
- `devices clear` is intentionally gated by `--yes`.
9494
- If pairing scope is unavailable on local loopback (and no explicit `--url` is passed), list/approve can use a local pairing fallback.
95+
96+
## Token drift recovery checklist
97+
98+
Use this when Control UI or other clients keep failing with `AUTH_TOKEN_MISMATCH` or `AUTH_DEVICE_TOKEN_MISMATCH`.
99+
100+
1. Confirm current gateway token source:
101+
102+
```bash
103+
openclaw config get gateway.auth.token
104+
```
105+
106+
2. List paired devices and identify the affected device id:
107+
108+
```bash
109+
openclaw devices list
110+
```
111+
112+
3. Rotate operator token for the affected device:
113+
114+
```bash
115+
openclaw devices rotate --device <deviceId> --role operator
116+
```
117+
118+
4. If rotation is not enough, remove stale pairing and approve again:
119+
120+
```bash
121+
openclaw devices remove <deviceId>
122+
openclaw devices list
123+
openclaw devices approve <requestId>
124+
```
125+
126+
5. Retry client connection with the current shared token/password.
127+
128+
Related:
129+
130+
- [Dashboard auth troubleshooting](/web/dashboard#if-you-see-unauthorized-1008)
131+
- [Gateway troubleshooting](/gateway/troubleshooting#dashboard-control-ui-connectivity)

docs/gateway/protocol.md

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -206,6 +206,12 @@ The Gateway treats these as **claims** and enforces server-side allowlists.
206206
persisted by the client for future connects.
207207
- Device tokens can be rotated/revoked via `device.token.rotate` and
208208
`device.token.revoke` (requires `operator.pairing` scope).
209+
- Auth failures include `error.details.code` plus recovery hints:
210+
- `error.details.canRetryWithDeviceToken` (boolean)
211+
- `error.details.recommendedNextStep` (`retry_with_device_token`, `update_auth_configuration`, `update_auth_credentials`, `wait_then_retry`, `review_auth_configuration`)
212+
- Client behavior for `AUTH_TOKEN_MISMATCH`:
213+
- Trusted clients may attempt one bounded retry with a cached per-device token.
214+
- If that retry fails, clients should stop automatic reconnect loops and surface operator action guidance.
209215

210216
## Device identity + pairing
211217

@@ -217,8 +223,9 @@ The Gateway treats these as **claims** and enforces server-side allowlists.
217223
- **Local** connects include loopback and the gateway host’s own tailnet address
218224
(so same‑host tailnet binds can still auto‑approve).
219225
- All WS clients must include `device` identity during `connect` (operator + node).
220-
Control UI can omit it **only** when `gateway.controlUi.dangerouslyDisableDeviceAuth`
221-
is enabled for break-glass use.
226+
Control UI can omit it only in these modes:
227+
- `gateway.controlUi.allowInsecureAuth=true` for localhost-only insecure HTTP compatibility.
228+
- `gateway.controlUi.dangerouslyDisableDeviceAuth=true` (break-glass, severe security downgrade).
222229
- All connections must sign the server-provided `connect.challenge` nonce.
223230

224231
### Device auth migration diagnostics

docs/gateway/security/index.md

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -262,9 +262,14 @@ High-signal `checkId` values you will most likely see in real deployments (not e
262262
## Control UI over HTTP
263263

264264
The Control UI needs a **secure context** (HTTPS or localhost) to generate device
265-
identity. `gateway.controlUi.allowInsecureAuth` does **not** bypass secure-context,
266-
device-identity, or device-pairing checks. Prefer HTTPS (Tailscale Serve) or open
267-
the UI on `127.0.0.1`.
265+
identity. `gateway.controlUi.allowInsecureAuth` is a local compatibility toggle:
266+
267+
- On localhost, it allows Control UI auth without device identity when the page
268+
is loaded over non-secure HTTP.
269+
- It does not bypass pairing checks.
270+
- It does not relax remote (non-localhost) device identity requirements.
271+
272+
Prefer HTTPS (Tailscale Serve) or open the UI on `127.0.0.1`.
268273

269274
For break-glass scenarios only, `gateway.controlUi.dangerouslyDisableDeviceAuth`
270275
disables device identity checks entirely. This is a severe security downgrade;

0 commit comments

Comments
 (0)