Skip to content

Commit 090c15b

Browse files
committed
fix(gateway): address lifecycle review findings
1 parent 9305b66 commit 090c15b

File tree

8 files changed

+208
-20
lines changed

8 files changed

+208
-20
lines changed

src/cli/gateway-cli/run.option-collisions.test.ts

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,10 @@ describe("gateway run option collisions", () => {
143143
]);
144144

145145
expect(forceFreePortAndWait).toHaveBeenCalledWith(18789, expect.anything());
146-
expect(waitForPortBindable).toHaveBeenCalledWith(18789, expect.anything());
146+
expect(waitForPortBindable).toHaveBeenCalledWith(
147+
18789,
148+
expect.objectContaining({ host: "127.0.0.1" }),
149+
);
147150
expect(setGatewayWsLogStyle).toHaveBeenCalledWith("full");
148151
expect(startGatewayServer).toHaveBeenCalledWith(
149152
18789,

src/cli/gateway-cli/run.ts

Lines changed: 23 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,20 @@ async function runGatewayCommand(opts: GatewayRunOpts) {
186186
defaultRuntime.error("Invalid port");
187187
defaultRuntime.exit(1);
188188
}
189+
const bindRaw = toOptionString(opts.bind) ?? cfg.gateway?.bind ?? "loopback";
190+
const bind =
191+
bindRaw === "loopback" ||
192+
bindRaw === "lan" ||
193+
bindRaw === "auto" ||
194+
bindRaw === "custom" ||
195+
bindRaw === "tailnet"
196+
? bindRaw
197+
: null;
198+
if (!bind) {
199+
defaultRuntime.error('Invalid --bind (use "loopback", "lan", "tailnet", "auto", or "custom")');
200+
defaultRuntime.exit(1);
201+
return;
202+
}
189203
if (opts.force) {
190204
try {
191205
const { killed, waitedMs, escalatedToSigkill } = await forceFreePortAndWait(port, {
@@ -209,9 +223,18 @@ async function runGatewayCommand(opts: GatewayRunOpts) {
209223
}
210224
}
211225
// After killing, verify the port is actually bindable (handles TIME_WAIT).
226+
const bindProbeHost =
227+
bind === "loopback"
228+
? "127.0.0.1"
229+
: bind === "lan"
230+
? "0.0.0.0"
231+
: bind === "custom"
232+
? toOptionString(cfg.gateway?.customBindHost)
233+
: undefined;
212234
const bindWaitMs = await waitForPortBindable(port, {
213235
timeoutMs: 3000,
214236
intervalMs: 150,
237+
host: bindProbeHost,
215238
});
216239
if (bindWaitMs > 0) {
217240
gatewayLog.info(`force: waited ${bindWaitMs}ms for port ${port} to become bindable`);
@@ -265,21 +288,6 @@ async function runGatewayCommand(opts: GatewayRunOpts) {
265288
defaultRuntime.exit(1);
266289
return;
267290
}
268-
const bindRaw = toOptionString(opts.bind) ?? cfg.gateway?.bind ?? "loopback";
269-
const bind =
270-
bindRaw === "loopback" ||
271-
bindRaw === "lan" ||
272-
bindRaw === "auto" ||
273-
bindRaw === "custom" ||
274-
bindRaw === "tailnet"
275-
? bindRaw
276-
: null;
277-
if (!bind) {
278-
defaultRuntime.error('Invalid --bind (use "loopback", "lan", "tailnet", "auto", or "custom")');
279-
defaultRuntime.exit(1);
280-
return;
281-
}
282-
283291
const miskeys = extractGatewayMiskeys(snapshot?.parsed);
284292
const authOverride =
285293
authMode || passwordRaw || tokenRaw || authModeRaw

src/cli/ports.test.ts

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,33 @@ describe("probePortFree", () => {
8383
});
8484

8585
describe("waitForPortBindable", () => {
86+
it("probes the provided host when waiting for bindability", async () => {
87+
const listenCalls: Array<{ port: number; host: string }> = [];
88+
const fakeServer = new EventEmitter() as unknown as net.Server;
89+
(fakeServer as unknown as { close: (cb?: () => void) => net.Server }).close = (
90+
cb?: () => void,
91+
) => {
92+
cb?.();
93+
return fakeServer;
94+
};
95+
(fakeServer as unknown as { unref: () => net.Server }).unref = () => fakeServer;
96+
(fakeServer as unknown as { listen: (...args: unknown[]) => net.Server }).listen = (
97+
...args: unknown[]
98+
) => {
99+
const [port, host] = args as [number, string];
100+
listenCalls.push({ port, host });
101+
const callback = args.find((a) => typeof a === "function") as (() => void) | undefined;
102+
setImmediate(() => callback?.());
103+
return fakeServer;
104+
};
105+
mockCreateServer.mockReturnValue(fakeServer);
106+
107+
await expect(
108+
waitForPortBindable(9999, { timeoutMs: 100, intervalMs: 10, host: "127.0.0.1" }),
109+
).resolves.toBe(0);
110+
expect(listenCalls[0]).toEqual({ port: 9999, host: "127.0.0.1" });
111+
});
112+
86113
it("propagates EACCES rejection immediately without retrying", async () => {
87114
// Every call to createServer will emit EACCES — so if waitForPortBindable retried,
88115
// mockCreateServer would be called many times. We assert it's called exactly once.

src/cli/ports.ts

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -366,20 +366,21 @@ export function probePortFree(port: number, host = "0.0.0.0"): Promise<boolean>
366366
*/
367367
export async function waitForPortBindable(
368368
port: number,
369-
opts: { timeoutMs?: number; intervalMs?: number } = {},
369+
opts: { timeoutMs?: number; intervalMs?: number; host?: string } = {},
370370
): Promise<number> {
371371
const timeoutMs = Math.max(opts.timeoutMs ?? 3000, 0);
372372
const intervalMs = Math.max(opts.intervalMs ?? 150, 1);
373+
const host = opts.host;
373374
let waited = 0;
374375
while (waited < timeoutMs) {
375-
if (await probePortFree(port)) {
376+
if (await probePortFree(port, host)) {
376377
return waited;
377378
}
378379
await sleep(intervalMs);
379380
waited += intervalMs;
380381
}
381382
// Final attempt
382-
if (await probePortFree(port)) {
383+
if (await probePortFree(port, host)) {
383384
return waited;
384385
}
385386
throw new Error(`port ${port} still not bindable after ${waited}ms (TIME_WAIT or kernel hold)`);
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
import { EventEmitter } from "node:events";
2+
import type { Server as HttpServer } from "node:http";
3+
import { describe, expect, it, vi } from "vitest";
4+
import { GatewayLockError } from "../../infra/gateway-lock.js";
5+
import { listenGatewayHttpServer } from "./http-listen.js";
6+
7+
const sleepMock = vi.hoisted(() => vi.fn(async (_ms: number) => {}));
8+
9+
vi.mock("../../utils.js", () => ({
10+
sleep: (ms: number) => sleepMock(ms),
11+
}));
12+
13+
type ListenOutcome = { kind: "error"; code: string } | { kind: "listening" };
14+
15+
function createFakeHttpServer(outcomes: ListenOutcome[]) {
16+
class FakeHttpServer extends EventEmitter {
17+
public closeCalls = 0;
18+
private attempt = 0;
19+
20+
listen(_port: number, _host: string) {
21+
const outcome = outcomes[this.attempt] ?? { kind: "listening" };
22+
this.attempt += 1;
23+
setImmediate(() => {
24+
if (outcome.kind === "error") {
25+
const err = Object.assign(new Error(outcome.code), { code: outcome.code });
26+
this.emit("error", err);
27+
} else {
28+
this.emit("listening");
29+
}
30+
});
31+
return this;
32+
}
33+
34+
close(cb?: () => void) {
35+
this.closeCalls += 1;
36+
setImmediate(() => cb?.());
37+
return this;
38+
}
39+
}
40+
41+
return new FakeHttpServer();
42+
}
43+
44+
describe("listenGatewayHttpServer", () => {
45+
it("retries EADDRINUSE and closes server handle before retry", async () => {
46+
sleepMock.mockClear();
47+
const fake = createFakeHttpServer([
48+
{ kind: "error", code: "EADDRINUSE" },
49+
{ kind: "listening" },
50+
]);
51+
52+
await expect(
53+
listenGatewayHttpServer({
54+
httpServer: fake as unknown as HttpServer,
55+
bindHost: "127.0.0.1",
56+
port: 18789,
57+
}),
58+
).resolves.toBeUndefined();
59+
60+
expect(fake.closeCalls).toBe(1);
61+
expect(sleepMock).toHaveBeenCalledTimes(1);
62+
});
63+
64+
it("throws GatewayLockError after EADDRINUSE retries are exhausted", async () => {
65+
sleepMock.mockClear();
66+
const fake = createFakeHttpServer([
67+
{ kind: "error", code: "EADDRINUSE" },
68+
{ kind: "error", code: "EADDRINUSE" },
69+
{ kind: "error", code: "EADDRINUSE" },
70+
{ kind: "error", code: "EADDRINUSE" },
71+
{ kind: "error", code: "EADDRINUSE" },
72+
{ kind: "error", code: "EADDRINUSE" },
73+
]);
74+
75+
await expect(
76+
listenGatewayHttpServer({
77+
httpServer: fake as unknown as HttpServer,
78+
bindHost: "127.0.0.1",
79+
port: 18789,
80+
}),
81+
).rejects.toBeInstanceOf(GatewayLockError);
82+
83+
expect(fake.closeCalls).toBe(4);
84+
});
85+
86+
it("wraps non-EADDRINUSE errors as GatewayLockError", async () => {
87+
sleepMock.mockClear();
88+
const fake = createFakeHttpServer([{ kind: "error", code: "EACCES" }]);
89+
90+
await expect(
91+
listenGatewayHttpServer({
92+
httpServer: fake as unknown as HttpServer,
93+
bindHost: "127.0.0.1",
94+
port: 18789,
95+
}),
96+
).rejects.toBeInstanceOf(GatewayLockError);
97+
98+
expect(fake.closeCalls).toBe(0);
99+
});
100+
});

src/gateway/server/http-listen.ts

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,16 @@ import { sleep } from "../../utils.js";
55
const EADDRINUSE_MAX_RETRIES = 4;
66
const EADDRINUSE_RETRY_INTERVAL_MS = 500;
77

8+
async function closeServerQuietly(httpServer: HttpServer): Promise<void> {
9+
await new Promise<void>((resolve) => {
10+
try {
11+
httpServer.close(() => resolve());
12+
} catch {
13+
resolve();
14+
}
15+
});
16+
}
17+
818
export async function listenGatewayHttpServer(params: {
919
httpServer: HttpServer;
1020
bindHost: string;
@@ -32,6 +42,7 @@ export async function listenGatewayHttpServer(params: {
3242
const code = (err as NodeJS.ErrnoException).code;
3343
if (code === "EADDRINUSE" && attempt < EADDRINUSE_MAX_RETRIES) {
3444
// Port may still be in TIME_WAIT after a recent process exit; retry.
45+
await closeServerQuietly(httpServer);
3546
await sleep(EADDRINUSE_RETRY_INTERVAL_MS);
3647
continue;
3748
}

src/infra/restart-stale-pids.test.ts

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ const isWindows = process.platform === "win32";
88

99
const mockSpawnSync = vi.hoisted(() => vi.fn());
1010
const mockResolveGatewayPort = vi.hoisted(() => vi.fn(() => 18789));
11+
const mockRestartWarn = vi.hoisted(() => vi.fn());
1112

1213
vi.mock("node:child_process", () => ({
1314
spawnSync: (...args: unknown[]) => mockSpawnSync(...args),
@@ -22,6 +23,14 @@ vi.mock("./ports-lsof.js", () => ({
2223
resolveLsofCommandSync: vi.fn(() => "lsof"),
2324
}));
2425

26+
vi.mock("../logging/subsystem.js", () => ({
27+
createSubsystemLogger: vi.fn(() => ({
28+
warn: (...args: unknown[]) => mockRestartWarn(...args),
29+
info: vi.fn(),
30+
error: vi.fn(),
31+
})),
32+
}));
33+
2534
import { resolveLsofCommandSync } from "./ports-lsof.js";
2635
import {
2736
__testing,
@@ -37,6 +46,7 @@ describe.skipIf(isWindows)("restart-stale-pids", () => {
3746
beforeEach(() => {
3847
mockSpawnSync.mockReset();
3948
mockResolveGatewayPort.mockReset();
49+
mockRestartWarn.mockReset();
4050
mockResolveGatewayPort.mockReturnValue(18789);
4151
__testing.setSleepSyncOverride(() => {});
4252
});
@@ -56,6 +66,14 @@ describe.skipIf(isWindows)("restart-stale-pids", () => {
5666
expect(findGatewayPidsOnPortSync(18789)).toEqual([]);
5767
});
5868

69+
it("logs warning when initial lsof scan exits with status > 1", () => {
70+
mockSpawnSync.mockReturnValue({ error: null, status: 2, stdout: "", stderr: "lsof error" });
71+
expect(findGatewayPidsOnPortSync(18789)).toEqual([]);
72+
expect(mockRestartWarn).toHaveBeenCalledWith(
73+
expect.stringContaining("lsof exited with status 2"),
74+
);
75+
});
76+
5977
it("returns [] when lsof returns an error object (e.g. ENOENT)", () => {
6078
mockSpawnSync.mockReturnValue({
6179
error: new Error("ENOENT"),
@@ -64,6 +82,9 @@ describe.skipIf(isWindows)("restart-stale-pids", () => {
6482
stderr: "",
6583
});
6684
expect(findGatewayPidsOnPortSync(18789)).toEqual([]);
85+
expect(mockRestartWarn).toHaveBeenCalledWith(
86+
expect.stringContaining("lsof failed during initial stale-pid scan"),
87+
);
6788
});
6889

6990
it("parses openclaw-gateway pids and excludes the current process", () => {

src/infra/restart-stale-pids.ts

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,24 @@ export function findGatewayPidsOnPortSync(
9494
encoding: "utf8",
9595
timeout: spawnTimeoutMs,
9696
});
97-
if (res.error || res.status !== 0) {
97+
if (res.error) {
98+
const code = (res.error as NodeJS.ErrnoException).code;
99+
const detail =
100+
code && code.trim().length > 0
101+
? code
102+
: res.error instanceof Error
103+
? res.error.message
104+
: "unknown error";
105+
restartLog.warn(`lsof failed during initial stale-pid scan for port ${port}: ${detail}`);
106+
return [];
107+
}
108+
if (res.status === 1) {
109+
return [];
110+
}
111+
if (res.status !== 0) {
112+
restartLog.warn(
113+
`lsof exited with status ${res.status} during initial stale-pid scan for port ${port}; skipping stale pid check`,
114+
);
98115
return [];
99116
}
100117
return parsePidsFromLsofOutput(res.stdout);

0 commit comments

Comments
 (0)