Skip to content

Commit 63c6080

Browse files
SidQin-cybersteipete
authored andcommitted
fix: clean stale gateway PIDs before triggerOpenClawRestart calls launchctl/systemctl
When the /restart command runs inside an embedded agent process (no SIGUSR1 listener), it falls through to triggerOpenClawRestart() which calls launchctl kickstart -k directly — bypassing the pre-restart port cleanup added in #27013. If the gateway was started via TUI/CLI, the orphaned process still holds the port and the new launchd instance crash-loops. Add synchronous stale-PID detection (lsof) and termination (SIGTERM→SIGKILL) inside triggerOpenClawRestart() itself, so every caller — including the embedded agent /restart path — gets port cleanup before the service manager restart command fires. Closes #26736 Made-with: Cursor
1 parent 792ce7b commit 63c6080

File tree

2 files changed

+117
-0
lines changed

2 files changed

+117
-0
lines changed

src/infra/restart.test.ts

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
import { describe, expect, it } from "vitest";
2+
import { findGatewayPidsOnPortSync } from "./restart.js";
3+
4+
describe("findGatewayPidsOnPortSync", () => {
5+
it("returns an empty array for a port with no listeners", () => {
6+
const pids = findGatewayPidsOnPortSync(19999);
7+
expect(pids).toEqual([]);
8+
});
9+
10+
it("never includes the current process PID", () => {
11+
const pids = findGatewayPidsOnPortSync(18789);
12+
expect(pids).not.toContain(process.pid);
13+
});
14+
15+
it("returns an array (not undefined or null) on any port", () => {
16+
const pids = findGatewayPidsOnPortSync(0);
17+
expect(Array.isArray(pids)).toBe(true);
18+
});
19+
});

src/infra/restart.ts

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
import { spawnSync } from "node:child_process";
2+
import { resolveGatewayPort } from "../config/paths.js";
23
import {
34
resolveGatewayLaunchAgentLabel,
45
resolveGatewaySystemdServiceName,
56
} from "../daemon/constants.js";
67
import { createSubsystemLogger } from "../logging/subsystem.js";
8+
import { resolveLsofCommandSync } from "./ports-lsof.js";
79

810
export type RestartAttempt = {
911
ok: boolean;
@@ -283,10 +285,106 @@ function normalizeSystemdUnit(raw?: string, profile?: string): string {
283285
return unit.endsWith(".service") ? unit : `${unit}.service`;
284286
}
285287

288+
/**
289+
* Find PIDs of gateway processes listening on the given port using synchronous lsof.
290+
* Returns only PIDs that belong to openclaw gateway processes (not the current process).
291+
*/
292+
export function findGatewayPidsOnPortSync(port: number): number[] {
293+
if (process.platform === "win32") {
294+
return [];
295+
}
296+
const lsof = resolveLsofCommandSync();
297+
const res = spawnSync(lsof, ["-nP", `-iTCP:${port}`, "-sTCP:LISTEN", "-Fpc"], {
298+
encoding: "utf8",
299+
timeout: SPAWN_TIMEOUT_MS,
300+
});
301+
if (res.error || res.status !== 0) {
302+
return [];
303+
}
304+
const pids: number[] = [];
305+
let currentPid: number | undefined;
306+
let currentCmd: string | undefined;
307+
for (const line of res.stdout.split(/\r?\n/).filter(Boolean)) {
308+
if (line.startsWith("p")) {
309+
if (currentPid != null && currentCmd && currentCmd.toLowerCase().includes("openclaw")) {
310+
pids.push(currentPid);
311+
}
312+
const parsed = Number.parseInt(line.slice(1), 10);
313+
currentPid = Number.isFinite(parsed) && parsed > 0 ? parsed : undefined;
314+
currentCmd = undefined;
315+
} else if (line.startsWith("c")) {
316+
currentCmd = line.slice(1);
317+
}
318+
}
319+
if (currentPid != null && currentCmd && currentCmd.toLowerCase().includes("openclaw")) {
320+
pids.push(currentPid);
321+
}
322+
return pids.filter((pid) => pid !== process.pid);
323+
}
324+
325+
const STALE_SIGTERM_WAIT_MS = 300;
326+
const STALE_SIGKILL_WAIT_MS = 200;
327+
328+
/**
329+
* Synchronously terminate stale gateway processes.
330+
* Sends SIGTERM, waits briefly, then SIGKILL for survivors.
331+
*/
332+
function terminateStaleProcessesSync(pids: number[]): number[] {
333+
if (pids.length === 0) {
334+
return [];
335+
}
336+
const killed: number[] = [];
337+
for (const pid of pids) {
338+
try {
339+
process.kill(pid, "SIGTERM");
340+
killed.push(pid);
341+
} catch {
342+
// ESRCH — already gone
343+
}
344+
}
345+
if (killed.length === 0) {
346+
return killed;
347+
}
348+
spawnSync("sleep", [String(STALE_SIGTERM_WAIT_MS / 1000)], { timeout: 2000 });
349+
for (const pid of killed) {
350+
try {
351+
process.kill(pid, 0);
352+
process.kill(pid, "SIGKILL");
353+
} catch {
354+
// already gone
355+
}
356+
}
357+
spawnSync("sleep", [String(STALE_SIGKILL_WAIT_MS / 1000)], { timeout: 2000 });
358+
return killed;
359+
}
360+
361+
/**
362+
* Inspect the gateway port and kill any stale gateway processes holding it.
363+
* Called before service restart commands to prevent port conflicts.
364+
*/
365+
function cleanStaleGatewayProcessesSync(): number[] {
366+
try {
367+
const port = resolveGatewayPort(undefined, process.env);
368+
const stalePids = findGatewayPidsOnPortSync(port);
369+
if (stalePids.length === 0) {
370+
return [];
371+
}
372+
restartLog.warn(
373+
`killing ${stalePids.length} stale gateway process(es) before restart: ${stalePids.join(", ")}`,
374+
);
375+
return terminateStaleProcessesSync(stalePids);
376+
} catch {
377+
return [];
378+
}
379+
}
380+
286381
export function triggerOpenClawRestart(): RestartAttempt {
287382
if (process.env.VITEST || process.env.NODE_ENV === "test") {
288383
return { ok: true, method: "supervisor", detail: "test mode" };
289384
}
385+
386+
cleanStaleGatewayProcessesSync();
387+
290388
const tried: string[] = [];
291389
if (process.platform !== "darwin") {
292390
if (process.platform === "linux") {

0 commit comments

Comments
 (0)