Skip to content

Commit 8336545

Browse files
committed
fix(gateway): keep container restarts in-process
1 parent a3fd975 commit 8336545

5 files changed

Lines changed: 90 additions & 55 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ Docs: https://docs.openclaw.ai
6060
- Channels/Telegram: keep Bot API network fallbacks sticky after failed attempts and retry timed-out startup control calls once on the fallback route, so `deleteWebhook` IPv6 stalls no longer trigger slow multi-account retry storms. Fixes #73255. Thanks @ttomiczek and @sktbrd.
6161
- Gateway/models: merge explicit `models.providers.*.models` rows into the Gateway model catalog with normalized provider/model dedupe, and use normalized image-capability lookup so custom vision models keep native image attachments even when Pi discovery omits them or model ID casing differs. Fixes #64213 and #65165. Thanks @billonese and @202233a.
6262
- Gateway/reload: publish canonical post-write source config to in-process reloaders so simple config saves no longer create phantom plugin diffs or trigger unnecessary Gateway restarts. (#73267) Thanks @szsip239.
63+
- Gateway/Docker: keep config-triggered restarts in-process inside containers instead of spawning a detached child and exiting PID 1 cleanly, so Docker Swarm and other on-failure supervisors do not leave the service stuck at 0/1 replicas. Fixes #73178. Thanks @du-nguyen-IT007.
6364
- CLI/tasks: ship the task-registry control runtime in npm packages so `openclaw tasks cancel` can load ACP/subagent cancellation helpers from published builds. Fixes #68997. Thanks @1OAKDesign.
6465
- Channels/Telegram: preserve unsent generated media after partial reply streaming has already delivered the text, so `image_generate` outputs still reach Telegram as photos instead of being dropped from the final payload. Fixes #73253. Thanks @mlaihk.
6566
- Export/session: keep inline export HTML scripts and vendor libraries injected after template formatting so generated session exports open with the app code, markdown renderer, and syntax highlighter present. Fixes #41862 and #49957; carries forward #41861 and #68947. Thanks @briannewman, @martenzi, and @armanddp.

src/gateway/net.ts

Lines changed: 8 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
1-
import fs from "node:fs";
21
import type { IncomingMessage } from "node:http";
32
import net from "node:net";
43
import type { GatewayBindMode } from "../config/types.gateway.js";
4+
import {
5+
__resetContainerEnvironmentCacheForTest,
6+
isContainerEnvironment,
7+
} from "../infra/container-environment.js";
58
import {
69
pickMatchingExternalInterfaceAddress,
710
readNetworkInterfaces,
@@ -228,60 +231,10 @@ export function isLocalGatewayAddress(ip: string | undefined): boolean {
228231
return false;
229232
}
230233

231-
/**
232-
* Detect whether the current process is running inside a container
233-
* (Docker, Podman, or Kubernetes).
234-
*
235-
* Uses two reliable heuristics:
236-
* 1. Presence of well-known container sentinel files such as `/.dockerenv`
237-
* (Docker) or `/run/.containerenv` (Podman).
238-
* 2. Presence of container-related cgroup entries in `/proc/1/cgroup`
239-
* (covers Docker, containerd, and Kubernetes pods).
240-
*
241-
* The result is cached after the first call so filesystem access
242-
* happens at most once per process lifetime.
243-
*/
244-
let _containerCacheResult: boolean | undefined;
245-
export function isContainerEnvironment(): boolean {
246-
if (_containerCacheResult !== undefined) {
247-
return _containerCacheResult;
248-
}
249-
_containerCacheResult = detectContainerEnvironment();
250-
return _containerCacheResult;
251-
}
252-
253-
function detectContainerEnvironment(): boolean {
254-
// 1. Check common Docker/Podman container sentinel files.
255-
for (const sentinelPath of ["/.dockerenv", "/run/.containerenv", "/var/run/.containerenv"]) {
256-
try {
257-
fs.accessSync(sentinelPath, fs.constants.F_OK);
258-
return true;
259-
} catch {
260-
// not present — continue
261-
}
262-
}
263-
// 2. /proc/1/cgroup contains docker, containerd, kubepods, or lxc markers.
264-
// Covers both cgroup v1 (/docker/<id>, /kubepods/...) and cgroup v2
265-
// (kubepods.slice, cri-containerd-<id>.scope) path formats.
266-
try {
267-
const cgroup = fs.readFileSync("/proc/1/cgroup", "utf8");
268-
if (
269-
/\/docker\/|cri-containerd-[0-9a-f]|containerd\/[0-9a-f]{64}|\/kubepods[/.]|\blxc\b/.test(
270-
cgroup,
271-
)
272-
) {
273-
return true;
274-
}
275-
} catch {
276-
// /proc may not exist (macOS, Windows) — not a container
277-
}
278-
return false;
279-
}
280-
281-
/** @internal — test-only helper to reset the cached container detection result. */
282-
export function __resetContainerCacheForTest(): void {
283-
_containerCacheResult = undefined;
284-
}
234+
export {
235+
isContainerEnvironment,
236+
__resetContainerEnvironmentCacheForTest as __resetContainerCacheForTest,
237+
};
285238

286239
/**
287240
* Resolves gateway bind host with fallback strategy.

src/infra/container-environment.ts

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
import fs from "node:fs";
2+
3+
/**
4+
* Detect whether the current process is running inside a container
5+
* (Docker, Podman, or Kubernetes).
6+
*
7+
* Uses two reliable heuristics:
8+
* - Presence of common container sentinel files.
9+
* - Container-related entries in /proc/1/cgroup.
10+
*
11+
* The result is cached after the first call so filesystem access happens at
12+
* most once per process lifetime.
13+
*/
14+
let containerEnvironmentCache: boolean | undefined;
15+
16+
export function isContainerEnvironment(): boolean {
17+
if (containerEnvironmentCache !== undefined) {
18+
return containerEnvironmentCache;
19+
}
20+
containerEnvironmentCache = detectContainerEnvironment();
21+
return containerEnvironmentCache;
22+
}
23+
24+
function detectContainerEnvironment(): boolean {
25+
for (const sentinelPath of ["/.dockerenv", "/run/.containerenv", "/var/run/.containerenv"]) {
26+
try {
27+
fs.accessSync(sentinelPath, fs.constants.F_OK);
28+
return true;
29+
} catch {
30+
// Not present; try the next signal.
31+
}
32+
}
33+
34+
try {
35+
const cgroup = fs.readFileSync("/proc/1/cgroup", "utf8");
36+
if (
37+
/\/docker\/|cri-containerd-[0-9a-f]|containerd\/[0-9a-f]{64}|\/kubepods[/.]|\blxc\b/.test(
38+
cgroup,
39+
)
40+
) {
41+
return true;
42+
}
43+
} catch {
44+
// /proc may not exist on non-Linux platforms.
45+
}
46+
47+
return false;
48+
}
49+
50+
/** @internal test helper */
51+
export function __resetContainerEnvironmentCacheForTest(): void {
52+
containerEnvironmentCache = undefined;
53+
}

src/infra/process-respawn.test.ts

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import { SUPERVISOR_HINT_ENV_VARS } from "./supervisor-markers.js";
44

55
const spawnMock = vi.hoisted(() => vi.fn());
66
const triggerOpenClawRestartMock = vi.hoisted(() => vi.fn());
7+
const isContainerEnvironmentMock = vi.hoisted(() => vi.fn(() => false));
78

89
vi.mock("node:child_process", async () => {
910
const { mockNodeBuiltinModule } = await import("openclaw/plugin-sdk/test-node-mocks");
@@ -17,6 +18,9 @@ vi.mock("node:child_process", async () => {
1718
vi.mock("./restart.js", () => ({
1819
triggerOpenClawRestart: (...args: unknown[]) => triggerOpenClawRestartMock(...args),
1920
}));
21+
vi.mock("./container-environment.js", () => ({
22+
isContainerEnvironment: () => isContainerEnvironmentMock(),
23+
}));
2024

2125
import {
2226
respawnGatewayProcessForUpdate,
@@ -44,6 +48,8 @@ afterEach(() => {
4448
process.execArgv = [...originalExecArgv];
4549
spawnMock.mockClear();
4650
triggerOpenClawRestartMock.mockClear();
51+
isContainerEnvironmentMock.mockReset();
52+
isContainerEnvironmentMock.mockReturnValue(false);
4753
if (originalPlatformDescriptor) {
4854
Object.defineProperty(process, "platform", originalPlatformDescriptor);
4955
}
@@ -206,6 +212,21 @@ describe("restartGatewayProcessWithFreshPid", () => {
206212
expect(spawnMock).not.toHaveBeenCalled();
207213
});
208214

215+
it("returns disabled in containers so PID 1 stays alive for in-process restart", () => {
216+
delete process.env.OPENCLAW_NO_RESPAWN;
217+
clearSupervisorHints();
218+
setPlatform("linux");
219+
isContainerEnvironmentMock.mockReturnValue(true);
220+
221+
const result = restartGatewayProcessWithFreshPid();
222+
223+
expect(result).toEqual({
224+
mode: "disabled",
225+
detail: "container: use in-process restart to keep PID 1 alive",
226+
});
227+
expect(spawnMock).not.toHaveBeenCalled();
228+
});
229+
209230
it("ignores node task script hints for gateway restart detection on Windows", () => {
210231
clearSupervisorHints();
211232
setPlatform("win32");

src/infra/process-respawn.ts

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import { spawn, type ChildProcess } from "node:child_process";
22
import { normalizeOptionalLowercaseString } from "../shared/string-coerce.js";
3+
import { isContainerEnvironment } from "./container-environment.js";
34
import { formatErrorMessage } from "./errors.js";
45
import { triggerOpenClawRestart } from "./restart.js";
56
import { detectRespawnSupervisor } from "./supervisor-markers.js";
@@ -66,6 +67,12 @@ export function restartGatewayProcessWithFreshPid(): GatewayRespawnResult {
6667
detail: "win32: detached respawn unsupported without Scheduled Task markers",
6768
};
6869
}
70+
if (isContainerEnvironment()) {
71+
return {
72+
mode: "disabled",
73+
detail: "container: use in-process restart to keep PID 1 alive",
74+
};
75+
}
6976

7077
try {
7178
const { pid } = spawnDetachedGatewayProcess();

0 commit comments

Comments
 (0)