fix(bridge): scope CLI sessions per OpenClaw session and reset on /new
The bridge was keying claudeSessionId by agentId alone, so every Discord
channel, DM, and cron run for a single agent shared one Claude CLI
session. Two consequences in the wild:
- Cross-channel context bleed: 8.7MB session for `developer` mixed
references from channels 1474327736242798612 and 1498579994044010566
plus the operator DM all in one --resume thread.
- `/new` had no effect on the CLI side. OpenClaw rotated its session
file but the bridge kept --resume-ing the same long-lived
claudeSessionId, eventually crossing the 1M model context (debug log
showed `prompt is too long: 1179616 tokens > 1000000 maximum`).
Changes:
* input-filter: extract `chat_id` from the Conversation-info
untrusted-metadata block (scanning all messages, since runtimeOnly
turns put it in the system prompt) and detect bare `/new`/`/reset`
via the BARE_SESSION_RESET_PROMPT_BASE marker. Add buildSessionKey
`${agentId}::${chatId}` and resolveDispatchPrompt fallback for the
empty user message that OpenClaw sends on bare resets.
* server: use the composite session key for getSession/putSession;
on bareSessionReset, removeSession before dispatching so the CLI
starts a fresh session; on a CLI result_error (typically
prompt_too_long) drop the entry too so the next turn doesn't
re-resume into the poisoned context.
* claude/sdk-adapter: surface CLI terminal errors via a new
`result_error` event (carries reason + sessionId) so the bridge
can react instead of just streaming the synthetic
"Prompt is too long" assistant text and silently re-using the
same session.
* index: convert register() to synchronous (OpenClaw rejects async
register with "plugin register must be synchronous"); replace the
pre-bind port probe with a server-level EADDRINUSE handler.
* .gitignore: ignore node_modules/ and dist/.
This commit is contained in:
@@ -1,5 +1,4 @@
|
||||
import fs from "node:fs";
|
||||
import net from "node:net";
|
||||
import path from "node:path";
|
||||
import type { OpenClawPluginApi } from "openclaw/plugin-sdk";
|
||||
import { normalizePluginConfig } from "./core/types/contractor.js";
|
||||
@@ -8,15 +7,6 @@ import { createBridgeServer } from "./web/server.js";
|
||||
import { registerCli } from "./commands/register-cli.js";
|
||||
import type http from "node:http";
|
||||
|
||||
function isPortFree(port: number): Promise<boolean> {
|
||||
return new Promise((resolve) => {
|
||||
const tester = net.createServer();
|
||||
tester.once("error", () => resolve(false));
|
||||
tester.once("listening", () => tester.close(() => resolve(true)));
|
||||
tester.listen(port, "127.0.0.1");
|
||||
});
|
||||
}
|
||||
|
||||
// ── GlobalThis state ─────────────────────────────────────────────────────────
|
||||
// All persistent state lives on globalThis to survive OpenClaw hot-reloads.
|
||||
// See LESSONS_LEARNED.md items 1, 3, 11.
|
||||
@@ -32,7 +22,12 @@ const OPENCLAW_CONFIG_KEY = "_contractorOpenClawConfig";
|
||||
export default {
|
||||
id: "contractor-agent",
|
||||
name: "Contractor Agent",
|
||||
async register(api: OpenClawPluginApi) {
|
||||
// OpenClaw requires register() to be synchronous — returning a Promise
|
||||
// surfaces as `Error: plugin register must be synchronous` and the plugin
|
||||
// ends up in `error` state. We avoid `await` here and instead let the
|
||||
// bridge server bind asynchronously, handling EADDRINUSE via the server's
|
||||
// `error` event when another gateway/CLI process already owns the port.
|
||||
register(api: OpenClawPluginApi): void {
|
||||
const config = normalizePluginConfig(api.pluginConfig);
|
||||
|
||||
// Resolve agent metadata for the bridge server's resolveAgent callback.
|
||||
@@ -58,9 +53,6 @@ export default {
|
||||
}
|
||||
|
||||
// ── Gateway lifecycle (start bridge server once per gateway process) ──────
|
||||
// Guard with globalThis flag AND a port probe to handle the case where the
|
||||
// gateway is already running the server while a CLI subprocess is starting up.
|
||||
// (See LESSONS_LEARNED.md item 7 — lock file / port probe pattern)
|
||||
// Always update the config accessor so hot-reloads get fresh config.
|
||||
// server.ts reads this via globalThis to build tool execution context.
|
||||
_G[OPENCLAW_CONFIG_KEY] = api.config;
|
||||
@@ -68,15 +60,6 @@ export default {
|
||||
if (!_G[LIFECYCLE_KEY]) {
|
||||
_G[LIFECYCLE_KEY] = true;
|
||||
|
||||
// Only bind if port is not already in use (avoids EADDRINUSE in CLI mode)
|
||||
const portFree = await isPortFree(config.bridgePort);
|
||||
if (!portFree) {
|
||||
api.logger.info(
|
||||
`[contractor-agent] bridge already running on port ${config.bridgePort}, skipping bind`,
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
const server = createBridgeServer({
|
||||
port: config.bridgePort,
|
||||
apiKey: config.bridgeApiKey,
|
||||
@@ -84,6 +67,20 @@ export default {
|
||||
resolveAgent,
|
||||
logger: api.logger,
|
||||
});
|
||||
|
||||
// EADDRINUSE → another gateway/CLI process already owns the port; that's
|
||||
// fine, we just don't double-bind. Any other error is logged but does
|
||||
// not crash registration.
|
||||
server.on("error", (err: NodeJS.ErrnoException) => {
|
||||
if (err.code === "EADDRINUSE") {
|
||||
api.logger.info(
|
||||
`[contractor-agent] bridge already running on port ${config.bridgePort}, skipping bind`,
|
||||
);
|
||||
return;
|
||||
}
|
||||
api.logger.warn(`[contractor-agent] bridge server error: ${err.message ?? String(err)}`);
|
||||
});
|
||||
|
||||
_G[SERVER_KEY] = server;
|
||||
|
||||
api.on("gateway_stop", () => {
|
||||
|
||||
Reference in New Issue
Block a user