fix(bridge): scope CLI sessions per OpenClaw session and reset on /new

The bridge was keying claudeSessionId by agentId alone, so every Discord
channel, DM, and cron run for a single agent shared one Claude CLI
session. Two consequences in the wild:

  - Cross-channel context bleed: 8.7MB session for `developer` mixed
    references from channels 1474327736242798612 and 1498579994044010566
    plus the operator DM all in one --resume thread.
  - `/new` had no effect on the CLI side. OpenClaw rotated its session
    file but the bridge kept --resume-ing the same long-lived
    claudeSessionId, eventually crossing the 1M model context (debug log
    showed `prompt is too long: 1179616 tokens > 1000000 maximum`).

Changes:

  * input-filter: extract `chat_id` from the Conversation-info
    untrusted-metadata block (scanning all messages, since runtimeOnly
    turns put it in the system prompt) and detect bare `/new`/`/reset`
    via the BARE_SESSION_RESET_PROMPT_BASE marker. Add buildSessionKey
    `${agentId}::${chatId}` and resolveDispatchPrompt fallback for the
    empty user message that OpenClaw sends on bare resets.

  * server: use the composite session key for getSession/putSession;
    on bareSessionReset, removeSession before dispatching so the CLI
    starts a fresh session; on a CLI result_error (typically
    prompt_too_long) drop the entry too so the next turn doesn't
    re-resume into the poisoned context.

  * claude/sdk-adapter: surface CLI terminal errors via a new
    `result_error` event (carries reason + sessionId) so the bridge
    can react instead of just streaming the synthetic
    "Prompt is too long" assistant text and silently re-using the
    same session.

  * index: convert register() to synchronous (OpenClaw rejects async
    register with "plugin register must be synchronous"); replace the
    pre-bind port probe with a server-level EADDRINUSE handler.

  * .gitignore: ignore node_modules/ and dist/.
This commit is contained in:
zhi
2026-04-28 12:32:37 +00:00
parent 6be8d47982
commit 992f4d8703
5 changed files with 268 additions and 52 deletions

4
.gitignore vendored
View File

@@ -8,3 +8,7 @@ CLAUDE_CONTRACTOR_TEST_TOKEN
# IDE
.idea/
# Local dependencies / build output
node_modules/
dist/

View File

@@ -8,7 +8,14 @@ import { fileURLToPath } from "node:url";
export type ClaudeMessage =
| { type: "text"; text: string }
| { type: "done"; sessionId: string }
| { type: "error"; message: string };
| { type: "error"; message: string }
/**
* Terminal error from the CLI's `result` event (e.g. `is_error: true` with
* `terminal_reason: "prompt_too_long"`). The bridge uses this signal to
* drop the session-map entry so the next turn starts a fresh CLI session
* instead of `--resume`-ing into the same poisoned context.
*/
| { type: "result_error"; sessionId: string; reason: string; message: string };
export type OpenAITool = {
type: "function";
@@ -155,7 +162,9 @@ export async function* dispatchToClaude(
const rl = createInterface({ input: child.stdout!, crlfDelay: Infinity });
type CapturedResultError = { reason: string; message: string };
let capturedSessionId = "";
let capturedResultError = null as CapturedResultError | null;
const events: ClaudeMessage[] = [];
let done = false;
@@ -216,6 +225,15 @@ export async function* dispatchToClaude(
if (type === "result") {
const sessionId = (event.session_id as string) ?? "";
if (sessionId) capturedSessionId = sessionId;
// CLI signals fatal-but-graceful errors (context overflow, refusal,
// billing, etc.) via `is_error: true` on the result event. Capture the
// reason so the bridge layer can decide whether to invalidate the
// session-map entry (e.g. context overflow → drop, retry next turn).
if (event.is_error === true) {
const reason = (event.terminal_reason as string) ?? (event.subtype as string) ?? "error";
const message = (event.result as string) ?? `claude result error (${reason})`;
capturedResultError = { reason, message };
}
// `result` is the terminal stream-json event; commit the turn without
// waiting for claude's process tree to fully exit (leaked Bash grandchildren
// can otherwise hold stdout open indefinitely).
@@ -250,7 +268,18 @@ export async function* dispatchToClaude(
yield events.shift()!;
}
if (capturedSessionId) {
// Pull into a local with explicit type so TS doesn't infer the inner field
// accesses as `never` (the field is only ever assigned inside the readline
// callback above, so closure-based narrowing can't see it from this scope).
const resultErr: CapturedResultError | null = capturedResultError;
if (resultErr && capturedSessionId) {
yield {
type: "result_error",
sessionId: capturedSessionId,
reason: resultErr.reason,
message: resultErr.message,
};
} else if (capturedSessionId) {
yield { type: "done", sessionId: capturedSessionId };
} else {
const stderrSummary = stderrLines.join(" ").slice(0, 200);

View File

@@ -1,5 +1,4 @@
import fs from "node:fs";
import net from "node:net";
import path from "node:path";
import type { OpenClawPluginApi } from "openclaw/plugin-sdk";
import { normalizePluginConfig } from "./core/types/contractor.js";
@@ -8,15 +7,6 @@ import { createBridgeServer } from "./web/server.js";
import { registerCli } from "./commands/register-cli.js";
import type http from "node:http";
function isPortFree(port: number): Promise<boolean> {
return new Promise((resolve) => {
const tester = net.createServer();
tester.once("error", () => resolve(false));
tester.once("listening", () => tester.close(() => resolve(true)));
tester.listen(port, "127.0.0.1");
});
}
// ── GlobalThis state ─────────────────────────────────────────────────────────
// All persistent state lives on globalThis to survive OpenClaw hot-reloads.
// See LESSONS_LEARNED.md items 1, 3, 11.
@@ -32,7 +22,12 @@ const OPENCLAW_CONFIG_KEY = "_contractorOpenClawConfig";
export default {
id: "contractor-agent",
name: "Contractor Agent",
async register(api: OpenClawPluginApi) {
// OpenClaw requires register() to be synchronous — returning a Promise
// surfaces as `Error: plugin register must be synchronous` and the plugin
// ends up in `error` state. We avoid `await` here and instead let the
// bridge server bind asynchronously, handling EADDRINUSE via the server's
// `error` event when another gateway/CLI process already owns the port.
register(api: OpenClawPluginApi): void {
const config = normalizePluginConfig(api.pluginConfig);
// Resolve agent metadata for the bridge server's resolveAgent callback.
@@ -58,9 +53,6 @@ export default {
}
// ── Gateway lifecycle (start bridge server once per gateway process) ──────
// Guard with globalThis flag AND a port probe to handle the case where the
// gateway is already running the server while a CLI subprocess is starting up.
// (See LESSONS_LEARNED.md item 7 — lock file / port probe pattern)
// Always update the config accessor so hot-reloads get fresh config.
// server.ts reads this via globalThis to build tool execution context.
_G[OPENCLAW_CONFIG_KEY] = api.config;
@@ -68,15 +60,6 @@ export default {
if (!_G[LIFECYCLE_KEY]) {
_G[LIFECYCLE_KEY] = true;
// Only bind if port is not already in use (avoids EADDRINUSE in CLI mode)
const portFree = await isPortFree(config.bridgePort);
if (!portFree) {
api.logger.info(
`[contractor-agent] bridge already running on port ${config.bridgePort}, skipping bind`,
);
return;
}
const server = createBridgeServer({
port: config.bridgePort,
apiKey: config.bridgeApiKey,
@@ -84,6 +67,20 @@ export default {
resolveAgent,
logger: api.logger,
});
// EADDRINUSE → another gateway/CLI process already owns the port; that's
// fine, we just don't double-bind. Any other error is logged but does
// not crash registration.
server.on("error", (err: NodeJS.ErrnoException) => {
if (err.code === "EADDRINUSE") {
api.logger.info(
`[contractor-agent] bridge already running on port ${config.bridgePort}, skipping bind`,
);
return;
}
api.logger.warn(`[contractor-agent] bridge server error: ${err.message ?? String(err)}`);
});
_G[SERVER_KEY] = server;
api.on("gateway_stop", () => {

View File

@@ -10,6 +10,11 @@ function messageText(m: OpenAIMessage): string {
.join("");
}
function stripOpenClawTimestampPrefix(raw: string): string {
// "[Sat 2026-04-11 08:32 GMT+1] " → ""
return raw.replace(/^\[[^\]]+\]\s*/, "").trim();
}
/**
* Extract the latest user message from the OpenClaw request.
*
@@ -20,14 +25,16 @@ function messageText(m: OpenAIMessage): string {
*
* OpenClaw prefixes user messages with a timestamp: "[Day YYYY-MM-DD HH:MM TZ] text"
* We strip the timestamp prefix before forwarding.
*
* Returns "" if no user messages exist or the latest user message is empty
* (e.g. a bare /new turn — see also extractRequestContext.bareSessionReset).
*/
export function extractLatestUserMessage(req: BridgeInboundRequest): string {
const userMessages = req.messages.filter((m) => m.role === "user");
if (userMessages.length === 0) return "";
const raw = messageText(userMessages[userMessages.length - 1]);
// Strip OpenClaw timestamp prefix: "[Sat 2026-04-11 08:32 GMT+1] "
return raw.replace(/^\[[^\]]+\]\s*/, "").trim();
return stripOpenClawTimestampPrefix(raw);
}
export type RequestContext = {
@@ -37,26 +44,90 @@ export type RequestContext = {
skillsBlock: string;
/** OpenClaw context files present in the workspace (SOUL.md, IDENTITY.md, etc.) */
workspaceContextFiles: string[];
/**
* OpenClaw conversation/chat identifier scraped from the "Conversation info"
* untrusted-metadata JSON block that OpenClaw appends to user messages on
* non-direct or non-webchat surfaces (Discord channels, Discord DMs,
* Telegram, etc.).
*
* Format examples:
* - DM: "user:561921120408698910"
* - Channel: "channel:1498579994044010566"
*
* Empty when not parseable (typical for local TUI / webchat direct chats),
* in which case we fall back to keying sessions by agentId only.
*/
chatId: string;
/**
* True when this turn was triggered by `/new` (or the equivalent bare
* `/reset`) on the OpenClaw side. We detect it by looking for the literal
* marker that OpenClaw injects into the runtime prompt:
*
* "A new session was started via /new or /reset."
*
* (See `BARE_SESSION_RESET_PROMPT_BASE` in OpenClaw's
* startup-context module.)
*
* The bridge uses this to discard any prior `claudeSessionId` so we start
* a fresh Claude CLI session instead of `--resume`-ing into an old one
* that the user just asked to abandon.
*/
bareSessionReset: boolean;
};
const BARE_SESSION_RESET_MARKER =
"A new session was started via /new or /reset";
function extractChatIdFromText(text: string): string {
// OpenClaw injects an untrusted-metadata block of the form:
//
// Conversation info (untrusted metadata):
// ```json
// {
// "chat_id": "channel:1498579994044010566",
// ...
// }
// ```
//
// It can appear inside a user message body, the runtime-context system
// message, or both. A non-greedy regex on the JSON literal is enough — we
// don't need to JSON.parse the whole block (and parsing would be brittle
// against truncation / nested code fences).
const match = text.match(/"chat_id"\s*:\s*"([^"\n]+)"/);
return match ? match[1] : "";
}
/**
* Parse agent ID and workspace path from the OpenClaw system prompt.
* Parse agent ID, workspace path, chat id, and the bare-session-reset flag
* out of the OpenClaw request.
*
* OpenClaw does NOT send agent ID / session key as HTTP headers — it's embedded
* in the system prompt as a "## Runtime" line:
* Runtime: agent=contractor-e2e | host=... | repo=/tmp/contractor-e2e-workspace | ...
* OpenClaw does NOT send agent ID / session key as HTTP headers — agent and
* workspace come from the system prompt's "## Runtime" line:
*
* We parse this line to extract `agent` (agent ID) and `repo` (workspace path).
* Runtime: agent=<id> | host=... | repo=<workspace> | ...
*
* Conversation info (chat_id) is injected into the user message envelope
* as untrusted metadata; we scrape it so the bridge can scope sessions per
* Discord channel / DM / etc., instead of collapsing everything for an
* agent into a single Claude CLI session.
*/
export function extractRequestContext(req: BridgeInboundRequest): RequestContext {
const empty: RequestContext = {
agentId: "",
workspace: "",
skillsBlock: "",
workspaceContextFiles: [],
chatId: "",
bareSessionReset: false,
};
const systemMsg = req.messages.find((m) => m.role === "system");
if (!systemMsg) return { agentId: "", workspace: "", skillsBlock: "", workspaceContextFiles: [] };
if (!systemMsg) return empty;
const text = messageText(systemMsg);
const systemText = messageText(systemMsg);
// Match "Runtime: agent=<id> | ... | repo=<path> | ..."
const runtimeMatch = text.match(/Runtime:\s*([^\n]+)/);
if (!runtimeMatch) return { agentId: "", workspace: "", skillsBlock: "", workspaceContextFiles: [] };
const runtimeMatch = systemText.match(/Runtime:\s*([^\n]+)/);
if (!runtimeMatch) return empty;
const runtimeLine = runtimeMatch[1];
const agentMatch = runtimeLine.match(/\bagent=([^|\s]+)/);
@@ -65,14 +136,13 @@ export function extractRequestContext(req: BridgeInboundRequest): RequestContext
// Extract <available_skills>...</available_skills> XML block.
// Expand leading "~/" in <location> paths to the actual home dir so Claude doesn't
// try /root/.openclaw/... (which fails with EACCES).
const skillsMatch = text.match(/<available_skills>[\s\S]*?<\/available_skills>/);
const skillsMatch = systemText.match(/<available_skills>[\s\S]*?<\/available_skills>/);
const home = process.env.HOME ?? "/root";
const skillsBlock = skillsMatch
? skillsMatch[0].replace(/~\//g, `${home}/`)
: "";
// Detect which OpenClaw context files are present in the workspace.
// These tell us what persona/memory files to surface to Claude.
const workspace = repoMatch?.[1] ?? "";
const CONTEXT_FILES = ["SOUL.md", "IDENTITY.md", "MEMORY.md", "AGENTS.md", "USER.md"];
const workspaceContextFiles: string[] = [];
@@ -80,16 +150,77 @@ export function extractRequestContext(req: BridgeInboundRequest): RequestContext
for (const f of CONTEXT_FILES) {
if (fs.existsSync(path.join(workspace, f))) workspaceContextFiles.push(f);
}
// Also check for memory/ directory
if (fs.existsSync(path.join(workspace, "memory"))) {
workspaceContextFiles.push("memory/");
}
}
// chat_id can appear in any message (user envelope or runtime-context
// system block). Scan from newest to oldest and take the first hit.
let chatId = "";
for (let i = req.messages.length - 1; i >= 0; i -= 1) {
const text = messageText(req.messages[i]);
if (!text) continue;
const found = extractChatIdFromText(text);
if (found) {
chatId = found;
break;
}
}
// Detect bare /new or /reset: OpenClaw injects the BARE_SESSION_RESET_PROMPT_BASE
// marker into the prompt body when the user typed `/new` (or bare `/reset`)
// with no trailing instruction.
let bareSessionReset = false;
for (const m of req.messages) {
if (messageText(m).includes(BARE_SESSION_RESET_MARKER)) {
bareSessionReset = true;
break;
}
}
return {
agentId: agentMatch?.[1] ?? "",
workspace,
skillsBlock,
workspaceContextFiles,
chatId,
bareSessionReset,
};
}
/**
* Build the per-CLI-session map key from the parsed request context.
*
* Each unique OpenClaw session (DM, channel, etc.) gets its own Claude CLI
* session so contexts don't bleed across surfaces. Falls back to the agent
* id alone when chat_id can't be parsed (e.g. local TUI direct chats), so
* the historical "one session per agent" behavior remains as a backstop
* rather than degrading to one session per *request*.
*/
export function buildSessionKey(agentId: string, chatId: string): string {
if (!agentId) return "";
if (!chatId) return agentId;
return `${agentId}::${chatId}`;
}
/**
* Pick the prompt to forward to the Claude CLI for this turn.
*
* Normal turns: the latest user message (timestamp prefix stripped).
*
* Bare `/new` turns: OpenClaw sends an empty user message body alongside a
* runtime-context system block that asks the agent to greet the user; the
* provider rejects an empty user message so we synthesize a short prompt
* from the bare-reset marker instead.
*/
export function resolveDispatchPrompt(
latestMessage: string,
ctx: Pick<RequestContext, "bareSessionReset">,
): string {
if (latestMessage) return latestMessage;
if (ctx.bareSessionReset) {
return "A new session was just started. Greet the user briefly in your configured persona and ask what they'd like to do.";
}
return "";
}

View File

@@ -1,7 +1,12 @@
import http from "node:http";
import { randomUUID } from "node:crypto";
import type { BridgeInboundRequest } from "../core/types/model.js";
import { extractLatestUserMessage, extractRequestContext } from "./input-filter.js";
import {
buildSessionKey,
extractLatestUserMessage,
extractRequestContext,
resolveDispatchPrompt,
} from "./input-filter.js";
import { buildBootstrap } from "./bootstrap.js";
import { dispatchToClaude } from "../core/claude/sdk-adapter.js";
import { dispatchToGemini } from "../core/gemini/sdk-adapter.js";
@@ -10,6 +15,7 @@ import {
getSession,
putSession,
markOrphaned,
removeSession,
} from "../core/contractor/session-map-store.js";
export type BridgeServerConfig = {
@@ -100,22 +106,39 @@ export function createBridgeServer(config: BridgeServerConfig): http.Server {
return;
}
// Extract agent ID and workspace from the system prompt's Runtime line.
// OpenClaw does NOT send agent/session info as HTTP headers — it's in the system prompt.
const { agentId: parsedAgentId, workspace: parsedWorkspace, skillsBlock, workspaceContextFiles } = extractRequestContext(body);
// Extract agent ID, workspace, chat id, and bare-reset signal from the
// request. OpenClaw does NOT send agent/session info as HTTP headers — it
// lives in the system prompt's Runtime line and the user envelope's
// "Conversation info" untrusted-metadata block.
const {
agentId: parsedAgentId,
workspace: parsedWorkspace,
skillsBlock,
workspaceContextFiles,
chatId,
bareSessionReset,
} = extractRequestContext(body);
const latestMessage = extractLatestUserMessage(body);
if (!latestMessage) {
// Pick the prompt to forward to the CLI. For bare /new turns OpenClaw
// submits an empty user message — we synthesize a stub prompt instead so
// the CLI has something to respond to.
const dispatchPrompt = resolveDispatchPrompt(latestMessage, { bareSessionReset });
if (!dispatchPrompt) {
sendJson(res, 400, { error: "no user message found" });
return;
}
// Use agentId as session key — one persistent Claude session per agent (v1).
// Scope the CLI session by (agentId, chat_id) so different Discord
// channels / DMs / etc. for the same agent don't pile into one Claude
// session and bleed context across surfaces. Falls back to agentId-only
// when chat_id can't be parsed (local TUI, etc.).
const agentId = parsedAgentId;
const sessionKey = agentId; // stable per-agent key
const sessionKey = buildSessionKey(agentId, chatId);
logger.info(
`[contractor-bridge] turn agentId=${agentId} workspace=${parsedWorkspace} msg=${latestMessage.substring(0, 80)}`,
`[contractor-bridge] turn agentId=${agentId} sessionKey=${sessionKey} workspace=${parsedWorkspace} bareReset=${bareSessionReset} msg=${dispatchPrompt.substring(0, 80)}`,
);
// Resolve workspace: prefer what we parsed from the system prompt (most accurate);
@@ -133,8 +156,18 @@ export function createBridgeServer(config: BridgeServerConfig): http.Server {
// Detect backend from body.model: "contractor-gemini-bridge" → Gemini, else → Claude
const isGemini = typeof body.model === "string" && body.model.includes("gemini");
// Look up existing session (shared structure for both Claude and Gemini)
// Look up existing session (shared structure for both Claude and Gemini).
// On a bare /new or /reset turn we deliberately drop the existing entry so
// the CLI starts a fresh session — otherwise --resume would bring back the
// very history the user just asked to abandon.
let existingEntry = sessionKey ? getSession(workspace, sessionKey) : null;
if (bareSessionReset && existingEntry && sessionKey) {
logger.info(
`[contractor-bridge] bare /new detected — dropping prior CLI session sessionKey=${sessionKey} prevClaudeSessionId=${existingEntry.claudeSessionId}`,
);
removeSession(workspace, sessionKey);
existingEntry = null;
}
let resumeSessionId = existingEntry?.state === "active" ? existingEntry.claudeSessionId : null;
// Bootstrap is passed as the system prompt on every turn (stateless — not persisted in session files).
@@ -160,13 +193,14 @@ export function createBridgeServer(config: BridgeServerConfig): http.Server {
const completionId = `chatcmpl-bridge-${randomUUID().slice(0, 8)}`;
let newSessionId = "";
let hasError = false;
let resultErrorReason: string | null = null;
const openclawTools = body.tools ?? [];
try {
const dispatchIter = isGemini
? dispatchToGemini({
prompt: latestMessage,
prompt: dispatchPrompt,
systemPrompt,
workspace,
agentId,
@@ -176,7 +210,7 @@ export function createBridgeServer(config: BridgeServerConfig): http.Server {
bridgeApiKey: apiKey,
})
: dispatchToClaude({
prompt: latestMessage,
prompt: dispatchPrompt,
systemPrompt,
workspace,
agentId,
@@ -192,6 +226,15 @@ export function createBridgeServer(config: BridgeServerConfig): http.Server {
sseWrite(res, buildChunk(completionId, event.text));
} else if (event.type === "done") {
newSessionId = event.sessionId;
} else if (event.type === "result_error") {
// CLI returned a terminal error (typically context overflow). The
// text was already streamed via prior `text` events; record the
// session so we can drop it below and log the reason.
logger.warn(
`[contractor-bridge] ${isGemini ? "gemini" : "claude"} result_error reason=${event.reason} sessionId=${event.sessionId} message=${event.message.substring(0, 200)}`,
);
resultErrorReason = event.reason;
newSessionId = event.sessionId;
} else if (event.type === "error") {
logger.warn(`[contractor-bridge] ${isGemini ? "gemini" : "claude"} error: ${event.message}`);
hasError = true;
@@ -208,8 +251,20 @@ export function createBridgeServer(config: BridgeServerConfig): http.Server {
sseWrite(res, "[DONE]");
res.end();
// Persist session mapping (shared for both Claude and Gemini)
if (newSessionId && sessionKey && !hasError) {
// Session-map persistence:
// - Successful turn → upsert with the latest claudeSessionId so the next
// turn can `--resume` into it.
// - Terminal CLI error (context overflow etc., reported via result_error)
// → drop the entry so the next turn starts fresh instead of resuming
// into the same poisoned session and re-erroring.
// - Stream/transport error before any sessionId was captured → mark the
// prior entry orphaned (existing behavior).
if (resultErrorReason && sessionKey) {
logger.info(
`[contractor-bridge] dropping CLI session after terminal error sessionKey=${sessionKey} reason=${resultErrorReason}`,
);
removeSession(workspace, sessionKey);
} else if (newSessionId && sessionKey && !hasError) {
const now = new Date().toISOString();
putSession(workspace, {
openclawSessionKey: sessionKey,