fix(bridge): scope CLI sessions per OpenClaw session and reset on /new

The bridge was keying claudeSessionId by agentId alone, so every Discord channel, DM, and cron run for a single agent shared one Claude CLI session. Two consequences in the wild: - Cross-channel context bleed: 8.7MB session for `developer` mixed references from channels 1474327736242798612 and 1498579994044010566 plus the operator DM all in one --resume thread. - `/new` had no effect on the CLI side. OpenClaw rotated its session file but the bridge kept --resume-ing the same long-lived claudeSessionId, eventually crossing the 1M model context (debug log showed `prompt is too long: 1179616 tokens > 1000000 maximum`). Changes: * input-filter: extract `chat_id` from the Conversation-info untrusted-metadata block (scanning all messages, since runtimeOnly turns put it in the system prompt) and detect bare `/new`/`/reset` via the BARE_SESSION_RESET_PROMPT_BASE marker. Add buildSessionKey `${agentId}::${chatId}` and resolveDispatchPrompt fallback for the empty user message that OpenClaw sends on bare resets. * server: use the composite session key for getSession/putSession; on bareSessionReset, removeSession before dispatching so the CLI starts a fresh session; on a CLI result_error (typically prompt_too_long) drop the entry too so the next turn doesn't re-resume into the poisoned context. * claude/sdk-adapter: surface CLI terminal errors via a new `result_error` event (carries reason + sessionId) so the bridge can react instead of just streaming the synthetic "Prompt is too long" assistant text and silently re-using the same session. * index: convert register() to synchronous (OpenClaw rejects async register with "plugin register must be synchronous"); replace the pre-bind port probe with a server-level EADDRINUSE handler. * .gitignore: ignore node_modules/ and dist/.
2026-04-28 12:32:37 +00:00
parent 6be8d47982
commit 992f4d8703
5 changed files with 268 additions and 52 deletions
--- a/plugin/web/input-filter.ts
+++ b/plugin/web/input-filter.ts
@@ -10,6 +10,11 @@ function messageText(m: OpenAIMessage): string {
    .join("");
 }

+function stripOpenClawTimestampPrefix(raw: string): string {
+  // "[Sat 2026-04-11 08:32 GMT+1] " → ""
+  return raw.replace(/^\[[^\]]+\]\s*/, "").trim();
+}
+
 /**
 * Extract the latest user message from the OpenClaw request.
 *
@@ -20,14 +25,16 @@ function messageText(m: OpenAIMessage): string {
 *
 * OpenClaw prefixes user messages with a timestamp: "[Day YYYY-MM-DD HH:MM TZ] text"
 * We strip the timestamp prefix before forwarding.
+ *
+ * Returns "" if no user messages exist or the latest user message is empty
+ * (e.g. a bare /new turn — see also extractRequestContext.bareSessionReset).
 */
 export function extractLatestUserMessage(req: BridgeInboundRequest): string {
  const userMessages = req.messages.filter((m) => m.role === "user");
  if (userMessages.length === 0) return "";

  const raw = messageText(userMessages[userMessages.length - 1]);
-  // Strip OpenClaw timestamp prefix: "[Sat 2026-04-11 08:32 GMT+1] "
-  return raw.replace(/^\[[^\]]+\]\s*/, "").trim();
+  return stripOpenClawTimestampPrefix(raw);
 }

 export type RequestContext = {
@@ -37,26 +44,90 @@ export type RequestContext = {
  skillsBlock: string;
  /** OpenClaw context files present in the workspace (SOUL.md, IDENTITY.md, etc.) */
  workspaceContextFiles: string[];
+  /**
+   * OpenClaw conversation/chat identifier scraped from the "Conversation info"
+   * untrusted-metadata JSON block that OpenClaw appends to user messages on
+   * non-direct or non-webchat surfaces (Discord channels, Discord DMs,
+   * Telegram, etc.).
+   *
+   * Format examples:
+   *   - DM:      "user:561921120408698910"
+   *   - Channel: "channel:1498579994044010566"
+   *
+   * Empty when not parseable (typical for local TUI / webchat direct chats),
+   * in which case we fall back to keying sessions by agentId only.
+   */
+  chatId: string;
+  /**
+   * True when this turn was triggered by `/new` (or the equivalent bare
+   * `/reset`) on the OpenClaw side. We detect it by looking for the literal
+   * marker that OpenClaw injects into the runtime prompt:
+   *
+   *   "A new session was started via /new or /reset."
+   *
+   * (See `BARE_SESSION_RESET_PROMPT_BASE` in OpenClaw's
+   * startup-context module.)
+   *
+   * The bridge uses this to discard any prior `claudeSessionId` so we start
+   * a fresh Claude CLI session instead of `--resume`-ing into an old one
+   * that the user just asked to abandon.
+   */
+  bareSessionReset: boolean;
 };

+const BARE_SESSION_RESET_MARKER =
+  "A new session was started via /new or /reset";
+
+function extractChatIdFromText(text: string): string {
+  // OpenClaw injects an untrusted-metadata block of the form:
+  //
+  //   Conversation info (untrusted metadata):
+  //   ```json
+  //   {
+  //     "chat_id": "channel:1498579994044010566",
+  //     ...
+  //   }
+  //   ```
+  //
+  // It can appear inside a user message body, the runtime-context system
+  // message, or both. A non-greedy regex on the JSON literal is enough — we
+  // don't need to JSON.parse the whole block (and parsing would be brittle
+  // against truncation / nested code fences).
+  const match = text.match(/"chat_id"\s*:\s*"([^"\n]+)"/);
+  return match ? match[1] : "";
+}
+
 /**
- * Parse agent ID and workspace path from the OpenClaw system prompt.
+ * Parse agent ID, workspace path, chat id, and the bare-session-reset flag
+ * out of the OpenClaw request.
 *
- * OpenClaw does NOT send agent ID / session key as HTTP headers — it's embedded
- * in the system prompt as a "## Runtime" line:
- *   Runtime: agent=contractor-e2e | host=... | repo=/tmp/contractor-e2e-workspace | ...
+ * OpenClaw does NOT send agent ID / session key as HTTP headers — agent and
+ * workspace come from the system prompt's "## Runtime" line:
 *
- * We parse this line to extract `agent` (agent ID) and `repo` (workspace path).
+ *   Runtime: agent=<id> | host=... | repo=<workspace> | ...
+ *
+ * Conversation info (chat_id) is injected into the user message envelope
+ * as untrusted metadata; we scrape it so the bridge can scope sessions per
+ * Discord channel / DM / etc., instead of collapsing everything for an
+ * agent into a single Claude CLI session.
 */
 export function extractRequestContext(req: BridgeInboundRequest): RequestContext {
+  const empty: RequestContext = {
+    agentId: "",
+    workspace: "",
+    skillsBlock: "",
+    workspaceContextFiles: [],
+    chatId: "",
+    bareSessionReset: false,
+  };
  const systemMsg = req.messages.find((m) => m.role === "system");
-  if (!systemMsg) return { agentId: "", workspace: "", skillsBlock: "", workspaceContextFiles: [] };
+  if (!systemMsg) return empty;

-  const text = messageText(systemMsg);
+  const systemText = messageText(systemMsg);

  // Match "Runtime: agent=<id> | ... | repo=<path> | ..."
-  const runtimeMatch = text.match(/Runtime:\s*([^\n]+)/);
-  if (!runtimeMatch) return { agentId: "", workspace: "", skillsBlock: "", workspaceContextFiles: [] };
+  const runtimeMatch = systemText.match(/Runtime:\s*([^\n]+)/);
+  if (!runtimeMatch) return empty;

  const runtimeLine = runtimeMatch[1];
  const agentMatch = runtimeLine.match(/\bagent=([^|\s]+)/);
@@ -65,14 +136,13 @@ export function extractRequestContext(req: BridgeInboundRequest): RequestContext
  // Extract <available_skills>...</available_skills> XML block.
  // Expand leading "~/" in <location> paths to the actual home dir so Claude doesn't
  // try /root/.openclaw/... (which fails with EACCES).
-  const skillsMatch = text.match(/<available_skills>[\s\S]*?<\/available_skills>/);
+  const skillsMatch = systemText.match(/<available_skills>[\s\S]*?<\/available_skills>/);
  const home = process.env.HOME ?? "/root";
  const skillsBlock = skillsMatch
    ? skillsMatch[0].replace(/~\//g, `${home}/`)
    : "";

  // Detect which OpenClaw context files are present in the workspace.
-  // These tell us what persona/memory files to surface to Claude.
  const workspace = repoMatch?.[1] ?? "";
  const CONTEXT_FILES = ["SOUL.md", "IDENTITY.md", "MEMORY.md", "AGENTS.md", "USER.md"];
  const workspaceContextFiles: string[] = [];
@@ -80,16 +150,77 @@ export function extractRequestContext(req: BridgeInboundRequest): RequestContext
    for (const f of CONTEXT_FILES) {
      if (fs.existsSync(path.join(workspace, f))) workspaceContextFiles.push(f);
    }
-    // Also check for memory/ directory
    if (fs.existsSync(path.join(workspace, "memory"))) {
      workspaceContextFiles.push("memory/");
    }
  }

+  // chat_id can appear in any message (user envelope or runtime-context
+  // system block). Scan from newest to oldest and take the first hit.
+  let chatId = "";
+  for (let i = req.messages.length - 1; i >= 0; i -= 1) {
+    const text = messageText(req.messages[i]);
+    if (!text) continue;
+    const found = extractChatIdFromText(text);
+    if (found) {
+      chatId = found;
+      break;
+    }
+  }
+
+  // Detect bare /new or /reset: OpenClaw injects the BARE_SESSION_RESET_PROMPT_BASE
+  // marker into the prompt body when the user typed `/new` (or bare `/reset`)
+  // with no trailing instruction.
+  let bareSessionReset = false;
+  for (const m of req.messages) {
+    if (messageText(m).includes(BARE_SESSION_RESET_MARKER)) {
+      bareSessionReset = true;
+      break;
+    }
+  }
+
  return {
    agentId: agentMatch?.[1] ?? "",
    workspace,
    skillsBlock,
    workspaceContextFiles,
+    chatId,
+    bareSessionReset,
  };
 }
+
+/**
+ * Build the per-CLI-session map key from the parsed request context.
+ *
+ * Each unique OpenClaw session (DM, channel, etc.) gets its own Claude CLI
+ * session so contexts don't bleed across surfaces. Falls back to the agent
+ * id alone when chat_id can't be parsed (e.g. local TUI direct chats), so
+ * the historical "one session per agent" behavior remains as a backstop
+ * rather than degrading to one session per *request*.
+ */
+export function buildSessionKey(agentId: string, chatId: string): string {
+  if (!agentId) return "";
+  if (!chatId) return agentId;
+  return `${agentId}::${chatId}`;
+}
+
+/**
+ * Pick the prompt to forward to the Claude CLI for this turn.
+ *
+ * Normal turns: the latest user message (timestamp prefix stripped).
+ *
+ * Bare `/new` turns: OpenClaw sends an empty user message body alongside a
+ * runtime-context system block that asks the agent to greet the user; the
+ * provider rejects an empty user message so we synthesize a short prompt
+ * from the bare-reset marker instead.
+ */
+export function resolveDispatchPrompt(
+  latestMessage: string,
+  ctx: Pick<RequestContext, "bareSessionReset">,
+): string {
+  if (latestMessage) return latestMessage;
+  if (ctx.bareSessionReset) {
+    return "A new session was just started. Greet the user briefly in your configured persona and ask what they'd like to do.";
+  }
+  return "";
+}
--- a/plugin/web/server.ts
+++ b/plugin/web/server.ts
@@ -1,7 +1,12 @@
 import http from "node:http";
 import { randomUUID } from "node:crypto";
 import type { BridgeInboundRequest } from "../core/types/model.js";
-import { extractLatestUserMessage, extractRequestContext } from "./input-filter.js";
+import {
+  buildSessionKey,
+  extractLatestUserMessage,
+  extractRequestContext,
+  resolveDispatchPrompt,
+} from "./input-filter.js";
 import { buildBootstrap } from "./bootstrap.js";
 import { dispatchToClaude } from "../core/claude/sdk-adapter.js";
 import { dispatchToGemini } from "../core/gemini/sdk-adapter.js";
@@ -10,6 +15,7 @@ import {
  getSession,
  putSession,
  markOrphaned,
+  removeSession,
 } from "../core/contractor/session-map-store.js";

 export type BridgeServerConfig = {
@@ -100,22 +106,39 @@ export function createBridgeServer(config: BridgeServerConfig): http.Server {
      return;
    }

-    // Extract agent ID and workspace from the system prompt's Runtime line.
-    // OpenClaw does NOT send agent/session info as HTTP headers — it's in the system prompt.
-    const { agentId: parsedAgentId, workspace: parsedWorkspace, skillsBlock, workspaceContextFiles } = extractRequestContext(body);
+    // Extract agent ID, workspace, chat id, and bare-reset signal from the
+    // request. OpenClaw does NOT send agent/session info as HTTP headers — it
+    // lives in the system prompt's Runtime line and the user envelope's
+    // "Conversation info" untrusted-metadata block.
+    const {
+      agentId: parsedAgentId,
+      workspace: parsedWorkspace,
+      skillsBlock,
+      workspaceContextFiles,
+      chatId,
+      bareSessionReset,
+    } = extractRequestContext(body);
    const latestMessage = extractLatestUserMessage(body);

-    if (!latestMessage) {
+    // Pick the prompt to forward to the CLI. For bare /new turns OpenClaw
+    // submits an empty user message — we synthesize a stub prompt instead so
+    // the CLI has something to respond to.
+    const dispatchPrompt = resolveDispatchPrompt(latestMessage, { bareSessionReset });
+
+    if (!dispatchPrompt) {
      sendJson(res, 400, { error: "no user message found" });
      return;
    }

-    // Use agentId as session key — one persistent Claude session per agent (v1).
+    // Scope the CLI session by (agentId, chat_id) so different Discord
+    // channels / DMs / etc. for the same agent don't pile into one Claude
+    // session and bleed context across surfaces. Falls back to agentId-only
+    // when chat_id can't be parsed (local TUI, etc.).
    const agentId = parsedAgentId;
-    const sessionKey = agentId; // stable per-agent key
+    const sessionKey = buildSessionKey(agentId, chatId);

    logger.info(
-      `[contractor-bridge] turn agentId=${agentId} workspace=${parsedWorkspace} msg=${latestMessage.substring(0, 80)}`,
+      `[contractor-bridge] turn agentId=${agentId} sessionKey=${sessionKey} workspace=${parsedWorkspace} bareReset=${bareSessionReset} msg=${dispatchPrompt.substring(0, 80)}`,
    );

    // Resolve workspace: prefer what we parsed from the system prompt (most accurate);
@@ -133,8 +156,18 @@ export function createBridgeServer(config: BridgeServerConfig): http.Server {
    // Detect backend from body.model: "contractor-gemini-bridge" → Gemini, else → Claude
    const isGemini = typeof body.model === "string" && body.model.includes("gemini");

-    // Look up existing session (shared structure for both Claude and Gemini)
+    // Look up existing session (shared structure for both Claude and Gemini).
+    // On a bare /new or /reset turn we deliberately drop the existing entry so
+    // the CLI starts a fresh session — otherwise --resume would bring back the
+    // very history the user just asked to abandon.
    let existingEntry = sessionKey ? getSession(workspace, sessionKey) : null;
+    if (bareSessionReset && existingEntry && sessionKey) {
+      logger.info(
+        `[contractor-bridge] bare /new detected — dropping prior CLI session sessionKey=${sessionKey} prevClaudeSessionId=${existingEntry.claudeSessionId}`,
+      );
+      removeSession(workspace, sessionKey);
+      existingEntry = null;
+    }
    let resumeSessionId = existingEntry?.state === "active" ? existingEntry.claudeSessionId : null;

    // Bootstrap is passed as the system prompt on every turn (stateless — not persisted in session files).
@@ -160,13 +193,14 @@ export function createBridgeServer(config: BridgeServerConfig): http.Server {
    const completionId = `chatcmpl-bridge-${randomUUID().slice(0, 8)}`;
    let newSessionId = "";
    let hasError = false;
+    let resultErrorReason: string | null = null;

    const openclawTools = body.tools ?? [];

    try {
      const dispatchIter = isGemini
        ? dispatchToGemini({
-            prompt: latestMessage,
+            prompt: dispatchPrompt,
            systemPrompt,
            workspace,
            agentId,
@@ -176,7 +210,7 @@ export function createBridgeServer(config: BridgeServerConfig): http.Server {
            bridgeApiKey: apiKey,
          })
        : dispatchToClaude({
-            prompt: latestMessage,
+            prompt: dispatchPrompt,
            systemPrompt,
            workspace,
            agentId,
@@ -192,6 +226,15 @@ export function createBridgeServer(config: BridgeServerConfig): http.Server {
          sseWrite(res, buildChunk(completionId, event.text));
        } else if (event.type === "done") {
          newSessionId = event.sessionId;
+        } else if (event.type === "result_error") {
+          // CLI returned a terminal error (typically context overflow). The
+          // text was already streamed via prior `text` events; record the
+          // session so we can drop it below and log the reason.
+          logger.warn(
+            `[contractor-bridge] ${isGemini ? "gemini" : "claude"} result_error reason=${event.reason} sessionId=${event.sessionId} message=${event.message.substring(0, 200)}`,
+          );
+          resultErrorReason = event.reason;
+          newSessionId = event.sessionId;
        } else if (event.type === "error") {
          logger.warn(`[contractor-bridge] ${isGemini ? "gemini" : "claude"} error: ${event.message}`);
          hasError = true;
@@ -208,8 +251,20 @@ export function createBridgeServer(config: BridgeServerConfig): http.Server {
    sseWrite(res, "[DONE]");
    res.end();

-    // Persist session mapping (shared for both Claude and Gemini)
-    if (newSessionId && sessionKey && !hasError) {
+    // Session-map persistence:
+    //  - Successful turn → upsert with the latest claudeSessionId so the next
+    //    turn can `--resume` into it.
+    //  - Terminal CLI error (context overflow etc., reported via result_error)
+    //    → drop the entry so the next turn starts fresh instead of resuming
+    //    into the same poisoned session and re-erroring.
+    //  - Stream/transport error before any sessionId was captured → mark the
+    //    prior entry orphaned (existing behavior).
+    if (resultErrorReason && sessionKey) {
+      logger.info(
+        `[contractor-bridge] dropping CLI session after terminal error sessionKey=${sessionKey} reason=${resultErrorReason}`,
+      );
+      removeSession(workspace, sessionKey);
+    } else if (newSessionId && sessionKey && !hasError) {
      const now = new Date().toISOString();
      putSession(workspace, {
        openclawSessionKey: sessionKey,