Files
ContractorAgent/plugin/index.ts
zhi 992f4d8703 fix(bridge): scope CLI sessions per OpenClaw session and reset on /new
The bridge was keying claudeSessionId by agentId alone, so every Discord
channel, DM, and cron run for a single agent shared one Claude CLI
session. Two consequences in the wild:

  - Cross-channel context bleed: 8.7MB session for `developer` mixed
    references from channels 1474327736242798612 and 1498579994044010566
    plus the operator DM all in one --resume thread.
  - `/new` had no effect on the CLI side. OpenClaw rotated its session
    file but the bridge kept --resume-ing the same long-lived
    claudeSessionId, eventually crossing the 1M model context (debug log
    showed `prompt is too long: 1179616 tokens > 1000000 maximum`).

Changes:

  * input-filter: extract `chat_id` from the Conversation-info
    untrusted-metadata block (scanning all messages, since runtimeOnly
    turns put it in the system prompt) and detect bare `/new`/`/reset`
    via the BARE_SESSION_RESET_PROMPT_BASE marker. Add buildSessionKey
    `${agentId}::${chatId}` and resolveDispatchPrompt fallback for the
    empty user message that OpenClaw sends on bare resets.

  * server: use the composite session key for getSession/putSession;
    on bareSessionReset, removeSession before dispatching so the CLI
    starts a fresh session; on a CLI result_error (typically
    prompt_too_long) drop the entry too so the next turn doesn't
    re-resume into the poisoned context.

  * claude/sdk-adapter: surface CLI terminal errors via a new
    `result_error` event (carries reason + sessionId) so the bridge
    can react instead of just streaming the synthetic
    "Prompt is too long" assistant text and silently re-using the
    same session.

  * index: convert register() to synchronous (OpenClaw rejects async
    register with "plugin register must be synchronous"); replace the
    pre-bind port probe with a server-level EADDRINUSE handler.

  * .gitignore: ignore node_modules/ and dist/.
2026-04-28 12:32:37 +00:00

99 lines
4.3 KiB
TypeScript

import fs from "node:fs";
import path from "node:path";
import type { OpenClawPluginApi } from "openclaw/plugin-sdk";
import { normalizePluginConfig } from "./core/types/contractor.js";
import { resolveContractorAgentMetadata } from "./core/contractor/metadata-resolver.js";
import { createBridgeServer } from "./web/server.js";
import { registerCli } from "./commands/register-cli.js";
import type http from "node:http";
// ── GlobalThis state ─────────────────────────────────────────────────────────
// All persistent state lives on globalThis to survive OpenClaw hot-reloads.
// See LESSONS_LEARNED.md items 1, 3, 11.
const _G = globalThis as Record<string, unknown>;
const LIFECYCLE_KEY = "_contractorAgentLifecycleRegistered";
const SERVER_KEY = "_contractorAgentBridgeServer";
/** Key for the live OpenClaw config accessor (getter fn) shared via globalThis. */
const OPENCLAW_CONFIG_KEY = "_contractorOpenClawConfig";
// ── Plugin entry ─────────────────────────────────────────────────────────────
export default {
id: "contractor-agent",
name: "Contractor Agent",
// OpenClaw requires register() to be synchronous — returning a Promise
// surfaces as `Error: plugin register must be synchronous` and the plugin
// ends up in `error` state. We avoid `await` here and instead let the
// bridge server bind asynchronously, handling EADDRINUSE via the server's
// `error` event when another gateway/CLI process already owns the port.
register(api: OpenClawPluginApi): void {
const config = normalizePluginConfig(api.pluginConfig);
// Resolve agent metadata for the bridge server's resolveAgent callback.
// We do this by reading openclaw.json — the bridge server calls this on every turn.
function resolveAgent(agentId: string, _sessionKey: string) {
try {
const configPath = path.join(
(process.env.HOME ?? "/root"),
".openclaw",
"openclaw.json",
);
const raw = JSON.parse(fs.readFileSync(configPath, "utf8")) as {
agents?: { list?: Array<{ id: string; workspace?: string; model?: string }> };
};
const agent = raw.agents?.list?.find((a) => a.id === agentId);
if (!agent) return null;
const meta = resolveContractorAgentMetadata(agent, config.permissionMode);
if (!meta) return null;
return { workspace: meta.workspace };
} catch {
return null;
}
}
// ── Gateway lifecycle (start bridge server once per gateway process) ──────
// Always update the config accessor so hot-reloads get fresh config.
// server.ts reads this via globalThis to build tool execution context.
_G[OPENCLAW_CONFIG_KEY] = api.config;
if (!_G[LIFECYCLE_KEY]) {
_G[LIFECYCLE_KEY] = true;
const server = createBridgeServer({
port: config.bridgePort,
apiKey: config.bridgeApiKey,
permissionMode: config.permissionMode,
resolveAgent,
logger: api.logger,
});
// EADDRINUSE → another gateway/CLI process already owns the port; that's
// fine, we just don't double-bind. Any other error is logged but does
// not crash registration.
server.on("error", (err: NodeJS.ErrnoException) => {
if (err.code === "EADDRINUSE") {
api.logger.info(
`[contractor-agent] bridge already running on port ${config.bridgePort}, skipping bind`,
);
return;
}
api.logger.warn(`[contractor-agent] bridge server error: ${err.message ?? String(err)}`);
});
_G[SERVER_KEY] = server;
api.on("gateway_stop", () => {
const s = _G[SERVER_KEY] as http.Server | undefined;
if (s) s.close();
api.logger.info("[contractor-agent] bridge server stopped");
});
}
// ── CLI ───────────────────────────────────────────────────────────────────
registerCli(api);
api.logger.info(`[contractor-agent] plugin registered (bridge port: ${config.bridgePort})`);
},
};