From 6ae795eea810358cbb1feaf4753bef2a54731b93 Mon Sep 17 00:00:00 2001
From: hzhang <hzhang@hangman-lab.top>
Date: Sat, 11 Apr 2026 21:28:56 +0100
Subject: [PATCH] chore: add .gitignore and untrack internal Claude dev notes

- Add .gitignore covering node_modules, .idea, credential files
- Untrack docs/claude/LESSONS_LEARNED.md and OPENCLAW_PLUGIN_DEV.md
  (internal dev notes, not part of the plugin repo)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .gitignore                     |  10 +
 docs/claude/LESSONS_LEARNED.md | 478 ---------------------------------
 2 files changed, 10 insertions(+), 478 deletions(-)
 create mode 100644 .gitignore
 delete mode 100644 docs/claude/LESSONS_LEARNED.md
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..da70c69
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,10 @@
+
+# Internal Claude dev notes — not for repo
+docs/claude/LESSONS_LEARNED.md
+docs/claude/OPENCLAW_PLUGIN_DEV.md
+
+# Credentials and tokens
+CLAUDE_CONTRACTOR_TEST_TOKEN
+
+# IDE
+.idea/
diff --git a/docs/claude/LESSONS_LEARNED.md b/docs/claude/LESSONS_LEARNED.md
deleted file mode 100644
index 38a6fd5..0000000
--- a/docs/claude/LESSONS_LEARNED.md
+++ /dev/null
@@ -1,478 +0,0 @@
-# OpenClaw 插件开发经验教训
-
-> 记录插件开发过程中踩过的坑，供后续迭代参考。最初源自 Dirigent，后续经验来自 Yonexus。
-
----
-
-## 1. OpenClaw 热重载与模块状态
-
-**问题**：OpenClaw 每次热重载（hot-reload）会把插件模块放入新的 VM 隔离上下文，模块级变量全部重置。
-
-```typescript
-// ❌ 错误：热重载后 Map 被清空，turn 状态丢失
-const channelStates = new Map<string, ChannelTurnState>();
-```
-
-**解法**：把需要跨热重载持久化的状态挂在 `globalThis` 上。
-
-```typescript
-// ✅ 正确：globalThis 绑定在 Node.js 进程层面，热重载不影响
-function channelStates(): Map<string, ChannelTurnState> {
-  if (!(_G._tmChannelStates instanceof Map))
-    _G._tmChannelStates = new Map();
-  return _G._tmChannelStates as Map<string, ChannelTurnState>;
-}
-```
-
-**规则**：
-- 业务状态（turn state、speaker list、pending turns）→ `globalThis`
-- 热重载内部的临时变量（局部锁、dedup set）→ `globalThis`（理由同上）
-- 无状态工具函数 → 普通模块变量即可
-
----
-
-## 2. Hook 事件重复触发（Event Deduplication）
-
-**问题**：OpenClaw 热重载会把新的 handler 叠加在旧的 handler 上，同一事件（如 `agent_end`、`before_model_resolve`）被多个 handler 实例处理，导致：
-- Turn 被推进两次
-- Speaker 被重复 suppress
-- Schedule trigger 重复发送
-
-**解法**：用挂在 `globalThis` 上的 `WeakSet`（事件对象）或 `Set`（runId）做去重。
-
-```typescript
-// before_model_resolve：事件对象去重（WeakSet 自动 GC）
-const processed = new WeakSet<object>();
-api.on("before_model_resolve", async (event) => {
-  if (processed.has(event as object)) return;
-  processed.add(event as object);
-  // ...
-});
-
-// agent_end：runId 去重（Set + 上限淘汰）
-const processedRunIds = new Set<string>();
-api.on("agent_end", async (event) => {
-  const runId = (event as any).runId;
-  if (processedRunIds.has(runId)) return;
-  processedRunIds.add(runId);
-  if (processedRunIds.size > 500) {
-    processedRunIds.delete(processedRunIds.values().next().value);
-  }
-  // ...
-});
-```
-
-**规则**：所有 hook handler 必须有去重逻辑，dedup 结构本身也要挂在 `globalThis`。
-
----
-
-## 3. Gateway 生命周期事件与 Agent 会话事件的区别
-
-**问题**：`gateway_start` / `gateway_stop` 是全局事件，只触发一次。但 `register()` 每次热重载都会被调用，导致 `gateway_start` handler 被重复注册，sidecar 被重复启动。
-
-**解法**：用 `globalThis` flag 保证只注册一次。
-
-```typescript
-const _G = globalThis as Record<string, unknown>;
-const LIFECYCLE_KEY = "_dirigentGatewayLifecycleRegistered";
-
-if (!_G[LIFECYCLE_KEY]) {
-  _G[LIFECYCLE_KEY] = true;
-  startSideCar(...);
-  api.on("gateway_stop", () => stopSideCar(...));
-}
-```
-
-**规则**：
-- `gateway_start` / `gateway_stop` handler → `globalThis` flag 保护
-- `before_model_resolve` / `agent_end` / `message_received` → 每次 `register()` 都注册，但靠 event dedup 防止重复处理
-
----
-
-## 4. ChannelStore 文件缓存陷阱
-
-**问题**：`ChannelStore` 懒加载文件（第一次读后设 `loaded=true` 不再重读）。如果在 gateway 运行期间直接编辑 `dirigent-channels.json`，已存在的 `ChannelStore` 实例不会感知变化，`getMode()` 对新增 channel 返回 `"none"`，导致 turn management 完全失效（before_model_resolve 看到 `mode === "none"` 直接 return，不做任何 suppress）。
-
-**现象**：新 channel 里所有 agent 同时响应，日志里没有任何 `before_model_resolve` 的 suppressing 或 anchor set 日志。
-
-**解法（当前）**：编辑 `dirigent-channels.json` 后必须 `openclaw gateway restart`。
-
-**更好的长期方案**：`ChannelStore` 应该在 `setMode()`/`setLockedMode()` 时通知所有实例，或改用 `fs.watch()` 监听文件变化，或每次 `getMode()` 都从文件读（对 read 频率低的场景可以接受）。
-
----
-
-## 5. Discord 权限 Overwrite 的 type 字段
-
-**问题**：设置 channel permission overwrite 时，`type` 字段含义：
-- `type: 0` → 针对 **role**（角色）
-- `type: 1` → 针对 **member**（成员/用户）
-
-将 bot 用户 ID 作为 member overwrite 时必须用 `type: 1`，用 `type: 0` 会返回错误或静默失败（Discord 会把 ID 当 role 处理）。
-
-```typescript
-// ✅ 正确
-{ id: botUserId, type: 1, allow: "68608", deny: "0" }
-```
-
-**常用 permission bitmask**：
-- VIEW_CHANNEL = 1024 (1 << 10)
-- SEND_MESSAGES = 2048 (1 << 11)
-- READ_MESSAGE_HISTORY = 65536 (1 << 16)
-- 三者合计 = 68608
-
----
-
-## 6. AgentTool 的 execute API（非 handler）
-
-**问题**：OpenClaw Plugin SDK 要求 tool 使用 `execute: async (toolCallId, params) => {}` 接口，不是 `handler:`。如果需要 `ctx.agentId`，要使用工厂函数形式。
-
-```typescript
-// ✅ 正确
-api.registerTool({
-  name: "my-tool",
-  // ...schema...
-  execute: async (toolCallId, params) => {
-    // toolCallId 是 string，params 是入参对象
-    return { result: "ok" };
-  },
-});
-
-// ✅ 需要 agentId 时
-api.registerTool((ctx) => ({
-  name: "my-tool",
-  execute: async (toolCallId, params) => {
-    const agentId = ctx.agentId;
-    // ...
-  },
-}));
-```
-
----
-
-## 7. Sidecar 锁文件防重复启动
-
-**问题**：gateway 重启或热重载时 `startSideCar()` 可能被多次调用，导致多个 sidecar 进程竞争同一端口。
-
-**解法**：写 lock 文件（`/tmp/dirigent-sidecar.lock`），启动前检查文件是否存在且对应进程仍在运行。
-
-```typescript
-const lockFile = "/tmp/dirigent-sidecar.lock";
-if (fs.existsSync(lockFile)) {
-  const pid = Number(fs.readFileSync(lockFile, "utf8").trim());
-  if (isProcessAlive(pid)) {
-    logger.info("sidecar already running, skipping");
-    return;
-  }
-}
-// 启动 sidecar，写 lock file
-```
-
----
-
-## 8. 并发 advanceSpeaker 竞争
-
-**问题**：两个 VM 上下文的 `agent_end` handler 可能同时执行，两者都通过了 runId 去重（runId 不同），都调用 `advanceSpeaker`，导致 speaker index 被推进两次。
-
-**解法**：在 `advanceSpeaker` 入口加 per-channel 锁（`Set<string>` 挂在 `globalThis`）。
-
-```typescript
-if (advancingChannels.has(channelId)) return;  // 已有并发调用，跳过
-advancingChannels.add(channelId);
-try {
-  await advanceSpeaker(...);
-} finally {
-  advancingChannels.delete(channelId);
-}
-```
-
----
-
-## 9. isTurnPending 的生命周期边界
-
-**问题**：`clearTurnPending` 的位置影响正确性：
-- 太早（在 `advanceSpeaker` 前清除）→ 下一个 wakeup 可能被误判为合法 turn，在 cycle boundary 期间 index 尚未更新导致 speaker 错误
-- 太晚无问题，但在 `pollForTailMatch` 期间必须保持 `isTurnPending=true`，否则 re-trigger 会被当作合法 turn 重入
-
-**正确位置**：`advanceSpeaker` 完成后、`triggerNextSpeaker` 前。
-
----
-
-## 10. Discord Gateway 重连后的消息丢失
-
-**问题**：Gateway 重启后，bot 重新连接 Discord WS 有延迟（10–30s）。如果在 bot 完全连接前就发送 schedule trigger（`<@bot_id>➡️`），bot 会错过该消息（WS 不推送历史消息）。
-
-**现象**：发送了 trigger，channel 里能看到消息，但 bot 没有响应。
-
-**解法**：
-1. Gateway 重启后等待所有 bot 的 `discord client initialized` 日志出现再发种子消息
-2. 或手动补发 trigger
-
-**长期方案**：sidecar 可以暴露一个 `/status` 接口，等待所有 Discord 账号连接就绪后再允许外部发消息。
-
----
-
-## 11. 连接型插件的热重载陷阱（Yonexus）
-
-**问题**：Yonexus.Client / Yonexus.Server 是"连接型插件"——插件本身管理一条持久 WebSocket 连接（或监听端口）。如果用模块级变量做启动防重复保护：
-
-```typescript
-// ❌ 错误：热重载后新 VM 上下文重置，_started = false → 第二个 runtime 被创建
-let _started = false;
-export function createPlugin(api) {
-  if (_started) return;
-  _started = true;
-  const runtime = createRuntime(...);
-  runtime.start();
-}
-```
-
-热重载后：
-- **服务端**：第二个 runtime 尝试 bind 同一端口 → EADDRINUSE → `runtime.start()` 抛出 → 被 `.catch` 静默吞掉，但 `globalThis.__yonexusServer` 已被覆盖为指向新的（未启动的）transport → `sendRule()` 永远返回 false
-- **客户端**：第二个 runtime 成功建立了新的 WebSocket 连接，与旧连接并存，产生重复认证
-
-**解法**：
-```typescript
-// ✅ 正确：用 globalThis 保护，热重载后新 VM 上下文也能看到 flag
-const _G = globalThis as Record<string, unknown>;
-const STARTED_KEY = "_yonexusClientStarted";
-
-export function createPlugin(api) {
-  if (_G[STARTED_KEY]) {
-    // 热重载时更新 __yonexusClient 指向仍在运行的旧 runtime（存在 globalThis 上）
-    // 无需重新启动
-    return;
-  }
-  _G[STARTED_KEY] = true;
-  // ... 创建并启动 runtime
-}
-```
-
-如果需要让热重载后新注册的 hook/rule 生效，还需把 `ruleRegistry`、`onXxxCallbacks` 等也存到 `globalThis`，而不是在函数体内每次新建。
-
-**规则**：
-- 任何管理持久连接/监听端口的插件，其启动 flag 必须放 `globalThis`
-- 相关的 registry、回调数组也应放 `globalThis`，否则热重载后 `__pluginId` API 对象被覆盖，旧 runtime 的回调数组失去引用
-
----
-
-## 12. WebSocket 服务端 Transport 的消息路由竞态（Yonexus）
-
-**问题**：Server transport 在 `ws.on("message")` 里通过 identifier 查 `_connections` 得到 `ClientConnection`：
-
-```typescript
-// ❌ 危险：当 ws_new 还在 tempConnections，但 _connections["test-client"] 指向即将关闭的 ws_old 时
-const connection = identifier ? this._connections.get(identifier) ?? tempConn : tempConn;
-```
-
-**场景**：
-1. `ws_old`（外部测试脚本）已认证，`_connections["test-client"] = ws_old`
-2. `ws_new`（插件重连）发 hello → 进入 tempConnections，assignedIdentifier = "test-client"
-3. 插件发 `auth_request` → message handler 查 `_connections.get("test-client")` → 返回 ws_old
-4. `promoteToAuthenticated("test-client", ws_old)` → ws_old 不在 tempConnections → 返回 false
-5. `onClientAuthenticated` 仍然触发 → `_connections.get("test-client")` = ws_old（已关闭）→ `sendRule` 返回 false
-
-**解法**：消息路由时，如果发送方 `ws` 仍在 `tempConnections`，直接用 `tempConn`（持有正确 ws 引用的本地对象），**不再** fallback 到 `_connections`：
-
-```typescript
-// ✅ 正确：按 ws 引用路由，不按 identifier 路由
-if (this.tempConnections.has(ws)) {
-  this.options.onMessage(tempConn, message);
-  return;
-}
-// ws 已 promote，从 _connections 中找
-let connection = tempConn;
-for (const [, conn] of this._connections) {
-  if (conn.ws === ws) { connection = conn; break; }
-}
-this.options.onMessage(connection, message);
-```
-
-**附加修复**：`promoteToAuthenticated` 的返回值不应被忽略。只有 promote 成功时才触发 `onClientAuthenticated`：
-
-```typescript
-const promoted = transport.promoteToAuthenticated(identifier, connection.ws);
-if (promoted) {
-  options.onClientAuthenticated?.(identifier);
-}
-```
-
-**规则**：WebSocket 服务端的消息路由应始终以**发送方的 ws 对象引用**为准，不以 identifier 查映射表。identifier 可能在 tempConnections 和 _connections 之间的过渡期产生歧义。
-
----
-
-## 13. 服务端 Session 竞态 → 客户端 re-hello 恢复（Yonexus）
-
-**问题**：服务端在已认证连接关闭时（`onDisconnect`）删除对应的 session。如果另一个客户端连接（同 identifier）的 `auth_request` 恰好在 session 被删除之后到达，服务端返回 `auth_failed("not_paired")`，即使客户端持有有效 secret。
-
-**场景**：
-1. 测试脚本 ws_1 已认证 → session["test-client"] 存在
-2. 插件 ws_2 发送 hello → session["test-client"] 被覆写（socket = ws_2）
-3. 测试脚本 ws_1 关闭 → `handleDisconnect("test-client")` → `sessions.delete("test-client")`
-4. 插件 ws_2 发 `auth_request` → session 不存在 → `auth_failed("not_paired")`
-5. 插件有 secret，但 `auth_required` 状态没有 re-hello 逻辑 → 永远卡住
-
-**解法**：客户端收到 `auth_failed("not_paired")` 且持有有效 secret 时，重新发送 hello 以在服务端创建新 session，然后重试认证：
-
-```typescript
-if (payload.reason === "not_paired" && hasClientSecret(this.clientState)) {
-  this.sendHello();   // 重建 session，触发 hello_ack("auth_required") → sendAuthRequest()
-  return;
-}
-```
-
-**规则**：客户端凡是遇到"自己有凭据但服务端找不到 session"的错误，都应尝试重走 hello 流程，而不是直接进入 `auth_required` 等待用户干预。
-
----
-
-## 14. ContractorAgent 开发经验
-
-### 14.1 `openclaw.json` runtime.type 只允许 "embedded" 或 "acp"
-
-**问题**：写入自定义 `runtime.type: "contractor"` 到 `openclaw.json` 会导致 gateway 启动时 schema 校验失败：
-
-```
-agents.list.N.runtime: Invalid input (allowed: "embedded", "acp")
-```
-
-**解法**：用 `model` 字段识别 contractor agent（model 为 `"contractor-agent/contractor-claude-bridge"`），不写 custom runtime type。
-
----
-
-### 14.2 `openclaw gateway restart` 不一定真正重启
-
-**问题**：`openclaw gateway restart` 命令输出 "Restarted systemd service" 但有时不实际重启（PID 不变）。这会导致部署新插件代码后仍然运行旧版本。
-
-**解法**：用 `systemctl --user restart openclaw-gateway.service` 替代。可用 `ss -tlnp | grep <port>` 或 `journalctl` 确认 PID 是否变化。
-
----
-
-### 14.3 `claude -p --output-format stream-json` 需要 `--verbose`
-
-**问题**：`claude -p --output-format stream-json` 没有 `--verbose` 时报错：
-
-```
-Error: When using --print, --output-format=stream-json requires --verbose
-```
-
-**解法**：始终加 `--verbose` 标志。
-
----
-
-### 14.4 async plugin register 的问题
-
-**问题**：OpenClaw 不等待 `register()` 返回的 promise：
-
-```
-[plugins] plugin register returned a promise; async registration is ignored (plugin=contractor-agent)
-```
-
-实际上，`register()` 里 `await` 之后的代码（如 `isPortFree` check + `createBridgeServer`）仍然异步执行，但 OpenClaw 不等待它们完成。Bridge server 的 `listen` 回调是异步的，所以 `[contractor-bridge] sidecar listening` 日志出现在 `[contractor-agent] plugin registered` 之后——这是正常的。关键是不要把 CLI 注册（`registerCli`）放在 `await` 后面，否则可能因为竞态而失效。
-
-**规则**：把 `registerCli(api)` 和其他同步注册调用移到 `await` 之前，或确认它们在 async 流程中最终被执行。
-
----
-
-### 14.5 `ctx.program.command()` vs 导入 Commander（版本冲突）
-
-**问题**：在插件里 `import { Command } from "commander"` 后用 `ctx.program.addCommand(new Command(...))` 会导致：
-
-```
-TypeError: subCommand._prepareForParse is not a function
-```
-
-原因：插件的 `commander` 包和 OpenClaw 内部的 `commander` 包版本不同，`Command` 实例不兼容。
-
-**解法**：只用 `ctx.program.command("subcommand")` 链式调用，不导入 Commander，不创建 `new Command()`。依赖列表也不需要 `commander`。
-
----
-
-### 14.6 Node.js `Readable` push 模式的 async iteration 问题
-
-**问题**：用 `new Readable({ read() {} })` 作为 claude 输出的行缓冲中介，然后 `for await (const raw of lines)` 读取——在 gateway 环境中可靠性存疑（本地测试失败，可能是 push 时序或 Buffer 编码问题）。
-
-**解法**：用 `readline.createInterface({ input: child.stdout })` + 事件驱动的 Promise 队列。`readline` 的行分割行为经过 Node.js 官方测试，比手动 `split("\n")` + Readable 更可靠。
-
----
-
-### 14.7 `--mcp-config <configs...>` 是变参——不能把 prompt 放在它后面
-
-**问题**：`claude -p ... --mcp-config /tmp/mcp.json "You are operating..."` 会把 prompt 当作第二个 MCP config 文件，报错：
-
-```
-Error: Invalid MCP configuration:
-MCP config file not found: /tmp/workspace/You are operating as a contractor agent...
-```
-
-原因：`--mcp-config` 接受多个值（`<configs...>`），贪婪消费后续所有非 flag 参数，把 prompt 当成了第二个路径。
-
-**解法**：把 prompt 放在 `-p` 的紧接下一个位置（所有 flag 之前），`--mcp-config` 放在最后：
-
-```typescript
-const args = [
-  "-p",
-  prompt,          // ← 紧跟 -p，在所有 flag 之前
-  "--output-format", "stream-json",
-  "--verbose",
-  ...
-  "--mcp-config", mcpConfigPath,  // ← 放在最后，不影响 prompt
-];
-```
-
----
-
-### 14.8 OpenClaw 不会自动向 agent 发送工具列表——需要配置 `tools.profile`
-
-**问题**：contractor-e2e agent 的请求 body 里 `tools: []`——OpenClaw 没有把 `contractor_echo` 等工具发给 bridge，导致 MCP proxy 无工具可暴露。
-
-原因：OpenClaw 只对配置了工具 profile 的 agent 才在请求中携带工具定义。
-
-**解法**：在 `openclaw.json` 里给 contractor agent 加 `"tools": { "profile": "full" }`。在代码里，`markAgentAsClaudeContractor()` 负责在创建时自动写入这个字段。
-
----
-
-### 14.9 Claude Code 有自己的身份系统提示，与 SOUL.md 冲突轻微
-
-**情况**：Claude Code 的内置 system prompt 第一句是 `"You are Claude Code, Anthropic's official CLI for Claude."`。当 contractor bridge 通过 `-p` 传入 bootstrap（user turn），并让 Claude 读取 SOUL.md 时，存在身份层次冲突：
-
-```
-[SYSTEM] You are Claude Code...          ← Claude Code 内置
-[USER]   You are a contractor agent...   ← 我们的 bootstrap
-         Please embody SOUL.md...
-```
-
-**结论**：冲突轻微且可绕过。实测中 SOUL.md 指令能有效覆盖 Claude Code 的弱身份声明（agent 自称 "Rook ♜" 而非 "Claude Code"）。原因是 Claude Code 的身份声明较通用，user-turn 的 bootstrap + SOUL.md Read 优先级足够。
-
-**更干净的方案**（未实现）：用 `--append-system-prompt` 把 contractor 身份推到 system 层：
-```typescript
-args.push("--append-system-prompt",
-  `You are operating as contractor agent (${agentId}) in OpenClaw. Embody SOUL.md.`
-);
-```
-
-**规则**：
-- 不要在 contractor workspace 创建 `CLAUDE.md`（Claude Code 会自动注入，与 SOUL.md 产生更强冲突）
-- SOUL.md/IDENTITY.md 通过 bootstrap 的 Read 指令机制生效即可
-- Claude Code auto-memory（`~/.claude/projects/.../memory/`）和 OpenClaw memory（`workspace/memory/`）路径不同，并行运行互不干扰
-
----
-
-### 14.10 OpenClaw 工具执行：MCP proxy 的"全局注册表"模式
-
-**问题**：bridge 运行在 OpenClaw gateway 进程内，但 OpenClaw plugin SDK 没有暴露 `callTool(name, args)` API。要从 bridge 的 `/mcp/execute` 端点调用其他插件注册的工具，没有官方途径。
-
-**解法**：用 `globalThis` 共享注册表：
-
-```typescript
-// 在 tool-test/index.ts（或任意插件）
-const _G = globalThis as Record<string, unknown>;
-if (!(_G["_contractorToolHandlers"] instanceof Map)) {
-  _G["_contractorToolHandlers"] = new Map<string, (args: unknown) => Promise<string>>();
-}
-(_G["_contractorToolHandlers"] as Map<..., ...>).set("contractor_echo", async (args) => {
-  return `[contractor_echo] ${(args as any).message}`;
-});
-```
-
-bridge 的 `/mcp/execute` 端点从 `globalThis._contractorToolHandlers` 查找并调用处理函数。
-
-**限制**：工具实现需要主动"双注册"（OpenClaw 的 `api.registerTool()` + globalThis 注册表）。不支持远程工具（运行在其他进程的工具）。完整方案待后续研究。