Files
Fabric.OpenclawPlugin/src/inbound.ts
hzhang c5a33c33ec feat(triage): per-channel serial queue + HF on_call gate + observer skip
Three behavioral changes to inbound message handling to support the
new triage flow:

## 1. Per-channel serial queue

Replaces `void this.dispatch(...)` (fire-and-forget) with a per-channel
chain so consecutive messages on the same channel are processed strictly
in order — no concurrent model turns for the same channel. Other
channels remain independent (parallelism preserved across channels).

Implementation: `Map<channelId, Promise>` where each new task awaits
the previous. The map entry self-cleans when the chain settles AND
no newer task has overwritten it.

## 2. HF on_call gate (triage + wake=true only)

Before dispatching a triage wake to the on-duty agent, hit HF
`GET /calendar/agent/status?agent_id=...`. If the agent isn't
currently on_call, the message is pushed to a per-agent gated queue
instead of dispatched — no model turn fires.

Status check is cached for 5s to amortise across rapid triage bursts.

When a subsequent triage message arrives and the agent IS on_call by
that point, the gated queue drains FIFO (re-enqueued through the same
per-channel chain so order is kept) before the new message dispatches.

Drained queue is in-memory only; on gateway restart the underlying
Fabric messages get re-fetched via the connect-time history sweep.

## 3. Triage observer skip (wake=false)

Triage messages that arrive with wakeup=false are admin observers — by
spec they MUST NOT enter the agent's session history. Skipped entirely
(no recordInboundSession call). The next time this agent legitimately
wakes for triage, their context contains only past wakeups + their own
outgoing messages — no observer-side chatter from other agents.

For NON-triage channels the legacy "record-as-history" stays — those
keep their full channel conversation available for later wakes.

## Env

- HF_API_BASE_URL  — defaults `https://monitor.hangman-lab.top`
- HF_CLAW_IDENTIFIER — defaults to `os.hostname()`

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-22 22:17:39 +01:00

576 lines
25 KiB
TypeScript

import { promises as fs } from 'node:fs';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { io, type Socket } from 'socket.io-client';
import { dispatchInboundReplyWithBase } from 'openclaw/plugin-sdk/inbound-reply-dispatch';
import type { FabricClient, FabricSession } from './fabric-client.js';
import type { IdentityRegistry } from './identity.js';
import { resolveCoalesce } from './accounts.js';
import { enqueueDelivery, flushFabricForChannel } from './coalesce.js';
// COMPAT NOTE (openclaw v2026.5.7): the inbound path mirrors how bundled
// channels (nextcloud-talk) drive the kernel:
// core = PluginRuntime (from setRuntime)
// route = core.channel.routing.resolveAgentRoute(...)
// ctx = core.channel.reply.finalizeInboundContext(...) // has SessionKey
// dispatch= dispatchInboundReplyWithBase({ cfg, route, ctxPayload, core, deliver })
// `core.channel.*` is accessed loosely so unrelated SDK drift won't break us.
type Core = {
channel: {
routing: { resolveAgentRoute: (p: unknown) => { agentId: string; sessionKey: string; accountId?: string } };
session: {
resolveStorePath: (store: unknown, o: { agentId: string }) => string;
recordInboundSession: (p: {
storePath: string;
sessionKey: string;
ctx: unknown;
createIfMissing?: boolean;
onRecordError: (e: unknown) => void;
}) => Promise<unknown>;
};
reply: { finalizeInboundContext: (p: Record<string, unknown>) => unknown };
};
};
type Logger = { info: (m: string) => void; warn: (m: string) => void; error?: (m: string) => void };
type FabricAttachment = { url: string; name?: string; mimeType?: string };
type FabricMessage = {
messageId: string;
seq: number;
content: string;
authorUserId?: string;
createdAt?: string;
channelId?: string;
attachments?: FabricAttachment[];
wakeup?: boolean;
// x-type of the channel (sent on message.created). 'dm' bypasses the
// wakeup gate: any message that isn't the agent's own is delivered.
xType?: string;
};
export class FabricInbound {
private sockets: Socket[] = [];
private seen = new Set<string>();
// Timers that periodically re-sync channel membership per (agent, guild).
// Without this, the agent's socket.io subscriptions are a snapshot taken
// at connect time — any channel the agent joins later (e.g. a fresh DM
// created by another user) is unreachable until the gateway restarts.
private channelSyncTimers: NodeJS.Timeout[] = [];
// Resync cadence. Backend doesn't push a `channel.joined` event, so we
// poll. 60s keeps the lag bounded without hammering the backend.
private static readonly CHANNEL_SYNC_INTERVAL_MS = 60_000;
// Guild access tokens are short-lived (~15 min). The socket survives via
// socket.io reconnect, but the token captured at connect time goes stale,
// so HTTP calls (attachment download, posting the reply) start 401ing.
// Re-login per agent on a short TTL to keep a fresh token.
private tokenCache = new Map<string, { session: FabricSession; at: number }>();
private static readonly TOKEN_TTL_MS = 8 * 60 * 1000;
// Per-channel serial work queue. Every inbound socket message for a
// channel awaits the previous task for that same channel, so model
// turns never interleave. Map key = channelId; value is the tail of
// the chain (an in-flight promise the next task awaits).
//
// Why per-channel and not per-agent: a single agent may sit in
// several triage / general channels; we want each channel to flow at
// its own speed but the SAME channel's traffic to be strictly serial.
// For dm and discuss the queue also serialises but those traditionally
// had at-most-one-in-flight anyway via the turn engine.
private channelChains = new Map<string, Promise<void>>();
// Agent.status snapshot cache (5s TTL) — keeps the HF /calendar/
// agent/status round-trip off the hot path for back-to-back triage
// messages. Short TTL because status flips are rare-but-meaningful.
private agentStatusCache = new Map<string, { onCall: boolean; at: number }>();
private static readonly AGENT_STATUS_TTL_MS = 5_000;
// Triage messages that arrived while the on-duty agent wasn't on_call
// — sit here until either (a) the agent becomes on_call and the next
// triage arrival drains them, or (b) the gateway restarts (lost; ok
// because the underlying Fabric messages are persisted and re-fetched
// on agent reconnect's history sweep).
private pendingTriageGated: Array<{
agentId: string;
g: { nodeId: string; endpoint: string };
channelId: string;
m: FabricMessage;
session: FabricSession;
}> = [];
// Schedule `task` to run after every previous task on the same
// channel has completed. Returns the promise so callers can await
// their own result if they need to; the chain itself is fire-and-
// forget from the socket.on handler.
private enqueueChannelTask(channelId: string, task: () => Promise<void>): Promise<void> {
const prev = this.channelChains.get(channelId) ?? Promise.resolve();
const next = prev.then(task).catch((err) => {
this.log.warn(`fabric: per-channel task failed channel=${channelId}: ${String(err)}`);
});
this.channelChains.set(channelId, next);
// Best-effort cleanup so the Map doesn't grow without bound for
// long-running gateways: drop the entry when the chain settles, but
// only if it's still the latest reference (newer enqueue may have
// overwritten it in the meantime).
void next.finally(() => {
if (this.channelChains.get(channelId) === next) {
this.channelChains.delete(channelId);
}
});
return next;
}
// Hit HF backend to check whether `agentId` is currently on_call.
// Cached for 5s. Failures (network, 404, etc.) are treated as "not
// on_call" — triage stays gated rather than risking a confused wake.
private async checkAgentOnCall(agentId: string): Promise<boolean> {
const cached = this.agentStatusCache.get(agentId);
if (cached && Date.now() - cached.at < FabricInbound.AGENT_STATUS_TTL_MS) {
return cached.onCall;
}
const base = (process.env.HF_API_BASE_URL ?? '').trim() || 'https://monitor.hangman-lab.top';
const claw = (process.env.HF_CLAW_IDENTIFIER ?? '').trim() || (await import('os')).hostname();
let onCall = false;
try {
const url = `${base.replace(/\/$/, '')}/calendar/agent/status?agent_id=${encodeURIComponent(agentId)}`;
const res = await fetch(url, {
headers: { 'X-Agent-ID': agentId, 'X-Claw-Identifier': claw },
});
if (res.ok) {
const data = (await res.json()) as { status?: string };
onCall = (data.status ?? '').toLowerCase() === 'on_call';
}
} catch (err) {
this.log.warn(`fabric: HF status check failed agent=${agentId}: ${String(err)}`);
}
this.agentStatusCache.set(agentId, { onCall, at: Date.now() });
return onCall;
}
// FIFO drain of all triage-gated messages for `agentId` (called when
// we just learned they're on_call). Each drained message is dispatched
// through its own channel chain so per-channel serial order is kept.
private async drainGatedFor(agentId: string): Promise<void> {
const keep: typeof this.pendingTriageGated = [];
const drain: typeof this.pendingTriageGated = [];
for (const item of this.pendingTriageGated) {
if (item.agentId === agentId) drain.push(item);
else keep.push(item);
}
if (drain.length === 0) return;
this.pendingTriageGated = keep;
for (const item of drain) {
this.log.info(
`fabric: triage drain agent=${item.agentId} channel=${item.channelId} msg=${item.m.messageId}`,
);
// Re-enqueue via the per-channel chain so ordering is preserved.
this.enqueueChannelTask(item.channelId, async () => {
await this.dispatch(item.agentId, item.g, item.channelId, item.m, item.session);
});
}
}
// Return a fresh guild access token for the agent, re-authenticating with
// the agent's Fabric API key when the cached session is stale. Falls back
// to the connect-time session token if re-login fails.
private async freshGuildToken(
agentId: string,
guildNodeId: string,
fallback: FabricSession,
): Promise<string | undefined> {
const pick = (s: FabricSession) =>
s.guildAccessTokens.find((t) => t.guildNodeId === guildNodeId)?.token;
const now = Date.now();
const cached = this.tokenCache.get(agentId);
if (cached && now - cached.at < FabricInbound.TOKEN_TTL_MS) {
return pick(cached.session) ?? pick(fallback);
}
const apiKey = this.identity.findByAgentId(agentId)?.fabricApiKey;
if (apiKey) {
try {
const s = await this.client.agentLogin(apiKey);
this.tokenCache.set(agentId, { session: s, at: now });
return pick(s) ?? pick(fallback);
} catch (err) {
this.log.warn(`fabric: token refresh failed agent=${agentId}: ${String(err)}`);
}
}
return pick(fallback);
}
constructor(
private readonly core: unknown, // PluginRuntime
private readonly cfg: unknown, // OpenClawConfig
private readonly client: FabricClient,
private readonly identity: IdentityRegistry,
private readonly log: Logger,
private readonly accounts: Array<{ agentId: string; fabricApiKey: string }> = [],
) {}
async start(): Promise<void> {
const entries =
this.accounts.length > 0
? this.accounts.map((a) => ({ agentId: a.agentId, fabricApiKey: a.fabricApiKey }))
: this.identity.list();
for (const entry of entries) {
try {
const session = await this.client.agentLogin(entry.fabricApiKey);
this.identity.upsert({
agentId: entry.agentId,
fabricApiKey: entry.fabricApiKey,
fabricUserId: session.user.id,
displayName: session.user.name,
});
await this.connectAgent(entry.agentId, session);
this.log.info(`fabric: agent ${entry.agentId} connected as ${session.user.email}`);
} catch (err) {
this.log.warn(`fabric: agent ${entry.agentId} connect failed: ${String(err)}`);
}
}
}
stop(): void {
for (const t of this.channelSyncTimers) clearInterval(t);
this.channelSyncTimers = [];
for (const s of this.sockets) s.disconnect();
this.sockets = [];
}
private async connectAgent(agentId: string, session: FabricSession): Promise<void> {
const selfUserId = session.user.id;
for (const g of session.guilds) {
const tok = session.guildAccessTokens.find((t) => t.guildNodeId === g.nodeId)?.token;
if (!tok) continue;
const socket = io(`${g.endpoint}/realtime`, {
transports: ['websocket'],
auth: { token: tok },
autoConnect: false,
});
// Tracked socket.io rooms for this (agent, guild). The initial fetch
// on `connect` seeds it; the periodic resync diffs against it so we
// only emit `join_channel` for genuinely new channels (and
// `leave_channel` for ones the agent is no longer in).
const joined = new Set<string>();
const syncChannels = async (kind: 'initial' | 'resync') => {
let freshTok: string | undefined;
try {
freshTok = await this.freshGuildToken(agentId, g.nodeId, session);
} catch {
freshTok = tok;
}
const authTok = freshTok ?? tok;
try {
const res = await fetch(
`${g.endpoint}/api/channels?guildId=${encodeURIComponent(g.nodeId)}`,
{ headers: { authorization: `Bearer ${authTok}` } },
);
if (!res.ok) return;
const channels = (await res.json()) as Array<{ id: string }>;
const current = new Set(channels.map((c) => c.id));
let added = 0;
let removed = 0;
for (const id of current) {
if (!joined.has(id)) {
socket.emit('join_channel', { channelId: id });
joined.add(id);
added++;
}
}
for (const id of [...joined]) {
if (!current.has(id)) {
socket.emit('leave_channel', { channelId: id });
joined.delete(id);
removed++;
}
}
if (kind === 'initial') {
this.log.info(
`fabric: agent ${agentId} joined ${current.size} channel(s) on ${g.nodeId}`,
);
} else if (added > 0 || removed > 0) {
this.log.info(
`fabric: agent ${agentId} channel resync on ${g.nodeId}: +${added} -${removed} (now ${joined.size})`,
);
}
} catch {
/* best effort — next tick will retry */
}
};
socket.on('connect', () => {
// On every (re)connect the server forgets prior subscriptions, so
// reset our local view and seed from a fresh fetch.
joined.clear();
void syncChannels('initial');
});
// Push-based membership events from the backend (companion to
// Fabric.Backend.Guild's RealtimeGateway.emitToUser). When the
// server tells us this user was added to / removed from a
// channel, we sub/unsub the socket.io room immediately — no
// 60s wait for the polling resync. Polling remains as a safety
// net for missed events.
socket.on('channel.joined', (evt: { channelId?: string }) => {
const id = evt?.channelId;
if (!id || joined.has(id)) return;
socket.emit('join_channel', { channelId: id });
joined.add(id);
this.log.info(`fabric: agent ${agentId} channel.joined push on ${g.nodeId}: ${id} (now ${joined.size})`);
});
socket.on('channel.left', (evt: { channelId?: string }) => {
const id = evt?.channelId;
if (!id || !joined.has(id)) return;
socket.emit('leave_channel', { channelId: id });
joined.delete(id);
this.log.info(`fabric: agent ${agentId} channel.left push on ${g.nodeId}: ${id} (now ${joined.size})`);
});
const syncTimer = setInterval(
() => void syncChannels('resync'),
FabricInbound.CHANNEL_SYNC_INTERVAL_MS,
);
this.channelSyncTimers.push(syncTimer);
socket.on('message.created', (m: FabricMessage) => {
const channelId = m.channelId ?? '';
if (!channelId) return;
if (m.authorUserId && m.authorUserId === selfUserId) return;
const key = `${agentId}:${m.messageId}`;
if (this.seen.has(key)) return;
this.seen.add(key);
if (this.seen.size > 5000) this.seen.clear();
// Per-channel serial queue. Prevents concurrent model turns for
// the same channel — important for triage where a second wake
// arriving mid-reply would interleave with the in-flight one.
this.enqueueChannelTask(channelId, async () => {
// Triage on_call gate: if the on-duty agent isn't currently
// on_call per HF, don't dispatch yet — just sit on the
// per-channel queue. Subsequent triage messages will recheck;
// when the agent becomes on_call, the next arrival drains.
//
// Also handles: triage + wake=true must verify status before
// committing to a model turn. Non-triage and triage observer
// (wake=false) skip the gate.
if (m.xType === 'triage' && m.wakeup === true) {
const onCall = await this.checkAgentOnCall(agentId);
if (!onCall) {
this.log.info(
`fabric: triage wake gated (agent=${agentId} not on_call) — re-queue msg=${m.messageId}`,
);
this.pendingTriageGated.push({ agentId, g, channelId, m, session });
return;
}
// Drain any previously-gated messages (FIFO) before this one,
// now that we know the agent is on_call.
await this.drainGatedFor(agentId);
}
await this.dispatch(agentId, g, channelId, m, session);
});
});
socket.connect();
this.sockets.push(socket);
}
}
// Download a message's attachments to a temp dir using the agent's guild
// token; returns local paths/types/urls for the inbound media context.
private async fetchAttachments(
agentId: string,
endpoint: string,
token: string | undefined,
m: FabricMessage,
): Promise<{ paths: string[]; types: string[]; urls: string[] }> {
const out = { paths: [] as string[], types: [] as string[], urls: [] as string[] };
const list = m.attachments ?? [];
if (!list.length || !token) return out;
const dir = join(tmpdir(), `fabric-media-${agentId}-${m.messageId}`.replace(/[^\w.-]/g, '_'));
try {
await fs.mkdir(dir, { recursive: true });
} catch {
return out;
}
let i = 0;
for (const a of list) {
try {
const abs = a.url.startsWith('http') ? a.url : `${endpoint}${a.url}`;
const res = await fetch(abs, { headers: { authorization: `Bearer ${token}` } });
if (!res.ok) {
this.log.warn(`fabric: attachment fetch ${res.status} ${abs}`);
continue;
}
const buf = Buffer.from(await res.arrayBuffer());
const safe = (a.name ?? `file-${i}`).replace(/[^\w.-]/g, '_').slice(0, 120) || `file-${i}`;
const p = join(dir, `${i}-${safe}`);
await fs.writeFile(p, buf);
out.paths.push(p);
out.types.push(
a.mimeType ||
res.headers.get('content-type')?.split(';')[0] ||
'application/octet-stream',
);
out.urls.push(abs);
i++;
} catch (err) {
this.log.warn(`fabric: attachment fetch failed agent=${agentId}: ${String(err)}`);
}
}
if (out.paths.length)
this.log.info(`fabric: fetched ${out.paths.length} attachment(s) agent=${agentId}`);
return out;
}
private async dispatch(
agentId: string,
guild: { nodeId: string; endpoint: string },
channelId: string,
m: FabricMessage,
session: FabricSession,
): Promise<void> {
const core = this.core as Core & Record<string, unknown>;
const cfg = this.cfg as { session?: { store?: unknown } };
try {
const route = core.channel.routing.resolveAgentRoute({
cfg: this.cfg,
channel: 'fabric',
accountId: agentId,
peer: { kind: 'group', id: channelId },
});
const storePath = core.channel.session.resolveStorePath(cfg.session?.store, {
agentId: route.agentId,
});
const baseCtx: Record<string, unknown> = {
Body: m.content,
BodyForAgent: m.content,
RawBody: m.content,
CommandBody: m.content,
From: `fabric:channel:${channelId}`,
To: `fabric:${channelId}`,
SessionKey: route.sessionKey,
AccountId: route.accountId ?? agentId,
ChatType: 'group',
ConversationLabel: `fabric:${guild.nodeId}`,
SenderId: m.authorUserId ?? 'fabric',
Provider: 'fabric',
Surface: 'fabric',
MessageSid: m.messageId,
Timestamp: m.createdAt ? Date.parse(m.createdAt) : Date.now(),
OriginatingChannel: 'fabric',
OriginatingTo: `fabric:${channelId}`,
};
// Non-wakeup: Fabric has already decided this agent is NOT the speaker
// this round. Do NOT run the model and do NOT send anything back — the
// discuss/work turn engine expects silence from non-woken agents (only
// the woken speaker emits a normal message or /no-reply). We still
// record the message into the agent's session so it has the full
// channel conversation as context whenever it IS later woken.
//
// Exception: dm channels are 1:1 — there is no turn/wakeup gating;
// any message that isn't the agent's own (already filtered above) is
// always delivered to the model.
if (m.xType !== 'dm' && m.wakeup !== true) {
// Triage exception: non-wake messages (admin observer) MUST NOT
// enter the agent's session at all. The next time the agent
// wakes for a triage message, their context should contain only
// their own past wakeups + their own outgoing messages — never
// the observer-only chatter from other agents. For non-triage
// channels keep the legacy "record-as-history" so a later wake
// sees the full channel conversation.
if (m.xType === 'triage') {
this.log.info(
`fabric: triage observer skip agent=${agentId} channel=${channelId} msg=${m.messageId}`,
);
return;
}
const ctxPayload = core.channel.reply.finalizeInboundContext(baseCtx);
await core.channel.session.recordInboundSession({
storePath,
sessionKey: route.sessionKey,
ctx: ctxPayload,
createIfMissing: true,
onRecordError: (err: unknown) =>
this.log.warn(`fabric: history record failed agent=${agentId}: ${String(err)}`),
});
this.log.info(
`fabric: recorded (no wakeup, history only) agent=${agentId} channel=${channelId}`,
);
return;
}
this.log.info(`fabric: dispatch agent=${agentId} channel=${channelId}`);
const gt = await this.freshGuildToken(agentId, guild.nodeId, session);
// Fetch any uploaded files for the agent: download to a temp dir and
// hand openclaw local MediaPaths (+types) so the model receives them.
const media = await this.fetchAttachments(agentId, guild.endpoint, gt, m);
const ctxPayload = core.channel.reply.finalizeInboundContext({
...baseCtx,
// Provide ONLY local paths. The guild file URL is on a private host
// (e.g. localhost); openclaw's SSRF guard blocks re-fetching it, so
// passing MediaUrls is both redundant (we already downloaded the
// bytes) and noisy. Local MediaPaths is the reliable delivery.
...(media.paths.length
? {
MediaPaths: media.paths,
MediaTypes: media.types,
MediaPath: media.paths[0],
MediaType: media.types[0],
}
: {}),
});
await dispatchInboundReplyWithBase({
cfg: this.cfg as never,
channel: 'fabric',
accountId: agentId,
route,
storePath,
ctxPayload: ctxPayload as never,
core: this.core as never,
deliver: async (payload: { text?: string }) => {
const text = (payload?.text ?? '').trim();
this.log.info(`fabric: deliver agent=${agentId} channel=${channelId} len=${text.length}`);
if (!text || !gt) return;
// Buffer segments; the merged message is posted right after
// dispatch returns (the deterministic turn boundary, see the
// finally below). Disable per channel: channels.fabric.coalesce.
await enqueueDelivery({
channelId,
text,
coalesce: resolveCoalesce(this.cfg as never),
post: (t) =>
this.client.postMessage(guild.endpoint, gt, channelId, t, session.user.id) as Promise<void>,
log: (m) => this.log.info(m),
});
},
onRecordError: (err: unknown) =>
this.log.warn(`fabric: session record failed agent=${agentId}: ${String(err)}`),
onDispatchError: (err: unknown, info: { kind: string }) =>
this.log.warn(`fabric: ${info.kind} dispatch failed agent=${agentId}: ${String(err)}`),
// - disableBlockStreaming: Fabric has no length limit, deliver the
// whole reply as ONE message.
// - sourceReplyDeliveryMode 'automatic': OpenClaw defaults group
// chats to "message_tool_only", which SUPPRESSES auto-delivery of
// the agent's text reply (it expects the agent to call a message
// tool). Fabric already gates *when* an agent speaks via the
// per-recipient wakeup flag, so once a turn is dispatched the
// reply must always flow back through `deliver`. Forcing
// 'automatic' overrides the group default so the reply is
// delivered. (source-reply-delivery-mode: a truthy `requested`
// wins unless it's message_tool_only with no tool available.)
replyOptions: {
disableBlockStreaming: true,
sourceReplyDeliveryMode: 'automatic',
} as never,
});
this.log.info(`fabric: dispatch returned agent=${agentId} channel=${channelId}`);
} catch (err) {
this.log.warn(`fabric: dispatch failed agent=${agentId} channel=${channelId}: ${String(err)}`);
} finally {
// Deterministic per-turn boundary: dispatchInboundReplyWithBase only
// resolves AFTER every deliver() call of this turn has run, so the
// buffer now holds all segments — flush them as ONE Fabric message.
// No hooks, no timers, no idle guessing.
await flushFabricForChannel(channelId);
}
}
}