From 7dc70522d165c6b0553e6a4142e945ff71fdd497 Mon Sep 17 00:00:00 2001 From: hzhang Date: Tue, 26 May 2026 13:50:24 +0100 Subject: [PATCH] fix(inbound): refresh socket.io auth on (re)connect via callback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Backend issues short-lived guildAccessToken (TTL=900s). The previous `auth: { token: tok }` shape captured the JWT once in connectAgent's closure: after socket.io's auto-reconnect the backend kept getting the same expired JWT and silently rejected the handshake at the application layer (RealtimeGateway logs 'socket rejected: '). The client's 'connect' event still fired (TCP succeeded) so the plugin happily ran the channel-resync, emitted join_channel into the void, and logged 'joined N channel(s)' while the backend was actually broadcasting message.created to a room with zero subscribers. End-user symptom: DMs/group messages to agents silently dropped 15 min after gateway start, with no error anywhere on the agent side. Switch to the callback form, which socket.io re-evaluates on every (re)connect — same call site we already use for the HTTP path via freshGuildToken/tokenCache. Verified in sim (commit 2acb084 + this patch): 1. Connect new DM channel + post msg -> dispatch + reply ✓ 2. `docker restart fabric-backend-guild` to force socket disconnect 3. Plugin reconnects automatically and logs 'fabric: agent recruiter joined 12 channel(s) on sim-guild-1' ✓ (without the fix this reconnect was silently rejected; sim used to log 'WARN socket rejected: ' on the guild backend) Co-Authored-By: Claude Opus 4.7 (1M context) --- dist/fabric/src/inbound.js | 19 ++++++++++++++++++- src/inbound.ts | 19 ++++++++++++++++++- 2 files changed, 36 insertions(+), 2 deletions(-) diff --git a/dist/fabric/src/inbound.js b/dist/fabric/src/inbound.js index f81d871..03bb9c1 100644 --- a/dist/fabric/src/inbound.js +++ b/dist/fabric/src/inbound.js @@ -261,9 +261,26 @@ export class FabricInbound { const tok = session.guildAccessTokens.find((t) => t.guildNodeId === g.nodeId)?.token; if (!tok) continue; + // Use the *callback* form of `auth` so socket.io re-evaluates the JWT + // on every (re)connect. The single-shot `auth: { token: tok }` shape + // captured the token in closure: after socket.io's silent auto-reconnect + // the backend got the same JWT that expired ~15 min into the session + // (guildAccessToken TTL = 900s) and silently rejected the handshake at + // the application layer. The client's `connect` event still fired (TCP + // succeeded), so the plugin happily ran the channel-resync, emitted + // `join_channel` into the void, and logged "joined N channel(s)" while + // the backend was actually broadcasting message.created to a room with + // zero subscribers. End user symptom: DMs to agents silently dropped. const socket = io(`${g.endpoint}/realtime`, { transports: ['websocket'], - auth: { token: tok }, + auth: (cb) => { + // Best-effort fresh token; on transient failure fall back to the + // last known good one. tokenCache also keeps HTTP calls (attachment + // download / reply post) from 401'ing in the same window. + this.freshGuildToken(agentId, g.nodeId, session) + .then((fresh) => cb({ token: fresh ?? tok })) + .catch(() => cb({ token: tok })); + }, autoConnect: false, }); // Tracked socket.io rooms for this (agent, guild). The initial fetch diff --git a/src/inbound.ts b/src/inbound.ts index 01e151b..1ce77f4 100644 --- a/src/inbound.ts +++ b/src/inbound.ts @@ -325,9 +325,26 @@ export class FabricInbound { for (const g of session.guilds) { const tok = session.guildAccessTokens.find((t) => t.guildNodeId === g.nodeId)?.token; if (!tok) continue; + // Use the *callback* form of `auth` so socket.io re-evaluates the JWT + // on every (re)connect. The single-shot `auth: { token: tok }` shape + // captured the token in closure: after socket.io's silent auto-reconnect + // the backend got the same JWT that expired ~15 min into the session + // (guildAccessToken TTL = 900s) and silently rejected the handshake at + // the application layer. The client's `connect` event still fired (TCP + // succeeded), so the plugin happily ran the channel-resync, emitted + // `join_channel` into the void, and logged "joined N channel(s)" while + // the backend was actually broadcasting message.created to a room with + // zero subscribers. End user symptom: DMs to agents silently dropped. const socket = io(`${g.endpoint}/realtime`, { transports: ['websocket'], - auth: { token: tok }, + auth: (cb) => { + // Best-effort fresh token; on transient failure fall back to the + // last known good one. tokenCache also keeps HTTP calls (attachment + // download / reply post) from 401'ing in the same window. + this.freshGuildToken(agentId, g.nodeId, session) + .then((fresh) => cb({ token: fresh ?? tok })) + .catch(() => cb({ token: tok })); + }, autoConnect: false, }); // Tracked socket.io rooms for this (agent, guild). The initial fetch