fix(inbound): refresh socket.io auth on (re)connect via callback
Backend issues short-lived guildAccessToken (TTL=900s). The previous
`auth: { token: tok }` shape captured the JWT once in connectAgent's
closure: after socket.io's auto-reconnect the backend kept getting the
same expired JWT and silently rejected the handshake at the application
layer (RealtimeGateway logs 'socket rejected: <id>'). The client's
'connect' event still fired (TCP succeeded) so the plugin happily ran
the channel-resync, emitted join_channel into the void, and logged
'joined N channel(s)' while the backend was actually broadcasting
message.created to a room with zero subscribers. End-user symptom:
DMs/group messages to agents silently dropped 15 min after gateway
start, with no error anywhere on the agent side.
Switch to the callback form, which socket.io re-evaluates on every
(re)connect — same call site we already use for the HTTP path via
freshGuildToken/tokenCache.
Verified in sim (commit 2acb084 + this patch):
1. Connect new DM channel + post msg -> dispatch + reply ✓
2. `docker restart fabric-backend-guild` to force socket disconnect
3. Plugin reconnects automatically and logs
'fabric: agent recruiter joined 12 channel(s) on sim-guild-1' ✓
(without the fix this reconnect was silently rejected; sim used to
log 'WARN socket rejected: <id>' on the guild backend)
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
19
dist/fabric/src/inbound.js
vendored
19
dist/fabric/src/inbound.js
vendored
@@ -261,9 +261,26 @@ export class FabricInbound {
|
|||||||
const tok = session.guildAccessTokens.find((t) => t.guildNodeId === g.nodeId)?.token;
|
const tok = session.guildAccessTokens.find((t) => t.guildNodeId === g.nodeId)?.token;
|
||||||
if (!tok)
|
if (!tok)
|
||||||
continue;
|
continue;
|
||||||
|
// Use the *callback* form of `auth` so socket.io re-evaluates the JWT
|
||||||
|
// on every (re)connect. The single-shot `auth: { token: tok }` shape
|
||||||
|
// captured the token in closure: after socket.io's silent auto-reconnect
|
||||||
|
// the backend got the same JWT that expired ~15 min into the session
|
||||||
|
// (guildAccessToken TTL = 900s) and silently rejected the handshake at
|
||||||
|
// the application layer. The client's `connect` event still fired (TCP
|
||||||
|
// succeeded), so the plugin happily ran the channel-resync, emitted
|
||||||
|
// `join_channel` into the void, and logged "joined N channel(s)" while
|
||||||
|
// the backend was actually broadcasting message.created to a room with
|
||||||
|
// zero subscribers. End user symptom: DMs to agents silently dropped.
|
||||||
const socket = io(`${g.endpoint}/realtime`, {
|
const socket = io(`${g.endpoint}/realtime`, {
|
||||||
transports: ['websocket'],
|
transports: ['websocket'],
|
||||||
auth: { token: tok },
|
auth: (cb) => {
|
||||||
|
// Best-effort fresh token; on transient failure fall back to the
|
||||||
|
// last known good one. tokenCache also keeps HTTP calls (attachment
|
||||||
|
// download / reply post) from 401'ing in the same window.
|
||||||
|
this.freshGuildToken(agentId, g.nodeId, session)
|
||||||
|
.then((fresh) => cb({ token: fresh ?? tok }))
|
||||||
|
.catch(() => cb({ token: tok }));
|
||||||
|
},
|
||||||
autoConnect: false,
|
autoConnect: false,
|
||||||
});
|
});
|
||||||
// Tracked socket.io rooms for this (agent, guild). The initial fetch
|
// Tracked socket.io rooms for this (agent, guild). The initial fetch
|
||||||
|
|||||||
@@ -325,9 +325,26 @@ export class FabricInbound {
|
|||||||
for (const g of session.guilds) {
|
for (const g of session.guilds) {
|
||||||
const tok = session.guildAccessTokens.find((t) => t.guildNodeId === g.nodeId)?.token;
|
const tok = session.guildAccessTokens.find((t) => t.guildNodeId === g.nodeId)?.token;
|
||||||
if (!tok) continue;
|
if (!tok) continue;
|
||||||
|
// Use the *callback* form of `auth` so socket.io re-evaluates the JWT
|
||||||
|
// on every (re)connect. The single-shot `auth: { token: tok }` shape
|
||||||
|
// captured the token in closure: after socket.io's silent auto-reconnect
|
||||||
|
// the backend got the same JWT that expired ~15 min into the session
|
||||||
|
// (guildAccessToken TTL = 900s) and silently rejected the handshake at
|
||||||
|
// the application layer. The client's `connect` event still fired (TCP
|
||||||
|
// succeeded), so the plugin happily ran the channel-resync, emitted
|
||||||
|
// `join_channel` into the void, and logged "joined N channel(s)" while
|
||||||
|
// the backend was actually broadcasting message.created to a room with
|
||||||
|
// zero subscribers. End user symptom: DMs to agents silently dropped.
|
||||||
const socket = io(`${g.endpoint}/realtime`, {
|
const socket = io(`${g.endpoint}/realtime`, {
|
||||||
transports: ['websocket'],
|
transports: ['websocket'],
|
||||||
auth: { token: tok },
|
auth: (cb) => {
|
||||||
|
// Best-effort fresh token; on transient failure fall back to the
|
||||||
|
// last known good one. tokenCache also keeps HTTP calls (attachment
|
||||||
|
// download / reply post) from 401'ing in the same window.
|
||||||
|
this.freshGuildToken(agentId, g.nodeId, session)
|
||||||
|
.then((fresh) => cb({ token: fresh ?? tok }))
|
||||||
|
.catch(() => cb({ token: tok }));
|
||||||
|
},
|
||||||
autoConnect: false,
|
autoConnect: false,
|
||||||
});
|
});
|
||||||
// Tracked socket.io rooms for this (agent, guild). The initial fetch
|
// Tracked socket.io rooms for this (agent, guild). The initial fetch
|
||||||
|
|||||||
Reference in New Issue
Block a user