feat: Phase F-2 — socket.io inbound + wakeup gate + token refresh

End-to-end Fabric inbound→Plexum→Fabric outbound now works against a
live Fabric stack:

  alice posts in bt2-clean (Fabric REST)
    → guild emits message.created over socket.io
    → plugin's wakeup gate decides dispatch
    → notifications/plexum/channel/inbound to host
    → Plexum agent runs (echo provider)
    → outbound `send` tool posts via Fabric REST
    → fabrictester reply visible in channel

internal/socketio/ (~280 LOC + 2 tests):
- Minimal Engine.IO v4 + Socket.IO v5 client over websocket
- WebSocket-only transport (skip polling upgrade dance)
- AuthFunc callback re-evaluated on every (re)connect — fixes the
  stale-JWT-on-reconnect bug openclaw plugin documented for the JS
  client's single-shot auth, which the available Go socket.io
  library (zishang520) doesn't address either
- PING/PONG per server-supplied interval
- Caller-driven reconnect: Connect returns on close, supervisor
  re-dials with fresh token

internal/tokens/ (~95 LOC + 9 tests):
- Per-agent session cache with 8min TTL (matches openclaw's
  TOKEN_TTL_MS); guild tokens are ~15min so 8min keeps a margin
- Invalidate forces re-login (used by inbound when CONNECT auth fires)
- GuildToken helper picks the per-guild JWT from the cached session;
  if the guild is missing from the cache, invalidate + retry once

internal/inbound/ (~290 LOC):
- Supervisor: one socket.io conn per (agent, guild); reconnect with
  fresh token on drop; ChannelSyncInterval (60s) polling + push
  channel.joined/channel.left handlers
- Wakeup gate: dm channels deliver any non-self message; other
  x_types require wakeup=true (record-only for non-wake non-dm
  deferred — Plexum has no history-injection equivalent in v1)
- Self-author filter on selfUserId from cached session
- Per-(agent,msgId) dedup bounded to 5000 entries
- Per-channel serial queue with 5s idle drain so concurrent inbounds
  on the same channel run one-at-a-time (matches openclaw plugin)
- Emits notifications/plexum/channel/inbound with session_id =
  "s_fab_<fabric_channel_id>" for stable per-channel session continuity

cmd/plexum-fabric-channel-plugin:
- Wires inbound supervisor at Init; runs in a background goroutine
  for the plugin's lifetime
- Replaces F-1's sessions map with tokens.Cache (same warm-sessions
  behavior, now backed by TTL)
- hostLogHandler: bridges slog records from inbound supervisor to
  HostAPI.Log notifications

F-2 deferred to F-3+:
- record-only history injection (Plexum v1 has no equivalent)
- tools.ts port (15 MCP tools — channel/canvas/sub-discussion family)
- presence-sync, command-sync, attachments, coalesce parity

Tests: 22 (5 identity + 6 config + 9 tokens + 2 socketio).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
h z
2026-05-31 15:29:01 +01:00
parent f8d43ae70e
commit 0efcdfd342
8 changed files with 1162 additions and 36 deletions

View File

@@ -16,6 +16,7 @@ import (
"encoding/json"
"errors"
"fmt"
"log/slog"
"os"
"path/filepath"
"sync"
@@ -26,6 +27,8 @@ import (
"git.hangman-lab.top/hzhang/Plexum-fabric-channel-plugin/internal/config"
"git.hangman-lab.top/hzhang/Plexum-fabric-channel-plugin/internal/fabric"
"git.hangman-lab.top/hzhang/Plexum-fabric-channel-plugin/internal/identity"
"git.hangman-lab.top/hzhang/Plexum-fabric-channel-plugin/internal/inbound"
"git.hangman-lab.top/hzhang/Plexum-fabric-channel-plugin/internal/tokens"
)
// HostConfig is the plugin's own config at
@@ -47,10 +50,19 @@ type fabricPlugin struct {
bindings []config.FabricBinding
byFabric config.ByFabricChannel
client *fabric.Client
tokens *tokens.Cache
// Per-agent Session cache (refreshed lazily; full refresh in F-2).
// Goroutine handle for the inbound supervisor. Cancelled on
// plugin shutdown (we don't have an explicit shutdown signal in
// the SDK today; rely on subprocess kill).
inboundCancel context.CancelFunc
inboundDone chan struct{}
// Legacy field — kept only for back-compat with non-tokens code
// paths during the F-1 → F-2 refactor; safe to remove once nothing
// else references it. Not used anymore.
sessMu sync.Mutex
sessions map[string]*fabric.Session // agentID → session
sessions map[string]*fabric.Session
}
func (p *fabricPlugin) Manifest() plugin.Manifest {
@@ -148,6 +160,15 @@ func (p *fabricPlugin) Init(ctx context.Context, host plugin.HostAPI) error {
}
p.byFabric = config.Index(p.bindings)
// Token cache: re-login per agent on TTL miss (8min default).
p.tokens = tokens.New(0, func(loginCtx context.Context, agentID string) (*fabric.Session, error) {
entry := p.identities.Lookup(agentID)
if entry == nil || !entry.Enabled {
return nil, fmt.Errorf("agent %s: no enabled identity", agentID)
}
return p.client.AgentLogin(loginCtx, entry.FabricAPIKey)
})
host.Log("info", "fabric channel plugin initialized", map[string]any{
"center": p.cfg.CenterAPIBase,
"identity_path": idPath,
@@ -155,18 +176,78 @@ func (p *fabricPlugin) Init(ctx context.Context, host plugin.HostAPI) error {
"identities_loaded": len(p.identities.AgentIDs()),
})
// Eager validate: for every bound agent that has a channel, do a
// blocking agentLogin so we surface bad keys at startup instead of
// on first outbound. F-2 hooks socket.io subscription here too.
// Warm sessions (early bad-key detection).
if err := p.warmSessions(ctx); err != nil {
// Log + continue; outbound will retry on demand. We don't want
// to refuse plugin init just because one key is stale.
host.Log("warn", "fabric warm-sessions had errors",
map[string]any{"err": err.Error()})
}
// Phase F-2: start the inbound supervisor in a goroutine. Lives
// until p.inboundCancel fires (currently never — SDK has no
// shutdown hook; subprocess kill is the only stop signal).
if len(p.bindings) > 0 {
ctxBg, cancel := context.WithCancel(context.Background())
p.inboundCancel = cancel
p.inboundDone = make(chan struct{})
notifier := func(channelName, message, sessionID string) {
p.host.EmitNotification("notifications/plexum/channel/inbound", map[string]any{
"channel_name": channelName,
"message": message,
"session_id": sessionID,
})
}
// slog wrapping plugin.HostAPI.Log isn't worth the indirection
// here; use a discard-style adapter that pipes WARN/INFO to
// the host log.
logger := slog.New(&hostLogHandler{host: host, level: slog.LevelInfo})
sup := inbound.New(p.client, p.tokens, p.bindings, notifier, logger)
go func() {
defer close(p.inboundDone)
if err := sup.Run(ctxBg); err != nil {
host.Log("warn", "inbound supervisor exited", map[string]any{"err": err.Error()})
}
}()
host.Log("info", "fabric inbound supervisor started",
map[string]any{"agents": sup.AgentIDs, "bindings": len(p.bindings)})
}
return nil
}
// hostLogHandler is a tiny slog.Handler that forwards records to the
// plugin's HostAPI.Log. inbound + supervisor use slog for structured
// logging; this bridges to the host's log notification stream.
type hostLogHandler struct {
host plugin.HostAPI
level slog.Level
}
func (h *hostLogHandler) Enabled(_ context.Context, l slog.Level) bool { return l >= h.level }
func (h *hostLogHandler) Handle(_ context.Context, r slog.Record) error {
attrs := make(map[string]any, r.NumAttrs())
r.Attrs(func(a slog.Attr) bool {
attrs[a.Key] = a.Value.Any()
return true
})
h.host.Log(levelString(r.Level), r.Message, attrs)
return nil
}
func (h *hostLogHandler) WithAttrs(_ []slog.Attr) slog.Handler { return h }
func (h *hostLogHandler) WithGroup(_ string) slog.Handler { return h }
func levelString(l slog.Level) string {
switch {
case l >= slog.LevelError:
return "error"
case l >= slog.LevelWarn:
return "warn"
case l >= slog.LevelInfo:
return "info"
default:
return "debug"
}
}
func (p *fabricPlugin) warmSessions(ctx context.Context) error {
// Which agents appear as a binding's AgentID?
agentsNeeded := map[string]bool{}
@@ -177,8 +258,7 @@ func (p *fabricPlugin) warmSessions(ctx context.Context) error {
var firstErr error
for agentID := range agentsNeeded {
entry, ok := enabled[agentID]
if !ok {
if _, ok := enabled[agentID]; !ok {
err := fmt.Errorf("agent %s has channels but no identity (run plexum-fabric-register --agent-id %s --api-key ...)",
agentID, agentID)
p.host.Log("warn", err.Error(), nil)
@@ -187,18 +267,15 @@ func (p *fabricPlugin) warmSessions(ctx context.Context) error {
}
continue
}
sess, err := p.client.AgentLogin(ctx, entry.FabricAPIKey)
sess, err := p.tokens.Get(ctx, agentID)
if err != nil {
err = fmt.Errorf("agent %s login: %w", agentID, err)
p.host.Log("warn", err.Error(), nil)
p.host.Log("warn", "fabric agent warm failed",
map[string]any{"agent": agentID, "err": err.Error()})
if firstErr == nil {
firstErr = err
}
continue
}
p.sessMu.Lock()
p.sessions[agentID] = sess
p.sessMu.Unlock()
p.host.Log("info", "fabric session warm", map[string]any{
"agent": agentID, "fabric_user": sess.User.Email,
"guilds": len(sess.Guilds),
@@ -223,6 +300,9 @@ func (p *fabricPlugin) CallTool(ctx context.Context, name string, input json.Raw
if args.ChannelName == "" {
return errResult("channel_name required"), nil
}
p.host.Log("info", "fabric send", map[string]any{
"channel_name": args.ChannelName, "len": len(args.Message),
})
// Find the binding for this plexum channel name.
var binding *config.FabricBinding
@@ -274,26 +354,9 @@ func (p *fabricPlugin) CallTool(ctx context.Context, name string, input json.Raw
}
func (p *fabricPlugin) sessionFor(ctx context.Context, agentID string) (*fabric.Session, error) {
p.sessMu.Lock()
sess := p.sessions[agentID]
p.sessMu.Unlock()
if sess != nil {
return sess, nil
}
entry := p.identities.Lookup(agentID)
if entry == nil || !entry.Enabled {
return nil, errors.New("no identity registered (use plexum-fabric-register)")
}
loginCtx, cancel := context.WithTimeout(ctx, 15*time.Second)
defer cancel()
fresh, err := p.client.AgentLogin(loginCtx, entry.FabricAPIKey)
if err != nil {
return nil, err
}
p.sessMu.Lock()
p.sessions[agentID] = fresh
p.sessMu.Unlock()
return fresh, nil
return p.tokens.Get(loginCtx, agentID)
}
func errResult(msg string) plugin.ToolResult {