Files
hzhang 6e3ad669f8 feat(monitor): active push loop replacing standalone monitor
Adds a periodic POST loop to <backend>/monitor/server/heartbeat so
HF plugin can take over the standalone harborforge-monitor daemon's
job — same X-API-Key header, same flat telemetry shape (cpu_pct /
mem_pct / disk_pct / swap_pct / load_avg / uptime_seconds /
plugin_version / agents[]). HF backend stays unchanged.

Config: monitor_push_enabled (default false; opt-in to avoid surprise
heartbeats from existing deployments), monitor_push_interval_seconds
(default 30), reuses apiKey for the X-API-Key header. Lift the
container's HF_MONITER_API_KEY into config.apiKey, flip
monitor_push_enabled true, then docker rm -f the container — DB
last_seen_at keeps advancing under the plugin's loop.

Collector grew swap + cpu sampling (two reads of /proc/stat over a
1-second window when SampleCPU=true). Bridge endpoint stays cheap
(SampleCPU=false on demand); push loop is the only caller paying the
sampling cost.

E2E in sim: monitor_push_enabled=true + apiKey from injected
MonitoredServer row → server_states.last_seen_at advances exactly
every interval_seconds (10s configured, 10s observed). cpu/mem/disk/
swap_pct all populate correctly.
2026-06-03 13:04:51 +01:00

278 lines
8.4 KiB
Go

// plexum-harborforge-plugin — Plexum-side HarborForge plugin.
//
// Mirrors HarborForge.OpenclawPlugin's responsibilities, recast on the
// Plexum SDK:
//
// - eager activation: plugin spawns at host start so the Monitor
// bridge listener and Calendar scheduler are running before any
// turn fires
// - 9 harborforge_* tools backed by tools.Dispatch
// - state-aware wake-agent via HostAPI.WakeAgent for Calendar slots
//
// Config layout: <profile>/plugins/harbor-forge/config.json — see
// internal/config/config.go for the schema.
package main
import (
"context"
"encoding/json"
"errors"
"fmt"
"os"
"path/filepath"
"strings"
"sync"
"time"
sdkplugin "git.hangman-lab.top/hzhang/Plexum-sdk-go/plugin"
"git.hangman-lab.top/zhi/HarborForge.PlexumPlugin/internal/calendar"
hfcfg "git.hangman-lab.top/zhi/HarborForge.PlexumPlugin/internal/config"
"git.hangman-lab.top/zhi/HarborForge.PlexumPlugin/internal/monitor"
"git.hangman-lab.top/zhi/HarborForge.PlexumPlugin/internal/telemetry"
"git.hangman-lab.top/zhi/HarborForge.PlexumPlugin/internal/tools"
)
// Version is injected via -ldflags "-X main.Version=…" at build time.
var Version = "0.1.0"
// harborForgePlugin satisfies sdkplugin.ToolPlugin.
type harborForgePlugin struct {
host sdkplugin.HostAPI
cfg hfcfg.Resolved
bridge *monitor.Bridge
pusher *monitor.Pusher
sched *calendar.Scheduler
deps tools.Deps
cancelBg context.CancelFunc
wg sync.WaitGroup
agentCache sync.Map // sessionID/turnID → agentID stash (best-effort)
}
func (p *harborForgePlugin) Manifest() sdkplugin.Manifest {
return manifestFromDisk()
}
func (p *harborForgePlugin) Init(ctx context.Context, host sdkplugin.HostAPI) error {
p.host = host
profileRoot := os.Getenv("PLEXUM_PROFILE_ROOT")
if profileRoot == "" {
home, _ := os.UserHomeDir()
profileRoot = filepath.Join(home, ".plexum")
}
raw, err := hfcfg.Load(profileRoot)
if err != nil {
return fmt.Errorf("load harbor-forge config: %w", err)
}
p.cfg = hfcfg.Resolve(raw)
host.Log("info", "harbor-forge plugin initialized", map[string]any{
"version": Version,
"backend": p.cfg.BackendURL,
"identifier": p.cfg.Identifier,
"monitor_port": p.cfg.MonitorPort,
"monitor_push_enabled": p.cfg.MonitorPushEnabled,
"calendar_enabled": p.cfg.CalendarEnabled,
})
bgCtx, cancel := context.WithCancel(context.Background())
p.cancelBg = cancel
// Listers + collectors capture bgCtx (not Init ctx) — Init returns
// once MCP initialize completes, but the plugin process lives on
// and so do the goroutines + closures we registered.
makeCollector := func(sampleCPU bool) func() telemetry.Snapshot {
return func() telemetry.Snapshot {
return telemetry.Collect(telemetry.CollectOpts{
Identifier: p.cfg.Identifier,
Version: Version,
SampleCPU: sampleCPU,
AgentLister: func() []telemetry.AgentInfo {
return p.listAgents(bgCtx, profileRoot)
},
})
}
}
// Bridge serves on-demand reads; cheap, no CPU sampling.
collect := makeCollector(false)
// Pusher runs the slow push loop; CPU sampling fine here.
collectForPush := makeCollector(true)
p.bridge = monitor.New(p.cfg.MonitorPort, collect,
func(level, msg string, attrs map[string]any) { host.Log(level, msg, attrs) })
if err := p.bridge.Start(bgCtx); err != nil {
host.Log("warn", "monitor bridge failed to start", map[string]any{"err": err.Error()})
}
// Active push loop — replaces the standalone harborforge-monitor
// container. Off by default; operator opts in via
// monitor_push_enabled + apiKey.
p.pusher = monitor.NewPusher(monitor.PusherConfig{
BackendURL: p.cfg.BackendURL,
APIKey: p.cfg.APIKey,
Interval: time.Duration(p.cfg.MonitorPushIntervalSeconds) * time.Second,
}, collectForPush,
func(level, msg string, attrs map[string]any) { host.Log(level, msg, attrs) })
if p.cfg.MonitorPushEnabled {
p.wg.Add(1)
go func() {
defer p.wg.Done()
if err := p.pusher.Run(bgCtx); err != nil && !errors.Is(err, context.Canceled) {
host.Log("warn", "monitor pusher exited", map[string]any{"err": err.Error()})
}
}()
}
calBackend := p.cfg.CalendarBackendURL
if calBackend == "" {
calBackend = p.cfg.BackendURL
}
bridge := calendar.New(calBackend, p.cfg.Identifier)
p.sched = calendar.NewScheduler(
calendar.Config{
HeartbeatInterval: time.Duration(p.cfg.CalendarHeartbeatIntervalSeconds) * time.Second,
},
bridge, host,
calendar.PluginInfoTag{Name: "harbor-forge", Version: Version, Backend: "plexum"},
func() []calendar.ReportableAgent {
return p.listReportableAgents(bgCtx, profileRoot)
},
)
if p.cfg.CalendarEnabled {
p.wg.Add(1)
go func() {
defer p.wg.Done()
if err := p.sched.Run(bgCtx); err != nil {
host.Log("warn", "calendar scheduler exited", map[string]any{"err": err.Error()})
}
}()
} else {
host.Log("info", "calendar scheduler disabled by config", nil)
}
p.deps = tools.Deps{
Config: p.cfg,
Version: Version,
Collect: collect,
Bridge: p.bridge,
Pusher: p.pusher,
Scheduler: p.sched,
Host: host,
AgentIDFromCtx: func(ctx context.Context) string {
// Host attaches the caller agent id via tools/call
// `_meta.agent_id`; SDK unpacks it into ctx.
if id := sdkplugin.AgentIDFromContext(ctx); id != "" {
return id
}
// Fallback for host paths that don't carry an agent
// (channel-driven, CLI plugin-call). When a single
// calendar slot is active we can deterministically
// attribute the call to that slot's owner.
return p.sched.SingleActiveAgentID()
},
}
return nil
}
func (p *harborForgePlugin) CallTool(ctx context.Context, name string, input json.RawMessage) (sdkplugin.ToolResult, error) {
return tools.Dispatch(ctx, p.deps, name, input)
}
// ---- agent enumeration ----
// listAgents walks <profile>/agents/*/agent.json + state.json so the
// telemetry payload includes every Plexum agent visible on this host.
// Best-effort: read failures degrade to empty list.
func (p *harborForgePlugin) listAgents(ctx context.Context, profileRoot string) []telemetry.AgentInfo {
root := filepath.Join(profileRoot, "agents")
entries, err := os.ReadDir(root)
if err != nil {
return nil
}
out := make([]telemetry.AgentInfo, 0, len(entries))
for _, e := range entries {
if !e.IsDir() {
continue
}
agentID := e.Name()
var info telemetry.AgentInfo
info.ID = agentID
if raw, err := os.ReadFile(filepath.Join(root, agentID, "agent.json")); err == nil {
var meta struct{ Model string `json:"model"` }
_ = json.Unmarshal(raw, &meta)
info.Model = meta.Model
}
// State via HostAPI.ReadAgentState — host-side, ground truth.
if snap, err := p.host.ReadAgentState(ctx, agentID); err == nil {
info.State = snap.State
}
out = append(out, info)
}
return out
}
func (p *harborForgePlugin) listReportableAgents(ctx context.Context, profileRoot string) []calendar.ReportableAgent {
telem := p.listAgents(ctx, profileRoot)
out := make([]calendar.ReportableAgent, 0, len(telem))
for _, a := range telem {
out = append(out, calendar.ReportableAgent{
ID: a.ID, Model: a.Model,
State: mapStateToCalendar(a.State),
})
}
return out
}
func mapStateToCalendar(s string) calendar.AgentStatusValue {
switch strings.ToLower(s) {
case "idle":
return calendar.AgentStatusIdle
case "working":
return calendar.AgentStatusOnCall
case "busy":
return calendar.AgentStatusBusy
case "offline":
return calendar.AgentStatusOffline
}
return calendar.AgentStatusOffline
}
func manifestFromDisk() sdkplugin.Manifest {
// Bundled manifest.json is the authoritative shape; the binary
// version reads it next to itself to avoid hand-syncing two
// definitions. Falls back to a minimal in-code manifest if the
// file is missing (development / first build).
exe, err := os.Executable()
if err == nil {
raw, err := os.ReadFile(filepath.Join(filepath.Dir(exe), "manifest.json"))
if err == nil {
var m sdkplugin.Manifest
if err := json.Unmarshal(raw, &m); err == nil && m.Name != "" {
return m
}
}
}
return sdkplugin.Manifest{
Name: "harbor-forge",
Version: Version,
Activation: sdkplugin.ActivationEager,
Executable: "plexum-harborforge-plugin",
}
}
func main() {
p := &harborForgePlugin{}
defer func() {
if p.cancelBg != nil {
p.cancelBg()
}
p.wg.Wait()
}()
if err := sdkplugin.Serve(p); err != nil && !errors.Is(err, context.Canceled) {
fmt.Fprintf(os.Stderr, "plexum-harborforge-plugin: %v\n", err)
os.Exit(1)
}
}