feat(monitor): active push loop replacing standalone monitor

Adds a periodic POST loop to <backend>/monitor/server/heartbeat so
HF plugin can take over the standalone harborforge-monitor daemon's
job — same X-API-Key header, same flat telemetry shape (cpu_pct /
mem_pct / disk_pct / swap_pct / load_avg / uptime_seconds /
plugin_version / agents[]). HF backend stays unchanged.

Config: monitor_push_enabled (default false; opt-in to avoid surprise
heartbeats from existing deployments), monitor_push_interval_seconds
(default 30), reuses apiKey for the X-API-Key header. Lift the
container's HF_MONITER_API_KEY into config.apiKey, flip
monitor_push_enabled true, then docker rm -f the container — DB
last_seen_at keeps advancing under the plugin's loop.

Collector grew swap + cpu sampling (two reads of /proc/stat over a
1-second window when SampleCPU=true). Bridge endpoint stays cheap
(SampleCPU=false on demand); push loop is the only caller paying the
sampling cost.

E2E in sim: monitor_push_enabled=true + apiKey from injected
MonitoredServer row → server_states.last_seen_at advances exactly
every interval_seconds (10s configured, 10s observed). cpu/mem/disk/
swap_pct all populate correctly.
This commit is contained in:
h z
2026-06-03 13:04:51 +01:00
parent 472cecd771
commit 6e3ad669f8
7 changed files with 448 additions and 30 deletions

View File

@@ -18,9 +18,19 @@ submodule of the HarborForge umbrella repo.
## What it does
- **Monitor bridge** — HTTP server on `127.0.0.1:<monitor_port>` that
responds to `/telemetry` with a Snapshot the HarborForge.Monitor
binary expects (system metrics + every Plexum agent's sm-state)
- **Monitor push loop** — when `monitor_push_enabled: true`, posts a
flat telemetry payload (cpu/mem/disk/swap/load + per-agent state)
to `<backendUrl>/monitor/server/heartbeat` every
`monitor_push_interval_seconds`. This replaces the standalone
`harborforge-monitor` daemon — the plugin's lifecycle (gateway
start/stop) bounds the loop, so a separate supervisor isn't needed.
Use the same `apiKey` value the standalone monitor's
`HF_MONITER_API_KEY` carried.
- **Monitor bridge** (optional) — HTTP server on
`127.0.0.1:<monitor_port>` that responds to `/telemetry` with a
Snapshot. Useful when the standalone monitor is still present and
you want it to enrich its push payload from the plugin's view of
agents. Disable by setting `monitor_port: 0`.
- **Calendar scheduler** — heartbeats `<backendUrl>/calendar/agent/
heartbeat` every interval, receives any TimeSlots due to fire, and
dispatches them through `HostAPI.WakeAgent` (state-aware queue
@@ -64,15 +74,24 @@ And configure at `~/.plexum/plugins/harbor-forge/config.json`:
```json
{
"backendUrl": "https://monitor.hangman-lab.top",
"backendUrl": "https://hf-api.hangman-lab.top",
"identifier": "server-t3",
"apiKey": "g1_xxx",
"monitor_port": 9100,
"monitor_push_enabled": true,
"monitor_push_interval_seconds": 30,
"monitor_port": 0,
"calendar_enabled": true,
"calendar_heartbeat_interval_seconds": 30
}
```
Replacing the standalone `harborforge-monitor` container: lift the
container's `HF_MONITER_API_KEY` into `apiKey`, set
`monitor_push_enabled: true`, then `docker rm -f harborforge-monitor`
once you've confirmed the plugin's pushes are landing (the backend's
`server_states.last_seen_at` should keep advancing without the
container running).
Restart the host (`systemctl --user restart plexum`) and verify:
```bash

View File

@@ -41,6 +41,7 @@ type harborForgePlugin struct {
host sdkplugin.HostAPI
cfg hfcfg.Resolved
bridge *monitor.Bridge
pusher *monitor.Pusher
sched *calendar.Scheduler
deps tools.Deps
cancelBg context.CancelFunc
@@ -70,6 +71,7 @@ func (p *harborForgePlugin) Init(ctx context.Context, host sdkplugin.HostAPI) er
"backend": p.cfg.BackendURL,
"identifier": p.cfg.Identifier,
"monitor_port": p.cfg.MonitorPort,
"monitor_push_enabled": p.cfg.MonitorPushEnabled,
"calendar_enabled": p.cfg.CalendarEnabled,
})
@@ -79,15 +81,22 @@ func (p *harborForgePlugin) Init(ctx context.Context, host sdkplugin.HostAPI) er
// Listers + collectors capture bgCtx (not Init ctx) — Init returns
// once MCP initialize completes, but the plugin process lives on
// and so do the goroutines + closures we registered.
collect := func() telemetry.Snapshot {
makeCollector := func(sampleCPU bool) func() telemetry.Snapshot {
return func() telemetry.Snapshot {
return telemetry.Collect(telemetry.CollectOpts{
Identifier: p.cfg.Identifier,
Version: Version,
SampleCPU: sampleCPU,
AgentLister: func() []telemetry.AgentInfo {
return p.listAgents(bgCtx, profileRoot)
},
})
}
}
// Bridge serves on-demand reads; cheap, no CPU sampling.
collect := makeCollector(false)
// Pusher runs the slow push loop; CPU sampling fine here.
collectForPush := makeCollector(true)
p.bridge = monitor.New(p.cfg.MonitorPort, collect,
func(level, msg string, attrs map[string]any) { host.Log(level, msg, attrs) })
@@ -96,6 +105,25 @@ func (p *harborForgePlugin) Init(ctx context.Context, host sdkplugin.HostAPI) er
host.Log("warn", "monitor bridge failed to start", map[string]any{"err": err.Error()})
}
// Active push loop — replaces the standalone harborforge-monitor
// container. Off by default; operator opts in via
// monitor_push_enabled + apiKey.
p.pusher = monitor.NewPusher(monitor.PusherConfig{
BackendURL: p.cfg.BackendURL,
APIKey: p.cfg.APIKey,
Interval: time.Duration(p.cfg.MonitorPushIntervalSeconds) * time.Second,
}, collectForPush,
func(level, msg string, attrs map[string]any) { host.Log(level, msg, attrs) })
if p.cfg.MonitorPushEnabled {
p.wg.Add(1)
go func() {
defer p.wg.Done()
if err := p.pusher.Run(bgCtx); err != nil && !errors.Is(err, context.Canceled) {
host.Log("warn", "monitor pusher exited", map[string]any{"err": err.Error()})
}
}()
}
calBackend := p.cfg.CalendarBackendURL
if calBackend == "" {
calBackend = p.cfg.BackendURL
@@ -128,6 +156,7 @@ func (p *harborForgePlugin) Init(ctx context.Context, host sdkplugin.HostAPI) er
Version: Version,
Collect: collect,
Bridge: p.bridge,
Pusher: p.pusher,
Scheduler: p.sched,
Host: host,
AgentIDFromCtx: func(ctx context.Context) string {

View File

@@ -35,6 +35,17 @@ type Config struct {
// server listens on. Zero/missing disables the bridge entirely.
MonitorPort int `json:"monitor_port,omitempty"`
// MonitorPushEnabled toggles the active push loop that uploads
// system telemetry to BackendURL /monitor/server/heartbeat. Lets
// HF plugin replace the standalone harborforge-monitor container.
// nil (unset) defaults to false; operators must opt in explicitly
// since they need to provision APIKey too.
MonitorPushEnabled *bool `json:"monitor_push_enabled,omitempty"`
// MonitorPushIntervalSeconds — defaults to 30s when ≤0. Mirrors
// the standalone monitor's HF_MONITER_REPORT_INTERVAL knob.
MonitorPushIntervalSeconds int `json:"monitor_push_interval_seconds,omitempty"`
// CalendarHeartbeatIntervalSeconds — defaults to 30s when ≤0.
CalendarHeartbeatIntervalSeconds int `json:"calendar_heartbeat_interval_seconds,omitempty"`
@@ -57,6 +68,8 @@ type Resolved struct {
Identifier string
APIKey string
MonitorPort int
MonitorPushEnabled bool
MonitorPushIntervalSeconds int
CalendarEnabled bool
CalendarHeartbeatIntervalSeconds int
CalendarBackendURL string
@@ -104,6 +117,8 @@ func Resolve(c Config) Resolved {
Identifier: c.Identifier,
APIKey: c.APIKey,
MonitorPort: c.MonitorPort,
MonitorPushEnabled: false,
MonitorPushIntervalSeconds: 30,
CalendarEnabled: true,
CalendarHeartbeatIntervalSeconds: 30,
CalendarBackendURL: c.CalendarBackendURL,
@@ -127,6 +142,12 @@ func Resolve(c Config) Resolved {
if c.CalendarHeartbeatIntervalSeconds > 0 {
out.CalendarHeartbeatIntervalSeconds = c.CalendarHeartbeatIntervalSeconds
}
if c.MonitorPushEnabled != nil {
out.MonitorPushEnabled = *c.MonitorPushEnabled
}
if c.MonitorPushIntervalSeconds > 0 {
out.MonitorPushIntervalSeconds = c.MonitorPushIntervalSeconds
}
if c.RestartPollIntervalSeconds > 0 {
out.RestartPollIntervalSeconds = c.RestartPollIntervalSeconds
}

223
internal/monitor/pusher.go Normal file
View File

@@ -0,0 +1,223 @@
// Pusher periodically uploads system telemetry to the HarborForge
// backend's /monitor/server/heartbeat endpoint. Replaces the standalone
// `harborforge-monitor` daemon — the plugin's lifecycle (host gateway
// start/stop) bounds the heartbeat loop, so no separate process need
// supervise it.
//
// Wire shape mirrors HarborForge.Monitor's `telemetry.Payload`
// (flat `cpu_pct/mem_pct/...` fields + `X-API-Key` header). The
// translation from internal `telemetry.Snapshot` to that shape lives
// in buildPayload; HF backend stays unchanged.
package monitor
import (
"bytes"
"context"
"encoding/json"
"fmt"
"io"
"net/http"
"strings"
"sync"
"time"
"git.hangman-lab.top/zhi/HarborForge.PlexumPlugin/internal/telemetry"
)
// PushPayload is the wire shape POSTed to /monitor/server/heartbeat —
// 1:1 with HarborForge.Monitor's `telemetry.Payload`.
type PushPayload struct {
Identifier string `json:"identifier"`
PluginVersion string `json:"plugin_version,omitempty"`
Agents []any `json:"agents"`
NginxInstalled bool `json:"nginx_installed"`
NginxSites []string `json:"nginx_sites"`
CPUPct float64 `json:"cpu_pct,omitempty"`
MemPct float64 `json:"mem_pct,omitempty"`
DiskPct float64 `json:"disk_pct,omitempty"`
SwapPct float64 `json:"swap_pct,omitempty"`
LoadAvg []float64 `json:"load_avg,omitempty"`
UptimeSeconds uint64 `json:"uptime_seconds,omitempty"`
}
// PusherConfig is the operator-supplied tuning.
type PusherConfig struct {
BackendURL string // e.g. https://hf-api.hangman-lab.top
APIKey string // sent as X-API-Key
Interval time.Duration // default 30s when <=0
}
// Pusher runs the periodic POST loop. One per plugin process.
type Pusher struct {
cfg PusherConfig
collect func() telemetry.Snapshot
log LogFunc
http *http.Client
// stats — for the monitor_telemetry tool / status surfacing.
mu sync.RWMutex
lastSentAt time.Time
lastStatus int
lastErr string
successHits uint64
errHits uint64
}
// NewPusher constructs the loop runner. collect must be a snapshot
// producer (caller usually wires it to telemetry.Collect with
// SampleCPU=true).
func NewPusher(cfg PusherConfig, collect func() telemetry.Snapshot, log LogFunc) *Pusher {
if cfg.Interval <= 0 {
cfg.Interval = 30 * time.Second
}
if log == nil {
log = func(string, string, map[string]any) {}
}
return &Pusher{
cfg: cfg,
collect: collect,
log: log,
http: &http.Client{Timeout: 15 * time.Second},
}
}
// Run drives the push loop until ctx is cancelled. Returns ctx.Err().
// First push happens immediately so the backend sees this claw alive
// without waiting an interval.
func (p *Pusher) Run(ctx context.Context) error {
if p.cfg.BackendURL == "" {
p.log("warn", "monitor push disabled (empty backendURL)", nil)
return nil
}
if p.cfg.APIKey == "" {
p.log("warn", "monitor push disabled (empty apiKey)", nil)
return nil
}
url := strings.TrimRight(p.cfg.BackendURL, "/") + "/monitor/server/heartbeat"
tick := time.NewTicker(p.cfg.Interval)
defer tick.Stop()
p.pushOnce(ctx, url)
for {
select {
case <-ctx.Done():
return ctx.Err()
case <-tick.C:
p.pushOnce(ctx, url)
}
}
}
func (p *Pusher) pushOnce(ctx context.Context, url string) {
snap := p.collect()
body, err := json.Marshal(buildPayload(snap))
if err != nil {
p.recordErr("marshal: " + err.Error())
return
}
req, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(body))
if err != nil {
p.recordErr("build req: " + err.Error())
return
}
req.Header.Set("Content-Type", "application/json")
req.Header.Set("X-API-Key", p.cfg.APIKey)
res, err := p.http.Do(req)
if err != nil {
p.recordErr("send: " + err.Error())
p.log("warn", "monitor push failed", map[string]any{"err": err.Error()})
return
}
defer res.Body.Close()
raw, _ := io.ReadAll(res.Body)
if res.StatusCode < 200 || res.StatusCode >= 300 {
p.recordErr(fmt.Sprintf("%d: %s", res.StatusCode, truncate(raw, 200)))
p.log("warn", "monitor push non-2xx", map[string]any{
"status": res.StatusCode, "body": truncate(raw, 200),
})
return
}
p.recordOK(res.StatusCode)
}
// Stats exposes a copy of the latest push state for diagnostics
// (harborforge_monitor_telemetry tool surfaces this).
type PushStats struct {
LastSentAt time.Time
LastStatus int
LastErr string
SuccessCount uint64
ErrorCount uint64
}
func (p *Pusher) Stats() PushStats {
p.mu.RLock()
defer p.mu.RUnlock()
return PushStats{
LastSentAt: p.lastSentAt,
LastStatus: p.lastStatus,
LastErr: p.lastErr,
SuccessCount: p.successHits,
ErrorCount: p.errHits,
}
}
func (p *Pusher) recordOK(status int) {
p.mu.Lock()
defer p.mu.Unlock()
p.lastSentAt = time.Now().UTC()
p.lastStatus = status
p.lastErr = ""
p.successHits++
}
func (p *Pusher) recordErr(msg string) {
p.mu.Lock()
defer p.mu.Unlock()
p.lastSentAt = time.Now().UTC()
p.lastErr = msg
p.errHits++
}
// buildPayload translates the internal Snapshot into the flat
// PushPayload shape the backend expects. agents is passed through as
// []any (one entry per agent — id/model/state preserved).
func buildPayload(snap telemetry.Snapshot) PushPayload {
agents := make([]any, 0, len(snap.Agents))
for _, a := range snap.Agents {
agents = append(agents, map[string]any{
"id": a.ID,
"model": a.Model,
"state": a.State,
})
}
return PushPayload{
Identifier: snap.Identifier,
PluginVersion: snap.PluginInfo.Version,
Agents: agents,
// nginx detection is independent monitor's responsibility today;
// HF plugin leaves it blank rather than rediscovering nginx
// state. Operators that need it can keep the standalone monitor
// alongside or wait for a follow-up commit.
NginxInstalled: false,
NginxSites: []string{},
CPUPct: round1(snap.CPU.UsedPercent),
MemPct: round1(snap.Memory.UsedPercent),
DiskPct: round1(snap.Disk.UsedPercent),
SwapPct: round1(snap.Swap.UsedPercent),
LoadAvg: []float64{round2(snap.Load.One), round2(snap.Load.Five), round2(snap.Load.Fifteen)},
UptimeSeconds: snap.UptimeSecs,
}
}
func round1(v float64) float64 { return float64(int64(v*10+0.5)) / 10 }
func round2(v float64) float64 { return float64(int64(v*100+0.5)) / 100 }
func truncate(b []byte, n int) string {
if len(b) <= n {
return string(b)
}
return string(b[:n]) + "…"
}

View File

@@ -29,14 +29,30 @@ type Snapshot struct {
Hostname string `json:"hostname"`
UptimeSecs uint64 `json:"uptime"`
Memory MemoryInfo `json:"memory"`
Swap SwapInfo `json:"swap"`
Load LoadInfo `json:"load"`
Disk DiskInfo `json:"disk"`
CPU CPUInfo `json:"cpu"`
Agents []AgentInfo `json:"agents"`
PluginInfo PluginInfo `json:"plugin"`
CapturedAt time.Time `json:"captured_at"`
HostMetadata map[string]string `json:"host_metadata,omitempty"`
}
// SwapInfo is the system swap usage. Zeroes when swap isn't configured.
type SwapInfo struct {
Total uint64 `json:"total"`
Free uint64 `json:"free"`
Used uint64 `json:"used"`
UsedPercent float64 `json:"used_percent"`
}
// CPUInfo holds the most recent CPU usage estimate. UsedPercent is
// computed across one sample interval (see Collect's cpu helper).
type CPUInfo struct {
UsedPercent float64 `json:"used_percent"`
}
// MemoryInfo mirrors OpenclawPlugin's memory shape.
type MemoryInfo struct {
Total uint64 `json:"total"` // bytes
@@ -84,15 +100,27 @@ type CollectOpts struct {
Identifier string
Version string
AgentLister func() []AgentInfo // resolved by the caller (plugin uses HostAPI to walk agents)
// SampleCPU asks Collect to take a 1-second CPU sample. Off-path
// (status endpoint, bridge serve) leave false to keep calls cheap;
// the slow push loop sets it true.
SampleCPU bool
}
// Collect produces a fresh snapshot from /proc + the supplied AgentLister.
// SampleCPU=true takes a 1-second CPU sample (two reads of /proc/stat
// with a sleep between); otherwise CPU usage stays zero. Set true on
// the slow push loop, false on the cheap on-demand status endpoint.
func Collect(opts CollectOpts) Snapshot {
now := time.Now().UTC()
host, _ := os.Hostname()
mem := readMemInfo()
mem, swap := readMemAndSwap()
load := readLoadAvg()
disk := readDiskRoot()
cpu := CPUInfo{}
if opts.SampleCPU {
cpu.UsedPercent = sampleCPUPercent(time.Second)
}
var agents []AgentInfo
if opts.AgentLister != nil {
agents = opts.AgentLister()
@@ -103,8 +131,10 @@ func Collect(opts CollectOpts) Snapshot {
Hostname: host,
UptimeSecs: readUptime(),
Memory: mem,
Swap: swap,
Load: load,
Disk: disk,
CPU: cpu,
Agents: agents,
PluginInfo: PluginInfo{
Name: "harbor-forge",
@@ -117,10 +147,10 @@ func Collect(opts CollectOpts) Snapshot {
// ---- /proc helpers ----
func readMemInfo() MemoryInfo {
func readMemAndSwap() (MemoryInfo, SwapInfo) {
f, err := os.Open("/proc/meminfo")
if err != nil {
return MemoryInfo{}
return MemoryInfo{}, SwapInfo{}
}
defer f.Close()
fields := map[string]uint64{}
@@ -145,6 +175,12 @@ func readMemInfo() MemoryInfo {
// All MemInfo values are in KB; convert to bytes.
fields[key] = v * 1024
}
mem := buildMemInfo(fields)
swap := buildSwapInfo(fields)
return mem, swap
}
func buildMemInfo(fields map[string]uint64) MemoryInfo {
total := fields["MemTotal"]
free := fields["MemAvailable"]
if free == 0 {
@@ -158,6 +194,67 @@ func readMemInfo() MemoryInfo {
return MemoryInfo{Total: total, Free: free, Used: used, UsedPercent: pct}
}
func buildSwapInfo(fields map[string]uint64) SwapInfo {
total := fields["SwapTotal"]
free := fields["SwapFree"]
if total == 0 {
return SwapInfo{}
}
used := total - free
pct := float64(used) / float64(total) * 100
return SwapInfo{Total: total, Free: free, Used: used, UsedPercent: pct}
}
// sampleCPUPercent computes overall CPU usage across one sample
// interval. Two reads of /proc/stat's aggregate "cpu" line, derive
// busy-time delta as (1 - idle/total). Returns 0 on read failure.
func sampleCPUPercent(interval time.Duration) float64 {
total1, idle1, ok := readCPUStat()
if !ok {
return 0
}
time.Sleep(interval)
total2, idle2, ok := readCPUStat()
if !ok || total2 <= total1 {
return 0
}
totalDelta := total2 - total1
idleDelta := idle2 - idle1
if idleDelta > totalDelta {
return 0
}
return float64(totalDelta-idleDelta) / float64(totalDelta) * 100
}
func readCPUStat() (total, idle uint64, ok bool) {
f, err := os.Open("/proc/stat")
if err != nil {
return 0, 0, false
}
defer f.Close()
sc := bufio.NewScanner(f)
if !sc.Scan() {
return 0, 0, false
}
parts := strings.Fields(sc.Text())
if len(parts) < 5 || parts[0] != "cpu" {
return 0, 0, false
}
for i := 1; i < len(parts); i++ {
v, err := strconv.ParseUint(parts[i], 10, 64)
if err != nil {
return 0, 0, false
}
total += v
// idle is the 4th column (parts[4]); iowait (parts[5]) is also
// idle-ish but we count it as busy to match gopsutil's default.
if i == 4 {
idle = v
}
}
return total, idle, true
}
func readLoadAvg() LoadInfo {
raw, err := os.ReadFile("/proc/loadavg")
if err != nil {

View File

@@ -27,6 +27,7 @@ type Deps struct {
Version string
Collect func() telemetry.Snapshot
Bridge *monitor.Bridge
Pusher *monitor.Pusher
Scheduler *calendar.Scheduler
Host sdkplugin.HostAPI
@@ -89,11 +90,32 @@ func toolStatus(deps Deps) (sdkplugin.ToolResult, error) {
"queries": bs.Queries,
"last_query": bs.LastQuery,
},
"monitor_push": monitorPushSummary(deps),
"calendar": sch,
}
return jsonResult(out)
}
// monitorPushSummary returns the pusher's last-known state in the same
// JSON layout the status/monitor_telemetry tools surface. Nil-safe: if
// no pusher is wired (testing, push disabled), reports enabled=false.
func monitorPushSummary(deps Deps) map[string]any {
out := map[string]any{
"enabled": deps.Config.MonitorPushEnabled,
"interval_seconds": deps.Config.MonitorPushIntervalSeconds,
"endpoint": deps.Config.BackendURL + "/monitor/server/heartbeat",
}
if deps.Pusher != nil {
st := deps.Pusher.Stats()
out["last_sent_at"] = st.LastSentAt
out["last_status"] = st.LastStatus
out["last_err"] = st.LastErr
out["success_count"] = st.SuccessCount
out["error_count"] = st.ErrorCount
}
return out
}
func toolTelemetry(deps Deps) (sdkplugin.ToolResult, error) {
return jsonResult(deps.Collect())
}
@@ -101,11 +123,14 @@ func toolTelemetry(deps Deps) (sdkplugin.ToolResult, error) {
func toolMonitorTelemetry(deps Deps) (sdkplugin.ToolResult, error) {
bs := deps.Bridge.Stats()
return jsonResult(map[string]any{
"bridge": map[string]any{
"port": bs.Port,
"listening": bs.Listening,
"queries": bs.Queries,
"last_query": bs.LastQuery,
"last_snapshot": bs.LastSnap,
},
"push": monitorPushSummary(deps),
})
}

View File

@@ -37,12 +37,16 @@ Next steps:
"harbor-forge"
2. Write ${PLUGIN_DIR}/config.json — sample:
{
"backendUrl": "https://monitor.hangman-lab.top",
"backendUrl": "https://hf-api.hangman-lab.top",
"identifier": "server-t3",
"apiKey": "g1_xxx",
"monitor_port": 9100,
"apiKey": "<copy from HF_MONITER_API_KEY>",
"monitor_push_enabled": true,
"monitor_push_interval_seconds": 30,
"monitor_port": 0,
"calendar_enabled": true,
"calendar_heartbeat_interval_seconds": 30
}
3. Restart the host: systemctl --user restart plexum
4. Verify push is landing (DB last_seen_at advancing) and then
remove the standalone harborforge-monitor container.
EOF