feat(monitor): active push loop replacing standalone monitor

Adds a periodic POST loop to <backend>/monitor/server/heartbeat so
HF plugin can take over the standalone harborforge-monitor daemon's
job — same X-API-Key header, same flat telemetry shape (cpu_pct /
mem_pct / disk_pct / swap_pct / load_avg / uptime_seconds /
plugin_version / agents[]). HF backend stays unchanged.

Config: monitor_push_enabled (default false; opt-in to avoid surprise
heartbeats from existing deployments), monitor_push_interval_seconds
(default 30), reuses apiKey for the X-API-Key header. Lift the
container's HF_MONITER_API_KEY into config.apiKey, flip
monitor_push_enabled true, then docker rm -f the container — DB
last_seen_at keeps advancing under the plugin's loop.

Collector grew swap + cpu sampling (two reads of /proc/stat over a
1-second window when SampleCPU=true). Bridge endpoint stays cheap
(SampleCPU=false on demand); push loop is the only caller paying the
sampling cost.

E2E in sim: monitor_push_enabled=true + apiKey from injected
MonitoredServer row → server_states.last_seen_at advances exactly
every interval_seconds (10s configured, 10s observed). cpu/mem/disk/
swap_pct all populate correctly.
This commit is contained in:
h z
2026-06-03 13:04:51 +01:00
parent 472cecd771
commit 6e3ad669f8
7 changed files with 448 additions and 30 deletions

View File

@@ -29,14 +29,30 @@ type Snapshot struct {
Hostname string `json:"hostname"`
UptimeSecs uint64 `json:"uptime"`
Memory MemoryInfo `json:"memory"`
Swap SwapInfo `json:"swap"`
Load LoadInfo `json:"load"`
Disk DiskInfo `json:"disk"`
CPU CPUInfo `json:"cpu"`
Agents []AgentInfo `json:"agents"`
PluginInfo PluginInfo `json:"plugin"`
CapturedAt time.Time `json:"captured_at"`
HostMetadata map[string]string `json:"host_metadata,omitempty"`
}
// SwapInfo is the system swap usage. Zeroes when swap isn't configured.
type SwapInfo struct {
Total uint64 `json:"total"`
Free uint64 `json:"free"`
Used uint64 `json:"used"`
UsedPercent float64 `json:"used_percent"`
}
// CPUInfo holds the most recent CPU usage estimate. UsedPercent is
// computed across one sample interval (see Collect's cpu helper).
type CPUInfo struct {
UsedPercent float64 `json:"used_percent"`
}
// MemoryInfo mirrors OpenclawPlugin's memory shape.
type MemoryInfo struct {
Total uint64 `json:"total"` // bytes
@@ -84,15 +100,27 @@ type CollectOpts struct {
Identifier string
Version string
AgentLister func() []AgentInfo // resolved by the caller (plugin uses HostAPI to walk agents)
// SampleCPU asks Collect to take a 1-second CPU sample. Off-path
// (status endpoint, bridge serve) leave false to keep calls cheap;
// the slow push loop sets it true.
SampleCPU bool
}
// Collect produces a fresh snapshot from /proc + the supplied AgentLister.
// SampleCPU=true takes a 1-second CPU sample (two reads of /proc/stat
// with a sleep between); otherwise CPU usage stays zero. Set true on
// the slow push loop, false on the cheap on-demand status endpoint.
func Collect(opts CollectOpts) Snapshot {
now := time.Now().UTC()
host, _ := os.Hostname()
mem := readMemInfo()
mem, swap := readMemAndSwap()
load := readLoadAvg()
disk := readDiskRoot()
cpu := CPUInfo{}
if opts.SampleCPU {
cpu.UsedPercent = sampleCPUPercent(time.Second)
}
var agents []AgentInfo
if opts.AgentLister != nil {
agents = opts.AgentLister()
@@ -103,8 +131,10 @@ func Collect(opts CollectOpts) Snapshot {
Hostname: host,
UptimeSecs: readUptime(),
Memory: mem,
Swap: swap,
Load: load,
Disk: disk,
CPU: cpu,
Agents: agents,
PluginInfo: PluginInfo{
Name: "harbor-forge",
@@ -117,10 +147,10 @@ func Collect(opts CollectOpts) Snapshot {
// ---- /proc helpers ----
func readMemInfo() MemoryInfo {
func readMemAndSwap() (MemoryInfo, SwapInfo) {
f, err := os.Open("/proc/meminfo")
if err != nil {
return MemoryInfo{}
return MemoryInfo{}, SwapInfo{}
}
defer f.Close()
fields := map[string]uint64{}
@@ -145,6 +175,12 @@ func readMemInfo() MemoryInfo {
// All MemInfo values are in KB; convert to bytes.
fields[key] = v * 1024
}
mem := buildMemInfo(fields)
swap := buildSwapInfo(fields)
return mem, swap
}
func buildMemInfo(fields map[string]uint64) MemoryInfo {
total := fields["MemTotal"]
free := fields["MemAvailable"]
if free == 0 {
@@ -158,6 +194,67 @@ func readMemInfo() MemoryInfo {
return MemoryInfo{Total: total, Free: free, Used: used, UsedPercent: pct}
}
func buildSwapInfo(fields map[string]uint64) SwapInfo {
total := fields["SwapTotal"]
free := fields["SwapFree"]
if total == 0 {
return SwapInfo{}
}
used := total - free
pct := float64(used) / float64(total) * 100
return SwapInfo{Total: total, Free: free, Used: used, UsedPercent: pct}
}
// sampleCPUPercent computes overall CPU usage across one sample
// interval. Two reads of /proc/stat's aggregate "cpu" line, derive
// busy-time delta as (1 - idle/total). Returns 0 on read failure.
func sampleCPUPercent(interval time.Duration) float64 {
total1, idle1, ok := readCPUStat()
if !ok {
return 0
}
time.Sleep(interval)
total2, idle2, ok := readCPUStat()
if !ok || total2 <= total1 {
return 0
}
totalDelta := total2 - total1
idleDelta := idle2 - idle1
if idleDelta > totalDelta {
return 0
}
return float64(totalDelta-idleDelta) / float64(totalDelta) * 100
}
func readCPUStat() (total, idle uint64, ok bool) {
f, err := os.Open("/proc/stat")
if err != nil {
return 0, 0, false
}
defer f.Close()
sc := bufio.NewScanner(f)
if !sc.Scan() {
return 0, 0, false
}
parts := strings.Fields(sc.Text())
if len(parts) < 5 || parts[0] != "cpu" {
return 0, 0, false
}
for i := 1; i < len(parts); i++ {
v, err := strconv.ParseUint(parts[i], 10, 64)
if err != nil {
return 0, 0, false
}
total += v
// idle is the 4th column (parts[4]); iowait (parts[5]) is also
// idle-ish but we count it as busy to match gopsutil's default.
if i == 4 {
idle = v
}
}
return total, idle, true
}
func readLoadAvg() LoadInfo {
raw, err := os.ReadFile("/proc/loadavg")
if err != nil {