feat(monitor): active push loop replacing standalone monitor
Adds a periodic POST loop to <backend>/monitor/server/heartbeat so HF plugin can take over the standalone harborforge-monitor daemon's job — same X-API-Key header, same flat telemetry shape (cpu_pct / mem_pct / disk_pct / swap_pct / load_avg / uptime_seconds / plugin_version / agents[]). HF backend stays unchanged. Config: monitor_push_enabled (default false; opt-in to avoid surprise heartbeats from existing deployments), monitor_push_interval_seconds (default 30), reuses apiKey for the X-API-Key header. Lift the container's HF_MONITER_API_KEY into config.apiKey, flip monitor_push_enabled true, then docker rm -f the container — DB last_seen_at keeps advancing under the plugin's loop. Collector grew swap + cpu sampling (two reads of /proc/stat over a 1-second window when SampleCPU=true). Bridge endpoint stays cheap (SampleCPU=false on demand); push loop is the only caller paying the sampling cost. E2E in sim: monitor_push_enabled=true + apiKey from injected MonitoredServer row → server_states.last_seen_at advances exactly every interval_seconds (10s configured, 10s observed). cpu/mem/disk/ swap_pct all populate correctly.
This commit is contained in:
@@ -29,14 +29,30 @@ type Snapshot struct {
|
||||
Hostname string `json:"hostname"`
|
||||
UptimeSecs uint64 `json:"uptime"`
|
||||
Memory MemoryInfo `json:"memory"`
|
||||
Swap SwapInfo `json:"swap"`
|
||||
Load LoadInfo `json:"load"`
|
||||
Disk DiskInfo `json:"disk"`
|
||||
CPU CPUInfo `json:"cpu"`
|
||||
Agents []AgentInfo `json:"agents"`
|
||||
PluginInfo PluginInfo `json:"plugin"`
|
||||
CapturedAt time.Time `json:"captured_at"`
|
||||
HostMetadata map[string]string `json:"host_metadata,omitempty"`
|
||||
}
|
||||
|
||||
// SwapInfo is the system swap usage. Zeroes when swap isn't configured.
|
||||
type SwapInfo struct {
|
||||
Total uint64 `json:"total"`
|
||||
Free uint64 `json:"free"`
|
||||
Used uint64 `json:"used"`
|
||||
UsedPercent float64 `json:"used_percent"`
|
||||
}
|
||||
|
||||
// CPUInfo holds the most recent CPU usage estimate. UsedPercent is
|
||||
// computed across one sample interval (see Collect's cpu helper).
|
||||
type CPUInfo struct {
|
||||
UsedPercent float64 `json:"used_percent"`
|
||||
}
|
||||
|
||||
// MemoryInfo mirrors OpenclawPlugin's memory shape.
|
||||
type MemoryInfo struct {
|
||||
Total uint64 `json:"total"` // bytes
|
||||
@@ -84,15 +100,27 @@ type CollectOpts struct {
|
||||
Identifier string
|
||||
Version string
|
||||
AgentLister func() []AgentInfo // resolved by the caller (plugin uses HostAPI to walk agents)
|
||||
|
||||
// SampleCPU asks Collect to take a 1-second CPU sample. Off-path
|
||||
// (status endpoint, bridge serve) leave false to keep calls cheap;
|
||||
// the slow push loop sets it true.
|
||||
SampleCPU bool
|
||||
}
|
||||
|
||||
// Collect produces a fresh snapshot from /proc + the supplied AgentLister.
|
||||
// SampleCPU=true takes a 1-second CPU sample (two reads of /proc/stat
|
||||
// with a sleep between); otherwise CPU usage stays zero. Set true on
|
||||
// the slow push loop, false on the cheap on-demand status endpoint.
|
||||
func Collect(opts CollectOpts) Snapshot {
|
||||
now := time.Now().UTC()
|
||||
host, _ := os.Hostname()
|
||||
mem := readMemInfo()
|
||||
mem, swap := readMemAndSwap()
|
||||
load := readLoadAvg()
|
||||
disk := readDiskRoot()
|
||||
cpu := CPUInfo{}
|
||||
if opts.SampleCPU {
|
||||
cpu.UsedPercent = sampleCPUPercent(time.Second)
|
||||
}
|
||||
var agents []AgentInfo
|
||||
if opts.AgentLister != nil {
|
||||
agents = opts.AgentLister()
|
||||
@@ -103,8 +131,10 @@ func Collect(opts CollectOpts) Snapshot {
|
||||
Hostname: host,
|
||||
UptimeSecs: readUptime(),
|
||||
Memory: mem,
|
||||
Swap: swap,
|
||||
Load: load,
|
||||
Disk: disk,
|
||||
CPU: cpu,
|
||||
Agents: agents,
|
||||
PluginInfo: PluginInfo{
|
||||
Name: "harbor-forge",
|
||||
@@ -117,10 +147,10 @@ func Collect(opts CollectOpts) Snapshot {
|
||||
|
||||
// ---- /proc helpers ----
|
||||
|
||||
func readMemInfo() MemoryInfo {
|
||||
func readMemAndSwap() (MemoryInfo, SwapInfo) {
|
||||
f, err := os.Open("/proc/meminfo")
|
||||
if err != nil {
|
||||
return MemoryInfo{}
|
||||
return MemoryInfo{}, SwapInfo{}
|
||||
}
|
||||
defer f.Close()
|
||||
fields := map[string]uint64{}
|
||||
@@ -145,6 +175,12 @@ func readMemInfo() MemoryInfo {
|
||||
// All MemInfo values are in KB; convert to bytes.
|
||||
fields[key] = v * 1024
|
||||
}
|
||||
mem := buildMemInfo(fields)
|
||||
swap := buildSwapInfo(fields)
|
||||
return mem, swap
|
||||
}
|
||||
|
||||
func buildMemInfo(fields map[string]uint64) MemoryInfo {
|
||||
total := fields["MemTotal"]
|
||||
free := fields["MemAvailable"]
|
||||
if free == 0 {
|
||||
@@ -158,6 +194,67 @@ func readMemInfo() MemoryInfo {
|
||||
return MemoryInfo{Total: total, Free: free, Used: used, UsedPercent: pct}
|
||||
}
|
||||
|
||||
func buildSwapInfo(fields map[string]uint64) SwapInfo {
|
||||
total := fields["SwapTotal"]
|
||||
free := fields["SwapFree"]
|
||||
if total == 0 {
|
||||
return SwapInfo{}
|
||||
}
|
||||
used := total - free
|
||||
pct := float64(used) / float64(total) * 100
|
||||
return SwapInfo{Total: total, Free: free, Used: used, UsedPercent: pct}
|
||||
}
|
||||
|
||||
// sampleCPUPercent computes overall CPU usage across one sample
|
||||
// interval. Two reads of /proc/stat's aggregate "cpu" line, derive
|
||||
// busy-time delta as (1 - idle/total). Returns 0 on read failure.
|
||||
func sampleCPUPercent(interval time.Duration) float64 {
|
||||
total1, idle1, ok := readCPUStat()
|
||||
if !ok {
|
||||
return 0
|
||||
}
|
||||
time.Sleep(interval)
|
||||
total2, idle2, ok := readCPUStat()
|
||||
if !ok || total2 <= total1 {
|
||||
return 0
|
||||
}
|
||||
totalDelta := total2 - total1
|
||||
idleDelta := idle2 - idle1
|
||||
if idleDelta > totalDelta {
|
||||
return 0
|
||||
}
|
||||
return float64(totalDelta-idleDelta) / float64(totalDelta) * 100
|
||||
}
|
||||
|
||||
func readCPUStat() (total, idle uint64, ok bool) {
|
||||
f, err := os.Open("/proc/stat")
|
||||
if err != nil {
|
||||
return 0, 0, false
|
||||
}
|
||||
defer f.Close()
|
||||
sc := bufio.NewScanner(f)
|
||||
if !sc.Scan() {
|
||||
return 0, 0, false
|
||||
}
|
||||
parts := strings.Fields(sc.Text())
|
||||
if len(parts) < 5 || parts[0] != "cpu" {
|
||||
return 0, 0, false
|
||||
}
|
||||
for i := 1; i < len(parts); i++ {
|
||||
v, err := strconv.ParseUint(parts[i], 10, 64)
|
||||
if err != nil {
|
||||
return 0, 0, false
|
||||
}
|
||||
total += v
|
||||
// idle is the 4th column (parts[4]); iowait (parts[5]) is also
|
||||
// idle-ish but we count it as busy to match gopsutil's default.
|
||||
if i == 4 {
|
||||
idle = v
|
||||
}
|
||||
}
|
||||
return total, idle, true
|
||||
}
|
||||
|
||||
func readLoadAvg() LoadInfo {
|
||||
raw, err := os.ReadFile("/proc/loadavg")
|
||||
if err != nil {
|
||||
|
||||
Reference in New Issue
Block a user