feat: implement HarborForge Monitor OpenClaw Plugin

Architecture: - openclaw.plugin.json: Plugin manifest with config schema - index.mjs: Plugin entry, lifecycle hooks (gateway:start/stop) - sidecar/server.mjs: Independent Node process for telemetry Features: - Collects system metrics (CPU, memory, disk, load, uptime) - Collects OpenClaw status (version, agents) - HTTP heartbeat to HarborForge Monitor - Config via ~/.openclaw/openclaw.json - Sidecar auto-starts/stops with Gateway Config options: - enabled, backendUrl, identifier - challengeUuid (required, from Monitor registration) - reportIntervalSec, httpFallbackIntervalSec - logLevel Provides tool: harborforge_monitor_status
2026-03-19 13:37:11 +00:00
parent c8681af0ac
commit a148c11e50
5 changed files with 698 additions and 0 deletions
--- a/sidecar/server.mjs
+++ b/sidecar/server.mjs
@@ -0,0 +1,302 @@
+/**
+ * HarborForge Monitor Sidecar Server
+ * 
+ * Runs as separate process from Gateway.
+ * Collects system metrics and OpenClaw status, sends to Monitor.
+ */
+import { createServer } from 'http';
+import { readFile, access } from 'fs/promises';
+import { constants } from 'fs';
+import { exec } from 'child_process';
+import { promisify } from 'util';
+import { platform, hostname, freemem, totalmem, uptime } from 'os';
+
+const execAsync = promisify(exec);
+
+// Config from environment (set by plugin)
+const CONFIG = {
+  backendUrl: process.env.HF_MONITOR_BACKEND_URL || 'https://monitor.hangman-lab.top',
+  identifier: process.env.HF_MONITOR_IDENTIFIER || hostname(),
+  challengeUuid: process.env.HF_MONITOR_CHALLENGE_UUID,
+  reportIntervalSec: parseInt(process.env.HF_MONITOR_REPORT_INTERVAL || '30', 10),
+  httpFallbackIntervalSec: parseInt(process.env.HF_MONITOR_HTTP_FALLBACK_INTERVAL || '60', 10),
+  logLevel: process.env.HF_MONITOR_LOG_LEVEL || 'info',
+  openclawPath: process.env.OPENCLAW_PATH || `${process.env.HOME}/.openclaw`,
+  openclawVersion: process.env.OPENCLAW_VERSION || 'unknown',
+};
+
+// Logging
+const log = {
+  debug: (...args) => CONFIG.logLevel === 'debug' && console.log('[DEBUG]', ...args),
+  info: (...args) => ['debug', 'info'].includes(CONFIG.logLevel) && console.log('[INFO]', ...args),
+  warn: (...args) => console.log('[WARN]', ...args),
+  error: (...args) => console.error('[ERROR]', ...args),
+};
+
+// State
+let wsConnection = null;
+let lastSuccessfulSend = null;
+let consecutiveFailures = 0;
+let isShuttingDown = false;
+
+/**
+ * Collect system metrics
+ */
+async function collectSystemMetrics() {
+  try {
+    // CPU usage (average over 1 second)
+    const cpuUsage = await getCpuUsage();
+    
+    // Memory
+    const memTotal = totalmem();
+    const memFree = freemem();
+    const memUsed = memTotal - memFree;
+    
+    // Disk usage
+    const diskInfo = await getDiskUsage();
+    
+    // Load average
+    const loadAvg = platform() !== 'win32' ? require('os').loadavg() : [0, 0, 0];
+    
+    return {
+      cpu_pct: cpuUsage,
+      mem_pct: Math.round((memUsed / memTotal) * 100 * 10) / 10,
+      mem_used_mb: Math.round(memUsed / 1024 / 1024),
+      mem_total_mb: Math.round(memTotal / 1024 / 1024),
+      disk_pct: diskInfo.usedPct,
+      disk_used_gb: Math.round(diskInfo.usedGB * 10) / 10,
+      disk_total_gb: Math.round(diskInfo.totalGB * 10) / 10,
+      swap_pct: diskInfo.swapUsedPct || 0,
+      uptime_sec: Math.floor(uptime()),
+      load_avg_1m: Math.round(loadAvg[0] * 100) / 100,
+      platform: platform(),
+      hostname: hostname(),
+    };
+  } catch (err) {
+    log.error('Failed to collect system metrics:', err.message);
+    return {};
+  }
+}
+
+/**
+ * Get CPU usage percentage
+ */
+async function getCpuUsage() {
+  try {
+    if (platform() === 'linux') {
+      const { stdout } = await execAsync("top -bn1 | grep 'Cpu(s)' | awk '{print $2}' | cut -d'%' -f1");
+      const usage = parseFloat(stdout.trim());
+      return isNaN(usage) ? 0 : Math.round(usage * 10) / 10;
+    } else if (platform() === 'darwin') {
+      const { stdout } = await execAsync("top -l 1 | grep 'CPU usage' | awk '{print $3}' | cut -d'%' -f1");
+      const usage = parseFloat(stdout.trim());
+      return isNaN(usage) ? 0 : Math.round(usage * 10) / 10;
+    }
+  } catch {
+    // Fallback: calculate from /proc/stat on Linux
+    try {
+      const stat = await readFile('/proc/stat', 'utf8');
+      const cpuLine = stat.split('\n')[0];
+      const parts = cpuLine.split(/\s+/).slice(1).map(Number);
+      const idle = parts[3];
+      const total = parts.reduce((a, b) => a + b, 0);
+      const usage = ((total - idle) / total) * 100;
+      return Math.round(usage * 10) / 10;
+    } catch {
+      return 0;
+    }
+  }
+  return 0;
+}
+
+/**
+ * Get disk usage
+ */
+async function getDiskUsage() {
+  try {
+    if (platform() === 'linux' || platform() === 'darwin') {
+      const { stdout } = await execAsync("df -h / | tail -1 | awk '{print $2,$3,$5}'");
+      const [total, used, pct] = stdout.trim().split(/\s+/);
+      return {
+        totalGB: parseSizeToGB(total),
+        usedGB: parseSizeToGB(used),
+        usedPct: parseInt(pct.replace('%', ''), 10),
+      };
+    }
+  } catch (err) {
+    log.debug('Failed to get disk usage:', err.message);
+  }
+  return { totalGB: 0, usedGB: 0, usedPct: 0 };
+}
+
+/**
+ * Parse size string (like '50G' or '100M') to GB
+ */
+function parseSizeToGB(size) {
+  const num = parseFloat(size);
+  if (size.includes('T')) return num * 1024;
+  if (size.includes('G')) return num;
+  if (size.includes('M')) return num / 1024;
+  if (size.includes('K')) return num / 1024 / 1024;
+  return num;
+}
+
+/**
+ * Collect OpenClaw status
+ */
+async function collectOpenclawStatus() {
+  try {
+    const agents = await getOpenclawAgents();
+    
+    return {
+      version: CONFIG.openclawVersion,
+      agent_count: agents.length,
+      agents: agents.map(a => ({
+        id: a.id,
+        name: a.name,
+        status: a.status,
+      })),
+    };
+  } catch (err) {
+    log.debug('Failed to collect OpenClaw status:', err.message);
+    return { version: CONFIG.openclawVersion, agent_count: 0, agents: [] };
+  }
+}
+
+/**
+ * Get list of OpenClaw agents from local state
+ */
+async function getOpenclawAgents() {
+  try {
+    // Try to read agent config/state from OpenClaw directory
+    const agentConfigPath = `${CONFIG.openclawPath}/agents.json`;
+    try {
+      await access(agentConfigPath, constants.R_OK);
+      const data = JSON.parse(await readFile(agentConfigPath, 'utf8'));
+      return data.agents || [];
+    } catch {
+      // Fallback: return empty list
+      return [];
+    }
+  } catch {
+    return [];
+  }
+}
+
+/**
+ * Build telemetry payload
+ */
+async function buildPayload() {
+  const system = await collectSystemMetrics();
+  const openclaw = await collectOpenclawStatus();
+  
+  return {
+    identifier: CONFIG.identifier,
+    challenge_uuid: CONFIG.challengeUuid,
+    timestamp: new Date().toISOString(),
+    ...system,
+    openclaw_version: openclaw.version,
+    openclaw_agents: openclaw.agents,
+    openclaw_agent_count: openclaw.agent_count,
+  };
+}
+
+/**
+ * Send telemetry via HTTP
+ */
+async function sendHttpHeartbeat() {
+  try {
+    const payload = await buildPayload();
+    
+    log.debug('Sending HTTP heartbeat...');
+    
+    const response = await fetch(`${CONFIG.backendUrl}/monitor/server/heartbeat`, {
+      method: 'POST',
+      headers: {
+        'Content-Type': 'application/json',
+        'X-Server-Identifier': CONFIG.identifier,
+        'X-Challenge-UUID': CONFIG.challengeUuid,
+      },
+      body: JSON.stringify(payload),
+    });
+    
+    if (response.ok) {
+      log.debug('HTTP heartbeat sent successfully');
+      lastSuccessfulSend = Date.now();
+      consecutiveFailures = 0;
+      return true;
+    } else {
+      throw new Error(`HTTP ${response.status}: ${await response.text()}`);
+    }
+  } catch (err) {
+    log.error('HTTP heartbeat failed:', err.message);
+    consecutiveFailures++;
+    return false;
+  }
+}
+
+/**
+ * Main reporting loop
+ */
+async function reportingLoop() {
+  while (!isShuttingDown) {
+    try {
+      // Try HTTP (WebSocket can be added later)
+      const success = await sendHttpHeartbeat();
+      
+      // Calculate next interval with backoff on failure
+      let interval = CONFIG.reportIntervalSec * 1000;
+      if (!success) {
+        // Exponential backoff: max 5 minutes
+        const backoff = Math.min(consecutiveFailures * 10000, 300000);
+        interval = Math.max(interval, backoff);
+        log.info(`Retry in ${interval}ms (backoff)`);
+      }
+      
+      // Sleep until next report
+      await new Promise(resolve => setTimeout(resolve, interval));
+      
+    } catch (err) {
+      log.error('Reporting loop error:', err.message);
+      await new Promise(resolve => setTimeout(resolve, 30000));
+    }
+  }
+}
+
+/**
+ * Graceful shutdown
+ */
+function shutdown() {
+  log.info('Shutting down sidecar...');
+  isShuttingDown = true;
+  
+  if (wsConnection) {
+    wsConnection.close();
+  }
+  
+  // Send final heartbeat
+  sendHttpHeartbeat().finally(() => {
+    process.exit(0);
+  });
+}
+
+// Handle signals
+process.on('SIGTERM', shutdown);
+process.on('SIGINT', shutdown);
+
+// Start
+log.info('HarborForge Monitor Sidecar starting...');
+log.info('Config:', {
+  identifier: CONFIG.identifier,
+  backendUrl: CONFIG.backendUrl,
+  reportIntervalSec: CONFIG.reportIntervalSec,
+});
+
+// Validate config
+if (!CONFIG.challengeUuid) {
+  log.error('Missing HF_MONITOR_CHALLENGE_UUID environment variable');
+  process.exit(1);
+}
+
+// Start reporting loop
+reportingLoop();