- Update openclaw.plugin.json: replace challengeUuid with apiKey (optional) - Fix tsconfig: use CommonJS module to avoid import.meta.url issues - Fix plugin/index.ts: remove ESM-specific code, use __dirname - Fix telemetry.mjs: - Add loadavg to os imports, remove require() call - Replace challengeUuid with apiKey in config - Update endpoint to heartbeat-v2 - Add X-API-Key header when apiKey is configured - Fix payload field names: agents, load_avg (array), uptime_seconds - Change missing apiKey from error to warning
289 lines
8.0 KiB
JavaScript
289 lines
8.0 KiB
JavaScript
/**
|
|
* HarborForge Monitor Telemetry Server
|
|
*
|
|
* Runs as separate process from Gateway.
|
|
* Collects system metrics and OpenClaw status, sends to Monitor.
|
|
*/
|
|
import { readFile, access } from 'fs/promises';
|
|
import { constants } from 'fs';
|
|
import { exec } from 'child_process';
|
|
import { promisify } from 'util';
|
|
import { platform, hostname, freemem, totalmem, uptime, loadavg } from 'os';
|
|
|
|
const execAsync = promisify(exec);
|
|
|
|
// Config from environment (set by plugin)
|
|
const openclawPath = process.env.OPENCLAW_PATH || `${process.env.HOME}/.openclaw`;
|
|
const CONFIG = {
|
|
backendUrl: process.env.HF_MONITOR_BACKEND_URL || 'https://monitor.hangman-lab.top',
|
|
identifier: process.env.HF_MONITOR_IDENTIFIER || hostname(),
|
|
apiKey: process.env.HF_MONITOR_API_KEY,
|
|
reportIntervalSec: parseInt(process.env.HF_MONITOR_REPORT_INTERVAL || '30', 10),
|
|
httpFallbackIntervalSec: parseInt(process.env.HF_MONITOR_HTTP_FALLBACK_INTERVAL || '60', 10),
|
|
logLevel: process.env.HF_MONITOR_LOG_LEVEL || 'info',
|
|
openclawPath,
|
|
openclawVersion: process.env.OPENCLAW_VERSION || 'unknown',
|
|
cachePath: process.env.HF_MONITOR_CACHE_PATH || `${openclawPath}/telemetry_cache.json`,
|
|
maxCacheSize: parseInt(process.env.HF_MONITOR_MAX_CACHE_SIZE || '100', 10),
|
|
};
|
|
|
|
// Logging
|
|
const log = {
|
|
debug: (...args) => CONFIG.logLevel === 'debug' && console.log('[DEBUG]', ...args),
|
|
info: (...args) => ['debug', 'info'].includes(CONFIG.logLevel) && console.log('[INFO]', ...args),
|
|
warn: (...args) => console.log('[WARN]', ...args),
|
|
error: (...args) => console.error('[ERROR]', ...args),
|
|
};
|
|
|
|
// State
|
|
let wsConnection = null;
|
|
let lastSuccessfulSend = null;
|
|
let consecutiveFailures = 0;
|
|
let isShuttingDown = false;
|
|
|
|
/**
|
|
* Collect system metrics
|
|
*/
|
|
async function collectSystemMetrics() {
|
|
try {
|
|
const cpuUsage = await getCpuUsage();
|
|
const memTotal = totalmem();
|
|
const memFree = freemem();
|
|
const memUsed = memTotal - memFree;
|
|
const diskInfo = await getDiskUsage();
|
|
const loadAvg = platform() !== 'win32' ? loadavg() : [0, 0, 0];
|
|
|
|
return {
|
|
cpu_pct: cpuUsage,
|
|
mem_pct: Math.round((memUsed / memTotal) * 100 * 10) / 10,
|
|
mem_used_mb: Math.round(memUsed / 1024 / 1024),
|
|
mem_total_mb: Math.round(memTotal / 1024 / 1024),
|
|
disk_pct: diskInfo.usedPct,
|
|
disk_used_gb: Math.round(diskInfo.usedGB * 10) / 10,
|
|
disk_total_gb: Math.round(diskInfo.totalGB * 10) / 10,
|
|
swap_pct: diskInfo.swapUsedPct || 0,
|
|
uptime_seconds: Math.floor(uptime()),
|
|
load_avg: [
|
|
Math.round(loadAvg[0] * 100) / 100,
|
|
Math.round(loadAvg[1] * 100) / 100,
|
|
Math.round(loadAvg[2] * 100) / 100,
|
|
],
|
|
platform: platform(),
|
|
hostname: hostname(),
|
|
};
|
|
} catch (err) {
|
|
log.error('Failed to collect system metrics:', err.message);
|
|
return {};
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Get CPU usage percentage
|
|
*/
|
|
async function getCpuUsage() {
|
|
try {
|
|
if (platform() === 'linux') {
|
|
const { stdout } = await execAsync("top -bn1 | grep 'Cpu(s)' | awk '{print $2}' | cut -d'%' -f1");
|
|
const usage = parseFloat(stdout.trim());
|
|
return isNaN(usage) ? 0 : Math.round(usage * 10) / 10;
|
|
} else if (platform() === 'darwin') {
|
|
const { stdout } = await execAsync("top -l 1 | grep 'CPU usage' | awk '{print $3}' | cut -d'%' -f1");
|
|
const usage = parseFloat(stdout.trim());
|
|
return isNaN(usage) ? 0 : Math.round(usage * 10) / 10;
|
|
}
|
|
} catch {
|
|
try {
|
|
const stat = await readFile('/proc/stat', 'utf8');
|
|
const cpuLine = stat.split('\n')[0];
|
|
const parts = cpuLine.split(/\s+/).slice(1).map(Number);
|
|
const idle = parts[3];
|
|
const total = parts.reduce((a, b) => a + b, 0);
|
|
const usage = ((total - idle) / total) * 100;
|
|
return Math.round(usage * 10) / 10;
|
|
} catch {
|
|
return 0;
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
* Get disk usage
|
|
*/
|
|
async function getDiskUsage() {
|
|
try {
|
|
if (platform() === 'linux' || platform() === 'darwin') {
|
|
const { stdout } = await execAsync("df -h / | tail -1 | awk '{print $2,$3,$5}'");
|
|
const [total, used, pct] = stdout.trim().split(/\s+/);
|
|
return {
|
|
totalGB: parseSizeToGB(total),
|
|
usedGB: parseSizeToGB(used),
|
|
usedPct: parseInt(pct.replace('%', ''), 10),
|
|
};
|
|
}
|
|
} catch (err) {
|
|
log.debug('Failed to get disk usage:', err.message);
|
|
}
|
|
return { totalGB: 0, usedGB: 0, usedPct: 0 };
|
|
}
|
|
|
|
function parseSizeToGB(size) {
|
|
const num = parseFloat(size);
|
|
if (size.includes('T')) return num * 1024;
|
|
if (size.includes('G')) return num;
|
|
if (size.includes('M')) return num / 1024;
|
|
if (size.includes('K')) return num / 1024 / 1024;
|
|
return num;
|
|
}
|
|
|
|
/**
|
|
* Collect OpenClaw status
|
|
*/
|
|
async function collectOpenclawStatus() {
|
|
try {
|
|
const agents = await getOpenclawAgents();
|
|
return {
|
|
version: CONFIG.openclawVersion,
|
|
agent_count: agents.length,
|
|
agents: agents.map(a => ({
|
|
id: a.id,
|
|
name: a.name,
|
|
status: a.status,
|
|
})),
|
|
};
|
|
} catch (err) {
|
|
log.debug('Failed to collect OpenClaw status:', err.message);
|
|
return { version: CONFIG.openclawVersion, agent_count: 0, agents: [] };
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Get list of OpenClaw agents from local state
|
|
*/
|
|
async function getOpenclawAgents() {
|
|
try {
|
|
const agentConfigPath = `${CONFIG.openclawPath}/agents.json`;
|
|
try {
|
|
await access(agentConfigPath, constants.R_OK);
|
|
const data = JSON.parse(await readFile(agentConfigPath, 'utf8'));
|
|
return data.agents || [];
|
|
} catch {
|
|
return [];
|
|
}
|
|
} catch {
|
|
return [];
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Build telemetry payload
|
|
*/
|
|
async function buildPayload() {
|
|
const system = await collectSystemMetrics();
|
|
const openclaw = await collectOpenclawStatus();
|
|
|
|
return {
|
|
identifier: CONFIG.identifier,
|
|
timestamp: new Date().toISOString(),
|
|
...system,
|
|
openclaw_version: openclaw.version,
|
|
agents: openclaw.agents,
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Send telemetry via HTTP
|
|
*/
|
|
async function sendHttpHeartbeat() {
|
|
try {
|
|
const payload = await buildPayload();
|
|
log.debug('Sending HTTP heartbeat...');
|
|
|
|
const headers = {
|
|
'Content-Type': 'application/json',
|
|
'X-Server-Identifier': CONFIG.identifier,
|
|
};
|
|
|
|
if (CONFIG.apiKey) {
|
|
headers['X-API-Key'] = CONFIG.apiKey;
|
|
}
|
|
|
|
const response = await fetch(`${CONFIG.backendUrl}/monitor/server/heartbeat-v2`, {
|
|
method: 'POST',
|
|
headers,
|
|
body: JSON.stringify(payload),
|
|
});
|
|
|
|
if (response.ok) {
|
|
log.debug('HTTP heartbeat sent successfully');
|
|
lastSuccessfulSend = Date.now();
|
|
consecutiveFailures = 0;
|
|
return true;
|
|
} else {
|
|
throw new Error(`HTTP ${response.status}: ${await response.text()}`);
|
|
}
|
|
} catch (err) {
|
|
log.error('HTTP heartbeat failed:', err.message);
|
|
consecutiveFailures++;
|
|
return false;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Main reporting loop
|
|
*/
|
|
async function reportingLoop() {
|
|
while (!isShuttingDown) {
|
|
try {
|
|
const success = await sendHttpHeartbeat();
|
|
|
|
let interval = CONFIG.reportIntervalSec * 1000;
|
|
if (!success) {
|
|
const backoff = Math.min(consecutiveFailures * 10000, 300000);
|
|
interval = Math.max(interval, backoff);
|
|
log.info(`Retry in ${interval}ms (backoff)`);
|
|
}
|
|
|
|
await new Promise(resolve => setTimeout(resolve, interval));
|
|
} catch (err) {
|
|
log.error('Reporting loop error:', err.message);
|
|
await new Promise(resolve => setTimeout(resolve, 30000));
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Graceful shutdown
|
|
*/
|
|
function shutdown() {
|
|
log.info('Shutting down telemetry server...');
|
|
isShuttingDown = true;
|
|
|
|
if (wsConnection) {
|
|
wsConnection.close();
|
|
}
|
|
|
|
sendHttpHeartbeat().finally(() => {
|
|
process.exit(0);
|
|
});
|
|
}
|
|
|
|
// Handle signals
|
|
process.on('SIGTERM', shutdown);
|
|
process.on('SIGINT', shutdown);
|
|
|
|
// Start
|
|
log.info('HarborForge Monitor Telemetry Server starting...');
|
|
log.info('Config:', {
|
|
identifier: CONFIG.identifier,
|
|
backendUrl: CONFIG.backendUrl,
|
|
reportIntervalSec: CONFIG.reportIntervalSec,
|
|
hasApiKey: !!CONFIG.apiKey,
|
|
});
|
|
|
|
if (!CONFIG.apiKey) {
|
|
log.warn('Missing HF_MONITOR_API_KEY environment variable - API authentication will fail');
|
|
}
|
|
|
|
reportingLoop();
|