From c70f90cb52fb56762afc770eb235189259462163 Mon Sep 17 00:00:00 2001 From: zhi Date: Thu, 19 Mar 2026 18:17:50 +0000 Subject: [PATCH 1/5] feat(monitor): add API Key authentication for server heartbeat - Add api_key field to MonitoredServer model with unique index - Add migration to create api_key column - Add POST /admin/servers/{id}/api-key for key generation - Add DELETE /admin/servers/{id}/api-key for key revocation - Add POST /server/heartbeat-v2 with X-API-Key header auth - TelemetryPayload includes load_avg and uptime_seconds --- app/api/routers/monitor.py | 67 +++++++++++++++++++++++++++++++++++++- app/main.py | 5 +++ app/models/monitor.py | 1 + 3 files changed, 72 insertions(+), 1 deletion(-) diff --git a/app/api/routers/monitor.py b/app/api/routers/monitor.py index b9d7be1..a8d2ac3 100644 --- a/app/api/routers/monitor.py +++ b/app/api/routers/monitor.py @@ -1,9 +1,10 @@ from datetime import datetime, timedelta, timezone import json +import secrets import uuid from typing import List, Dict -from fastapi import APIRouter, Depends, HTTPException, status, WebSocket, WebSocketDisconnect +from fastapi import APIRouter, Depends, Header, HTTPException, status, WebSocket, WebSocketDisconnect from pydantic import BaseModel from sqlalchemy.orm import Session @@ -171,6 +172,29 @@ def delete_server(server_id: int, db: Session = Depends(get_db), _: models.User return None +@router.post('/admin/servers/{server_id}/api-key') +def generate_api_key(server_id: int, db: Session = Depends(get_db), _: models.User = Depends(require_admin)): + """Generate or regenerate API Key for a server (heartbeat v2)""" + server = db.query(MonitoredServer).filter(MonitoredServer.id == server_id).first() + if not server: + raise HTTPException(status_code=404, detail='Server not found') + api_key = secrets.token_urlsafe(32) + server.api_key = api_key + db.commit() + return {'server_id': server.id, 'api_key': api_key, 'message': 'Store this key securely - it will not be shown again'} + + +@router.delete('/admin/servers/{server_id}/api-key', status_code=status.HTTP_204_NO_CONTENT) +def revoke_api_key(server_id: int, db: Session = Depends(get_db), _: models.User = Depends(require_admin)): + """Revoke API Key for a server""" + server = db.query(MonitoredServer).filter(MonitoredServer.id == server_id).first() + if not server: + raise HTTPException(status_code=404, detail='Server not found') + server.api_key = None + db.commit() + return None + + class ServerHeartbeat(BaseModel): identifier: str openclaw_version: str | None = None @@ -201,6 +225,47 @@ def server_heartbeat(payload: ServerHeartbeat, db: Session = Depends(get_db)): return {'ok': True, 'server_id': server.id, 'last_seen_at': st.last_seen_at} +# Heartbeat v2 with API Key authentication +class TelemetryPayload(BaseModel): + identifier: str + openclaw_version: str | None = None + agents: List[dict] = [] + cpu_pct: float | None = None + mem_pct: float | None = None + disk_pct: float | None = None + swap_pct: float | None = None + load_avg: list[float] | None = None + uptime_seconds: int | None = None + + +@router.post('/server/heartbeat-v2') +def server_heartbeat_v2( + payload: TelemetryPayload, + x_api_key: str = Header(..., alias='X-API-Key', description='API Key from /admin/servers/{id}/api-key'), + db: Session = Depends(get_db) +): + """Server heartbeat using API Key authentication (no challenge_uuid required)""" + server = db.query(MonitoredServer).filter( + MonitoredServer.api_key == x_api_key, + MonitoredServer.is_enabled == True + ).first() + if not server: + raise HTTPException(status_code=401, detail='Invalid or missing API Key') + st = db.query(ServerState).filter(ServerState.server_id == server.id).first() + if not st: + st = ServerState(server_id=server.id) + db.add(st) + st.openclaw_version = payload.openclaw_version + st.agents_json = json.dumps(payload.agents, ensure_ascii=False) + st.cpu_pct = payload.cpu_pct + st.mem_pct = payload.mem_pct + st.disk_pct = payload.disk_pct + st.swap_pct = payload.swap_pct + st.last_seen_at = datetime.now(timezone.utc) + db.commit() + return {'ok': True, 'server_id': server.id, 'identifier': server.identifier, 'last_seen_at': st.last_seen_at} + + @router.websocket('/server/ws') async def server_ws(websocket: WebSocket): await websocket.accept() diff --git a/app/main.py b/app/main.py index 594531f..eaf1e9b 100644 --- a/app/main.py +++ b/app/main.py @@ -215,6 +215,11 @@ def _migrate_schema(): "DEFAULT 'open'" )) + # --- monitored_servers.api_key for heartbeat v2 --- + if _has_table(db, "monitored_servers") and not _has_column(db, "monitored_servers", "api_key"): + db.execute(text("ALTER TABLE monitored_servers ADD COLUMN api_key VARCHAR(64) NULL")) + db.execute(text("CREATE UNIQUE INDEX idx_monitored_servers_api_key ON monitored_servers (api_key)")) + db.commit() except Exception as e: db.rollback() diff --git a/app/models/monitor.py b/app/models/monitor.py index 533dbcb..21d0fc1 100644 --- a/app/models/monitor.py +++ b/app/models/monitor.py @@ -39,6 +39,7 @@ class MonitoredServer(Base): identifier = Column(String(128), nullable=False, unique=True) display_name = Column(String(128), nullable=True) is_enabled = Column(Boolean, default=True) + api_key = Column(String(64), nullable=True, unique=True, index=True) # API Key for server heartbeat v2 created_by = Column(Integer, nullable=True) created_at = Column(DateTime(timezone=True), server_default=func.now()) From a0d0c7b3a149e796b5a2bdedf68f9ea88d2fd57b Mon Sep 17 00:00:00 2001 From: zhi Date: Thu, 19 Mar 2026 20:57:50 +0000 Subject: [PATCH 2/5] fix(monitoring): handle timezone-naive datetimes in get_server_states_view Fixes datetime comparison error when last_seen_at from database is offset-naive (no timezone info) while 'now' is offset-aware (UTC). This resolves the TypeError: can't subtract offset-naive and offset-aware datetimes issue in integration tests. --- app/services/monitoring.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/app/services/monitoring.py b/app/services/monitoring.py index 9de347b..81b1e1c 100644 --- a/app/services/monitoring.py +++ b/app/services/monitoring.py @@ -288,6 +288,9 @@ def get_server_states_view(db: Session, offline_after_minutes: int = 7): for s in servers: st = db.query(ServerState).filter(ServerState.server_id == s.id).first() last_seen = st.last_seen_at if st else None + # Handle timezone-naive datetimes from database + if last_seen and last_seen.tzinfo is None: + last_seen = last_seen.replace(tzinfo=timezone.utc) online = bool(last_seen and (now - last_seen).total_seconds() <= offline_after_minutes * 60) out.append({ 'server_id': s.id, From 97f12cac7a45a22c1825840468686ee4f613fefe Mon Sep 17 00:00:00 2001 From: zhi Date: Fri, 20 Mar 2026 07:23:18 +0000 Subject: [PATCH 3/5] feat(monitor): store plugin version separately from openclaw version - Add server_states.plugin_version column - Keep openclaw_version for remote OpenClaw runtime version - Expose plugin_version in monitor server view - Accept and persist plugin_version in heartbeat payloads --- app/api/routers/monitor.py | 9 +++++++++ app/main.py | 4 ++++ app/models/monitor.py | 1 + app/services/monitoring.py | 1 + 4 files changed, 15 insertions(+) diff --git a/app/api/routers/monitor.py b/app/api/routers/monitor.py index a8d2ac3..d396a2b 100644 --- a/app/api/routers/monitor.py +++ b/app/api/routers/monitor.py @@ -198,6 +198,7 @@ def revoke_api_key(server_id: int, db: Session = Depends(get_db), _: models.User class ServerHeartbeat(BaseModel): identifier: str openclaw_version: str | None = None + plugin_version: str | None = None agents: List[dict] = [] cpu_pct: float | None = None mem_pct: float | None = None @@ -215,6 +216,7 @@ def server_heartbeat(payload: ServerHeartbeat, db: Session = Depends(get_db)): st = ServerState(server_id=server.id) db.add(st) st.openclaw_version = payload.openclaw_version + st.plugin_version = payload.plugin_version st.agents_json = json.dumps(payload.agents, ensure_ascii=False) st.cpu_pct = payload.cpu_pct st.mem_pct = payload.mem_pct @@ -229,6 +231,7 @@ def server_heartbeat(payload: ServerHeartbeat, db: Session = Depends(get_db)): class TelemetryPayload(BaseModel): identifier: str openclaw_version: str | None = None + plugin_version: str | None = None agents: List[dict] = [] cpu_pct: float | None = None mem_pct: float | None = None @@ -256,6 +259,7 @@ def server_heartbeat_v2( st = ServerState(server_id=server.id) db.add(st) st.openclaw_version = payload.openclaw_version + st.plugin_version = payload.plugin_version st.agents_json = json.dumps(payload.agents, ensure_ascii=False) st.cpu_pct = payload.cpu_pct st.mem_pct = payload.mem_pct @@ -328,12 +332,17 @@ async def server_ws(websocket: WebSocket): if event == 'server.hello': st.openclaw_version = payload.get('openclaw_version') + st.plugin_version = payload.get('plugin_version') st.agents_json = json.dumps(payload.get('agents') or [], ensure_ascii=False) elif event in {'server.metrics', 'agent.status_changed'}: st.cpu_pct = payload.get('cpu_pct', st.cpu_pct) st.mem_pct = payload.get('mem_pct', st.mem_pct) st.disk_pct = payload.get('disk_pct', st.disk_pct) st.swap_pct = payload.get('swap_pct', st.swap_pct) + if 'openclaw_version' in payload: + st.openclaw_version = payload.get('openclaw_version') + if 'plugin_version' in payload: + st.plugin_version = payload.get('plugin_version') if 'agents' in payload: st.agents_json = json.dumps(payload.get('agents') or [], ensure_ascii=False) diff --git a/app/main.py b/app/main.py index eaf1e9b..7d85ef7 100644 --- a/app/main.py +++ b/app/main.py @@ -220,6 +220,10 @@ def _migrate_schema(): db.execute(text("ALTER TABLE monitored_servers ADD COLUMN api_key VARCHAR(64) NULL")) db.execute(text("CREATE UNIQUE INDEX idx_monitored_servers_api_key ON monitored_servers (api_key)")) + # --- server_states.plugin_version for monitor plugin telemetry --- + if _has_table(db, "server_states") and not _has_column(db, "server_states", "plugin_version"): + db.execute(text("ALTER TABLE server_states ADD COLUMN plugin_version VARCHAR(64) NULL")) + db.commit() except Exception as e: db.rollback() diff --git a/app/models/monitor.py b/app/models/monitor.py index 21d0fc1..13cad5a 100644 --- a/app/models/monitor.py +++ b/app/models/monitor.py @@ -50,6 +50,7 @@ class ServerState(Base): id = Column(Integer, primary_key=True, index=True) server_id = Column(Integer, ForeignKey('monitored_servers.id'), nullable=False, unique=True) openclaw_version = Column(String(64), nullable=True) + plugin_version = Column(String(64), nullable=True) agents_json = Column(Text, nullable=True) # json list cpu_pct = Column(Float, nullable=True) mem_pct = Column(Float, nullable=True) diff --git a/app/services/monitoring.py b/app/services/monitoring.py index 81b1e1c..0f10811 100644 --- a/app/services/monitoring.py +++ b/app/services/monitoring.py @@ -298,6 +298,7 @@ def get_server_states_view(db: Session, offline_after_minutes: int = 7): 'display_name': s.display_name or s.identifier, 'online': online, 'openclaw_version': st.openclaw_version if st else None, + 'plugin_version': st.plugin_version if st else None, 'cpu_pct': st.cpu_pct if st else None, 'mem_pct': st.mem_pct if st else None, 'disk_pct': st.disk_pct if st else None, From 8e0f158266d7b51e38b2f4753b8d7d84ff74bcda Mon Sep 17 00:00:00 2001 From: zhi Date: Fri, 20 Mar 2026 07:42:43 +0000 Subject: [PATCH 4/5] refactor(monitor): remove deprecated challenge flow - Remove challenge issuance endpoint - Remove monitor websocket challenge handshake flow - Remove challenge/nonce runtime models - Keep API key as the only server auth path --- app/api/routers/monitor.py | 130 ++----------------------------------- app/models/monitor.py | 19 ------ 2 files changed, 4 insertions(+), 145 deletions(-) diff --git a/app/api/routers/monitor.py b/app/api/routers/monitor.py index d396a2b..c17b76f 100644 --- a/app/api/routers/monitor.py +++ b/app/api/routers/monitor.py @@ -1,22 +1,19 @@ -from datetime import datetime, timedelta, timezone +from datetime import datetime, timezone import json import secrets -import uuid -from typing import List, Dict +from typing import List -from fastapi import APIRouter, Depends, Header, HTTPException, status, WebSocket, WebSocketDisconnect +from fastapi import APIRouter, Depends, Header, HTTPException, status from pydantic import BaseModel from sqlalchemy.orm import Session -from app.core.config import get_db, SessionLocal +from app.core.config import get_db from app.api.deps import get_current_user_or_apikey from app.models import models from app.models.monitor import ( ProviderAccount, MonitoredServer, ServerState, - ServerChallenge, - ServerHandshakeNonce, ) from app.services.monitoring import ( get_task_stats_cached, @@ -24,11 +21,8 @@ from app.services.monitoring import ( get_server_states_view, test_provider_connection, ) -from app.services.crypto_box import get_public_key_info, decrypt_payload_b64, ts_within - router = APIRouter(prefix='/monitor', tags=['Monitor']) SUPPORTED_PROVIDERS = {'anthropic', 'openai', 'minimax', 'kimi', 'qwen'} -ACTIVE_WS: Dict[int, WebSocket] = {} class ProviderAccountCreate(BaseModel): @@ -47,23 +41,12 @@ class MonitoredServerCreate(BaseModel): display_name: str | None = None -class ChallengeResponse(BaseModel): - identifier: str - challenge_uuid: str - expires_at: str - - def require_admin(current_user: models.User = Depends(get_current_user_or_apikey)): if not current_user.is_admin: raise HTTPException(status_code=403, detail='Admin required') return current_user -@router.get('/public/server-public-key') -def monitor_public_key(): - return get_public_key_info() - - @router.get('/public/overview') def public_overview(db: Session = Depends(get_db)): return { @@ -144,19 +127,6 @@ def add_server(payload: MonitoredServerCreate, db: Session = Depends(get_db), us return {'id': obj.id, 'identifier': obj.identifier, 'display_name': obj.display_name, 'is_enabled': obj.is_enabled} -@router.post('/admin/servers/{server_id}/challenge', response_model=ChallengeResponse) -def issue_server_challenge(server_id: int, db: Session = Depends(get_db), _: models.User = Depends(require_admin)): - server = db.query(MonitoredServer).filter(MonitoredServer.id == server_id).first() - if not server: - raise HTTPException(status_code=404, detail='Server not found') - challenge_uuid = str(uuid.uuid4()) - expires_at = datetime.now(timezone.utc) + timedelta(minutes=10) - ch = ServerChallenge(server_id=server_id, challenge_uuid=challenge_uuid, expires_at=expires_at) - db.add(ch) - db.commit() - return ChallengeResponse(identifier=server.identifier, challenge_uuid=challenge_uuid, expires_at=expires_at.isoformat()) - - @router.delete('/admin/servers/{server_id}', status_code=status.HTTP_204_NO_CONTENT) def delete_server(server_id: int, db: Session = Depends(get_db), _: models.User = Depends(require_admin)): obj = db.query(MonitoredServer).filter(MonitoredServer.id == server_id).first() @@ -165,8 +135,6 @@ def delete_server(server_id: int, db: Session = Depends(get_db), _: models.User state = db.query(ServerState).filter(ServerState.server_id == server_id).first() if state: db.delete(state) - db.query(ServerChallenge).filter(ServerChallenge.server_id == server_id).delete() - db.query(ServerHandshakeNonce).filter(ServerHandshakeNonce.server_id == server_id).delete() db.delete(obj) db.commit() return None @@ -269,93 +237,3 @@ def server_heartbeat_v2( db.commit() return {'ok': True, 'server_id': server.id, 'identifier': server.identifier, 'last_seen_at': st.last_seen_at} - -@router.websocket('/server/ws') -async def server_ws(websocket: WebSocket): - await websocket.accept() - db = SessionLocal() - server_id = None - try: - hello = await websocket.receive_json() - - encrypted_payload = (hello.get('encrypted_payload') or '').strip() - if encrypted_payload: - data = decrypt_payload_b64(encrypted_payload) - identifier = (data.get('identifier') or '').strip() - challenge_uuid = (data.get('challenge_uuid') or '').strip() - nonce = (data.get('nonce') or '').strip() - ts = data.get('ts') - if not ts_within(ts, max_minutes=10): - await websocket.close(code=4401) - return - else: - # backward compatible mode - identifier = (hello.get('identifier') or '').strip() - challenge_uuid = (hello.get('challenge_uuid') or '').strip() - nonce = (hello.get('nonce') or '').strip() - - if not identifier or not challenge_uuid or not nonce: - await websocket.close(code=4400) - return - - server = db.query(MonitoredServer).filter(MonitoredServer.identifier == identifier, MonitoredServer.is_enabled == True).first() - if not server: - await websocket.close(code=4404) - return - - ch = db.query(ServerChallenge).filter(ServerChallenge.challenge_uuid == challenge_uuid, ServerChallenge.server_id == server.id).first() - if not ch or ch.used_at is not None or ch.expires_at < datetime.now(timezone.utc): - await websocket.close(code=4401) - return - - nonce_used = db.query(ServerHandshakeNonce).filter(ServerHandshakeNonce.server_id == server.id, ServerHandshakeNonce.nonce == nonce).first() - if nonce_used: - await websocket.close(code=4409) - return - - db.add(ServerHandshakeNonce(server_id=server.id, nonce=nonce)) - ch.used_at = datetime.now(timezone.utc) - db.commit() - - server_id = server.id - ACTIVE_WS[server.id] = websocket - await websocket.send_json({'ok': True, 'server_id': server.id, 'message': 'connected'}) - - while True: - msg = await websocket.receive_json() - event = msg.get('event') - payload = msg.get('payload') or {} - st = db.query(ServerState).filter(ServerState.server_id == server.id).first() - if not st: - st = ServerState(server_id=server.id) - db.add(st) - - if event == 'server.hello': - st.openclaw_version = payload.get('openclaw_version') - st.plugin_version = payload.get('plugin_version') - st.agents_json = json.dumps(payload.get('agents') or [], ensure_ascii=False) - elif event in {'server.metrics', 'agent.status_changed'}: - st.cpu_pct = payload.get('cpu_pct', st.cpu_pct) - st.mem_pct = payload.get('mem_pct', st.mem_pct) - st.disk_pct = payload.get('disk_pct', st.disk_pct) - st.swap_pct = payload.get('swap_pct', st.swap_pct) - if 'openclaw_version' in payload: - st.openclaw_version = payload.get('openclaw_version') - if 'plugin_version' in payload: - st.plugin_version = payload.get('plugin_version') - if 'agents' in payload: - st.agents_json = json.dumps(payload.get('agents') or [], ensure_ascii=False) - - st.last_seen_at = datetime.now(timezone.utc) - db.commit() - except WebSocketDisconnect: - pass - except Exception: - try: - await websocket.close(code=1011) - except Exception: - pass - finally: - if server_id and ACTIVE_WS.get(server_id) is websocket: - ACTIVE_WS.pop(server_id, None) - db.close() diff --git a/app/models/monitor.py b/app/models/monitor.py index 13cad5a..b16a04f 100644 --- a/app/models/monitor.py +++ b/app/models/monitor.py @@ -59,22 +59,3 @@ class ServerState(Base): last_seen_at = Column(DateTime(timezone=True), nullable=True) updated_at = Column(DateTime(timezone=True), onupdate=func.now()) - -class ServerChallenge(Base): - __tablename__ = 'server_challenges' - - id = Column(Integer, primary_key=True, index=True) - server_id = Column(Integer, ForeignKey('monitored_servers.id'), nullable=False, index=True) - challenge_uuid = Column(String(64), nullable=False, unique=True, index=True) - expires_at = Column(DateTime(timezone=True), nullable=False) - used_at = Column(DateTime(timezone=True), nullable=True) - created_at = Column(DateTime(timezone=True), server_default=func.now()) - - -class ServerHandshakeNonce(Base): - __tablename__ = 'server_handshake_nonces' - - id = Column(Integer, primary_key=True, index=True) - server_id = Column(Integer, ForeignKey('monitored_servers.id'), nullable=False, index=True) - nonce = Column(String(128), nullable=False, index=True) - created_at = Column(DateTime(timezone=True), server_default=func.now()) From 9b5e2dc15cfaa41a6869d62fcbaf8da07df63bbf Mon Sep 17 00:00:00 2001 From: zhi Date: Fri, 20 Mar 2026 08:02:19 +0000 Subject: [PATCH 5/5] fix(monitor): harden server delete and remove challenge docs - Delete server state before monitored server to avoid FK 500s - Keep legacy cleanup for obsolete challenge tables - Rewrite monitor docs to API key-only flow --- app/api/routers/monitor.py | 18 +- docs/OPENCLAW_PLUGIN_DEV_PLAN.md | 530 +++------------------------ docs/openclaw-monitor-plugin-plan.md | 122 +++--- 3 files changed, 136 insertions(+), 534 deletions(-) diff --git a/app/api/routers/monitor.py b/app/api/routers/monitor.py index c17b76f..ba5557f 100644 --- a/app/api/routers/monitor.py +++ b/app/api/routers/monitor.py @@ -5,6 +5,7 @@ from typing import List from fastapi import APIRouter, Depends, Header, HTTPException, status from pydantic import BaseModel +from sqlalchemy import text from sqlalchemy.orm import Session from app.core.config import get_db @@ -132,9 +133,20 @@ def delete_server(server_id: int, db: Session = Depends(get_db), _: models.User obj = db.query(MonitoredServer).filter(MonitoredServer.id == server_id).first() if not obj: raise HTTPException(status_code=404, detail='Server not found') - state = db.query(ServerState).filter(ServerState.server_id == server_id).first() - if state: - db.delete(state) + + # Delete dependent rows first to avoid FK errors. + db.query(ServerState).filter(ServerState.server_id == server_id).delete(synchronize_session=False) + + # Backward-compatible cleanup for deprecated challenge tables that may still exist in older DBs. + try: + db.execute(text('DELETE FROM server_handshake_nonces WHERE server_id = :server_id'), {'server_id': server_id}) + except Exception: + pass + try: + db.execute(text('DELETE FROM server_challenges WHERE server_id = :server_id'), {'server_id': server_id}) + except Exception: + pass + db.delete(obj) db.commit() return None diff --git a/docs/OPENCLAW_PLUGIN_DEV_PLAN.md b/docs/OPENCLAW_PLUGIN_DEV_PLAN.md index c22c829..0ef7222 100644 --- a/docs/OPENCLAW_PLUGIN_DEV_PLAN.md +++ b/docs/OPENCLAW_PLUGIN_DEV_PLAN.md @@ -1,494 +1,76 @@ -# OpenClaw Plugin 开发计划 +# OpenClaw Plugin 开发计划(当前版) -**文档版本**: 0.1.0 -**日期**: 2026-03-19 -**状态**: 开发中 +**状态**: API Key 方案已落地,challenge / WebSocket 旧方案已废弃。 ---- +## 当前架构 -## 1. 概述 +- HarborForge Monitor Backend 提供服务器注册与遥测接收接口 +- OpenClaw Gateway 加载 `harborforge-monitor` 插件 +- 插件在 `gateway_start` 时启动 sidecar (`server/telemetry.mjs`) +- sidecar 通过 **HTTP + X-API-Key** 向 Backend 上报遥测 -本文档定义 HarborForge.OpenclawPlugin 的开发计划,以及 Backend 需要提供的接口支持。 +## 当前后端接口 -### 1.1 目标 +### 公开接口 +- `GET /monitor/public/overview` -开发一个 OpenClaw 插件,将服务器遥测数据(系统指标 + OpenClaw 状态)实时传输到 HarborForge Monitor。 +### 管理接口 +- `GET /monitor/admin/servers` +- `POST /monitor/admin/servers` +- `DELETE /monitor/admin/servers/{id}` +- `POST /monitor/admin/servers/{id}/api-key` +- `DELETE /monitor/admin/servers/{id}/api-key` -### 1.2 架构关系 +### 插件上报接口 +- `POST /monitor/server/heartbeat-v2` + - Header: `X-API-Key` + - Body: + - `identifier` + - `openclaw_version` + - `plugin_version` + - `agents` + - `cpu_pct` + - `mem_pct` + - `disk_pct` + - `swap_pct` + - `load_avg` + - `uptime_seconds` -``` -┌─────────────────────────────────────────────────────────────┐ -│ 远程服务器 (VPS) │ -│ ┌──────────────────────────────────────────────────────┐ │ -│ │ OpenClaw Gateway │ │ -│ │ ┌────────────────────────────────────────────────┐ │ │ -│ │ │ HarborForge.OpenclawPlugin │ │ │ -│ │ │ - 生命周期管理 (随 Gateway 启动/停止) │ │ │ -│ │ │ - 启动 sidecar 进程 │ │ │ -│ │ └────────────────────────────────────────────────┘ │ │ -│ │ │ │ │ -│ │ ▼ 启动/管理 │ │ -│ │ ┌────────────────────────────────────────────────┐ │ │ -│ │ │ Sidecar (独立 Node 进程) │ │ │ -│ │ │ - 收集系统指标 (CPU/内存/磁盘) │ │ │ -│ │ │ - 读取 OpenClaw 状态 (agents) │ │ │ -│ │ │ - HTTP/WebSocket 上报到 Monitor │ │ │ -│ │ └────────────────────────────────────────────────┘ │ │ -│ └──────────────────────────────────────────────────────┘ │ -└─────────────────────────────────────────────────────────────┘ - │ - │ HTTP / WebSocket - ▼ - ┌─────────────────────────┐ - │ HarborForge.Backend │ - │ - /monitor/* 接口 │ - │ - 数据存储 │ - └─────────────────────────┘ -``` +## 数据语义 ---- +- `openclaw_version`: 远程服务器上的 OpenClaw 版本 +- `plugin_version`: 远程服务器上的 harborforge-monitor 插件版本 -## 2. Backend 当前能力评估 +## 已废弃内容 -### 2.1 已实现接口 ✅ +以下旧方案已经废弃,不再作为实现路径: -| 接口 | 功能 | 完整度 | 说明 | -|------|------|--------|------| -| `GET /monitor/public/server-public-key` | 获取 RSA 公钥 | ✅ 100% | 用于插件加密 | -| `POST /admin/servers` | 注册服务器 | ✅ 100% | 返回 server_id | -| `POST /admin/servers/{id}/challenge` | 生成 challenge | ✅ 100% | 10分钟有效期 | -| `WS /monitor/server/ws` | WebSocket 连接 | ✅ 100% | 完整验证逻辑 | -| `POST /monitor/server/heartbeat` | HTTP 心跳 | ⚠️ 50% | 缺少安全验证 | +- challenge UUID +- `GET /monitor/public/server-public-key` +- `POST /monitor/admin/servers/{id}/challenge` +- `WS /monitor/server/ws` +- challenge / nonce 握手逻辑 -### 2.2 当前 HTTP Heartbeat 问题 🔴 +## 前端管理页要求 -```python -# 当前实现 (app/api/routers/monitor.py:191-207) -@router.post('/server/heartbeat') -def server_heartbeat(payload: ServerHeartbeat, db: Session = Depends(get_db)): - server = db.query(MonitoredServer).filter( - MonitoredServer.identifier == payload.identifier - ).first() - # 问题:只验证 identifier 存在,不验证 challenge! - # 任何人知道 identifier 就可以伪造数据 -``` +Monitor 管理页应提供: -**对比 WebSocket 实现**: -```python -# WebSocket 有完整验证 -ch = db.query(ServerChallenge).filter( - ServerChallenge.challenge_uuid == challenge_uuid, - ServerChallenge.server_id == server.id -).first() -if not ch or ch.used_at is not None or ch.expires_at < now(): - await websocket.close(code=4401) # 验证失败 -``` +- Add Server +- Generate API Key +- Revoke API Key +- Delete Server ---- +不再提供 `Generate Challenge`。 -## 3. Backend 需要补充的接口 +## 运行流程 -### 3.1 方案 A:增强 HTTP Heartbeat(推荐短期方案) +1. 管理员在 Monitor 中注册服务器 +2. 管理员为服务器生成 API Key +3. 将 API Key 写入 `~/.openclaw/openclaw.json` +4. 重启 OpenClaw Gateway +5. 插件启动 sidecar +6. sidecar 定时向 `/monitor/server/heartbeat-v2` 上报 -添加 challenge_uuid 验证: +## 备注 -```python -@router.post('/server/heartbeat') -def server_heartbeat( - payload: ServerHeartbeatSecure, # 包含 challenge_uuid - db: Session = Depends(get_db) -): - # 1. 验证服务器 - server = db.query(MonitoredServer).filter(...).first() - if not server: - raise HTTPException(404, 'unknown server') - - # 2. 验证 challenge - ch = db.query(ServerChallenge).filter( - ServerChallenge.challenge_uuid == payload.challenge_uuid, - ServerChallenge.server_id == server.id - ).first() - - if not ch or ch.expires_at < now(): - raise HTTPException(401, 'invalid or expired challenge') - - # 3. 存储数据... -``` - -**优点**: 与现有 WebSocket 验证逻辑一致 -**缺点**: Challenge 10分钟过期,需要定期重新注册 - -### 3.2 方案 B:API Key 模式(推荐长期方案) - -添加长期有效的 API Key: - -```python -# 1. 模型添加 api_key 字段 -class MonitoredServer(Base): - ... - api_key = Column(String(64), nullable=True, unique=True, index=True) - -# 2. 新增接口:生成/重置 API Key -@router.post('/admin/servers/{id}/api-key') -def generate_api_key(server_id: int, ...): - api_key = secrets.token_urlsafe(32) - # 存储并返回 (仅显示一次) - -# 3. 心跳接口验证 API Key -@router.post('/server/heartbeat-v2') -def server_heartbeat_v2( - payload: ServerHeartbeat, - x_api_key: str = Header(...), - db: Session = Depends(get_db) -): - server = db.query(MonitoredServer).filter( - MonitoredServer.identifier == payload.identifier, - MonitoredServer.api_key == x_api_key - ).first() - if not server: - raise HTTPException(401, 'invalid credentials') -``` - -**优点**: 长期有效,适合自动化 Agent -**缺点**: 需要新增数据库字段和接口 - -### 3.3 方案 C:加密 Payload(最高安全) - -参考 WebSocket 的 encrypted_payload: - -```python -@router.post('/server/heartbeat') -def server_heartbeat( - encrypted_payload: str = Body(...), # RSA-OAEP 加密 - db: Session = Depends(get_db) -): - # 1. 解密 - data = decrypt_payload_b64(encrypted_payload) - - # 2. 验证时间戳 (防重放) - if not ts_within(data['ts'], max_minutes=10): - raise HTTPException(401, 'expired') - - # 3. 验证 challenge - ch = db.query(ServerChallenge).filter( - challenge_uuid=data['challenge_uuid'] - ).first() - ... -``` - -**优点**: 最高安全性 -**缺点**: 客户端实现复杂,需要 RSA 加密 - ---- - -## 4. OpenclawPlugin 开发计划 - -### Phase 1: 基础功能开发(2-3天) - -**目标**: 可运行的基础版本(开发环境) - -| 任务 | 说明 | 依赖 | -|------|------|------| -| 1.1 Sidecar 基础架构 | Node.js 项目结构,配置加载 | 无 | -| 1.2 系统指标收集 | CPU/内存/磁盘/运行时间 | 无 | -| 1.3 OpenClaw 状态读取 | 读取 agents.json,版本信息 | 无 | -| 1.4 HTTP 心跳上报 | 使用当前 /heartbeat 接口 | ⚠️ 不安全,仅开发 | -| 1.5 Plugin 生命周期 | 随 Gateway 启动/停止 Sidecar | 无 | - -**验收标准**: -- [ ] 可以收集系统指标 -- [ ] 可以上报到 Backend -- [ ] 可以在 Monitor 面板看到数据 - -### Phase 2: 安全增强(2-3天) - -**目标**: 生产环境可用的安全版本 - -| 任务 | 说明 | 依赖 | -|------|------|------| -| 2.1 WebSocket 支持 | 实现 WS 连接和加密握手 | Backend WS 接口 ✅ | -| 2.2 或:等待 HTTP 增强 | Backend 添加 challenge 验证 | Backend 更新 | -| 2.3 重试/退避逻辑 | 连接失败时指数退避 | 无 | -| 2.4 离线缓存 | 暂时存储,恢复后批量上报 | 无 | - -**验收标准**: -- [ ] 连接需要验证(WebSocket 或增强 HTTP) -- [ ] 网络中断后自动恢复 -- [ ] 数据不丢失 - -### Phase 3: 生产就绪(1-2天) - -**目标**: 稳定可靠的监控系统 - -| 任务 | 说明 | 依赖 | -|------|------|------| -| 3.1 日志和诊断 | 结构化日志,调试接口 | 无 | -| 3.2 性能优化 | 减少资源占用 | 无 | -| 3.3 安装脚本完善 | 参考 PaddedCell 格式 | 无 | -| 3.4 文档编写 | 部署指南,故障排查 | 无 | - -**验收标准**: -- [ ] 长时间稳定运行(7天+) -- [ ] 资源占用 < 1% CPU,< 50MB 内存 -- [ ] 安装脚本一键部署 - ---- - -## 5. 接口规格详细定义 - -### 5.1 当前可用接口 - -#### GET /monitor/public/server-public-key -```yaml -Response: - public_key_pem: string # RSA 公钥 (PEM 格式) - key_id: string # 公钥指纹 -``` - -#### POST /admin/servers -```yaml -Headers: - Authorization: Bearer {admin_token} -Body: - identifier: string # 唯一标识 (如 "vps.t1") - display_name: string # 显示名称 -Response: - id: int - identifier: string - challenge_uuid: string # 10分钟有效 - expires_at: ISO8601 -``` - -#### WS /monitor/server/ws -```yaml -连接流程: - 1. Client -> Server: GET /monitor/server/ws (Upgrade) - 2. Client -> Server: { - "encrypted_payload": "base64_rsa_encrypted_json" - } - # 或明文(向后兼容): - # { - # "identifier": "vps.t1", - # "challenge_uuid": "...", - # "nonce": "...", - # "ts": "2026-03-19T14:00:00Z" - # } - 3. Server -> Client: { "ok": true, "server_id": 1 } - 4. Client -> Server: { - "event": "server.metrics", - "payload": { "cpu_pct": 12.5, "mem_pct": 41.2, ... } - } -``` - -#### POST /monitor/server/heartbeat(当前版本,不安全) -```yaml -Body: - identifier: string - openclaw_version: string - agents: [{id, name, status}] - cpu_pct: float - mem_pct: float - disk_pct: float - swap_pct: float -Response: - ok: true - server_id: int - last_seen_at: ISO8601 -``` - -### 5.2 建议新增接口 - -#### POST /server/heartbeat-secure(增强版) -```yaml -Body: - identifier: string - challenge_uuid: string # 新增:必填 - openclaw_version: string - agents: [...] - cpu_pct: float - mem_pct: float - disk_pct: float - swap_pct: float - timestamp: ISO8601 # 可选:防重放 -Response: - ok: true - server_id: int - last_seen_at: ISO8601 - challenge_expires_at: ISO8601 -Error: - 401: { detail: "invalid or expired challenge" } -``` - ---- - -## 6. 数据模型 - -### 6.1 当前 Backend 模型 - -```python -# app/models/monitor.py - -class MonitoredServer: - id: int - identifier: str # 唯一标识 - display_name: str - is_enabled: bool - created_by: int - created_at: datetime - # 建议添加: - # api_key: str # 长期有效的 API Key - -class ServerChallenge: - id: int - server_id: int - challenge_uuid: str # 10分钟有效 - expires_at: datetime - used_at: datetime # 首次使用时间 - created_at: datetime - -class ServerState: - id: int - server_id: int - openclaw_version: str - agents_json: str # JSON 序列化 - cpu_pct: float - mem_pct: float - disk_pct: float - swap_pct: float - last_seen_at: datetime - updated_at: datetime -``` - -### 6.2 Plugin 配置模型 - -```typescript -// ~/.openclaw/openclaw.json -{ - "plugins": { - "harborforge-monitor": { - "enabled": true, - "backendUrl": "https://monitor.hangman-lab.top", - "identifier": "vps.t1", // 服务器标识 - "challengeUuid": "uuid-here", // 从 /admin/servers/{id}/challenge 获取 - "apiKey": "key-here", // 如果使用 API Key 模式(可选) - "reportIntervalSec": 30, - "httpFallbackIntervalSec": 60, - "logLevel": "info" - } - } -} -``` - ---- - -## 7. 开发时序图 - -### 7.1 首次部署流程 - -```mermaid -sequenceDiagram - participant Admin - participant Backend - participant Plugin - participant Server as Server State - - Admin->>Backend: POST /admin/servers
{identifier: "vps.t1"} - Backend->>Admin: {id: 1, identifier: "vps.t1"} - - Admin->>Backend: POST /admin/servers/1/challenge - Backend->>Admin: {challenge_uuid: "abc-123", expires_at: "..."} - - Admin->>Server: 配置 challenge_uuid - - Note over Server: ~/.openclaw/openclaw.json - - Server->>Backend: openclaw gateway restart - - Plugin->>Backend: GET /monitor/public/server-public-key - Backend->>Plugin: {public_key_pem: "..."} - - alt WebSocket 模式 - Plugin->>Backend: WS /monitor/server/ws - Plugin->>Backend: {challenge_uuid, nonce, ts} - Backend->>Plugin: {ok: true} - loop 每 30 秒 - Plugin->>Backend: {event: "server.metrics", payload: {...}} - end - else HTTP 模式 - loop 每 30 秒 - Plugin->>Backend: POST /server/heartbeat
{challenge_uuid, ...} - Backend->>Plugin: {ok: true} - end - end -``` - -### 7.2 数据上报格式 - -```json -{ - "identifier": "vps.t1", - "challenge_uuid": "550e8400-e29b-41d4-a716-446655440000", - "timestamp": "2026-03-19T14:30:00Z", - - "cpu_pct": 12.5, - "mem_pct": 41.2, - "mem_used_mb": 4096, - "mem_total_mb": 8192, - "disk_pct": 62.0, - "disk_used_gb": 500.5, - "disk_total_gb": 1000.0, - "swap_pct": 0.0, - "uptime_sec": 86400, - "load_avg_1m": 0.5, - "platform": "linux", - "hostname": "vps.t1", - - "openclaw_version": "1.2.3", - "openclaw_agent_count": 2, - "openclaw_agents": [ - {"id": "dev", "name": "Developer", "status": "running"}, - {"id": "ops", "name": "Operator", "status": "idle"} - ] -} -``` - ---- - -## 8. 风险与缓解 - -| 风险 | 影响 | 缓解措施 | -|------|------|----------| -| HTTP Heartbeat 无验证 | 数据伪造 | 使用 WebSocket 或等待 Backend 修复 | -| Challenge 10分钟过期 | 需要频繁更新 | Backend 添加 API Key 模式 | -| 网络中断 | 数据丢失 | Plugin 实现离线缓存 | -| 资源占用过高 | 影响业务 | 控制采集频率,优化实现 | -| Sidecar 崩溃 | 监控中断 | Plugin 自动重启 Sidecar | - ---- - -## 9. 下一步行动 - -### Backend 团队 -- [ ] 决定采用方案 A/B/C 增强 HTTP Heartbeat 安全 -- [ ] 实现 `/server/heartbeat-secure` 或增强现有接口 -- [ ] (可选)添加 API Key 支持 - -### Plugin 开发团队 -- [ ] Phase 1: 基础功能开发(使用当前不安全 HTTP,仅开发测试) -- [ ] Phase 2: 集成 WebSocket(立即可用,最安全) -- [ ] 等待 Backend 更新后,切换到安全 HTTP - ---- - -## 10. 参考文档 - -- 原始设计文档: `docs/monitor-server-connector-plan.md` -- Backend 代码: `app/api/routers/monitor.py` -- Backend 模型: `app/models/monitor.py` -- 加密服务: `app/services/crypto_box.py` -- PaddedCell 安装脚本参考: `https://git.hangman-lab.top/nav/PaddedCell` - ---- - -**文档维护者**: HarborForge Team -**更新频率**: 随开发进度更新 +当前保留了对旧 challenge 数据表的**删除兼容清理**(仅为兼容老数据库中的遗留数据),但不再保留 challenge 功能入口与运行时逻辑。 diff --git a/docs/openclaw-monitor-plugin-plan.md b/docs/openclaw-monitor-plugin-plan.md index 80c0f18..4dd1c4c 100644 --- a/docs/openclaw-monitor-plugin-plan.md +++ b/docs/openclaw-monitor-plugin-plan.md @@ -1,68 +1,76 @@ -# OpenClaw Monitor Agent Plugin 开发计划(草案) +# HarborForge Monitor / OpenClaw Plugin Connector Plan ## 目标 -让被监测服务器通过 WebSocket 主动接入 HarborForge Backend,并持续上报: -- OpenClaw 版本 -- agent 列表 -- 每 5 分钟主机指标(CPU/MEM/DISK/SWAP) -- agent 状态变更事件 -## 握手流程 -1. Admin 在 HarborForge 后台添加 server identifier -2. Admin 生成 challenge(10 分钟有效) -3. 插件请求 `GET /monitor/public/server-public-key` 获取公钥 -4. 插件构造 payload: - - `identifier` - - `challenge_uuid` - - `nonce`(随机) - - `ts`(ISO8601) -5. 使用 RSA-OAEP(SHA256) 公钥加密,base64 后作为 `encrypted_payload` 发给 `WS /monitor/server/ws` -6. 握手成功后进入事件上报通道 +使用 **API Key + HTTP heartbeat** 连接 HarborForge Monitor 与远程 OpenClaw 节点。 -## 插件事件协议 -### server.hello +## 认证方式 + +- 管理员为服务器生成 API Key +- 插件通过 `X-API-Key` 调用 heartbeat 接口 +- 不再使用 challenge / RSA 公钥 / WebSocket 握手 + +## 上报接口 + +`POST /monitor/server/heartbeat-v2` + +### Headers +- `X-API-Key: ` + +### Payload ```json { - "event": "server.hello", - "payload": { - "openclaw_version": "x.y.z", - "agents": [{"id": "a1", "name": "agent-1", "status": "idle"}] + "identifier": "vps.t1", + "openclaw_version": "OpenClaw 2026.3.13 (61d171a)", + "plugin_version": "0.1.0", + "agents": [ + { "id": "agent-bot1", "name": "agent-bot1", "status": "configured" } + ], + "cpu_pct": 12.3, + "mem_pct": 45.6, + "disk_pct": 78.9, + "swap_pct": 0, + "load_avg": [0.12, 0.08, 0.03], + "uptime_seconds": 12345 +} +``` + +## 语义 + +- `openclaw_version`: 远程主机上的 OpenClaw 版本 +- `plugin_version`: harborforge-monitor 插件版本 + +## 插件生命周期 + +- 插件注册到 Gateway +- 在 `gateway_start` 启动 `server/telemetry.mjs` +- 在 `gateway_stop` 停止 sidecar + +## 配置位置 + +`~/.openclaw/openclaw.json` + +```json +{ + "plugins": { + "entries": { + "harborforge-monitor": { + "enabled": true, + "config": { + "enabled": true, + "backendUrl": "http://127.0.0.1:8000", + "identifier": "vps.t1", + "apiKey": "your-api-key" + } + } + } } } ``` -### server.metrics(每 5 分钟) -```json -{ - "event": "server.metrics", - "payload": { - "cpu_pct": 21.3, - "mem_pct": 42.1, - "disk_pct": 55.9, - "swap_pct": 0.0, - "agents": [{"id": "a1", "name": "agent-1", "status": "busy"}] - } -} -``` +## 已废弃 -### agent.status_changed(可选) -```json -{ - "event": "agent.status_changed", - "payload": { - "agents": [{"id": "a1", "name": "agent-1", "status": "focus"}] - } -} -``` - -## 实施里程碑 -- M1: Node/Python CLI 插件最小握手联通 -- M2: 指标采集 + 周期上报 -- M3: agent 状态采集与变更事件 -- M4: 守护化(systemd)+ 断线重连 + 本地日志 - -## 风险与注意事项 -- 时钟漂移会导致 `ts` 校验失败(建议 NTP) -- challenge 仅一次可用,重复使用会被拒绝 -- nonce 重放会被拒绝 -- 需要保证插件本地安全保存 identifier/challenge(短期) +- challenge UUID +- server public key +- WebSocket telemetry +- encrypted handshake payload