commit 49bcde41ec0963176d3c06bb4238743d4673ebbe Author: hzhang Date: Wed May 27 10:17:54 2026 +0100 init: OpenClaw Perf Cache — fs.{stat,lstat,realpath}{,Sync} TTL memo for plugin-tree paths Wraps the global fs functions with a 1s TTL memo, scoped via path whitelist to plugin-discovery paths only. Workaround for upstream openclaw issue #86791: `loadPluginMetadataSnapshot()`'s cache-validity check re-runs `hashWatchedFiles` on every lookup, which walks every plugin's package.json + manifest + source via realpathSync -> ancestor lstat chain. On prod t2 with ~100 plugins, one cache-check pass is ~6 400 lstat + ~400 stat (~6-7s CPU per call). Fires on every agent turn, every loadConfig() call, every channel routing decision. This plugin doesn't fix the upstream design; it just absorbs the repeated stats within a 1s window so the same paths aren't re-statted 6× per second during a discovery walk. Verified on prod t2 (2026-05-27): - Cache hit ratio: 92.1-98.2% (stable across windows) - Idle baseline (0 turn, 0 push): 0.6-3.7% CPU (was 25%+ pre-fix) - Per-turn cost: notably reduced; previously 100% sustained per turn Path whitelist: - /openclaw/dist/extensions/ - /.openclaw/plugins/ - /node_modules/@openclaw/ - /openclaw/plugin-sdk/ All other paths pass through to original fs functions unchanged. Manifest requires `activation.onStartup: true` so openclaw register()s the plugin even though it exposes no tools/contracts (otherwise jiti caches the module without ever calling register). Co-Authored-By: Claude Opus 4.7 (1M context) diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..dd2d906 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +node_modules/ +plugin/index.js +plugin/index.js.map +plugin/node_modules/ +*.bak.* diff --git a/README.md b/README.md new file mode 100644 index 0000000..c64a15c --- /dev/null +++ b/README.md @@ -0,0 +1,108 @@ +# OpenClaw Perf Cache + +A 200-line OpenClaw plugin that wraps `fs.statSync` / `fs.lstatSync` / +`fs.realpathSync` (and their `fs.promises` async siblings) with a 1-second +TTL memo, **scoped to plugin-tree paths only**, as a workaround for an +upstream openclaw performance bug. + +## Why this exists + +`loadPluginMetadataSnapshot()` in openclaw's `dist/plugin-metadata-snapshot-*.js` +keeps a memo of the resolved plugin registry, but its cache-validity check +runs `hashWatchedFiles(memo.watchedFiles)` on **every lookup**. That call +re-fingerprints every plugin's `package.json` + `openclaw.plugin.json` + +source + setupSource paths via `realpathSync` → ancestor `lstat` chain. + +On a prod gateway with ~100 installed plugins (the bundled +`dist/extensions/*` set), one cache-check pass is roughly: + +``` +100 plugins × 4 watched-files × 2 realpath/file × ~8 lstat/realpath + ≈ 6 400 lstat + ~400 stat per call + ≈ 6–7 s CPU +``` + +The check fires from many call sites — every agent turn (tool middleware +loader), every `loadConfig()` call, every channel routing decision — +turning what should be a cheap snapshot hit into a sustained CPU drain. + +Same hot path is observed in these upstream tickets: + +- [#86791 — repeated lstat/realpathSync in InstalledPluginIndex fingerprinting (memoization missing)](https://github.com/openclaw/openclaw/issues/86791) — **open, P2**, exact same call chain (`lstat and realpathSync under resolvePackageJsonPath -> buildInstalledManifestRegistryIndexKey -> resolveInstalledManifestRegistryIndexFingerprint`); two linked PRs (#86797, #86850) in progress. Once that lands, this plugin becomes unnecessary. +- [#67040 — persist plugin discovery cache + defer plugin loading](https://github.com/openclaw/openclaw/issues/67040) (closed as *not planned*) +- [#75297 — gateway event-loop saturation, very slow sessions.list after 2026.4.23](https://github.com/openclaw/openclaw/issues/75297) (workaround: rollback to 2026.4.23) +- [#28587 — plugin runtime eagerly loads channel SDKs causing sustained high CPU on startup](https://github.com/openclaw/openclaw/issues/28587) (closed by PR #28620, but only fixed the startup path, not the per-turn cost) + +## What this plugin does + +On `register()` it patches the global fs functions: + +| Wrapped | Pass-through when | +|---|---| +| `fs.statSync` | path does NOT match a plugin-tree needle | +| `fs.lstatSync` | path does NOT match a plugin-tree needle | +| `fs.realpathSync` | path does NOT match a plugin-tree needle | +| `fs.promises.stat` | path does NOT match a plugin-tree needle | +| `fs.promises.lstat` | path does NOT match a plugin-tree needle | +| `fs.promises.realpath` | path does NOT match a plugin-tree needle | + +Plugin-tree needles (substring match — any one matches): + +- `/openclaw/dist/extensions/` +- `/.openclaw/plugins/` +- `/node_modules/@openclaw/` +- `/openclaw/plugin-sdk/` + +Matched calls get a 1 000 ms TTL memo keyed by `(fn-name, path, JSON(opts))`. +Cached errors throw the same error on subsequent reads within the window. + +Counters are logged once a minute: + +``` +[perf-cache] last 60s: hits=812 misses=27 (hit-ratio 96.8%) passthrough=1493 errors=0 cache_size=804 +``` + +`passthrough` = calls that bypassed memo because the path wasn't a plugin +tree path — that count is essentially "rest of the system" and should be +mostly unchanged by us. + +## Safety notes + +- **Pass-through for non-plugin paths.** Business code (logs, session files, + skills/, secrets/, anything outside the whitelist) sees the unmodified + `fs`. Only plugin-discovery paths are intercepted. +- **1 s TTL.** Plugin manifest mtime resolution is millisecond level, so a + manifest change becomes visible at most ~1 s later. Dev-loop impact is + negligible. +- **Bounded memory.** `cache.clear()` fires when entries > 4 000. +- **Idempotent.** Module re-import (jiti reload) is a no-op via a sentinel + flag on `globalThis`. +- **Argument-aware.** Cache key includes a JSON of trailing args so + `statSync(p)` and `statSync(p, { bigint: true })` don't collide. + +## Install + +```bash +git clone https://git.hangman-lab.top/hzhang/OpenclawPerfCache.git +cd OpenclawPerfCache +npm --prefix plugin install +node scripts/install.mjs --install +systemctl --user restart openclaw-gateway +``` + +## Update (rebuild + recopy, no config touch) + +```bash +node scripts/install.mjs --update +systemctl --user restart openclaw-gateway +``` + +## Uninstall + +```bash +node scripts/install.mjs --uninstall +systemctl --user restart openclaw-gateway +``` + +If openclaw ever fixes the upstream cache-validity-check, this plugin can +be uninstalled with no consequence. diff --git a/plugin/index.ts b/plugin/index.ts new file mode 100644 index 0000000..7f8b4a8 --- /dev/null +++ b/plugin/index.ts @@ -0,0 +1,277 @@ +/** + * OpenClaw Perf Cache — fs.stat/lstat/realpath TTL memo for plugin-tree paths. + * + * Why this exists + * =============== + * + * Upstream openclaw's `loadPluginMetadataSnapshot()` (in + * `dist/plugin-metadata-snapshot-*.js`) maintains a memo cache of the plugin + * registry, but the cache-validity check it runs on every lookup itself does + * O(N) filesystem work: + * + * resolvePersistedRegistryMemoStateForLookup(params, memo): + * ... + * if (registryState && contextHash matches && fastHash matches + * && hashWatchedFiles(registryState.watchedFiles) === registryState.watchedFilesHash) + * return registryState; // ← `hashWatchedFiles` re-fingerprints every watched file + * + * `hashWatchedFiles` calls `fileFingerprint(path)` (statx) for every plugin + * package.json + openclaw.plugin.json + source + setupSource, and the + * watched-file collection is built by `persistedPluginFileFingerprint` which + * in turn calls `resolvePluginFilePath` -> `tryRealpath` (which is + * `fs.realpathSync`, walking the ancestor chain via `lstat` for each path + * segment). + * + * On prod t2 (~100 installed plugins, mostly bundled extensions under + * `/usr/lib/node_modules/openclaw/dist/extensions//`), one cache-check + * call costs roughly: + * + * 100 plugins × 4 watched files/plugin × 2 realpath/file × ~8 lstat/realpath + * ≈ 6400 lstat + ~400 stat per call (~6-7s of CPU per call) + * + * The deprecated `loadConfig()` path was firing one of these every 30s from + * HF plugin's `pushMetaToMonitor` (separate fix: zhi/HarborForge.OpenclawPlugin#11), + * and every agent turn fires one too (per the tool middleware loader). The + * push-driven baseline is gone; the per-turn cost is the remaining chronic + * load. + * + * Two upstream tickets have closed without a fix for this same hot path: + * - https://github.com/openclaw/openclaw/issues/67040 (closed as not planned) + * - https://github.com/openclaw/openclaw/issues/75297 (closed, no fix; rollback to 2026.4.23 was the workaround) + * + * What this plugin does + * ===================== + * + * On `register()` (which runs before any agent turn), wrap the global + * `fs.statSync`, `fs.lstatSync`, `fs.realpathSync` and their `fs.promises` + * counterparts with a small TTL memo. The wrapper is a no-op (pass-through) + * for any path that is NOT under a plugin tree, so general fs use elsewhere + * is unaffected. + * + * Path whitelist (anything else falls through to the original): + * - `/openclaw/dist/extensions/` (bundled openclaw channel SDKs) + * - `/.openclaw/plugins/` (user-installed plugins) + * - `/node_modules/@openclaw/` (managed npm plugin packages) + * - `/openclaw/plugin-sdk/` (SDK module imports) + * + * TTL: 1000ms. Within that window, repeated stats of the same path return the + * cached result. Two cache-check calls back-to-back (which is what the + * snapshot lookup does on each invocation) now cost ~0 instead of ~7s. + * + * Safety + * ====== + * + * - The wrappers are bound on the original functions, so `this` and the full + * argument list are preserved (including `options` like `{ bigint: true }`). + * - Cache key includes a JSON of the trailing args so different option shapes + * for the same path don't collide (e.g. `statSync(p)` vs `statSync(p,{bigint:true})`). + * - Pass-through for non-plugin paths: business code (logs, session files, + * skills/, secrets/, anything outside the whitelist) sees the unmodified fs. + * - 1s TTL: plugin manifest mtime resolution is ms-level, so a manifest change + * becomes visible at most ~1s later. dev-loop impact is negligible. + * - Bounded memory: cache.clear() when >4000 entries (~few hundred KB max). + * - Idempotent: a sentinel flag prevents double-wrapping across plugin reloads. + * - Counts are tracked and logged every minute so we can see hit ratio in + * journalctl and validate the workaround is actually firing. + * + * If openclaw ever fixes the upstream cache-validity-check (issue text in + * the comment above), this plugin can be uninstalled without consequence. + */ + +import fs from 'node:fs'; +import type { Stats, BigIntStats } from 'node:fs'; + +const TTL_MS = 1000; +const SOFT_CAP = 4000; + +// Path-prefix substring match. Anything that matches → memoized. Anything +// that doesn't → pass-through to the original. Keep the list short and only +// add patterns where the same path is statted many times per second by the +// plugin-discovery hot path. +const HOT_PATH_NEEDLES = [ + '/openclaw/dist/extensions/', + '/.openclaw/plugins/', + '/node_modules/@openclaw/', + '/openclaw/plugin-sdk/', +]; + +function isHotPath(p: unknown): p is string { + if (typeof p !== 'string') return false; + for (const needle of HOT_PATH_NEEDLES) { + if (p.includes(needle)) return true; + } + return false; +} + +interface CacheEntry { + result: unknown; + isError: boolean; + expiresAt: number; +} + +const cache = new Map(); +const counters = { hits: 0, misses: 0, passthrough: 0, errors: 0 }; + +function evictIfFull() { + if (cache.size > SOFT_CAP) cache.clear(); +} + +function buildKey(name: string, path: string, args: unknown[]): string { + // args[0] for these is typically a Buffer encoding or {bigint:true} + // option — keep it in the key so different shapes don't collide. + let opts = ''; + if (args.length > 0) { + try { opts = JSON.stringify(args); } catch { opts = String(args.length); } + } + return `${name}\x00${path}\x00${opts}`; +} + +function wrapSync any>( + name: string, + orig: F, +): F { + const wrapped = function (this: unknown, path: unknown, ...rest: any[]): any { + if (!isHotPath(path)) { + counters.passthrough++; + return orig.call(this, path, ...rest); + } + const now = Date.now(); + const key = buildKey(name, path, rest); + const hit = cache.get(key); + if (hit && hit.expiresAt > now) { + counters.hits++; + if (hit.isError) throw hit.result; + return hit.result; + } + counters.misses++; + evictIfFull(); + let result: unknown; + let isError = false; + try { + result = orig.call(this, path, ...rest); + } catch (err) { + // ENOENT and friends — cache the error too so repeated "does this file + // exist" probes don't restat the kernel. Same TTL applies. + result = err; + isError = true; + counters.errors++; + } + cache.set(key, { result, isError, expiresAt: now + TTL_MS }); + if (isError) throw result; + return result; + }; + return wrapped as unknown as F; +} + +function wrapAsync Promise>( + name: string, + orig: F, +): F { + const wrapped = async function (this: unknown, path: unknown, ...rest: any[]): Promise { + if (!isHotPath(path)) { + counters.passthrough++; + return orig.call(this, path, ...rest); + } + const now = Date.now(); + const key = buildKey(name, path, rest); + const hit = cache.get(key); + if (hit && hit.expiresAt > now) { + counters.hits++; + if (hit.isError) throw hit.result; + return hit.result; + } + counters.misses++; + evictIfFull(); + try { + const result = await orig.call(this, path, ...rest); + cache.set(key, { result, isError: false, expiresAt: now + TTL_MS }); + return result; + } catch (err) { + counters.errors++; + cache.set(key, { result: err, isError: true, expiresAt: now + TTL_MS }); + throw err; + } + }; + return wrapped as unknown as F; +} + +interface PluginAPI { + logger?: { + info?: (...args: unknown[]) => void; + warn?: (...args: unknown[]) => void; + debug?: (...args: unknown[]) => void; + }; + on?: (event: string, handler: () => void) => void; +} + +const SENTINEL = '__openclawPerfCacheInstalled' as const; +const _G = globalThis as Record; +let statsTimer: ReturnType | null = null; + +function install(logger: PluginAPI['logger']): void { + if (_G[SENTINEL]) { + logger?.debug?.('[perf-cache] already installed; skipping'); + return; + } + _G[SENTINEL] = true; + + // Sync versions + fs.statSync = wrapSync('statSync', fs.statSync) as typeof fs.statSync; + fs.lstatSync = wrapSync('lstatSync', fs.lstatSync) as typeof fs.lstatSync; + fs.realpathSync = wrapSync('realpathSync', fs.realpathSync) as typeof fs.realpathSync; + + // Async (promises) versions — `lstat` was hot on prod profile (~38% even + // after the push-driven baseline was killed) + const fsp = fs.promises; + fsp.stat = wrapAsync('stat', fsp.stat.bind(fsp)) as typeof fsp.stat; + fsp.lstat = wrapAsync('lstat', fsp.lstat.bind(fsp)) as typeof fsp.lstat; + fsp.realpath = wrapAsync('realpath', fsp.realpath.bind(fsp)) as typeof fsp.realpath; + + logger?.info?.( + '[perf-cache] installed fs.{stat,lstat,realpath}Sync + fs.promises.{stat,lstat,realpath} ' + + `with ${TTL_MS}ms TTL for plugin-tree paths only`, + ); + + // Periodic counter log so we can see hit ratio in journalctl. Reset after + // logging so each line is "since last log". + statsTimer = setInterval(() => { + const total = counters.hits + counters.misses; + if (total === 0 && counters.passthrough === 0) return; + const hitRatio = total > 0 ? ((counters.hits / total) * 100).toFixed(1) : '0.0'; + logger?.info?.( + `[perf-cache] last 60s: hits=${counters.hits} misses=${counters.misses} ` + + `(hit-ratio ${hitRatio}%) passthrough=${counters.passthrough} ` + + `errors=${counters.errors} cache_size=${cache.size}`, + ); + counters.hits = 0; + counters.misses = 0; + counters.passthrough = 0; + counters.errors = 0; + }, 60_000); + // Don't keep the process alive just for this timer. + statsTimer.unref?.(); +} + +export default { + id: 'openclaw-perf-cache', + name: 'OpenClaw Perf Cache', + register(api: PluginAPI): void { + // Install *immediately* on register() — the snapshot lookups happen during + // plugin loading and per agent turn, both of which need the wrapper in + // place before they run. There's no `gateway_start` hook on the critical + // path that fires before those. + install(api.logger); + + api.on?.('gateway_stop', () => { + if (statsTimer) { + clearInterval(statsTimer); + statsTimer = null; + } + // Note: we intentionally DON'T uninstall the fs wrappers on gateway_stop. + // jiti caches module instances, and uninstalling at stop with reinstall + // at next start would double-wrap on reload. Leaving the wrappers in + // place is harmless — the sentinel check in install() guarantees we + // never wrap twice. + }); + }, +}; diff --git a/plugin/openclaw.plugin.json b/plugin/openclaw.plugin.json new file mode 100644 index 0000000..8f43dc5 --- /dev/null +++ b/plugin/openclaw.plugin.json @@ -0,0 +1,15 @@ +{ + "id": "openclaw-perf-cache", + "name": "OpenClaw Perf Cache", + "version": "0.1.0", + "description": "Wraps fs.{statSync,lstatSync,realpathSync,promises.{stat,lstat,realpath}} with a short TTL memo for plugin-tree paths only. Workaround for upstream hashWatchedFiles being called on every plugin-metadata-snapshot cache-validity check (~800 statx/call × ~9 turns/hour on prod = sustained ~15-25% CPU baseline).", + "main": "index.js", + "activation": { + "onStartup": true + }, + "configSchema": { + "type": "object", + "additionalProperties": false, + "properties": {} + } +} diff --git a/plugin/package.json b/plugin/package.json new file mode 100644 index 0000000..75052fa --- /dev/null +++ b/plugin/package.json @@ -0,0 +1,12 @@ +{ + "name": "openclaw-perf-cache", + "version": "0.1.0", + "type": "module", + "main": "index.js", + "scripts": { + "build": "tsc -p tsconfig.json" + }, + "devDependencies": { + "typescript": "^5.4.0" + } +} diff --git a/plugin/tsconfig.json b/plugin/tsconfig.json new file mode 100644 index 0000000..d34a7d2 --- /dev/null +++ b/plugin/tsconfig.json @@ -0,0 +1,15 @@ +{ + "compilerOptions": { + "target": "ES2022", + "module": "NodeNext", + "moduleResolution": "NodeNext", + "lib": ["ES2022"], + "strict": true, + "esModuleInterop": true, + "skipLibCheck": true, + "declaration": false, + "sourceMap": false + }, + "include": ["index.ts"], + "exclude": [] +} diff --git a/scripts/install.mjs b/scripts/install.mjs new file mode 100644 index 0000000..4f0e71f --- /dev/null +++ b/scripts/install.mjs @@ -0,0 +1,111 @@ +#!/usr/bin/env node +/** + * Install / uninstall / update OpenClaw Perf Cache plugin. + * + * Usage: + * node scripts/install.mjs --install # build + copy + register in openclaw.json + * node scripts/install.mjs --uninstall # remove plugin + unregister + * node scripts/install.mjs --update # rebuild + copy (no config touch) + */ +import fs from 'node:fs'; +import path from 'node:path'; +import { execSync } from 'node:child_process'; +import { fileURLToPath } from 'node:url'; + +const __dirname = path.dirname(fileURLToPath(import.meta.url)); +const projRoot = path.resolve(__dirname, '..'); +const pluginSrcDir = path.join(projRoot, 'plugin'); + +const PLUGIN_ID = 'openclaw-perf-cache'; +const ocDir = path.join(process.env.HOME || '/root', '.openclaw'); +const pluginsDir = path.join(ocDir, 'plugins'); +const installDir = path.join(pluginsDir, PLUGIN_ID); +const configPath = path.join(ocDir, 'openclaw.json'); + +const action = process.argv[2]; +if (!action || !['--install', '--uninstall', '--update'].includes(action)) { + console.log('Usage: node scripts/install.mjs --install | --uninstall | --update'); + process.exit(1); +} + +function build() { + console.log('Building plugin TypeScript...'); + execSync('npx tsc -p tsconfig.json', { cwd: pluginSrcDir, stdio: 'inherit' }); +} + +function copyPluginFiles() { + if (fs.existsSync(installDir)) fs.rmSync(installDir, { recursive: true }); + fs.mkdirSync(installDir, { recursive: true }); + // Copy compiled output + manifest + package.json. Skip the .ts source so + // jiti uses the .js (no transpile overhead at load time) and skip tsconfig. + for (const f of ['index.js', 'openclaw.plugin.json', 'package.json']) { + const src = path.join(pluginSrcDir, f); + const dst = path.join(installDir, f); + if (!fs.existsSync(src)) { + console.error(`missing ${src}; run build first`); + process.exit(1); + } + fs.copyFileSync(src, dst); + } + console.log(`Copied to ${installDir}`); +} + +function readConfig() { + return JSON.parse(fs.readFileSync(configPath, 'utf8')); +} + +function writeConfig(cfg) { + fs.writeFileSync(configPath, JSON.stringify(cfg, null, 2) + '\n', 'utf8'); +} + +function registerInConfig() { + const cfg = readConfig(); + const plugins = (cfg.plugins ??= {}); + const allow = (plugins.allow ??= []); + const loadPaths = ((plugins.load ??= {}).paths ??= []); + const entries = (plugins.entries ??= {}); + + if (!allow.includes(PLUGIN_ID)) allow.push(PLUGIN_ID); + if (!loadPaths.includes(installDir)) loadPaths.push(installDir); + + const entry = (entries[PLUGIN_ID] ??= {}); + if (entry.enabled === undefined) entry.enabled = true; + + writeConfig(cfg); + console.log(`Registered ${PLUGIN_ID} in ${configPath}`); +} + +function unregisterFromConfig() { + const cfg = readConfig(); + const plugins = cfg.plugins ?? {}; + if (Array.isArray(plugins.allow)) { + plugins.allow = plugins.allow.filter((id) => id !== PLUGIN_ID); + } + const loadPaths = plugins.load?.paths; + if (Array.isArray(loadPaths)) { + plugins.load.paths = loadPaths.filter((p) => p !== installDir); + } + if (plugins.entries && plugins.entries[PLUGIN_ID]) { + delete plugins.entries[PLUGIN_ID]; + } + writeConfig(cfg); + console.log(`Unregistered ${PLUGIN_ID} from ${configPath}`); +} + +if (action === '--install') { + build(); + copyPluginFiles(); + registerInConfig(); + console.log('Done. Restart gateway to load: systemctl --user restart openclaw-gateway'); +} else if (action === '--update') { + build(); + copyPluginFiles(); + console.log('Updated. Restart gateway to reload: systemctl --user restart openclaw-gateway'); +} else if (action === '--uninstall') { + if (fs.existsSync(installDir)) { + fs.rmSync(installDir, { recursive: true }); + console.log(`Removed ${installDir}`); + } + unregisterFromConfig(); + console.log('Done. Restart gateway to drop: systemctl --user restart openclaw-gateway'); +}