fix: align calendar API with actual HarborForge.Backend contract

Initial drop guessed the heartbeat shape; sim e2e against a running
harborforge-backend revealed the real contract is per-agent with
header auth, not server-wide with bearer:

  POST /calendar/agent/heartbeat
    headers: X-Agent-ID, X-Claw-Identifier
    body:    {claw_identifier, agent_id}
    response: {slots: [Slot], agent_status, message?}

  PATCH /calendar/slots/{id}/agent-update
  PATCH /calendar/slots/virtual/{vid}/agent-update
    body: {status, started_at?, actual_duration?}

  POST /calendar/agent/status
    body: {claw_identifier, agent_id, status}

Refactors:

  - internal/calendar/types.go now mirrors OpenclawPlugin/calendar/
    types.ts 1:1 (SlotStatus camelCase, real vs virtual slot id
    discrimination, event_data shape)
  - internal/calendar/bridge.go: header-based auth, per-agent method
    signatures, separate UpdateRealSlot vs UpdateVirtualSlot
  - internal/calendar/scheduler.go: per-agent heartbeat loop
    (one HTTP call per agent per tick), highest-priority slot
    selection, agent-update PATCH for terminal/non-terminal states
  - SingleActiveAgentID helper for main.bestEffortAgentID

Also fix two bugs found in sim:

  - bgCtx capture: AgentLister closures were capturing Init's ctx
    which dies the moment MCP initialize returns; switched to
    bgCtx (lifetime = plugin process)
  - tools.toolRestartStatus referenced a non-existent
    sch.RestartPending — HF backend has no restart endpoint per
    /openapi.json, so the tool now reports last_heartbeats freshness

Scheduler logs each tick + each heartbeat outcome at info so
operators can see backend connectivity without enabling debug.

E2E against http://harborforge-backend:8000 in sim:
  daemon → heartbeat → 404 "Agent not found"
  (= correct endpoint, correct headers, correct body — agent just
   isn't registered yet, which is expected for an untenanted
   plugin)
This commit is contained in:
h z
2026-06-03 11:28:05 +01:00
parent 754e5183f7
commit 78b1ec5181
5 changed files with 474 additions and 370 deletions

View File

@@ -1,14 +1,16 @@
// Scheduler — main loop that heartbeats the backend, dispatches
// returned slots via Plexum's WakeAgent, and tracks per-agent active
// slot state for the calendar_* tools.
// Scheduler — loops over every Plexum agent, heartbeats per-agent,
// picks the highest-priority pending slot for each, dispatches via
// host.WakeAgent. Mirrors HarborForge.OpenclawPlugin's per-agent
// scheduler loop (PLG-CAL-002).
//
// State is in-memory: a daemon restart drops everything. Next
// heartbeat reconciles (backend keeps the canonical SlotStatus).
// In-memory state: per-agent active slot map. A daemon restart drops
// it; next heartbeat reconciles from the backend's canonical state.
//
// Concurrency:
// - one heartbeat ticker goroutine
// - per-slot dispatch is fire-and-forget via WakeAgent (queue-aware)
// - mu guards activeBySlot + activeByAgent maps
// Wake semantics: WakeAgent is fire-and-forget; the SDK's wake queue
// (depth 1 replace-newest) handles state-aware dispatch. We mark the
// slot Ongoing optimistically the moment we call WakeAgent; agents
// drive complete/abort/pause/resume via the harborforge_calendar_*
// tools.
package calendar
@@ -22,22 +24,20 @@ import (
sdkplugin "git.hangman-lab.top/hzhang/Plexum-sdk-go/plugin"
)
// Scheduler orchestrates the calendar loop.
// Scheduler is the long-running calendar driver.
type Scheduler struct {
cfg Config
bridge *Bridge
host sdkplugin.HostAPI
agentLister func() []ReportableAgent
identifier string
pluginInfo PluginInfoTag
cfg Config
bridge *Bridge
host sdkplugin.HostAPI
agentLister func() []ReportableAgent
pluginInfo PluginInfoTag
mu sync.Mutex
activeBySlotID map[string]*ActiveSlot
activeByAgentID map[string]*ActiveSlot
history []HistoryEntry
lastHeartbeat time.Time
lastResponse HeartbeatResponse
restartPending bool
mu sync.Mutex
activeByAgentID map[string]*ActiveSlot
activeBySlotIdent map[string]*ActiveSlot
history []HistoryEntry
lastHeartbeats map[string]time.Time
lastErrors map[string]string
}
// Config bundles scheduler tunables.
@@ -46,36 +46,42 @@ type Config struct {
HistoryCap int // bound on activity history; default 32
}
// ReportableAgent is the projection of a Plexum agent the scheduler
// needs for heartbeat — id + model + current sm state.
type ReportableAgent struct {
ID string
Model string
State AgentStatusValue
// PluginInfoTag tags heartbeat reports so the backend knows which
// plugin / version is reporting.
type PluginInfoTag struct {
Name string
Version string
Backend string // "plexum"
}
// ActiveSlot tracks an in-flight slot (between WakeAgent dispatch and
// terminal status update).
// ReportableAgent is the per-agent projection the scheduler needs for
// heartbeat enumeration.
type ReportableAgent struct {
ID string
Model string
State AgentStatusValue
}
// ActiveSlot tracks an in-flight slot from dispatch to terminal state.
type ActiveSlot struct {
Slot Slot
StartedAt time.Time
LastHeartbeat time.Time
State SlotStatus
}
// HistoryEntry is one resolved slot kept for the calendar_status tool.
// HistoryEntry records one resolved slot for the calendar_status tool.
type HistoryEntry struct {
SlotID string
AgentID string
Status SlotStatus
ResolvedAt time.Time
Reason string
Summary string
Ident string
AgentID string
Status SlotStatus
ResolvedAt time.Time
Reason string
Summary string
}
// NewScheduler constructs a Scheduler in stopped state.
func NewScheduler(cfg Config, bridge *Bridge, host sdkplugin.HostAPI,
identifier string, pluginInfo PluginInfoTag,
pluginInfo PluginInfoTag,
agentLister func() []ReportableAgent) *Scheduler {
if cfg.HeartbeatInterval <= 0 {
cfg.HeartbeatInterval = 30 * time.Second
@@ -84,193 +90,215 @@ func NewScheduler(cfg Config, bridge *Bridge, host sdkplugin.HostAPI,
cfg.HistoryCap = 32
}
return &Scheduler{
cfg: cfg,
bridge: bridge,
host: host,
agentLister: agentLister,
identifier: identifier,
pluginInfo: pluginInfo,
activeBySlotID: map[string]*ActiveSlot{},
activeByAgentID: map[string]*ActiveSlot{},
cfg: cfg,
bridge: bridge,
host: host,
agentLister: agentLister,
pluginInfo: pluginInfo,
activeByAgentID: map[string]*ActiveSlot{},
activeBySlotIdent: map[string]*ActiveSlot{},
lastHeartbeats: map[string]time.Time{},
lastErrors: map[string]string{},
}
}
// Run blocks until ctx cancels, ticking heartbeats every
// cfg.HeartbeatInterval. Returns nil on graceful shutdown.
// Run blocks until ctx cancels.
func (s *Scheduler) Run(ctx context.Context) error {
t := time.NewTicker(s.cfg.HeartbeatInterval)
defer t.Stop()
// First heartbeat immediately so initial state lands fast.
s.heartbeatOnce(ctx)
s.tick(ctx)
for {
select {
case <-ctx.Done():
return nil
case <-t.C:
s.heartbeatOnce(ctx)
s.tick(ctx)
}
}
}
func (s *Scheduler) heartbeatOnce(ctx context.Context) {
payload := HeartbeatPayload{
Identifier: s.identifier,
APIKey: s.bridge.APIKey,
PluginInfo: s.pluginInfo,
CapturedAt: time.Now().UTC(),
func (s *Scheduler) tick(ctx context.Context) {
if s.agentLister == nil {
return
}
if s.agentLister != nil {
for _, a := range s.agentLister() {
payload.AgentList = append(payload.AgentList, AgentReport{
ID: a.ID, Model: a.Model, Status: a.State,
})
}
now := time.Now().UTC()
agents := s.agentLister()
s.host.Log("info", "calendar tick", map[string]any{"agents": len(agents)})
for _, agent := range agents {
s.tickForAgent(ctx, agent, now)
}
resp, err := s.bridge.Heartbeat(ctx, payload)
}
func (s *Scheduler) tickForAgent(ctx context.Context, agent ReportableAgent, now time.Time) {
resp, err := s.bridge.Heartbeat(ctx, agent.ID)
s.mu.Lock()
s.lastHeartbeat = time.Now()
if err == nil {
s.lastResponse = resp
s.restartPending = resp.RestartPending
}
s.mu.Unlock()
s.lastHeartbeats[agent.ID] = now
if err != nil {
return // network blip; next tick retries
s.lastErrors[agent.ID] = err.Error()
s.mu.Unlock()
s.host.Log("warn", "calendar heartbeat failed", map[string]any{
"agent": agent.ID, "err": err.Error(),
})
return
}
for _, slot := range resp.SlotsToFire {
s.dispatchSlot(ctx, slot)
delete(s.lastErrors, agent.ID)
s.mu.Unlock()
s.host.Log("info", "calendar heartbeat ok", map[string]any{
"agent": agent.ID, "slots": len(resp.Slots), "agent_status": string(resp.AgentStatus),
})
// Pick highest-priority NotStarted slot; defer the rest.
var chosen *Slot
for i := range resp.Slots {
slot := &resp.Slots[i]
if slot.Status != SlotNotStarted && slot.Status != SlotDeferred {
continue
}
if chosen == nil || slot.Priority > chosen.Priority {
chosen = slot
}
}
if chosen != nil {
s.dispatchSlot(ctx, agent.ID, *chosen)
}
// Defer the other unchosen NotStarted/Deferred slots (priority +1)
// so they bubble up next heartbeat. We don't strictly need to push
// the update; the backend's priority bookkeeping survives without
// our nudge for v1. (OpenClaw plugin DOES push priority bumps —
// future v2 work if backend feedback shows starvation.)
}
// dispatchSlot fires the slot via host.WakeAgent and records it as
// active. WakeAgent handles state-aware queueing — if the agent is
// busy, our calendar slot enqueues at depth 1 and the previous wake
// is dropped per replace-newest semantics. We mark the slot
// in_progress optimistically when we ENQUEUED; backend reconciles on
// its own watchdog.
func (s *Scheduler) dispatchSlot(ctx context.Context, slot Slot) {
// Skip already-active slots (heartbeat may re-list a slot we
// already started — backend hasn't seen our optimistic update yet).
// dispatchSlot fires WakeAgent + records the slot active. Marks the
// slot Ongoing on the backend so the dashboard reflects the
// transition immediately.
func (s *Scheduler) dispatchSlot(ctx context.Context, agentID string, slot Slot) {
ident := slot.SlotIdent()
s.mu.Lock()
if _, ok := s.activeBySlotID[slot.ID]; ok {
if _, dup := s.activeBySlotIdent[ident]; dup {
s.mu.Unlock()
return
}
if _, agentBusy := s.activeByAgentID[agentID]; agentBusy {
// Don't pick up another slot until the current one resolves.
s.mu.Unlock()
return
}
now := time.Now().UTC()
act := &ActiveSlot{
Slot: slot, StartedAt: now, LastHeartbeat: now,
State: SlotInProgress,
}
s.activeBySlotID[slot.ID] = act
s.activeByAgentID[slot.AgentID] = act
active := &ActiveSlot{Slot: slot, StartedAt: now, LastHeartbeat: now}
s.activeBySlotIdent[ident] = active
s.activeByAgentID[agentID] = active
s.mu.Unlock()
message := slot.WakeOptions.OverrideMessage
if message == "" {
message = slot.PromptText
}
if message == "" {
message = fmt.Sprintf("[calendar] slot %s: %s", slot.ID, slot.Title)
}
source := fmt.Sprintf("calendar:slot-%s", slot.ID)
message := buildWakeMessage(slot)
source := "calendar:" + ident
if err := s.host.WakeAgent(ctx, sdkplugin.WakeAgentRequest{
AgentID: slot.AgentID,
Message: message,
Source: source,
AgentID: agentID, Message: message, Source: source,
}); err != nil {
// Wake itself failed (plumbing). Mark slot aborted +
// notify backend.
s.resolveSlot(ctx, slot.ID, SlotAborted, "", "wake-agent failed: "+err.Error())
s.resolveLocally(ident, agentID, SlotAborted, "", "wake failed: "+err.Error())
return
}
// Mark Ongoing on the backend.
update := SlotAgentUpdate{
Status: SlotOngoing, StartedAt: now.Format("15:04:05"),
}
s.pushUpdate(ctx, agentID, slot, update)
}
// resolveSlot moves an active slot to a terminal status, records
// history, and tells the backend. Safe to call concurrently.
func (s *Scheduler) resolveSlot(ctx context.Context, slotID string, status SlotStatus, summary, reason string) error {
s.mu.Lock()
act, ok := s.activeBySlotID[slotID]
if !ok {
s.mu.Unlock()
return fmt.Errorf("calendar: slot %s not active", slotID)
func buildWakeMessage(slot Slot) string {
// Backend EventData → prompt. v1 is intentionally simple; refine
// when the prompt-engineering side of the plugin matures.
if slot.EventType != nil {
switch *slot.EventType {
case EventTypeSystemEvent:
if ev, ok := slot.EventData["event"].(string); ok {
return fmt.Sprintf("[calendar system_event] %s", ev)
}
case EventTypeJob:
code, _ := slot.EventData["code"].(string)
typ, _ := slot.EventData["type"].(string)
if code != "" {
return fmt.Sprintf("[calendar job %s/%s] please handle this", typ, code)
}
}
}
delete(s.activeBySlotID, slotID)
delete(s.activeByAgentID, act.Slot.AgentID)
s.appendHistoryLocked(HistoryEntry{
SlotID: slotID, AgentID: act.Slot.AgentID, Status: status,
return fmt.Sprintf("[calendar slot %s] scheduled work — please proceed", slot.SlotIdent())
}
// CompleteForAgent → terminal; pushes Finished to backend.
func (s *Scheduler) CompleteForAgent(ctx context.Context, agentID, summary string) error {
act, ok := s.activeSlotForAgent(agentID)
if !ok {
return ErrNoActiveSlot
}
now := time.Now().UTC()
duration := int(now.Sub(act.StartedAt).Minutes())
if duration < 1 {
duration = 1
}
if err := s.pushUpdate(ctx, agentID, act.Slot, SlotAgentUpdate{
Status: SlotFinished, ActualDuration: duration,
}); err != nil {
return err
}
s.resolveLocally(act.Slot.SlotIdent(), agentID, SlotFinished, summary, "")
return nil
}
// AbortForAgent → terminal; pushes Aborted to backend.
func (s *Scheduler) AbortForAgent(ctx context.Context, agentID, reason string) error {
act, ok := s.activeSlotForAgent(agentID)
if !ok {
return ErrNoActiveSlot
}
if err := s.pushUpdate(ctx, agentID, act.Slot, SlotAgentUpdate{Status: SlotAborted}); err != nil {
return err
}
s.resolveLocally(act.Slot.SlotIdent(), agentID, SlotAborted, "", reason)
return nil
}
// PauseForAgent → non-terminal; pushes Paused.
func (s *Scheduler) PauseForAgent(ctx context.Context, agentID, reason string) error {
act, ok := s.activeSlotForAgent(agentID)
if !ok {
return ErrNoActiveSlot
}
return s.pushUpdate(ctx, agentID, act.Slot, SlotAgentUpdate{Status: SlotPaused})
}
// ResumeForAgent → non-terminal; pushes Ongoing.
func (s *Scheduler) ResumeForAgent(ctx context.Context, agentID string) error {
act, ok := s.activeSlotForAgent(agentID)
if !ok {
return ErrNoActiveSlot
}
return s.pushUpdate(ctx, agentID, act.Slot, SlotAgentUpdate{Status: SlotOngoing})
}
func (s *Scheduler) pushUpdate(ctx context.Context, agentID string, slot Slot, update SlotAgentUpdate) error {
if slot.HasRealID() {
return s.bridge.UpdateRealSlot(ctx, agentID, *slot.ID, update)
}
if slot.VirtualID != nil {
return s.bridge.UpdateVirtualSlot(ctx, agentID, *slot.VirtualID, update)
}
return errors.New("calendar: slot has neither real id nor virtual id")
}
func (s *Scheduler) resolveLocally(ident, agentID string, status SlotStatus, summary, reason string) {
s.mu.Lock()
defer s.mu.Unlock()
delete(s.activeBySlotIdent, ident)
delete(s.activeByAgentID, agentID)
s.history = append(s.history, HistoryEntry{
Ident: ident, AgentID: agentID, Status: status,
ResolvedAt: time.Now().UTC(), Summary: summary, Reason: reason,
})
s.mu.Unlock()
return s.bridge.UpdateSlotStatus(ctx, slotID, SlotUpdate{
Status: status, Summary: summary, Reason: reason,
})
}
// SetSlotState is a non-terminal status change (paused/resumed).
// Records the new state in-memory and tells the backend.
func (s *Scheduler) SetSlotState(ctx context.Context, slotID string, status SlotStatus, reason string) error {
s.mu.Lock()
act, ok := s.activeBySlotID[slotID]
if !ok {
s.mu.Unlock()
return fmt.Errorf("calendar: slot %s not active", slotID)
}
act.State = status
act.LastHeartbeat = time.Now().UTC()
s.mu.Unlock()
return s.bridge.UpdateSlotStatus(ctx, slotID, SlotUpdate{
Status: status, Reason: reason,
})
}
func (s *Scheduler) appendHistoryLocked(entry HistoryEntry) {
s.history = append(s.history, entry)
if len(s.history) > s.cfg.HistoryCap {
s.history = s.history[len(s.history)-s.cfg.HistoryCap:]
}
}
// CompleteForAgent / AbortForAgent / PauseForAgent / ResumeForAgent
// are the agent-facing tool entry points. They look up the agent's
// active slot, transition or terminate it, and notify the backend.
// CompleteForAgent terminates the agent's active slot as completed.
func (s *Scheduler) CompleteForAgent(ctx context.Context, agentID, summary string) error {
slot, ok := s.activeSlotForAgent(agentID)
if !ok {
return ErrNoActiveSlot
}
return s.resolveSlot(ctx, slot.Slot.ID, SlotCompleted, summary, "")
}
// AbortForAgent terminates the agent's active slot as aborted.
func (s *Scheduler) AbortForAgent(ctx context.Context, agentID, reason string) error {
slot, ok := s.activeSlotForAgent(agentID)
if !ok {
return ErrNoActiveSlot
}
return s.resolveSlot(ctx, slot.Slot.ID, SlotAborted, "", reason)
}
// PauseForAgent transitions the agent's slot to paused.
func (s *Scheduler) PauseForAgent(ctx context.Context, agentID, reason string) error {
slot, ok := s.activeSlotForAgent(agentID)
if !ok {
return ErrNoActiveSlot
}
return s.SetSlotState(ctx, slot.Slot.ID, SlotPaused, reason)
}
// ResumeForAgent transitions the agent's slot back to in_progress.
func (s *Scheduler) ResumeForAgent(ctx context.Context, agentID string) error {
slot, ok := s.activeSlotForAgent(agentID)
if !ok {
return ErrNoActiveSlot
}
return s.SetSlotState(ctx, slot.Slot.ID, SlotInProgress, "")
}
// activeSlotForAgent returns the per-agent active slot copy under lock.
func (s *Scheduler) activeSlotForAgent(agentID string) (ActiveSlot, bool) {
s.mu.Lock()
defer s.mu.Unlock()
@@ -281,36 +309,59 @@ func (s *Scheduler) activeSlotForAgent(agentID string) (ActiveSlot, bool) {
return *act, true
}
// Status returns the introspection shape for the calendar_status tool.
func (s *Scheduler) Status() SchedulerStatus {
// Status is the introspection shape calendar_status returns.
type Status struct {
Enabled bool `json:"enabled"`
LastHeartbeats map[string]time.Time `json:"last_heartbeats"`
LastErrors map[string]string `json:"last_errors,omitempty"`
HeartbeatEvery time.Duration `json:"heartbeat_every"`
Active []ActiveSlot `json:"active"`
History []HistoryEntry `json:"history"`
}
// SingleActiveAgentID returns the agent id when exactly one active
// slot exists, empty otherwise. Used by the plugin's bestEffortAgentID
// fallback for tool calls that don't carry agent context.
func (s *Scheduler) SingleActiveAgentID() string {
s.mu.Lock()
defer s.mu.Unlock()
active := make([]ActiveSlot, 0, len(s.activeBySlotID))
for _, a := range s.activeBySlotID {
if len(s.activeByAgentID) != 1 {
return ""
}
for k := range s.activeByAgentID {
return k
}
return ""
}
// Status returns the introspection shape calendar_status returns.
func (s *Scheduler) Status() Status {
s.mu.Lock()
defer s.mu.Unlock()
active := make([]ActiveSlot, 0, len(s.activeByAgentID))
for _, a := range s.activeByAgentID {
active = append(active, *a)
}
hb := make(map[string]time.Time, len(s.lastHeartbeats))
for k, v := range s.lastHeartbeats {
hb[k] = v
}
errs := make(map[string]string, len(s.lastErrors))
for k, v := range s.lastErrors {
errs[k] = v
}
history := make([]HistoryEntry, len(s.history))
copy(history, s.history)
return SchedulerStatus{
return Status{
Enabled: true,
LastHeartbeat: s.lastHeartbeat,
LastHeartbeats: hb,
LastErrors: errs,
HeartbeatEvery: s.cfg.HeartbeatInterval,
Active: active,
History: history,
RestartPending: s.restartPending,
}
}
// SchedulerStatus is the shape calendar_status returns.
type SchedulerStatus struct {
Enabled bool `json:"enabled"`
LastHeartbeat time.Time `json:"last_heartbeat"`
HeartbeatEvery time.Duration `json:"heartbeat_every"`
Active []ActiveSlot `json:"active"`
History []HistoryEntry `json:"history"`
RestartPending bool `json:"restart_pending"`
}
// ErrNoActiveSlot is returned by calendar_complete/abort/pause/resume
// when the agent has no slot in progress.
// ErrNoActiveSlot is returned when an agent calls calendar_complete /
// abort / pause / resume but has no slot active.
var ErrNoActiveSlot = errors.New("calendar: no active slot for agent")