BE-AGT-002: implement Agent status transition service

- New service: app/services/agent_status.py
  - transition_to_busy(): Idle → Busy/OnCall based on slot type
  - transition_to_idle(): Busy/OnCall/Exhausted/Offline → Idle
  - transition_to_offline(): Any → Offline (heartbeat timeout)
  - transition_to_exhausted(): Any → Exhausted (rate-limit/billing)
  - check_heartbeat_timeout(): auto-detect >2min heartbeat gap
  - check_exhausted_recovery(): auto-recover when recovery_at reached
  - record_heartbeat(): update timestamp, recover Offline agents
- Tests: tests/test_agent_status.py (22 test cases)
This commit is contained in:
zhi
2026-04-01 00:46:16 +00:00
parent 6c0959f5bb
commit 70f343fbac
2 changed files with 553 additions and 0 deletions

View File

@@ -0,0 +1,252 @@
"""Agent status transitions — BE-AGT-002.
Implements the state machine for Agent runtime status:
Idle ──→ Busy (woken by a Work slot)
Idle ──→ OnCall (woken by an OnCall slot)
Busy ──→ Idle (task finished / no more pending slots)
OnCall──→ Idle (task finished / no more pending slots)
* ──→ Offline (heartbeat timeout — no heartbeat for > 2 min)
* ──→ Exhausted (API quota / rate-limit error)
Exhausted → Idle (recovery_at reached)
Design reference: NEXT_WAVE_DEV_DIRECTION.md §6.4 (Status transitions)
"""
from __future__ import annotations
from datetime import datetime, timedelta, timezone
from typing import Optional
from sqlalchemy.orm import Session
from app.models.agent import Agent, AgentStatus, ExhaustReason
from app.models.calendar import SlotType
# Heartbeat timeout threshold in seconds (2 minutes per spec §6.4)
HEARTBEAT_TIMEOUT_SECONDS = 120
# Default recovery duration when we can't parse a retry-after header
DEFAULT_RECOVERY_HOURS = 5
# ---------------------------------------------------------------------------
# Transition helpers
# ---------------------------------------------------------------------------
class AgentStatusError(Exception):
"""Raised when a requested status transition is invalid."""
def _assert_current(agent: Agent, *expected: AgentStatus) -> None:
"""Raise if the agent is not in one of the expected statuses."""
if agent.status not in expected:
allowed = ", ".join(s.value for s in expected)
raise AgentStatusError(
f"Agent '{agent.agent_id}' is {agent.status.value}; "
f"expected one of [{allowed}]"
)
# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------
def transition_to_busy(
db: Session,
agent: Agent,
*,
slot_type: SlotType,
now: datetime | None = None,
) -> Agent:
"""Idle → Busy or OnCall depending on *slot_type*.
Parameters
----------
slot_type : SlotType
The type of the slot that triggered the wakeup.
``SlotType.ON_CALL`` → ``AgentStatus.ON_CALL``, everything else
→ ``AgentStatus.BUSY``.
"""
_assert_current(agent, AgentStatus.IDLE)
if slot_type == SlotType.ON_CALL:
agent.status = AgentStatus.ON_CALL
else:
agent.status = AgentStatus.BUSY
if now is None:
now = datetime.now(timezone.utc)
agent.last_heartbeat = now
db.flush()
return agent
def transition_to_idle(
db: Session,
agent: Agent,
*,
now: datetime | None = None,
) -> Agent:
"""Busy / OnCall / Exhausted (recovered) → Idle.
For Exhausted agents this should only be called when ``recovery_at``
has been reached; the caller is responsible for checking that.
"""
_assert_current(
agent,
AgentStatus.BUSY,
AgentStatus.ON_CALL,
AgentStatus.EXHAUSTED,
AgentStatus.OFFLINE,
)
agent.status = AgentStatus.IDLE
# Clear exhausted metadata if transitioning out of Exhausted
agent.exhausted_at = None
agent.recovery_at = None
agent.exhaust_reason = None
if now is None:
now = datetime.now(timezone.utc)
agent.last_heartbeat = now
db.flush()
return agent
def transition_to_offline(
db: Session,
agent: Agent,
) -> Agent:
"""Any status → Offline (heartbeat timeout).
Typically called by a background check that detects
``last_heartbeat`` is older than ``HEARTBEAT_TIMEOUT_SECONDS``.
"""
# Already offline — no-op
if agent.status == AgentStatus.OFFLINE:
return agent
agent.status = AgentStatus.OFFLINE
db.flush()
return agent
def transition_to_exhausted(
db: Session,
agent: Agent,
*,
reason: ExhaustReason,
recovery_at: datetime | None = None,
now: datetime | None = None,
) -> Agent:
"""Any active status → Exhausted (API quota error).
Parameters
----------
reason : ExhaustReason
``RATE_LIMIT`` or ``BILLING``.
recovery_at : datetime, optional
Parsed from retry-after / reset headers. If *None*, defaults to
``now + DEFAULT_RECOVERY_HOURS``.
"""
if now is None:
now = datetime.now(timezone.utc)
agent.status = AgentStatus.EXHAUSTED
agent.exhausted_at = now
agent.exhaust_reason = reason
if recovery_at is not None:
agent.recovery_at = recovery_at
else:
agent.recovery_at = now + timedelta(hours=DEFAULT_RECOVERY_HOURS)
db.flush()
return agent
# ---------------------------------------------------------------------------
# Heartbeat-driven checks
# ---------------------------------------------------------------------------
def check_heartbeat_timeout(
db: Session,
agent: Agent,
*,
now: datetime | None = None,
) -> bool:
"""Mark agent Offline if heartbeat has timed out.
Returns ``True`` if the agent was transitioned to Offline.
"""
if agent.status == AgentStatus.OFFLINE:
return False
if now is None:
now = datetime.now(timezone.utc)
if agent.last_heartbeat is None:
# Never sent a heartbeat — treat as offline
transition_to_offline(db, agent)
return True
elapsed = (now - agent.last_heartbeat).total_seconds()
if elapsed > HEARTBEAT_TIMEOUT_SECONDS:
transition_to_offline(db, agent)
return True
return False
def check_exhausted_recovery(
db: Session,
agent: Agent,
*,
now: datetime | None = None,
) -> bool:
"""Recover an Exhausted agent if ``recovery_at`` has been reached.
Returns ``True`` if the agent was transitioned back to Idle.
"""
if agent.status != AgentStatus.EXHAUSTED:
return False
if now is None:
now = datetime.now(timezone.utc)
if agent.recovery_at is not None and now >= agent.recovery_at:
transition_to_idle(db, agent, now=now)
return True
return False
def record_heartbeat(
db: Session,
agent: Agent,
*,
now: datetime | None = None,
) -> Agent:
"""Update ``last_heartbeat`` timestamp.
If the agent was Offline and a heartbeat arrives, transition back to
Idle (the agent has come back online).
"""
if now is None:
now = datetime.now(timezone.utc)
agent.last_heartbeat = now
if agent.status == AgentStatus.OFFLINE:
agent.status = AgentStatus.IDLE
# Clear any stale exhausted metadata
agent.exhausted_at = None
agent.recovery_at = None
agent.exhaust_reason = None
db.flush()
return agent