HarborForge.Backend: dev-2026-03-29 -> main #13
252
app/services/agent_status.py
Normal file
252
app/services/agent_status.py
Normal file
@@ -0,0 +1,252 @@
|
||||
"""Agent status transitions — BE-AGT-002.
|
||||
|
||||
Implements the state machine for Agent runtime status:
|
||||
|
||||
Idle ──→ Busy (woken by a Work slot)
|
||||
Idle ──→ OnCall (woken by an OnCall slot)
|
||||
Busy ──→ Idle (task finished / no more pending slots)
|
||||
OnCall──→ Idle (task finished / no more pending slots)
|
||||
* ──→ Offline (heartbeat timeout — no heartbeat for > 2 min)
|
||||
* ──→ Exhausted (API quota / rate-limit error)
|
||||
Exhausted → Idle (recovery_at reached)
|
||||
|
||||
Design reference: NEXT_WAVE_DEV_DIRECTION.md §6.4 (Status transitions)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from typing import Optional
|
||||
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.models.agent import Agent, AgentStatus, ExhaustReason
|
||||
from app.models.calendar import SlotType
|
||||
|
||||
# Heartbeat timeout threshold in seconds (2 minutes per spec §6.4)
|
||||
HEARTBEAT_TIMEOUT_SECONDS = 120
|
||||
|
||||
# Default recovery duration when we can't parse a retry-after header
|
||||
DEFAULT_RECOVERY_HOURS = 5
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Transition helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class AgentStatusError(Exception):
|
||||
"""Raised when a requested status transition is invalid."""
|
||||
|
||||
|
||||
def _assert_current(agent: Agent, *expected: AgentStatus) -> None:
|
||||
"""Raise if the agent is not in one of the expected statuses."""
|
||||
if agent.status not in expected:
|
||||
allowed = ", ".join(s.value for s in expected)
|
||||
raise AgentStatusError(
|
||||
f"Agent '{agent.agent_id}' is {agent.status.value}; "
|
||||
f"expected one of [{allowed}]"
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Public API
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def transition_to_busy(
|
||||
db: Session,
|
||||
agent: Agent,
|
||||
*,
|
||||
slot_type: SlotType,
|
||||
now: datetime | None = None,
|
||||
) -> Agent:
|
||||
"""Idle → Busy or OnCall depending on *slot_type*.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
slot_type : SlotType
|
||||
The type of the slot that triggered the wakeup.
|
||||
``SlotType.ON_CALL`` → ``AgentStatus.ON_CALL``, everything else
|
||||
→ ``AgentStatus.BUSY``.
|
||||
"""
|
||||
_assert_current(agent, AgentStatus.IDLE)
|
||||
|
||||
if slot_type == SlotType.ON_CALL:
|
||||
agent.status = AgentStatus.ON_CALL
|
||||
else:
|
||||
agent.status = AgentStatus.BUSY
|
||||
|
||||
if now is None:
|
||||
now = datetime.now(timezone.utc)
|
||||
agent.last_heartbeat = now
|
||||
|
||||
db.flush()
|
||||
return agent
|
||||
|
||||
|
||||
def transition_to_idle(
|
||||
db: Session,
|
||||
agent: Agent,
|
||||
*,
|
||||
now: datetime | None = None,
|
||||
) -> Agent:
|
||||
"""Busy / OnCall / Exhausted (recovered) → Idle.
|
||||
|
||||
For Exhausted agents this should only be called when ``recovery_at``
|
||||
has been reached; the caller is responsible for checking that.
|
||||
"""
|
||||
_assert_current(
|
||||
agent,
|
||||
AgentStatus.BUSY,
|
||||
AgentStatus.ON_CALL,
|
||||
AgentStatus.EXHAUSTED,
|
||||
AgentStatus.OFFLINE,
|
||||
)
|
||||
|
||||
agent.status = AgentStatus.IDLE
|
||||
|
||||
# Clear exhausted metadata if transitioning out of Exhausted
|
||||
agent.exhausted_at = None
|
||||
agent.recovery_at = None
|
||||
agent.exhaust_reason = None
|
||||
|
||||
if now is None:
|
||||
now = datetime.now(timezone.utc)
|
||||
agent.last_heartbeat = now
|
||||
|
||||
db.flush()
|
||||
return agent
|
||||
|
||||
|
||||
def transition_to_offline(
|
||||
db: Session,
|
||||
agent: Agent,
|
||||
) -> Agent:
|
||||
"""Any status → Offline (heartbeat timeout).
|
||||
|
||||
Typically called by a background check that detects
|
||||
``last_heartbeat`` is older than ``HEARTBEAT_TIMEOUT_SECONDS``.
|
||||
"""
|
||||
# Already offline — no-op
|
||||
if agent.status == AgentStatus.OFFLINE:
|
||||
return agent
|
||||
|
||||
agent.status = AgentStatus.OFFLINE
|
||||
db.flush()
|
||||
return agent
|
||||
|
||||
|
||||
def transition_to_exhausted(
|
||||
db: Session,
|
||||
agent: Agent,
|
||||
*,
|
||||
reason: ExhaustReason,
|
||||
recovery_at: datetime | None = None,
|
||||
now: datetime | None = None,
|
||||
) -> Agent:
|
||||
"""Any active status → Exhausted (API quota error).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
reason : ExhaustReason
|
||||
``RATE_LIMIT`` or ``BILLING``.
|
||||
recovery_at : datetime, optional
|
||||
Parsed from retry-after / reset headers. If *None*, defaults to
|
||||
``now + DEFAULT_RECOVERY_HOURS``.
|
||||
"""
|
||||
if now is None:
|
||||
now = datetime.now(timezone.utc)
|
||||
|
||||
agent.status = AgentStatus.EXHAUSTED
|
||||
agent.exhausted_at = now
|
||||
agent.exhaust_reason = reason
|
||||
|
||||
if recovery_at is not None:
|
||||
agent.recovery_at = recovery_at
|
||||
else:
|
||||
agent.recovery_at = now + timedelta(hours=DEFAULT_RECOVERY_HOURS)
|
||||
|
||||
db.flush()
|
||||
return agent
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Heartbeat-driven checks
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def check_heartbeat_timeout(
|
||||
db: Session,
|
||||
agent: Agent,
|
||||
*,
|
||||
now: datetime | None = None,
|
||||
) -> bool:
|
||||
"""Mark agent Offline if heartbeat has timed out.
|
||||
|
||||
Returns ``True`` if the agent was transitioned to Offline.
|
||||
"""
|
||||
if agent.status == AgentStatus.OFFLINE:
|
||||
return False
|
||||
|
||||
if now is None:
|
||||
now = datetime.now(timezone.utc)
|
||||
|
||||
if agent.last_heartbeat is None:
|
||||
# Never sent a heartbeat — treat as offline
|
||||
transition_to_offline(db, agent)
|
||||
return True
|
||||
|
||||
elapsed = (now - agent.last_heartbeat).total_seconds()
|
||||
if elapsed > HEARTBEAT_TIMEOUT_SECONDS:
|
||||
transition_to_offline(db, agent)
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def check_exhausted_recovery(
|
||||
db: Session,
|
||||
agent: Agent,
|
||||
*,
|
||||
now: datetime | None = None,
|
||||
) -> bool:
|
||||
"""Recover an Exhausted agent if ``recovery_at`` has been reached.
|
||||
|
||||
Returns ``True`` if the agent was transitioned back to Idle.
|
||||
"""
|
||||
if agent.status != AgentStatus.EXHAUSTED:
|
||||
return False
|
||||
|
||||
if now is None:
|
||||
now = datetime.now(timezone.utc)
|
||||
|
||||
if agent.recovery_at is not None and now >= agent.recovery_at:
|
||||
transition_to_idle(db, agent, now=now)
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def record_heartbeat(
|
||||
db: Session,
|
||||
agent: Agent,
|
||||
*,
|
||||
now: datetime | None = None,
|
||||
) -> Agent:
|
||||
"""Update ``last_heartbeat`` timestamp.
|
||||
|
||||
If the agent was Offline and a heartbeat arrives, transition back to
|
||||
Idle (the agent has come back online).
|
||||
"""
|
||||
if now is None:
|
||||
now = datetime.now(timezone.utc)
|
||||
|
||||
agent.last_heartbeat = now
|
||||
|
||||
if agent.status == AgentStatus.OFFLINE:
|
||||
agent.status = AgentStatus.IDLE
|
||||
# Clear any stale exhausted metadata
|
||||
agent.exhausted_at = None
|
||||
agent.recovery_at = None
|
||||
agent.exhaust_reason = None
|
||||
|
||||
db.flush()
|
||||
return agent
|
||||
301
tests/test_agent_status.py
Normal file
301
tests/test_agent_status.py
Normal file
@@ -0,0 +1,301 @@
|
||||
"""Tests for Agent status transition service — BE-AGT-002.
|
||||
|
||||
Covers:
|
||||
- Idle → Busy / OnCall
|
||||
- Busy / OnCall → Idle
|
||||
- Heartbeat timeout → Offline
|
||||
- API quota error → Exhausted
|
||||
- Exhausted recovery → Idle
|
||||
- Invalid transition errors
|
||||
"""
|
||||
|
||||
import pytest
|
||||
from datetime import datetime, timedelta, timezone
|
||||
|
||||
from app.models.agent import Agent, AgentStatus, ExhaustReason
|
||||
from app.models.calendar import SlotType
|
||||
from app.services.agent_status import (
|
||||
AgentStatusError,
|
||||
HEARTBEAT_TIMEOUT_SECONDS,
|
||||
DEFAULT_RECOVERY_HOURS,
|
||||
transition_to_busy,
|
||||
transition_to_idle,
|
||||
transition_to_offline,
|
||||
transition_to_exhausted,
|
||||
check_heartbeat_timeout,
|
||||
check_exhausted_recovery,
|
||||
record_heartbeat,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
NOW = datetime(2026, 4, 1, 12, 0, 0, tzinfo=timezone.utc)
|
||||
|
||||
|
||||
def _make_agent(db, *, status=AgentStatus.IDLE, last_hb=None, **kwargs):
|
||||
"""Insert and return an Agent row with a linked user."""
|
||||
from app.models import models
|
||||
from app.api.deps import get_password_hash
|
||||
|
||||
# Ensure we have a user
|
||||
user = db.query(models.User).filter_by(id=99).first()
|
||||
if user is None:
|
||||
# Need a role first
|
||||
from app.models.role_permission import Role
|
||||
role = db.query(Role).filter_by(id=99).first()
|
||||
if role is None:
|
||||
role = Role(id=99, name="agent_test_role", is_global=False)
|
||||
db.add(role)
|
||||
db.flush()
|
||||
user = models.User(
|
||||
id=99, username="agent_user", email="agent@test.com",
|
||||
hashed_password=get_password_hash("test123"),
|
||||
is_admin=False, role_id=role.id,
|
||||
)
|
||||
db.add(user)
|
||||
db.flush()
|
||||
|
||||
agent = Agent(
|
||||
user_id=user.id,
|
||||
agent_id=kwargs.get("agent_id", "test-agent-001"),
|
||||
claw_identifier="test-claw",
|
||||
status=status,
|
||||
last_heartbeat=last_hb,
|
||||
**{k: v for k, v in kwargs.items() if k not in ("agent_id",)},
|
||||
)
|
||||
db.add(agent)
|
||||
db.flush()
|
||||
return agent
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Idle → Busy / OnCall
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestTransitionToBusy:
|
||||
def test_idle_to_busy_for_work_slot(self, db):
|
||||
agent = _make_agent(db, status=AgentStatus.IDLE)
|
||||
result = transition_to_busy(db, agent, slot_type=SlotType.WORK, now=NOW)
|
||||
assert result.status == AgentStatus.BUSY
|
||||
assert result.last_heartbeat == NOW
|
||||
|
||||
def test_idle_to_on_call_for_on_call_slot(self, db):
|
||||
agent = _make_agent(db, status=AgentStatus.IDLE)
|
||||
result = transition_to_busy(db, agent, slot_type=SlotType.ON_CALL, now=NOW)
|
||||
assert result.status == AgentStatus.ON_CALL
|
||||
|
||||
def test_idle_to_busy_for_system_slot(self, db):
|
||||
agent = _make_agent(db, status=AgentStatus.IDLE)
|
||||
result = transition_to_busy(db, agent, slot_type=SlotType.SYSTEM, now=NOW)
|
||||
assert result.status == AgentStatus.BUSY
|
||||
|
||||
def test_idle_to_busy_for_entertainment_slot(self, db):
|
||||
agent = _make_agent(db, status=AgentStatus.IDLE)
|
||||
result = transition_to_busy(db, agent, slot_type=SlotType.ENTERTAINMENT, now=NOW)
|
||||
assert result.status == AgentStatus.BUSY
|
||||
|
||||
def test_busy_to_busy_raises(self, db):
|
||||
agent = _make_agent(db, status=AgentStatus.BUSY)
|
||||
with pytest.raises(AgentStatusError, match="busy"):
|
||||
transition_to_busy(db, agent, slot_type=SlotType.WORK)
|
||||
|
||||
def test_exhausted_to_busy_raises(self, db):
|
||||
agent = _make_agent(db, status=AgentStatus.EXHAUSTED)
|
||||
with pytest.raises(AgentStatusError):
|
||||
transition_to_busy(db, agent, slot_type=SlotType.WORK)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Busy / OnCall → Idle
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestTransitionToIdle:
|
||||
def test_busy_to_idle(self, db):
|
||||
agent = _make_agent(db, status=AgentStatus.BUSY)
|
||||
result = transition_to_idle(db, agent, now=NOW)
|
||||
assert result.status == AgentStatus.IDLE
|
||||
assert result.last_heartbeat == NOW
|
||||
|
||||
def test_on_call_to_idle(self, db):
|
||||
agent = _make_agent(db, status=AgentStatus.ON_CALL)
|
||||
result = transition_to_idle(db, agent, now=NOW)
|
||||
assert result.status == AgentStatus.IDLE
|
||||
|
||||
def test_exhausted_to_idle_clears_metadata(self, db):
|
||||
agent = _make_agent(
|
||||
db,
|
||||
status=AgentStatus.EXHAUSTED,
|
||||
exhausted_at=NOW - timedelta(hours=1),
|
||||
recovery_at=NOW,
|
||||
exhaust_reason=ExhaustReason.RATE_LIMIT,
|
||||
)
|
||||
result = transition_to_idle(db, agent, now=NOW)
|
||||
assert result.status == AgentStatus.IDLE
|
||||
assert result.exhausted_at is None
|
||||
assert result.recovery_at is None
|
||||
assert result.exhaust_reason is None
|
||||
|
||||
def test_offline_to_idle(self, db):
|
||||
agent = _make_agent(db, status=AgentStatus.OFFLINE)
|
||||
result = transition_to_idle(db, agent, now=NOW)
|
||||
assert result.status == AgentStatus.IDLE
|
||||
|
||||
def test_idle_to_idle_raises(self, db):
|
||||
agent = _make_agent(db, status=AgentStatus.IDLE)
|
||||
with pytest.raises(AgentStatusError, match="idle"):
|
||||
transition_to_idle(db, agent)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# * → Offline (heartbeat timeout)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestTransitionToOffline:
|
||||
def test_idle_to_offline(self, db):
|
||||
agent = _make_agent(db, status=AgentStatus.IDLE)
|
||||
result = transition_to_offline(db, agent)
|
||||
assert result.status == AgentStatus.OFFLINE
|
||||
|
||||
def test_busy_to_offline(self, db):
|
||||
agent = _make_agent(db, status=AgentStatus.BUSY)
|
||||
result = transition_to_offline(db, agent)
|
||||
assert result.status == AgentStatus.OFFLINE
|
||||
|
||||
def test_already_offline_noop(self, db):
|
||||
agent = _make_agent(db, status=AgentStatus.OFFLINE)
|
||||
result = transition_to_offline(db, agent)
|
||||
assert result.status == AgentStatus.OFFLINE
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# * → Exhausted (API quota)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestTransitionToExhausted:
|
||||
def test_busy_to_exhausted_with_recovery(self, db):
|
||||
recovery = NOW + timedelta(hours=1)
|
||||
agent = _make_agent(db, status=AgentStatus.BUSY)
|
||||
result = transition_to_exhausted(
|
||||
db, agent,
|
||||
reason=ExhaustReason.RATE_LIMIT,
|
||||
recovery_at=recovery,
|
||||
now=NOW,
|
||||
)
|
||||
assert result.status == AgentStatus.EXHAUSTED
|
||||
assert result.exhausted_at == NOW
|
||||
assert result.recovery_at == recovery
|
||||
assert result.exhaust_reason == ExhaustReason.RATE_LIMIT
|
||||
|
||||
def test_exhausted_default_recovery(self, db):
|
||||
agent = _make_agent(db, status=AgentStatus.BUSY)
|
||||
result = transition_to_exhausted(
|
||||
db, agent,
|
||||
reason=ExhaustReason.BILLING,
|
||||
now=NOW,
|
||||
)
|
||||
expected_recovery = NOW + timedelta(hours=DEFAULT_RECOVERY_HOURS)
|
||||
assert result.recovery_at == expected_recovery
|
||||
assert result.exhaust_reason == ExhaustReason.BILLING
|
||||
|
||||
def test_idle_to_exhausted(self, db):
|
||||
"""Edge case: agent gets a rate-limit before even starting work."""
|
||||
agent = _make_agent(db, status=AgentStatus.IDLE)
|
||||
result = transition_to_exhausted(
|
||||
db, agent,
|
||||
reason=ExhaustReason.RATE_LIMIT,
|
||||
now=NOW,
|
||||
)
|
||||
assert result.status == AgentStatus.EXHAUSTED
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Heartbeat timeout check
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestCheckHeartbeatTimeout:
|
||||
def test_timeout_triggers_offline(self, db):
|
||||
old_hb = NOW - timedelta(seconds=HEARTBEAT_TIMEOUT_SECONDS + 10)
|
||||
agent = _make_agent(db, status=AgentStatus.IDLE, last_hb=old_hb)
|
||||
changed = check_heartbeat_timeout(db, agent, now=NOW)
|
||||
assert changed is True
|
||||
assert agent.status == AgentStatus.OFFLINE
|
||||
|
||||
def test_recent_heartbeat_no_change(self, db):
|
||||
recent_hb = NOW - timedelta(seconds=30)
|
||||
agent = _make_agent(db, status=AgentStatus.BUSY, last_hb=recent_hb)
|
||||
changed = check_heartbeat_timeout(db, agent, now=NOW)
|
||||
assert changed is False
|
||||
assert agent.status == AgentStatus.BUSY
|
||||
|
||||
def test_no_heartbeat_ever_goes_offline(self, db):
|
||||
agent = _make_agent(db, status=AgentStatus.IDLE, last_hb=None)
|
||||
changed = check_heartbeat_timeout(db, agent, now=NOW)
|
||||
assert changed is True
|
||||
assert agent.status == AgentStatus.OFFLINE
|
||||
|
||||
def test_already_offline_returns_false(self, db):
|
||||
agent = _make_agent(db, status=AgentStatus.OFFLINE, last_hb=None)
|
||||
changed = check_heartbeat_timeout(db, agent, now=NOW)
|
||||
assert changed is False
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Exhausted recovery check
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestCheckExhaustedRecovery:
|
||||
def test_recovery_at_reached(self, db):
|
||||
agent = _make_agent(
|
||||
db,
|
||||
status=AgentStatus.EXHAUSTED,
|
||||
exhausted_at=NOW - timedelta(hours=5),
|
||||
recovery_at=NOW - timedelta(minutes=1),
|
||||
exhaust_reason=ExhaustReason.RATE_LIMIT,
|
||||
)
|
||||
recovered = check_exhausted_recovery(db, agent, now=NOW)
|
||||
assert recovered is True
|
||||
assert agent.status == AgentStatus.IDLE
|
||||
assert agent.exhausted_at is None
|
||||
|
||||
def test_recovery_at_not_yet_reached(self, db):
|
||||
agent = _make_agent(
|
||||
db,
|
||||
status=AgentStatus.EXHAUSTED,
|
||||
exhausted_at=NOW,
|
||||
recovery_at=NOW + timedelta(hours=1),
|
||||
exhaust_reason=ExhaustReason.BILLING,
|
||||
)
|
||||
recovered = check_exhausted_recovery(db, agent, now=NOW)
|
||||
assert recovered is False
|
||||
assert agent.status == AgentStatus.EXHAUSTED
|
||||
|
||||
def test_non_exhausted_agent_returns_false(self, db):
|
||||
agent = _make_agent(db, status=AgentStatus.IDLE)
|
||||
recovered = check_exhausted_recovery(db, agent, now=NOW)
|
||||
assert recovered is False
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Record heartbeat
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestRecordHeartbeat:
|
||||
def test_updates_timestamp(self, db):
|
||||
agent = _make_agent(db, status=AgentStatus.IDLE, last_hb=NOW - timedelta(minutes=1))
|
||||
result = record_heartbeat(db, agent, now=NOW)
|
||||
assert result.last_heartbeat == NOW
|
||||
|
||||
def test_offline_agent_recovers_to_idle(self, db):
|
||||
agent = _make_agent(db, status=AgentStatus.OFFLINE)
|
||||
result = record_heartbeat(db, agent, now=NOW)
|
||||
assert result.status == AgentStatus.IDLE
|
||||
assert result.last_heartbeat == NOW
|
||||
|
||||
def test_busy_agent_stays_busy(self, db):
|
||||
agent = _make_agent(db, status=AgentStatus.BUSY, last_hb=NOW - timedelta(seconds=30))
|
||||
result = record_heartbeat(db, agent, now=NOW)
|
||||
assert result.status == AgentStatus.BUSY
|
||||
assert result.last_heartbeat == NOW
|
||||
Reference in New Issue
Block a user