Files
web-page-backend/agent-office/app/node_monitor.py
gahusb ea1f0d103d fix(agent-office): node_monitor 루프 예외 방어 + 테스트 보강 (B2 리뷰)
- per-worker 루프 전체를 try/except로 감싸 Redis 예외 시 redis_ok=False+break (Blocker)
- heartbeat 파싱 except에 UnicodeDecodeError 추가 (Important)
- hb.get('ts') or '' 로 null ts 안전 처리 (Minor)
- 테스트 3개 추가: paused 폴백·processing 집계·llen 예외 회귀

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_019LV86jBozkNhSFXJA412fq
2026-06-29 17:56:18 +09:00

97 lines
3.9 KiB
Python

"""분산 워커 상태 집계 (read-only). Global Constraints 계약 2 스키마 생성."""
from __future__ import annotations
import datetime as dt, json, logging
import redis.asyncio as aioredis
from .config import REDIS_URL
logger = logging.getLogger("agent-office.node_monitor")
WORKER_REGISTRY = [
{"name": "music-render", "kind": "render", "queue": "queue:music-render"},
{"name": "video-render", "kind": "render", "queue": "queue:video-render"},
{"name": "image-render", "kind": "render", "queue": "queue:image-render"},
{"name": "insta-render", "kind": "render", "queue": "queue:insta-render"},
{"name": "task-watcher", "kind": "watcher", "queue": None},
{"name": "ai_trade", "kind": "trader", "queue": None},
]
_redis = None
def _get_redis():
global _redis
if _redis is None:
_redis = aioredis.from_url(REDIS_URL, decode_responses=False)
return _redis
def _beat_age(ts_str, now):
try:
beat = dt.datetime.fromisoformat(ts_str.replace("Z", "+00:00"))
return max(0, int((now - beat).total_seconds()))
except Exception:
return None
def _render_link_status(w):
if not w["alive"]:
return "down"
if w["state"] == "paused":
return "paused"
if w["dead_letter"] > 0:
return "degraded"
return "healthy"
async def collect_status(redis=None) -> dict:
r = redis or _get_redis()
now = dt.datetime.now(dt.timezone.utc)
out = {"redis_ok": True, "paused": False, "paused_reason": None,
"generated_at": now.strftime("%Y-%m-%dT%H:%M:%SZ"),
"workers": [], "links": []}
try:
out["paused"] = (await r.get("queue:paused")) == b"1"
except Exception:
logger.exception("redis 접근 실패")
out["redis_ok"] = False
return out
for w in WORKER_REGISTRY:
try:
info = {"name": w["name"], "kind": w["kind"], "alive": False, "state": None,
"last_beat_age_s": None, "queue_depth": 0, "dead_letter": 0,
"processing": 0, "jobs_done": 0, "jobs_failed": 0, "last_job_at": None}
raw = await r.get(f"worker:{w['name']}:heartbeat")
if raw:
try:
hb = json.loads(raw)
info.update(alive=True, state=hb.get("state"),
jobs_done=hb.get("jobs_done", 0), jobs_failed=hb.get("jobs_failed", 0),
last_job_at=hb.get("last_job_at"),
last_beat_age_s=_beat_age(hb.get("ts") or "", now))
if w["kind"] == "watcher" and hb.get("mode"):
out["paused_reason"] = hb["mode"]
except (json.JSONDecodeError, UnicodeDecodeError):
logger.warning("heartbeat JSON 파싱 실패 name=%s", w["name"])
if w["queue"]:
info["queue_depth"] = await r.llen(w["queue"])
info["dead_letter"] = await r.llen(f"dead_letter:{w['queue']}")
proc = 0
async for key in r.scan_iter(match=f"processing:{w['queue']}:*"):
proc += await r.llen(key)
info["processing"] = proc
out["workers"].append(info)
except Exception:
logger.exception("워커 상태 수집 실패 name=%s", w["name"])
out["redis_ok"] = False
break
for w in out["workers"]:
if w["kind"] == "trader":
out["links"].append({"from": "ai_trade", "to": "nas-stock", "type": "http-pull",
"status": "healthy" if w["alive"] else "down"})
elif w["kind"] == "render":
out["links"].append({"from": "nas", "to": w["name"], "type": "redis-queue",
"status": _render_link_status(w)})
if out["paused"] and not out["paused_reason"]:
out["paused_reason"] = "trading"
return out