fix(agent-office): node_monitor 루프 예외 방어 + 테스트 보강 (B2 리뷰)
- per-worker 루프 전체를 try/except로 감싸 Redis 예외 시 redis_ok=False+break (Blocker)
- heartbeat 파싱 except에 UnicodeDecodeError 추가 (Important)
- hb.get('ts') or '' 로 null ts 안전 처리 (Minor)
- 테스트 3개 추가: paused 폴백·processing 집계·llen 예외 회귀
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_019LV86jBozkNhSFXJA412fq
This commit is contained in:
@@ -55,29 +55,34 @@ async def collect_status(redis=None) -> dict:
|
||||
return out
|
||||
|
||||
for w in WORKER_REGISTRY:
|
||||
info = {"name": w["name"], "kind": w["kind"], "alive": False, "state": None,
|
||||
"last_beat_age_s": None, "queue_depth": 0, "dead_letter": 0,
|
||||
"processing": 0, "jobs_done": 0, "jobs_failed": 0, "last_job_at": None}
|
||||
raw = await r.get(f"worker:{w['name']}:heartbeat")
|
||||
if raw:
|
||||
try:
|
||||
hb = json.loads(raw)
|
||||
info.update(alive=True, state=hb.get("state"),
|
||||
jobs_done=hb.get("jobs_done", 0), jobs_failed=hb.get("jobs_failed", 0),
|
||||
last_job_at=hb.get("last_job_at"),
|
||||
last_beat_age_s=_beat_age(hb.get("ts", ""), now))
|
||||
if w["kind"] == "watcher" and hb.get("mode"):
|
||||
out["paused_reason"] = hb["mode"]
|
||||
except json.JSONDecodeError:
|
||||
logger.warning("heartbeat JSON 파싱 실패 name=%s", w["name"])
|
||||
if w["queue"]:
|
||||
info["queue_depth"] = await r.llen(w["queue"])
|
||||
info["dead_letter"] = await r.llen(f"dead_letter:{w['queue']}")
|
||||
proc = 0
|
||||
async for key in r.scan_iter(match=f"processing:{w['queue']}:*"):
|
||||
proc += await r.llen(key)
|
||||
info["processing"] = proc
|
||||
out["workers"].append(info)
|
||||
try:
|
||||
info = {"name": w["name"], "kind": w["kind"], "alive": False, "state": None,
|
||||
"last_beat_age_s": None, "queue_depth": 0, "dead_letter": 0,
|
||||
"processing": 0, "jobs_done": 0, "jobs_failed": 0, "last_job_at": None}
|
||||
raw = await r.get(f"worker:{w['name']}:heartbeat")
|
||||
if raw:
|
||||
try:
|
||||
hb = json.loads(raw)
|
||||
info.update(alive=True, state=hb.get("state"),
|
||||
jobs_done=hb.get("jobs_done", 0), jobs_failed=hb.get("jobs_failed", 0),
|
||||
last_job_at=hb.get("last_job_at"),
|
||||
last_beat_age_s=_beat_age(hb.get("ts") or "", now))
|
||||
if w["kind"] == "watcher" and hb.get("mode"):
|
||||
out["paused_reason"] = hb["mode"]
|
||||
except (json.JSONDecodeError, UnicodeDecodeError):
|
||||
logger.warning("heartbeat JSON 파싱 실패 name=%s", w["name"])
|
||||
if w["queue"]:
|
||||
info["queue_depth"] = await r.llen(w["queue"])
|
||||
info["dead_letter"] = await r.llen(f"dead_letter:{w['queue']}")
|
||||
proc = 0
|
||||
async for key in r.scan_iter(match=f"processing:{w['queue']}:*"):
|
||||
proc += await r.llen(key)
|
||||
info["processing"] = proc
|
||||
out["workers"].append(info)
|
||||
except Exception:
|
||||
logger.exception("워커 상태 수집 실패 name=%s", w["name"])
|
||||
out["redis_ok"] = False
|
||||
break
|
||||
|
||||
for w in out["workers"]:
|
||||
if w["kind"] == "trader":
|
||||
|
||||
Reference in New Issue
Block a user