Files
ai-trade/services/insta-render/worker.py
gahusb 6774067505 fix(insta-render): 큐 연결 socket_timeout=30 (None→30 교정)
근본원인 실험 확정: redis-py 블로킹 read에서 socket_timeout이 BLMOVE 블록(5s)
이하/None이면 read_timeout 경계 경합으로 간헐 "Timeout reading" → dequeue 실패
→ 슬레이트 draft 정지. socket_timeout 10/30은 모든 실험에서 안정. 블록보다 큰
30으로 명시(직전 None 커밋은 단독 테스트만 통과시켜 오도 — 재사용 패턴서 깨짐).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-08 03:17:34 +09:00

144 lines
5.4 KiB
Python

"""Redis ReliableQueue worker — F6 신뢰성 패턴 (BLMOVE + ack/fail + recovery).
queue:paused가 set이면 대기 (task-watcher가 박재오 활동 감지 시 set).
"""
from __future__ import annotations
import asyncio
import logging
import os
from typing import Any
import httpx
import redis.asyncio as aioredis
from card_renderer import render_slate
from _shared.reliable_queue import ReliableQueue
logger = logging.getLogger(__name__)
REDIS_URL = os.getenv("REDIS_URL", "redis://192.168.45.54:6379")
NAS_BASE_URL = os.getenv("NAS_BASE_URL", "http://192.168.45.54:18700")
INTERNAL_API_KEY = os.getenv("INTERNAL_API_KEY", "")
INSTA_MEDIA_URL_PREFIX = os.getenv("INSTA_MEDIA_URL_PREFIX", "/media/insta")
QUEUE_KEY = "queue:insta-render"
PAUSED_KEY = "queue:paused"
async def _post_update(client: httpx.AsyncClient, task_id: str, status: str, progress: int,
result_path: str | None = None, error: str | None = None) -> None:
"""NAS internal webhook 호출."""
url = f"{NAS_BASE_URL}/api/internal/insta/update"
payload: dict[str, Any] = {"task_id": task_id, "status": status, "progress": progress}
if result_path:
payload["result_path"] = result_path
if error:
payload["error"] = error
try:
r = await client.post(
url,
headers={"X-Internal-Key": INTERNAL_API_KEY},
json=payload,
timeout=10.0,
)
if r.status_code != 200:
logger.error("webhook %s returned %d: %s", task_id, r.status_code, r.text[:200])
except Exception:
logger.exception("webhook %s 호출 실패", task_id)
async def _fetch_slate(client: httpx.AsyncClient, slate_id: int) -> dict:
"""NAS /api/insta/slates/{id} GET. (인증 불필요 — 기존 endpoint)"""
r = await client.get(f"{NAS_BASE_URL}/api/insta/slates/{slate_id}", timeout=10.0)
r.raise_for_status()
return r.json()
async def _process_one(client: httpx.AsyncClient, payload: dict) -> None:
"""단일 작업 처리: fetch slate → render → webhook. 예외 발생 시 webhook(failed) 호출 후 raise.
F6: webhook 통신 외 예외는 poll_once가 fail(raw, payload)로 retry/dead-letter 처리.
"""
task_id = payload["task_id"]
params = payload.get("params", {})
slate_id = params.get("slate_id")
theme = params.get("theme", "default")
template = f"{theme}/card.html.j2"
try:
await _post_update(client, task_id, "processing", 20)
slate = await _fetch_slate(client, slate_id)
await _post_update(client, task_id, "processing", 50)
paths = await render_slate(slate, slate_id, template=template)
first_url = f"{INSTA_MEDIA_URL_PREFIX}/{slate_id}/01.png"
await _post_update(
client, task_id, "succeeded", 100, result_path=first_url
)
logger.info("rendered task=%s slate=%s count=%d", task_id, slate_id, len(paths))
except Exception as e:
logger.exception("render task=%s 실패", task_id)
await _post_update(client, task_id, "failed", 0, error=str(e))
raise
async def poll_once(queue: ReliableQueue, client: httpx.AsyncClient) -> bool:
"""1 cycle: dequeue → _process_one → ack/fail. Returns True if a job handled."""
result = await queue.dequeue(timeout=5)
if result is None:
return False
payload, raw = result
try:
await _process_one(client, payload)
except Exception:
await queue.fail(raw, payload)
return True
await queue.ack(raw)
return True
# 블로킹 dequeue는 BLMOVE(블록 5s)를 쓴다. redis-py 블로킹 read에서 socket_timeout이
# 블록(5s) 이하이거나 None이면 read-timeout이 블록 경계와 경합해 간헐적으로
# "Timeout reading"이 터져 잡을 못 꺼낸다(슬레이트 draft 정지). 실험상 socket_timeout이
# 블록보다 충분히 크면(10/30) 항상 안정. → 블록보다 넉넉히 큰 값을 명시한다.
QUEUE_SOCKET_TIMEOUT = 30 # > dequeue blmove 블록(5s)
def make_queue_redis():
"""블로킹 dequeue(BLMOVE)용 redis 클라이언트. socket_timeout > 블록(5s) 보장."""
return aioredis.from_url(
REDIS_URL, decode_responses=False,
socket_timeout=QUEUE_SOCKET_TIMEOUT, socket_keepalive=True,
)
async def worker_loop():
"""무한 루프 — paused 체크 → ReliableQueue.dequeue → process_one → ack/fail."""
redis = make_queue_redis()
queue = ReliableQueue(redis, queue_key=QUEUE_KEY)
async with httpx.AsyncClient() as client:
logger.info("insta-render worker started worker_id=%s queue=%s",
queue.worker_id, QUEUE_KEY)
# F6: startup recovery — 이전 crash 시 잔존 orphan 재큐
try:
recovered = await queue.recover()
if recovered:
logger.info("recovered %d orphaned items at startup", recovered)
except Exception:
logger.exception("startup recover failed")
while True:
try:
paused = await redis.get(PAUSED_KEY)
if paused == b"1":
await asyncio.sleep(10)
continue
await poll_once(queue, client)
except asyncio.CancelledError:
logger.info("worker_loop cancelled")
raise
except Exception:
logger.exception("worker_loop iteration 실패, 5초 후 재시도")
await asyncio.sleep(5)