feat(music-lab): orchestrator step 자동 재시도 (publish 제외)

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-12 00:20:29 +09:00
parent d638666659
commit e90e25d78f
2 changed files with 91 additions and 21 deletions

View File

@@ -11,6 +11,10 @@ from .gradient import make_gradient_with_title
logger = logging.getLogger("music-lab.orchestrator")
STEP_MAX_RETRIES = 2 # 추가 재시도 (총 시도 = +1)
STEP_RETRY_BACKOFF_SEC = [5, 15]
NON_RETRY_STEPS = {"publish"}
async def run_step(pipeline_id: int, step: str, feedback: str = "") -> None:
"""단계 실행 → 결과를 DB에 반영하고 *_pending 또는 다음 단계로 전이.
@@ -28,27 +32,34 @@ async def run_step(pipeline_id: int, step: str, feedback: str = "") -> None:
db.update_pipeline_state(pipeline_id, "failed", failed_reason=f"{step}: {e}")
return
try:
if step == "cover":
result = await _run_cover(p, ctx, feedback)
elif step == "video":
result = await _run_video(p, ctx)
elif step == "thumb":
result = await _run_thumb(p, ctx, feedback)
elif step == "meta":
result = await _run_meta(p, ctx, feedback)
elif step == "review":
result = await _run_review(p, ctx)
elif step == "publish":
result = await _run_publish(p, ctx)
else:
raise ValueError(f"unknown step: {step}")
db.update_pipeline_job(job_id, status="succeeded")
db.update_pipeline_state(pipeline_id, result["next_state"], **result.get("fields", {}))
except Exception as e:
logger.exception("step %s failed for pipeline %s", step, pipeline_id)
db.update_pipeline_job(job_id, status="failed", error=str(e))
db.update_pipeline_state(pipeline_id, "failed", failed_reason=f"{step}: {e}")
attempts = 1 if step in NON_RETRY_STEPS else (STEP_MAX_RETRIES + 1)
last_err = None
for i in range(attempts):
try:
result = await _dispatch_step(step, p, ctx, feedback)
db.update_pipeline_job(job_id, status="succeeded")
db.update_pipeline_state(pipeline_id, result["next_state"], **result.get("fields", {}))
return
except Exception as e:
last_err = e
logger.exception(
"step %s 실패 (pipeline %s, attempt %d/%d)", step, pipeline_id, i + 1, attempts
)
if i < attempts - 1:
await asyncio.sleep(STEP_RETRY_BACKOFF_SEC[min(i, len(STEP_RETRY_BACKOFF_SEC) - 1)])
db.update_pipeline_job(job_id, status="failed", error=str(last_err))
db.update_pipeline_state(pipeline_id, "failed", failed_reason=f"{step}: {last_err}")
async def _dispatch_step(step: str, p: dict, ctx: dict, feedback: str) -> dict:
"""step 이름으로 실행 함수 디스패치."""
if step == "cover": return await _run_cover(p, ctx, feedback)
if step == "video": return await _run_video(p, ctx)
if step == "thumb": return await _run_thumb(p, ctx, feedback)
if step == "meta": return await _run_meta(p, ctx, feedback)
if step == "review": return await _run_review(p, ctx)
if step == "publish": return await _run_publish(p, ctx)
raise ValueError(f"unknown step: {step}")
def _resolve_input(p: dict) -> dict: