fix(tarot-lab): max_tokens 1400→2800 + stop_reason 검사로 응답 truncation 처리

3-card spread 해석 응답이 1400 토큰 한계에서 잘려 JSON "Unterminated string" 파싱 실패가 reroll 2회 모두 발생하던 버그 수정. - max_tokens 1400 → 2800 (saju-lab 2400 기준 + interactions 마진) - stop_reason == "max_tokens" 검사 → 신규 TarotTruncated 예외로 truncation 명시화 - reroll feedback에 "각 카드 1~2문장으로 축약" 안내 추가 → 모델이 다음 응답 길이 조절 - truncation 시나리오 테스트 2개 추가 (1차 잘림→성공, 2회 모두 잘림→TarotError) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-26 22:55:28 +09:00
parent 99dca8df64
commit be9165efd2
2 changed files with 60 additions and 4 deletions
--- a/tarot-lab/app/pipeline.py
+++ b/tarot-lab/app/pipeline.py
@@ -28,6 +28,15 @@ class TarotError(Exception):
    pass


+class TarotTruncated(Exception):
+    """Claude 응답이 max_tokens에서 잘림 — reroll feedback에 길이 단축 요청 필요."""
+
+    def __init__(self, raw_text: str, limit: int):
+        self.raw_text = raw_text
+        self.limit = limit
+        super().__init__(f"Claude 응답이 max_tokens={limit}에서 잘림")
+
+
 def calc_cost(tokens_in: int, tokens_out: int) -> float:
    return (
        tokens_in / 1_000_000 * TAROT_COST_INPUT_PER_M
@@ -64,9 +73,10 @@ async def _call_claude(user_text: str, feedback: str = "") -> tuple[dict, dict,
        raise TarotError("ANTHROPIC_API_KEY missing")
    if feedback:
        user_text = f"이전 응답이 다음 이유로 거절됨: {feedback}\n올바른 스키마(시스템 지침)로 다시 응답.\n\n{user_text}"
+    max_tokens = 2800  # 3-card spread JSON(summary + cards×3 evidence + interactions + advice) 안전 마진
    payload = {
        "model": TAROT_MODEL,
-        "max_tokens": 1400,  # 응답 시간 단축 — 3-card spread evidence·interactions 포함 충분
+        "max_tokens": max_tokens,
        "system": [{"type": "text", "text": SYSTEM_PROMPT,
                    "cache_control": {"type": "ephemeral"}}],
        "messages": [{"role": "user", "content": [{"type": "text", "text": user_text}]}],
@@ -89,7 +99,15 @@ async def _call_claude(user_text: str, feedback: str = "") -> tuple[dict, dict,
    usage = resp.get("usage", {}) or {}
    tokens_in = int(usage.get("input_tokens", 0) or 0)
    tokens_out = int(usage.get("output_tokens", 0) or 0)
-    logger.info("tarot claude call: latency=%dms, in=%d, out=%d", latency_ms, tokens_in, tokens_out)
+    stop_reason = resp.get("stop_reason", "")
+    logger.info(
+        "tarot claude call: latency=%dms, in=%d, out=%d, stop=%s",
+        latency_ms, tokens_in, tokens_out, stop_reason,
+    )
+    if stop_reason == "max_tokens":
+        # truncation은 JSON 파싱 시도 전에 단락 — _extract_json은 거의 항상 실패하고
+        # 일반 JSONDecodeError로 reroll되면 모델이 길이를 줄일 힌트를 못 받음.
+        raise TarotTruncated(raw_text, max_tokens)
    parsed = _extract_json(raw_text)
    meta = {
        "tokens_in": tokens_in,
@@ -116,8 +134,15 @@ async def interpret(req: TarotInterpretRequest) -> Dict[str, Any]:
            parsed, meta, _raw = await _call_claude(user_text, feedback=last_error)
        except httpx.HTTPError as e:
            raise TarotError(f"Claude HTTP error: {e}") from e
+        except TarotTruncated as e:
+            last_error = (
+                f"이전 응답이 max_tokens={e.limit}에서 잘렸습니다. "
+                "각 카드의 interpretation·evidence·advice를 1~2문장으로 축약해 "
+                "전체 JSON 길이를 줄여 다시 응답하세요."
+            )
+            continue
        except json.JSONDecodeError as e:
-            last_error = f"JSON 파싱 실패: {e}"
+            last_error = f"JSON 파싱 실패: {e}. 모든 문자열을 닫고 유효한 JSON으로 재작성."
            continue
        total_in += meta["tokens_in"]
        total_out += meta["tokens_out"]