web-page-backend/music-lab/app/pipeline/metadata.py

"""메타데이터 생성 — Claude Haiku + 템플릿 폴백."""
import os
import json
import logging

import httpx

logger = logging.getLogger("music-lab.metadata")

CLAUDE_HAIKU_MODEL_DEFAULT = "claude-haiku-4-5-20251001"
TIMEOUT_S = 30


def _get_api_key() -> str:
    return os.getenv("ANTHROPIC_API_KEY", "")


def _get_model() -> str:
    return os.getenv("CLAUDE_HAIKU_MODEL", CLAUDE_HAIKU_MODEL_DEFAULT)


def _format_chapters(tracks: list[dict]) -> str:
    """YouTube 챕터 자동 인식 형식: '[mm:ss] 제목' 한 줄씩.

    1시간 이상이면 hh:mm:ss 형식.
    """
    if not tracks:
        return ""
    lines = []
    for t in tracks:
        offset = int(t.get("start_offset_sec", 0))
        m, s = divmod(offset, 60)
        h, m = divmod(m, 60)
        if h > 0:
            ts = f"{h:02d}:{m:02d}:{s:02d}"
        else:
            ts = f"{m:02d}:{s:02d}"
        lines.append(f"{ts} {t.get('title', '')}")
    return "\n".join(lines)


async def generate(*, track: dict, template: dict, trend_keywords: list[str],
                   feedback: str = "", tracks: list[dict] | None = None) -> dict:
    """메타데이터 생성. 성공 시 LLM, 실패/미설정 시 템플릿 치환 폴백.

    반환: {"title", "description", "tags", "category_id", "used_fallback", "error"}
    """
    api_key = _get_api_key()
    if not api_key:
        return {**_fallback_template(track, template, tracks), "used_fallback": True, "error": "no api key"}

    try:
        result = await _call_claude(track, template, trend_keywords, feedback, tracks,
                                    api_key=api_key, model=_get_model())
        return {**result, "used_fallback": False, "error": None}
    except (httpx.HTTPError, httpx.TimeoutException, KeyError, ValueError, json.JSONDecodeError) as e:
        logger.warning("메타데이터 LLM 실패 — 폴백: %s", e)
        return {**_fallback_template(track, template, tracks), "used_fallback": True, "error": str(e)}


def _fallback_template(track: dict, template: dict, tracks: list[dict] | None = None) -> dict:
    fmt_vars = {
        "title": track.get("title", ""),
        "genre": track.get("genre", ""),
        "bpm": track.get("bpm", ""),
        "key": track.get("key", ""),
        "scale": track.get("scale", ""),
    }
    title = template.get("title", "{title}").format(**fmt_vars)
    description = template.get("description", "{title}").format(**fmt_vars)
    if tracks and len(tracks) > 1:
        description = description + "\n\n" + _format_chapters(tracks)
    return {
        "title": title[:100],
        "description": description[:5000],
        "tags": (template.get("tags") or [])[:15],
        "category_id": template.get("category_id", 10),
    }


async def _call_claude(track: dict, template: dict, trend_keywords: list[str],
                        feedback: str, tracks: list[dict] | None,
                        *, api_key: str, model: str) -> dict:
    user_prompt = (
        "다음 트랙의 YouTube 메타데이터를 생성하세요. JSON으로만 응답.\n\n"
        f"트랙: {json.dumps(track, ensure_ascii=False)}\n"
        f"템플릿: {json.dumps(template, ensure_ascii=False)}\n"
        f"트렌드 키워드: {', '.join(trend_keywords)}\n"
    )
    if tracks and len(tracks) > 1:
        chapters = _format_chapters(tracks)
        user_prompt += (
            f"\n이 영상은 {len(tracks)}개 트랙의 mix입니다. "
            f"description에 다음 챕터 리스트를 그대로 포함하세요 (YouTube 자동 챕터 인식용):\n{chapters}\n"
        )
    if feedback:
        user_prompt += f"\n사용자 피드백: {feedback}\n"
    user_prompt += (
        '\n출력 JSON: {"title": "60자 이내", "description": "1000자 이내",'
        ' "tags": ["15개 이내"], "category_id": 10}'
    )

    async with httpx.AsyncClient(timeout=TIMEOUT_S) as client:
        resp = await client.post(
            "https://api.anthropic.com/v1/messages",
            headers={
                "x-api-key": api_key,
                "anthropic-version": "2023-06-01",
                "content-type": "application/json",
            },
            json={
                "model": model,
                "max_tokens": 2048,  # mix 더 길어서
                "messages": [{"role": "user", "content": user_prompt}],
            },
        )
        resp.raise_for_status()
        text = resp.json()["content"][0]["text"]
        # 가장 첫 JSON 블록 추출
        start = text.find("{")
        end = text.rfind("}") + 1
        if start < 0 or end <= start:
            raise ValueError("Claude 응답에 JSON 블록 없음")
        return json.loads(text[start:end])