feat(insta-lab): keyword_extractor with frequency + Claude refinement

2026-05-16 00:30:38 +09:00
parent c057304981
commit 07c4459085
2 changed files with 148 additions and 0 deletions
--- a/insta-lab/app/keyword_extractor.py
+++ b/insta-lab/app/keyword_extractor.py
@@ -0,0 +1,83 @@
+"""키워드 추출 — 한글 명사 빈도 + Claude Haiku 정제."""
+
+import json
+import logging
+import re
+from collections import Counter
+from typing import Any, Dict, List
+
+from anthropic import Anthropic
+
+from .config import ANTHROPIC_API_KEY, ANTHROPIC_MODEL_HAIKU, KEYWORDS_PER_CATEGORY
+from . import db
+
+logger = logging.getLogger(__name__)
+
+_NOUN_RE = re.compile(r"[가-힣]{2,6}")
+_STOPWORDS = {
+    "있다", "없다", "이다", "되다", "그리고", "하지만", "통해", "위해", "오늘", "이번",
+    "지난", "관련", "대해", "또한", "다만", "한편", "최근", "앞서", "현재", "진행",
+    "발생", "결과", "이상", "이하", "여러", "다양", "방법", "경우", "이유", "필요",
+}
+
+
+def _count_nouns(text: str) -> Dict[str, int]:
+    tokens = _NOUN_RE.findall(text or "")
+    return Counter(tokens)
+
+
+def _top_candidates(counts: Dict[str, int], n: int = 20) -> List[tuple]:
+    filtered = [(k, c) for k, c in counts.items() if k not in _STOPWORDS]
+    return sorted(filtered, key=lambda x: x[1], reverse=True)[:n]
+
+
+def _refine_with_llm(category: str, candidates: List[tuple], articles: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    """Claude Haiku로 후보 정제. JSON 리스트 [{keyword, score(0~1), reason}] 반환."""
+    if not ANTHROPIC_API_KEY:
+        return [{"keyword": k, "score": min(1.0, c / 10), "reason": "freq"} for k, c in candidates[:KEYWORDS_PER_CATEGORY]]
+
+    client = Anthropic(api_key=ANTHROPIC_API_KEY)
+    titles = [a["title"] for a in articles[:15]]
+    prompt = f"""너는 인스타그램 카드 뉴스 큐레이터다.
+카테고리: {category}
+빈도 상위 후보: {[k for k, _ in candidates]}
+관련 기사 제목 일부:
+{chr(10).join('- ' + t for t in titles)}
+
+이 후보 중에서 인스타 카드 콘텐츠로 적합한 키워드를 score 내림차순으로 최대 {KEYWORDS_PER_CATEGORY}개 골라.
+출력 형식 (JSON 배열만):
+[{{"keyword": "...", "score": 0.0~1.0, "reason": "..."}}]
+"""
+    msg = client.messages.create(
+        model=ANTHROPIC_MODEL_HAIKU,
+        max_tokens=600,
+        messages=[{"role": "user", "content": prompt}],
+    )
+    text = msg.content[0].text.strip()
+    if text.startswith("```"):
+        text = re.sub(r"^```(?:json)?\s*|\s*```$", "", text).strip()
+    try:
+        return json.loads(text)
+    except Exception:
+        logger.warning("LLM refine JSON parse failed, falling back to freq")
+        return [{"keyword": k, "score": min(1.0, c / 10), "reason": "freq-fallback"} for k, c in candidates[:KEYWORDS_PER_CATEGORY]]
+
+
+def extract_for_category(category: str, limit: int = KEYWORDS_PER_CATEGORY) -> List[Dict[str, Any]]:
+    """카테고리 기사들에서 키워드를 뽑아 DB에 저장하고 결과 반환."""
+    articles = db.list_news_articles(category=category, days=2)
+    text_blob = "\n".join((a["title"] + " " + a.get("summary", "")) for a in articles)
+    counts = _count_nouns(text_blob)
+    candidates = _top_candidates(counts, n=20)
+    refined = _refine_with_llm(category, candidates, articles)[:limit]
+
+    saved: List[Dict[str, Any]] = []
+    for kw in refined:
+        kid = db.add_trending_keyword({
+            "keyword": kw["keyword"],
+            "category": category,
+            "score": float(kw.get("score", 0.0)),
+            "articles_count": sum(1 for a in articles if kw["keyword"] in a["title"]),
+        })
+        saved.append({"id": kid, **kw, "category": category})
+    return saved