From 07c44590854f15302a841078f7a2e17e62fbb98e Mon Sep 17 00:00:00 2001 From: gahusb Date: Sat, 16 May 2026 00:30:38 +0900 Subject: [PATCH] feat(insta-lab): keyword_extractor with frequency + Claude refinement --- insta-lab/app/keyword_extractor.py | 83 +++++++++++++++++++++++ insta-lab/tests/test_keyword_extractor.py | 65 ++++++++++++++++++ 2 files changed, 148 insertions(+) create mode 100644 insta-lab/app/keyword_extractor.py create mode 100644 insta-lab/tests/test_keyword_extractor.py diff --git a/insta-lab/app/keyword_extractor.py b/insta-lab/app/keyword_extractor.py new file mode 100644 index 0000000..2c307e1 --- /dev/null +++ b/insta-lab/app/keyword_extractor.py @@ -0,0 +1,83 @@ +"""키워드 추출 — 한글 명사 빈도 + Claude Haiku 정제.""" + +import json +import logging +import re +from collections import Counter +from typing import Any, Dict, List + +from anthropic import Anthropic + +from .config import ANTHROPIC_API_KEY, ANTHROPIC_MODEL_HAIKU, KEYWORDS_PER_CATEGORY +from . import db + +logger = logging.getLogger(__name__) + +_NOUN_RE = re.compile(r"[가-힣]{2,6}") +_STOPWORDS = { + "있다", "없다", "이다", "되다", "그리고", "하지만", "통해", "위해", "오늘", "이번", + "지난", "관련", "대해", "또한", "다만", "한편", "최근", "앞서", "현재", "진행", + "발생", "결과", "이상", "이하", "여러", "다양", "방법", "경우", "이유", "필요", +} + + +def _count_nouns(text: str) -> Dict[str, int]: + tokens = _NOUN_RE.findall(text or "") + return Counter(tokens) + + +def _top_candidates(counts: Dict[str, int], n: int = 20) -> List[tuple]: + filtered = [(k, c) for k, c in counts.items() if k not in _STOPWORDS] + return sorted(filtered, key=lambda x: x[1], reverse=True)[:n] + + +def _refine_with_llm(category: str, candidates: List[tuple], articles: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Claude Haiku로 후보 정제. JSON 리스트 [{keyword, score(0~1), reason}] 반환.""" + if not ANTHROPIC_API_KEY: + return [{"keyword": k, "score": min(1.0, c / 10), "reason": "freq"} for k, c in candidates[:KEYWORDS_PER_CATEGORY]] + + client = Anthropic(api_key=ANTHROPIC_API_KEY) + titles = [a["title"] for a in articles[:15]] + prompt = f"""너는 인스타그램 카드 뉴스 큐레이터다. +카테고리: {category} +빈도 상위 후보: {[k for k, _ in candidates]} +관련 기사 제목 일부: +{chr(10).join('- ' + t for t in titles)} + +이 후보 중에서 인스타 카드 콘텐츠로 적합한 키워드를 score 내림차순으로 최대 {KEYWORDS_PER_CATEGORY}개 골라. +출력 형식 (JSON 배열만): +[{{"keyword": "...", "score": 0.0~1.0, "reason": "..."}}] +""" + msg = client.messages.create( + model=ANTHROPIC_MODEL_HAIKU, + max_tokens=600, + messages=[{"role": "user", "content": prompt}], + ) + text = msg.content[0].text.strip() + if text.startswith("```"): + text = re.sub(r"^```(?:json)?\s*|\s*```$", "", text).strip() + try: + return json.loads(text) + except Exception: + logger.warning("LLM refine JSON parse failed, falling back to freq") + return [{"keyword": k, "score": min(1.0, c / 10), "reason": "freq-fallback"} for k, c in candidates[:KEYWORDS_PER_CATEGORY]] + + +def extract_for_category(category: str, limit: int = KEYWORDS_PER_CATEGORY) -> List[Dict[str, Any]]: + """카테고리 기사들에서 키워드를 뽑아 DB에 저장하고 결과 반환.""" + articles = db.list_news_articles(category=category, days=2) + text_blob = "\n".join((a["title"] + " " + a.get("summary", "")) for a in articles) + counts = _count_nouns(text_blob) + candidates = _top_candidates(counts, n=20) + refined = _refine_with_llm(category, candidates, articles)[:limit] + + saved: List[Dict[str, Any]] = [] + for kw in refined: + kid = db.add_trending_keyword({ + "keyword": kw["keyword"], + "category": category, + "score": float(kw.get("score", 0.0)), + "articles_count": sum(1 for a in articles if kw["keyword"] in a["title"]), + }) + saved.append({"id": kid, **kw, "category": category}) + return saved diff --git a/insta-lab/tests/test_keyword_extractor.py b/insta-lab/tests/test_keyword_extractor.py new file mode 100644 index 0000000..ac476d3 --- /dev/null +++ b/insta-lab/tests/test_keyword_extractor.py @@ -0,0 +1,65 @@ +import os +import tempfile +from unittest.mock import patch, MagicMock + +import pytest + +from app import db as db_module +from app import keyword_extractor + + +@pytest.fixture +def tmp_db(monkeypatch): + fd, path = tempfile.mkstemp(suffix=".db") + os.close(fd) + monkeypatch.setattr(db_module, "DB_PATH", path) + db_module.init_db() + yield path + # Windows-safe cleanup: close handles + remove sidecars + import gc + gc.collect() + for ext in ("", "-wal", "-shm"): + try: + os.remove(path + ext) + except OSError: + pass + + +def test_count_nouns_extracts_korean_nouns(): + text = "기준금리 인상으로 환율 급등. 기준금리 추가 인상 가능성" + counts = keyword_extractor._count_nouns(text) + assert counts["기준금리"] == 2 + assert counts["환율"] == 1 + + +def test_top_candidates_filters_stopwords(): + counts = {"기준금리": 5, "있다": 7, "환율": 3, "그리고": 4} + top = keyword_extractor._top_candidates(counts, n=10) + keywords = [k for k, _ in top] + assert "있다" not in keywords + assert "그리고" not in keywords + assert "기준금리" in keywords + + +def test_extract_for_category_persists(tmp_db): + # seed articles + for i in range(3): + db_module.add_news_article({ + "category": "economy", + "title": f"기준금리 인상 {i}", + "link": f"https://example.com/{i}", + "summary": "환율도 영향", + }) + + # mock LLM refinement + fake_refined = [ + {"keyword": "기준금리", "score": 0.92, "reason": "핵심 금융 이슈"}, + {"keyword": "환율", "score": 0.71, "reason": "시장 영향"}, + ] + with patch.object(keyword_extractor, "_refine_with_llm", return_value=fake_refined): + kws = keyword_extractor.extract_for_category("economy", limit=2) + + assert len(kws) == 2 + assert kws[0]["keyword"] == "기준금리" + persisted = db_module.list_trending_keywords(category="economy") + assert {p["keyword"] for p in persisted} == {"기준금리", "환율"}