From 685320f3cfea6076605d30f5ad76c284db43f385 Mon Sep 17 00:00:00 2001
From: gahusb <bgg8988@gmail.com>
Date: Sat, 16 May 2026 17:47:17 +0900
Subject: [PATCH] feat(insta-lab): trend_collector with NAVER popular fetcher

---
 insta-lab/app/trend_collector.py        | 180 ++++++++++++++++++++++++
 insta-lab/tests/test_trend_collector.py | 121 ++++++++++++++++
 2 files changed, 301 insertions(+)
 create mode 100644 insta-lab/app/trend_collector.py
 create mode 100644 insta-lab/tests/test_trend_collector.py

diff --git a/insta-lab/app/trend_collector.py b/insta-lab/app/trend_collector.py
new file mode 100644
index 0000000..545dc75
--- /dev/null
+++ b/insta-lab/app/trend_collector.py
@@ -0,0 +1,180 @@
+"""외부 트렌드 수집 — NAVER 인기 + Google Trends + LLM 카테고리 분류."""
+
+import json
+import logging
+import re
+import time
+from typing import Any, Dict, List, Optional
+
+import requests
+from anthropic import Anthropic
+from pytrends.request import TrendReq
+
+from .config import (
+    NAVER_CLIENT_ID, NAVER_CLIENT_SECRET, DEFAULT_CATEGORY_SEEDS,
+    ANTHROPIC_API_KEY, ANTHROPIC_MODEL_HAIKU,
+)
+from . import db
+from .news_collector import _clean
+from .keyword_extractor import _count_nouns, _top_candidates
+
+logger = logging.getLogger(__name__)
+
+NEWS_URL = "https://openapi.naver.com/v1/search/news.json"
+_NAVER_HEADERS = {
+    "X-Naver-Client-Id": NAVER_CLIENT_ID,
+    "X-Naver-Client-Secret": NAVER_CLIENT_SECRET,
+}
+
+
+def _seeds_for(category: str) -> List[str]:
+    pt = db.get_prompt_template("category_seeds")
+    if pt and pt.get("template"):
+        try:
+            data = json.loads(pt["template"])
+            if category in data:
+                return list(data[category])
+        except Exception:
+            pass
+    return list(DEFAULT_CATEGORY_SEEDS.get(category, []))
+
+
+def fetch_naver_popular(category: str, per_seed: int = 30, top_n: int = 10) -> List[Dict[str, Any]]:
+    """카테고리 시드 키워드들로 NAVER news.json `sort=sim` 호출,
+    응답 기사 묶음에서 빈도어 추출 후 상위 N개 반환."""
+    seeds = _seeds_for(category)
+    if not seeds:
+        return []
+    blob_parts: List[str] = []
+    for seed in seeds:
+        try:
+            resp = requests.get(
+                NEWS_URL,
+                headers=_NAVER_HEADERS,
+                params={"query": seed, "display": per_seed, "sort": "sim"},
+                timeout=10,
+            )
+            resp.raise_for_status()
+            for item in resp.json().get("items", []):
+                blob_parts.append(_clean(item.get("title", "")))
+                blob_parts.append(_clean(item.get("description", "")))
+        except Exception as e:
+            logger.warning("fetch_naver_popular seed=%s err=%s", seed, e)
+            continue
+    text = "\n".join(blob_parts)
+    counts = _count_nouns(text)
+    candidates = _top_candidates(counts, n=top_n)
+    if not candidates:
+        return []
+    max_count = candidates[0][1] or 1
+    return [
+        {
+            "keyword": k,
+            "category": category,
+            "source": "naver_popular",
+            "score": round(min(1.0, c / max_count), 4),
+            "articles_count": c,
+        }
+        for k, c in candidates
+    ]
+
+
+def collect_naver_popular_for(categories: List[str]) -> int:
+    total = 0
+    for cat in categories:
+        trends = fetch_naver_popular(cat)
+        for t in trends:
+            db.add_external_trend(t)
+            total += 1
+    return total
+
+
+# ── LLM 분류 캐시 ────────────────────────────────────────────────────────────
+
+_CACHE_TTL_SEC = 24 * 3600
+_category_cache: Dict[str, tuple] = {}  # keyword -> (category, expires_ts)
+
+
+def _llm_classify_one(keyword: str) -> str:
+    """Claude Haiku 1회 호출로 단일 키워드 분류."""
+    if not ANTHROPIC_API_KEY:
+        return "uncategorized"
+    seeds_template = db.get_prompt_template("category_seeds")
+    if seeds_template and seeds_template.get("template"):
+        try:
+            allowed = sorted(json.loads(seeds_template["template"]).keys())
+        except Exception:
+            allowed = sorted(DEFAULT_CATEGORY_SEEDS.keys())
+    else:
+        allowed = sorted(DEFAULT_CATEGORY_SEEDS.keys())
+    allowed.append("uncategorized")
+
+    client = Anthropic(api_key=ANTHROPIC_API_KEY)
+    msg = client.messages.create(
+        model=ANTHROPIC_MODEL_HAIKU,
+        max_tokens=20,
+        messages=[{
+            "role": "user",
+            "content": (
+                f"다음 한국어 트렌딩 키워드를 카테고리 중 하나로 분류해라. "
+                f"카테고리: {allowed}. 키워드: '{keyword}'. "
+                f"카테고리명 한 단어만 출력. 다른 텍스트 금지."
+            ),
+        }],
+    )
+    raw = msg.content[0].text.strip().lower()
+    for cat in allowed:
+        if cat.lower() in raw:
+            return cat
+    return "uncategorized"
+
+
+def classify_keyword(keyword: str) -> str:
+    now = time.time()
+    cached = _category_cache.get(keyword)
+    if cached and cached[1] > now:
+        return cached[0]
+    cat = _llm_classify_one(keyword)
+    _category_cache[keyword] = (cat, now + _CACHE_TTL_SEC)
+    return cat
+
+
+# ── Google Trends ─────────────────────────────────────────────────────────────
+
+def fetch_google_trends() -> List[Dict[str, Any]]:
+    """pytrends 한국 daily trending searches. 실패 시 빈 리스트."""
+    try:
+        pytrends = TrendReq(hl="ko-KR", tz=540)
+        df = pytrends.trending_searches(pn="south_korea")
+    except Exception as e:
+        logger.warning("Google Trends fetch failed: %s", e)
+        return []
+
+    items: List[Dict[str, Any]] = []
+    for idx, row in df.iterrows():
+        kw = str(row.iloc[0]).strip()
+        if not kw:
+            continue
+        cat = classify_keyword(kw)
+        rank_score = round(max(0.0, 1.0 - (idx / max(1, len(df)))), 4)
+        items.append({
+            "keyword": kw,
+            "category": cat,
+            "source": "google_trends",
+            "score": rank_score,
+            "articles_count": 0,
+        })
+    return items
+
+
+def collect_google_trends() -> int:
+    items = fetch_google_trends()
+    for it in items:
+        db.add_external_trend(it)
+    return len(items)
+
+
+def collect_all(categories: List[str]) -> Dict[str, int]:
+    naver_n = collect_naver_popular_for(categories)
+    google_n = collect_google_trends()
+    return {"naver_popular": naver_n, "google_trends": google_n}
diff --git a/insta-lab/tests/test_trend_collector.py b/insta-lab/tests/test_trend_collector.py
new file mode 100644
index 0000000..465e729
--- /dev/null
+++ b/insta-lab/tests/test_trend_collector.py
@@ -0,0 +1,121 @@
+import os
+import gc
+import tempfile
+from unittest.mock import patch, MagicMock
+
+import pytest
+
+from app import db as db_module
+from app import trend_collector
+
+
+@pytest.fixture
+def tmp_db(monkeypatch):
+    fd, path = tempfile.mkstemp(suffix=".db")
+    os.close(fd)
+    monkeypatch.setattr(db_module, "DB_PATH", path)
+    db_module.init_db()
+    yield path
+    gc.collect()
+    for ext in ("", "-wal", "-shm"):
+        try:
+            os.remove(path + ext)
+        except OSError:
+            pass
+
+
+NAVER_RESPONSE = {
+    "items": [
+        {"title": "<b>기준금리</b> 인상", "link": "https://n.news.naver.com/a/1", "description": "한국은행 발표"},
+        {"title": "환율 급등", "link": "https://n.news.naver.com/a/2", "description": "달러 강세"},
+        {"title": "기준금리 추가 인상", "link": "https://n.news.naver.com/a/3", "description": "추가 발표"},
+    ],
+}
+
+
+def test_fetch_naver_popular_extracts_top_terms(tmp_db, monkeypatch):
+    fake_resp = MagicMock()
+    fake_resp.json.return_value = NAVER_RESPONSE
+    fake_resp.raise_for_status.return_value = None
+
+    with patch.object(trend_collector.requests, "get", return_value=fake_resp):
+        trends = trend_collector.fetch_naver_popular("economy", per_seed=10, top_n=5)
+
+    keywords = [t["keyword"] for t in trends]
+    assert "기준금리" in keywords
+    for t in trends:
+        assert t["category"] == "economy"
+        assert t["source"] == "naver_popular"
+        assert 0.0 <= t["score"] <= 1.0
+
+
+def test_collect_naver_writes_to_db(tmp_db, monkeypatch):
+    fake_resp = MagicMock()
+    fake_resp.json.return_value = NAVER_RESPONSE
+    fake_resp.raise_for_status.return_value = None
+    with patch.object(trend_collector.requests, "get", return_value=fake_resp):
+        n = trend_collector.collect_naver_popular_for(["economy"])
+    assert n > 0
+    rows = db_module.list_trends(source="naver_popular")
+    assert len(rows) > 0
+    assert all(r["source"] == "naver_popular" for r in rows)
+
+
+def test_classify_keyword_with_cache(monkeypatch):
+    calls = {"n": 0}
+
+    def fake_claude(keyword: str) -> str:
+        calls["n"] += 1
+        return "economy"
+
+    monkeypatch.setattr(trend_collector, "_llm_classify_one", fake_claude)
+    trend_collector._category_cache.clear()
+
+    c1 = trend_collector.classify_keyword("기준금리")
+    c2 = trend_collector.classify_keyword("기준금리")
+    assert c1 == c2 == "economy"
+    assert calls["n"] == 1
+
+
+def test_fetch_google_trends_parses_and_classifies(tmp_db, monkeypatch):
+    class FakePyTrends:
+        def __init__(self, *_a, **_kw):
+            pass
+
+        def trending_searches(self, pn="south_korea"):
+            import pandas as pd
+            return pd.DataFrame({"0": ["기준금리", "BTS 컴백", "스트레스 관리"]})
+
+    monkeypatch.setattr(trend_collector, "TrendReq", FakePyTrends)
+    monkeypatch.setattr(trend_collector, "classify_keyword",
+                        lambda kw: {"기준금리": "economy", "BTS 컴백": "celebrity",
+                                    "스트레스 관리": "psychology"}.get(kw, "uncategorized"))
+
+    trends = trend_collector.fetch_google_trends()
+    by_kw = {t["keyword"]: t for t in trends}
+    assert by_kw["기준금리"]["category"] == "economy"
+    assert by_kw["BTS 컴백"]["category"] == "celebrity"
+    assert by_kw["스트레스 관리"]["category"] == "psychology"
+    assert all(t["source"] == "google_trends" for t in trends)
+
+
+def test_collect_all_invokes_both_sources(tmp_db, monkeypatch):
+    monkeypatch.setattr(trend_collector, "collect_naver_popular_for",
+                        lambda cats: 5)
+    monkeypatch.setattr(trend_collector, "collect_google_trends",
+                        lambda: 3)
+    out = trend_collector.collect_all(["economy"])
+    assert out == {"naver_popular": 5, "google_trends": 3}
+
+
+def test_fetch_google_trends_graceful_on_pytrends_failure(monkeypatch):
+    class FakePyTrends:
+        def __init__(self, *_a, **_kw):
+            pass
+
+        def trending_searches(self, pn="south_korea"):
+            raise RuntimeError("rate limited")
+
+    monkeypatch.setattr(trend_collector, "TrendReq", FakePyTrends)
+    out = trend_collector.fetch_google_trends()
+    assert out == []