"""외부 트렌드 수집 — NAVER 인기 + Google Trends + LLM 카테고리 분류.""" import json import logging import re import time from typing import Any, Dict, List, Optional import requests from anthropic import Anthropic from pytrends.request import TrendReq from .config import ( NAVER_CLIENT_ID, NAVER_CLIENT_SECRET, DEFAULT_CATEGORY_SEEDS, ANTHROPIC_API_KEY, ANTHROPIC_MODEL_HAIKU, ) from . import db from .news_collector import _clean from .keyword_extractor import _count_nouns, _top_candidates logger = logging.getLogger(__name__) NEWS_URL = "https://openapi.naver.com/v1/search/news.json" _NAVER_HEADERS = { "X-Naver-Client-Id": NAVER_CLIENT_ID, "X-Naver-Client-Secret": NAVER_CLIENT_SECRET, } def _seeds_for(category: str) -> List[str]: pt = db.get_prompt_template("category_seeds") if pt and pt.get("template"): try: data = json.loads(pt["template"]) if category in data: return list(data[category]) except Exception: pass return list(DEFAULT_CATEGORY_SEEDS.get(category, [])) def fetch_naver_popular(category: str, per_seed: int = 30, top_n: int = 10) -> List[Dict[str, Any]]: """카테고리 시드 키워드들로 NAVER news.json `sort=sim` 호출, 응답 기사 묶음에서 빈도어 추출 후 상위 N개 반환.""" seeds = _seeds_for(category) if not seeds: return [] blob_parts: List[str] = [] for seed in seeds: try: resp = requests.get( NEWS_URL, headers=_NAVER_HEADERS, params={"query": seed, "display": per_seed, "sort": "sim"}, timeout=10, ) resp.raise_for_status() for item in resp.json().get("items", []): blob_parts.append(_clean(item.get("title", ""))) blob_parts.append(_clean(item.get("description", ""))) except Exception as e: logger.warning("fetch_naver_popular seed=%s err=%s", seed, e) continue text = "\n".join(blob_parts) counts = _count_nouns(text) candidates = _top_candidates(counts, n=top_n) if not candidates: return [] max_count = candidates[0][1] or 1 return [ { "keyword": k, "category": category, "source": "naver_popular", "score": round(min(1.0, c / max_count), 4), "articles_count": c, } for k, c in candidates ] def collect_naver_popular_for(categories: List[str]) -> int: total = 0 for cat in categories: trends = fetch_naver_popular(cat) for t in trends: db.add_external_trend(t) total += 1 return total # ── LLM 분류 캐시 ──────────────────────────────────────────────────────────── _CACHE_TTL_SEC = 24 * 3600 _category_cache: Dict[str, tuple] = {} # keyword -> (category, expires_ts) def _llm_classify_one(keyword: str) -> str: """Claude Haiku 1회 호출로 단일 키워드 분류.""" if not ANTHROPIC_API_KEY: return "uncategorized" seeds_template = db.get_prompt_template("category_seeds") if seeds_template and seeds_template.get("template"): try: allowed = sorted(json.loads(seeds_template["template"]).keys()) except Exception: allowed = sorted(DEFAULT_CATEGORY_SEEDS.keys()) else: allowed = sorted(DEFAULT_CATEGORY_SEEDS.keys()) allowed.append("uncategorized") client = Anthropic(api_key=ANTHROPIC_API_KEY) msg = client.messages.create( model=ANTHROPIC_MODEL_HAIKU, max_tokens=20, messages=[{ "role": "user", "content": ( f"다음 한국어 트렌딩 키워드를 카테고리 중 하나로 분류해라. " f"카테고리: {allowed}. 키워드: '{keyword}'. " f"카테고리명 한 단어만 출력. 다른 텍스트 금지." ), }], ) raw = msg.content[0].text.strip().lower() for cat in allowed: if cat.lower() in raw: return cat return "uncategorized" def classify_keyword(keyword: str) -> str: now = time.time() cached = _category_cache.get(keyword) if cached and cached[1] > now: return cached[0] cat = _llm_classify_one(keyword) _category_cache[keyword] = (cat, now + _CACHE_TTL_SEC) return cat # ── Google Trends ───────────────────────────────────────────────────────────── def fetch_google_trends() -> List[Dict[str, Any]]: """pytrends 한국 daily trending searches. 실패 시 빈 리스트.""" try: pytrends = TrendReq(hl="ko-KR", tz=540) df = pytrends.trending_searches(pn="south_korea") except Exception as e: logger.warning("Google Trends fetch failed: %s", e) return [] items: List[Dict[str, Any]] = [] for idx, row in df.iterrows(): kw = str(row.iloc[0]).strip() if not kw: continue cat = classify_keyword(kw) rank_score = round(max(0.0, 1.0 - (idx / max(1, len(df)))), 4) items.append({ "keyword": kw, "category": cat, "source": "google_trends", "score": rank_score, "articles_count": 0, }) return items def collect_google_trends() -> int: items = fetch_google_trends() for it in items: db.add_external_trend(it) return len(items) def collect_all(categories: List[str]) -> Dict[str, int]: naver_n = collect_naver_popular_for(categories) google_n = collect_google_trends() return {"naver_popular": naver_n, "google_trends": google_n}