fix(insta-lab): trend_collector — Google Trends RSS + seed placeholder filter

(1) pytrends 4.x가 Google API 변경으로 trending_searches(pn='south_korea')
가 404 반환 → daily trending searches RSS endpoint를 requests로 직접 호출
하도록 교체. pytrends 의존성 제거.

(2) category_seeds 프롬프트 템플릿에 placeholder ('...', 'TBD' 등) 또는
2자 미만 값이 들어가면 NAVER가 400 Bad Request 반환 → _seeds_for에
_is_valid_seed 가드 추가, 모두 invalid면 DEFAULT_CATEGORY_SEEDS 폴백.

테스트 8/8 PASS (기존 6 + placeholder/fallback 2 신규).
This commit is contained in:
2026-05-17 09:21:38 +09:00
parent ad6c744f2c
commit bf5897fc85
3 changed files with 98 additions and 36 deletions

View File

@@ -1,17 +1,19 @@
"""외부 트렌드 수집 — NAVER 인기 + Google Trends + LLM 카테고리 분류. """외부 트렌드 수집 — NAVER 인기 + Google Trends RSS + LLM 카테고리 분류.
Phase B Task 3: Google Trends integration via pytrends + Anthropic Haiku 분류 캐시 (24h TTL). NAVER: 카테고리별 시드 키워드로 인기 검색 → 빈도 상위 추출.
Google Trends: pytrends 4.x가 Google API 변경으로 깨진 상태라 daily RSS endpoint 직접 호출.
LLM 분류 결과는 24h in-memory 캐시.
""" """
import json import json
import logging import logging
import re import re
import time import time
import xml.etree.ElementTree as ET
from typing import Any, Dict, List, Optional from typing import Any, Dict, List, Optional
import requests import requests
from anthropic import Anthropic from anthropic import Anthropic
from pytrends.request import TrendReq
from .config import ( from .config import (
NAVER_CLIENT_ID, NAVER_CLIENT_SECRET, DEFAULT_CATEGORY_SEEDS, NAVER_CLIENT_ID, NAVER_CLIENT_SECRET, DEFAULT_CATEGORY_SEEDS,
@@ -29,16 +31,36 @@ _NAVER_HEADERS = {
"X-Naver-Client-Secret": NAVER_CLIENT_SECRET, "X-Naver-Client-Secret": NAVER_CLIENT_SECRET,
} }
GOOGLE_TRENDS_RSS_URL = "https://trends.google.com/trends/trendingsearches/daily/rss?geo=KR"
_PLACEHOLDER_SEEDS = {"...", "", "tbd", "todo", "placeholder", "example"}
def _is_valid_seed(s: str) -> bool:
"""프롬프트 템플릿에 placeholder/빈 값이 들어가 NAVER에 400을 유발하는 일을 막는 가드."""
if not s:
return False
s = s.strip()
if len(s) < 2:
return False
if s.lower() in _PLACEHOLDER_SEEDS:
return False
return True
def _seeds_for(category: str) -> List[str]: def _seeds_for(category: str) -> List[str]:
"""category_seeds 프롬프트 템플릿이 있으면 사용, 없거나 모두 invalid면 config DEFAULT 폴백."""
pt = db.get_prompt_template("category_seeds") pt = db.get_prompt_template("category_seeds")
if pt and pt.get("template"): if pt and pt.get("template"):
try: try:
data = json.loads(pt["template"]) data = json.loads(pt["template"])
if category in data: if category in data:
return list(data[category]) filtered = [s for s in (data[category] or []) if _is_valid_seed(s)]
except Exception: if filtered:
pass return filtered
logger.warning("category_seeds[%s]에 유효한 시드 없음 → DEFAULT 폴백", category)
except Exception as e:
logger.warning("category_seeds JSON 파싱 실패 → DEFAULT 폴백: %s", e)
return list(DEFAULT_CATEGORY_SEEDS.get(category, [])) return list(DEFAULT_CATEGORY_SEEDS.get(category, []))
@@ -143,23 +165,38 @@ def classify_keyword(keyword: str) -> str:
# ── Google Trends ───────────────────────────────────────────────────────────── # ── Google Trends ─────────────────────────────────────────────────────────────
# pytrends 4.x가 Google API 변경(404)으로 자주 깨지므로 daily trending searches
# RSS endpoint를 직접 호출. RSS는 공식 Google Trends 서비스가 제공하며 pn=geo
# 파라미터로 region 지정 가능.
def fetch_google_trends() -> List[Dict[str, Any]]: def fetch_google_trends() -> List[Dict[str, Any]]:
"""pytrends 한국 daily trending searches. 실패 시 빈 리스트.""" """Google Trends Daily RSS (한국) 직접 호출. 실패 시 빈 리스트로 graceful degrade."""
try: try:
pytrends = TrendReq(hl="ko-KR", tz=540) resp = requests.get(
df = pytrends.trending_searches(pn="south_korea") GOOGLE_TRENDS_RSS_URL,
timeout=15,
headers={"User-Agent": "Mozilla/5.0 (insta-lab trend collector)"},
)
resp.raise_for_status()
root = ET.fromstring(resp.text)
titles = [
(item.findtext("title") or "").strip()
for item in root.iter("item")
]
titles = [t for t in titles if t]
except Exception as e: except Exception as e:
logger.warning("Google Trends fetch failed: %s", e) logger.warning("Google Trends RSS fetch failed: %s", e)
return [] return []
items: List[Dict[str, Any]] = [] items: List[Dict[str, Any]] = []
for idx, row in df.iterrows(): total = max(1, len(titles))
kw = str(row.iloc[0]).strip() for idx, kw in enumerate(titles):
if not kw: try:
continue
cat = classify_keyword(kw) cat = classify_keyword(kw)
rank_score = round(max(0.0, 1.0 - (idx / max(1, len(df)))), 4) except Exception as e:
logger.warning("classify_keyword(%s) 실패: %s", kw, e)
cat = "uncategorized"
rank_score = round(max(0.0, 1.0 - (idx / total)), 4)
items.append({ items.append({
"keyword": kw, "keyword": kw,
"category": cat, "category": cat,

View File

@@ -7,4 +7,3 @@ jinja2>=3.1.4
playwright==1.48.0 playwright==1.48.0
pytest>=8.0 pytest>=8.0
pytest-asyncio>=0.24 pytest-asyncio>=0.24
pytrends>=4.9

View File

@@ -77,16 +77,20 @@ def test_classify_keyword_with_cache(monkeypatch):
assert calls["n"] == 1 assert calls["n"] == 1
def test_fetch_google_trends_parses_and_classifies(tmp_db, monkeypatch): def test_fetch_google_trends_parses_rss_and_classifies(tmp_db, monkeypatch):
class FakePyTrends: fake_rss = """<?xml version="1.0" encoding="UTF-8"?>
def __init__(self, *_a, **_kw): <rss version="2.0">
pass <channel>
<title>Daily Search Trends</title>
def trending_searches(self, pn="south_korea"): <item><title>기준금리</title></item>
import pandas as pd <item><title>BTS 컴백</title></item>
return pd.DataFrame({"0": ["기준금리", "BTS 컴백", "스트레스 관리"]}) <item><title>스트레스 관리</title></item>
</channel>
monkeypatch.setattr(trend_collector, "TrendReq", FakePyTrends) </rss>"""
fake_resp = MagicMock()
fake_resp.text = fake_rss
fake_resp.raise_for_status.return_value = None
monkeypatch.setattr(trend_collector.requests, "get", lambda *a, **kw: fake_resp)
monkeypatch.setattr(trend_collector, "classify_keyword", monkeypatch.setattr(trend_collector, "classify_keyword",
lambda kw: {"기준금리": "economy", "BTS 컴백": "celebrity", lambda kw: {"기준금리": "economy", "BTS 컴백": "celebrity",
"스트레스 관리": "psychology"}.get(kw, "uncategorized")) "스트레스 관리": "psychology"}.get(kw, "uncategorized"))
@@ -108,14 +112,36 @@ def test_collect_all_invokes_both_sources(tmp_db, monkeypatch):
assert out == {"naver_popular": 5, "google_trends": 3} assert out == {"naver_popular": 5, "google_trends": 3}
def test_fetch_google_trends_graceful_on_pytrends_failure(monkeypatch): def test_fetch_google_trends_graceful_on_rss_failure(monkeypatch):
class FakePyTrends: fake_resp = MagicMock()
def __init__(self, *_a, **_kw): fake_resp.raise_for_status.side_effect = RuntimeError("Google returned 404")
pass monkeypatch.setattr(trend_collector.requests, "get", lambda *a, **kw: fake_resp)
def trending_searches(self, pn="south_korea"):
raise RuntimeError("rate limited")
monkeypatch.setattr(trend_collector, "TrendReq", FakePyTrends)
out = trend_collector.fetch_google_trends() out = trend_collector.fetch_google_trends()
assert out == [] assert out == []
def test_seeds_for_filters_placeholder(tmp_db, monkeypatch):
"""category_seeds 템플릿에 placeholder '...'가 들어가도 DEFAULT 폴백."""
from app import db as db_module
db_module.upsert_prompt_template(
"category_seeds",
'{"economy": ["...", "", "a", "real_keyword"]}',
"test",
)
out = trend_collector._seeds_for("economy")
# '...', '…', 'a'(2자 미만)는 필터링되고 'real_keyword'만 남음
assert out == ["real_keyword"]
def test_seeds_for_falls_back_when_all_invalid(tmp_db, monkeypatch):
"""모든 시드가 invalid면 DEFAULT_CATEGORY_SEEDS 폴백."""
from app import db as db_module
db_module.upsert_prompt_template(
"category_seeds",
'{"economy": ["...", "TBD", ""]}',
"test",
)
out = trend_collector._seeds_for("economy")
# DEFAULT_CATEGORY_SEEDS["economy"] 가 반환되어야 함
from app.config import DEFAULT_CATEGORY_SEEDS
assert out == list(DEFAULT_CATEGORY_SEEDS["economy"])