fix(insta-lab): trend_collector — Google Trends RSS + seed placeholder filter

(1) pytrends 4.x가 Google API 변경으로 trending_searches(pn='south_korea')
가 404 반환 → daily trending searches RSS endpoint를 requests로 직접 호출
하도록 교체. pytrends 의존성 제거.

(2) category_seeds 프롬프트 템플릿에 placeholder ('...', 'TBD' 등) 또는
2자 미만 값이 들어가면 NAVER가 400 Bad Request 반환 → _seeds_for에
_is_valid_seed 가드 추가, 모두 invalid면 DEFAULT_CATEGORY_SEEDS 폴백.

테스트 8/8 PASS (기존 6 + placeholder/fallback 2 신규).
This commit is contained in:
2026-05-17 09:21:38 +09:00
parent ad6c744f2c
commit bf5897fc85
3 changed files with 98 additions and 36 deletions

View File

@@ -1,17 +1,19 @@
"""외부 트렌드 수집 — NAVER 인기 + Google Trends + LLM 카테고리 분류.
"""외부 트렌드 수집 — NAVER 인기 + Google Trends RSS + LLM 카테고리 분류.
Phase B Task 3: Google Trends integration via pytrends + Anthropic Haiku 분류 캐시 (24h TTL).
NAVER: 카테고리별 시드 키워드로 인기 검색 → 빈도 상위 추출.
Google Trends: pytrends 4.x가 Google API 변경으로 깨진 상태라 daily RSS endpoint 직접 호출.
LLM 분류 결과는 24h in-memory 캐시.
"""
import json
import logging
import re
import time
import xml.etree.ElementTree as ET
from typing import Any, Dict, List, Optional
import requests
from anthropic import Anthropic
from pytrends.request import TrendReq
from .config import (
NAVER_CLIENT_ID, NAVER_CLIENT_SECRET, DEFAULT_CATEGORY_SEEDS,
@@ -29,16 +31,36 @@ _NAVER_HEADERS = {
"X-Naver-Client-Secret": NAVER_CLIENT_SECRET,
}
GOOGLE_TRENDS_RSS_URL = "https://trends.google.com/trends/trendingsearches/daily/rss?geo=KR"
_PLACEHOLDER_SEEDS = {"...", "", "tbd", "todo", "placeholder", "example"}
def _is_valid_seed(s: str) -> bool:
"""프롬프트 템플릿에 placeholder/빈 값이 들어가 NAVER에 400을 유발하는 일을 막는 가드."""
if not s:
return False
s = s.strip()
if len(s) < 2:
return False
if s.lower() in _PLACEHOLDER_SEEDS:
return False
return True
def _seeds_for(category: str) -> List[str]:
"""category_seeds 프롬프트 템플릿이 있으면 사용, 없거나 모두 invalid면 config DEFAULT 폴백."""
pt = db.get_prompt_template("category_seeds")
if pt and pt.get("template"):
try:
data = json.loads(pt["template"])
if category in data:
return list(data[category])
except Exception:
pass
filtered = [s for s in (data[category] or []) if _is_valid_seed(s)]
if filtered:
return filtered
logger.warning("category_seeds[%s]에 유효한 시드 없음 → DEFAULT 폴백", category)
except Exception as e:
logger.warning("category_seeds JSON 파싱 실패 → DEFAULT 폴백: %s", e)
return list(DEFAULT_CATEGORY_SEEDS.get(category, []))
@@ -143,23 +165,38 @@ def classify_keyword(keyword: str) -> str:
# ── Google Trends ─────────────────────────────────────────────────────────────
# pytrends 4.x가 Google API 변경(404)으로 자주 깨지므로 daily trending searches
# RSS endpoint를 직접 호출. RSS는 공식 Google Trends 서비스가 제공하며 pn=geo
# 파라미터로 region 지정 가능.
def fetch_google_trends() -> List[Dict[str, Any]]:
"""pytrends 한국 daily trending searches. 실패 시 빈 리스트."""
"""Google Trends Daily RSS (한국) 직접 호출. 실패 시 빈 리스트로 graceful degrade."""
try:
pytrends = TrendReq(hl="ko-KR", tz=540)
df = pytrends.trending_searches(pn="south_korea")
resp = requests.get(
GOOGLE_TRENDS_RSS_URL,
timeout=15,
headers={"User-Agent": "Mozilla/5.0 (insta-lab trend collector)"},
)
resp.raise_for_status()
root = ET.fromstring(resp.text)
titles = [
(item.findtext("title") or "").strip()
for item in root.iter("item")
]
titles = [t for t in titles if t]
except Exception as e:
logger.warning("Google Trends fetch failed: %s", e)
logger.warning("Google Trends RSS fetch failed: %s", e)
return []
items: List[Dict[str, Any]] = []
for idx, row in df.iterrows():
kw = str(row.iloc[0]).strip()
if not kw:
continue
cat = classify_keyword(kw)
rank_score = round(max(0.0, 1.0 - (idx / max(1, len(df)))), 4)
total = max(1, len(titles))
for idx, kw in enumerate(titles):
try:
cat = classify_keyword(kw)
except Exception as e:
logger.warning("classify_keyword(%s) 실패: %s", kw, e)
cat = "uncategorized"
rank_score = round(max(0.0, 1.0 - (idx / total)), 4)
items.append({
"keyword": kw,
"category": cat,

View File

@@ -7,4 +7,3 @@ jinja2>=3.1.4
playwright==1.48.0
pytest>=8.0
pytest-asyncio>=0.24
pytrends>=4.9

View File

@@ -77,16 +77,20 @@ def test_classify_keyword_with_cache(monkeypatch):
assert calls["n"] == 1
def test_fetch_google_trends_parses_and_classifies(tmp_db, monkeypatch):
class FakePyTrends:
def __init__(self, *_a, **_kw):
pass
def trending_searches(self, pn="south_korea"):
import pandas as pd
return pd.DataFrame({"0": ["기준금리", "BTS 컴백", "스트레스 관리"]})
monkeypatch.setattr(trend_collector, "TrendReq", FakePyTrends)
def test_fetch_google_trends_parses_rss_and_classifies(tmp_db, monkeypatch):
fake_rss = """<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0">
<channel>
<title>Daily Search Trends</title>
<item><title>기준금리</title></item>
<item><title>BTS 컴백</title></item>
<item><title>스트레스 관리</title></item>
</channel>
</rss>"""
fake_resp = MagicMock()
fake_resp.text = fake_rss
fake_resp.raise_for_status.return_value = None
monkeypatch.setattr(trend_collector.requests, "get", lambda *a, **kw: fake_resp)
monkeypatch.setattr(trend_collector, "classify_keyword",
lambda kw: {"기준금리": "economy", "BTS 컴백": "celebrity",
"스트레스 관리": "psychology"}.get(kw, "uncategorized"))
@@ -108,14 +112,36 @@ def test_collect_all_invokes_both_sources(tmp_db, monkeypatch):
assert out == {"naver_popular": 5, "google_trends": 3}
def test_fetch_google_trends_graceful_on_pytrends_failure(monkeypatch):
class FakePyTrends:
def __init__(self, *_a, **_kw):
pass
def trending_searches(self, pn="south_korea"):
raise RuntimeError("rate limited")
monkeypatch.setattr(trend_collector, "TrendReq", FakePyTrends)
def test_fetch_google_trends_graceful_on_rss_failure(monkeypatch):
fake_resp = MagicMock()
fake_resp.raise_for_status.side_effect = RuntimeError("Google returned 404")
monkeypatch.setattr(trend_collector.requests, "get", lambda *a, **kw: fake_resp)
out = trend_collector.fetch_google_trends()
assert out == []
def test_seeds_for_filters_placeholder(tmp_db, monkeypatch):
"""category_seeds 템플릿에 placeholder '...'가 들어가도 DEFAULT 폴백."""
from app import db as db_module
db_module.upsert_prompt_template(
"category_seeds",
'{"economy": ["...", "", "a", "real_keyword"]}',
"test",
)
out = trend_collector._seeds_for("economy")
# '...', '…', 'a'(2자 미만)는 필터링되고 'real_keyword'만 남음
assert out == ["real_keyword"]
def test_seeds_for_falls_back_when_all_invalid(tmp_db, monkeypatch):
"""모든 시드가 invalid면 DEFAULT_CATEGORY_SEEDS 폴백."""
from app import db as db_module
db_module.upsert_prompt_template(
"category_seeds",
'{"economy": ["...", "TBD", ""]}',
"test",
)
out = trend_collector._seeds_for("economy")
# DEFAULT_CATEGORY_SEEDS["economy"] 가 반환되어야 함
from app.config import DEFAULT_CATEGORY_SEEDS
assert out == list(DEFAULT_CATEGORY_SEEDS["economy"])