From bf5897fc857bcffa2687a1930462fb6b5a39b553 Mon Sep 17 00:00:00 2001 From: gahusb Date: Sun, 17 May 2026 09:21:38 +0900 Subject: [PATCH] =?UTF-8?q?fix(insta-lab):=20trend=5Fcollector=20=E2=80=94?= =?UTF-8?q?=20Google=20Trends=20RSS=20+=20seed=20placeholder=20filter?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (1) pytrends 4.x가 Google API 변경으로 trending_searches(pn='south_korea') 가 404 반환 → daily trending searches RSS endpoint를 requests로 직접 호출 하도록 교체. pytrends 의존성 제거. (2) category_seeds 프롬프트 템플릿에 placeholder ('...', 'TBD' 등) 또는 2자 미만 값이 들어가면 NAVER가 400 Bad Request 반환 → _seeds_for에 _is_valid_seed 가드 추가, 모두 invalid면 DEFAULT_CATEGORY_SEEDS 폴백. 테스트 8/8 PASS (기존 6 + placeholder/fallback 2 신규). --- insta-lab/app/trend_collector.py | 69 +++++++++++++++++++------ insta-lab/requirements.txt | 1 - insta-lab/tests/test_trend_collector.py | 64 ++++++++++++++++------- 3 files changed, 98 insertions(+), 36 deletions(-) diff --git a/insta-lab/app/trend_collector.py b/insta-lab/app/trend_collector.py index d8a2f02..7dc132b 100644 --- a/insta-lab/app/trend_collector.py +++ b/insta-lab/app/trend_collector.py @@ -1,17 +1,19 @@ -"""외부 트렌드 수집 — NAVER 인기 + Google Trends + LLM 카테고리 분류. +"""외부 트렌드 수집 — NAVER 인기 + Google Trends RSS + LLM 카테고리 분류. -Phase B Task 3: Google Trends integration via pytrends + Anthropic Haiku 분류 캐시 (24h TTL). +NAVER: 카테고리별 시드 키워드로 인기 검색 → 빈도 상위 추출. +Google Trends: pytrends 4.x가 Google API 변경으로 깨진 상태라 daily RSS endpoint 직접 호출. +LLM 분류 결과는 24h in-memory 캐시. """ import json import logging import re import time +import xml.etree.ElementTree as ET from typing import Any, Dict, List, Optional import requests from anthropic import Anthropic -from pytrends.request import TrendReq from .config import ( NAVER_CLIENT_ID, NAVER_CLIENT_SECRET, DEFAULT_CATEGORY_SEEDS, @@ -29,16 +31,36 @@ _NAVER_HEADERS = { "X-Naver-Client-Secret": NAVER_CLIENT_SECRET, } +GOOGLE_TRENDS_RSS_URL = "https://trends.google.com/trends/trendingsearches/daily/rss?geo=KR" + +_PLACEHOLDER_SEEDS = {"...", "…", "tbd", "todo", "placeholder", "example"} + + +def _is_valid_seed(s: str) -> bool: + """프롬프트 템플릿에 placeholder/빈 값이 들어가 NAVER에 400을 유발하는 일을 막는 가드.""" + if not s: + return False + s = s.strip() + if len(s) < 2: + return False + if s.lower() in _PLACEHOLDER_SEEDS: + return False + return True + def _seeds_for(category: str) -> List[str]: + """category_seeds 프롬프트 템플릿이 있으면 사용, 없거나 모두 invalid면 config DEFAULT 폴백.""" pt = db.get_prompt_template("category_seeds") if pt and pt.get("template"): try: data = json.loads(pt["template"]) if category in data: - return list(data[category]) - except Exception: - pass + filtered = [s for s in (data[category] or []) if _is_valid_seed(s)] + if filtered: + return filtered + logger.warning("category_seeds[%s]에 유효한 시드 없음 → DEFAULT 폴백", category) + except Exception as e: + logger.warning("category_seeds JSON 파싱 실패 → DEFAULT 폴백: %s", e) return list(DEFAULT_CATEGORY_SEEDS.get(category, [])) @@ -143,23 +165,38 @@ def classify_keyword(keyword: str) -> str: # ── Google Trends ───────────────────────────────────────────────────────────── +# pytrends 4.x가 Google API 변경(404)으로 자주 깨지므로 daily trending searches +# RSS endpoint를 직접 호출. RSS는 공식 Google Trends 서비스가 제공하며 pn=geo +# 파라미터로 region 지정 가능. def fetch_google_trends() -> List[Dict[str, Any]]: - """pytrends 한국 daily trending searches. 실패 시 빈 리스트.""" + """Google Trends Daily RSS (한국) 직접 호출. 실패 시 빈 리스트로 graceful degrade.""" try: - pytrends = TrendReq(hl="ko-KR", tz=540) - df = pytrends.trending_searches(pn="south_korea") + resp = requests.get( + GOOGLE_TRENDS_RSS_URL, + timeout=15, + headers={"User-Agent": "Mozilla/5.0 (insta-lab trend collector)"}, + ) + resp.raise_for_status() + root = ET.fromstring(resp.text) + titles = [ + (item.findtext("title") or "").strip() + for item in root.iter("item") + ] + titles = [t for t in titles if t] except Exception as e: - logger.warning("Google Trends fetch failed: %s", e) + logger.warning("Google Trends RSS fetch failed: %s", e) return [] items: List[Dict[str, Any]] = [] - for idx, row in df.iterrows(): - kw = str(row.iloc[0]).strip() - if not kw: - continue - cat = classify_keyword(kw) - rank_score = round(max(0.0, 1.0 - (idx / max(1, len(df)))), 4) + total = max(1, len(titles)) + for idx, kw in enumerate(titles): + try: + cat = classify_keyword(kw) + except Exception as e: + logger.warning("classify_keyword(%s) 실패: %s", kw, e) + cat = "uncategorized" + rank_score = round(max(0.0, 1.0 - (idx / total)), 4) items.append({ "keyword": kw, "category": cat, diff --git a/insta-lab/requirements.txt b/insta-lab/requirements.txt index a3e6da9..3d26add 100644 --- a/insta-lab/requirements.txt +++ b/insta-lab/requirements.txt @@ -7,4 +7,3 @@ jinja2>=3.1.4 playwright==1.48.0 pytest>=8.0 pytest-asyncio>=0.24 -pytrends>=4.9 diff --git a/insta-lab/tests/test_trend_collector.py b/insta-lab/tests/test_trend_collector.py index 465e729..22fab8f 100644 --- a/insta-lab/tests/test_trend_collector.py +++ b/insta-lab/tests/test_trend_collector.py @@ -77,16 +77,20 @@ def test_classify_keyword_with_cache(monkeypatch): assert calls["n"] == 1 -def test_fetch_google_trends_parses_and_classifies(tmp_db, monkeypatch): - class FakePyTrends: - def __init__(self, *_a, **_kw): - pass - - def trending_searches(self, pn="south_korea"): - import pandas as pd - return pd.DataFrame({"0": ["기준금리", "BTS 컴백", "스트레스 관리"]}) - - monkeypatch.setattr(trend_collector, "TrendReq", FakePyTrends) +def test_fetch_google_trends_parses_rss_and_classifies(tmp_db, monkeypatch): + fake_rss = """ + + + Daily Search Trends + 기준금리 + BTS 컴백 + 스트레스 관리 + +""" + fake_resp = MagicMock() + fake_resp.text = fake_rss + fake_resp.raise_for_status.return_value = None + monkeypatch.setattr(trend_collector.requests, "get", lambda *a, **kw: fake_resp) monkeypatch.setattr(trend_collector, "classify_keyword", lambda kw: {"기준금리": "economy", "BTS 컴백": "celebrity", "스트레스 관리": "psychology"}.get(kw, "uncategorized")) @@ -108,14 +112,36 @@ def test_collect_all_invokes_both_sources(tmp_db, monkeypatch): assert out == {"naver_popular": 5, "google_trends": 3} -def test_fetch_google_trends_graceful_on_pytrends_failure(monkeypatch): - class FakePyTrends: - def __init__(self, *_a, **_kw): - pass - - def trending_searches(self, pn="south_korea"): - raise RuntimeError("rate limited") - - monkeypatch.setattr(trend_collector, "TrendReq", FakePyTrends) +def test_fetch_google_trends_graceful_on_rss_failure(monkeypatch): + fake_resp = MagicMock() + fake_resp.raise_for_status.side_effect = RuntimeError("Google returned 404") + monkeypatch.setattr(trend_collector.requests, "get", lambda *a, **kw: fake_resp) out = trend_collector.fetch_google_trends() assert out == [] + + +def test_seeds_for_filters_placeholder(tmp_db, monkeypatch): + """category_seeds 템플릿에 placeholder '...'가 들어가도 DEFAULT 폴백.""" + from app import db as db_module + db_module.upsert_prompt_template( + "category_seeds", + '{"economy": ["...", "…", "a", "real_keyword"]}', + "test", + ) + out = trend_collector._seeds_for("economy") + # '...', '…', 'a'(2자 미만)는 필터링되고 'real_keyword'만 남음 + assert out == ["real_keyword"] + + +def test_seeds_for_falls_back_when_all_invalid(tmp_db, monkeypatch): + """모든 시드가 invalid면 DEFAULT_CATEGORY_SEEDS 폴백.""" + from app import db as db_module + db_module.upsert_prompt_template( + "category_seeds", + '{"economy": ["...", "TBD", ""]}', + "test", + ) + out = trend_collector._seeds_for("economy") + # DEFAULT_CATEGORY_SEEDS["economy"] 가 반환되어야 함 + from app.config import DEFAULT_CATEGORY_SEEDS + assert out == list(DEFAULT_CATEGORY_SEEDS["economy"])