From 64fbbb7958c653bbf65e24a532b3e5a928e9577a Mon Sep 17 00:00:00 2001 From: gahusb Date: Sun, 17 May 2026 11:54:31 +0900 Subject: [PATCH] =?UTF-8?q?fix(insta-lab):=20replace=20Google=20Trends=20w?= =?UTF-8?q?ith=20YouTube=20Data=20API=20(Google=20API=20=ED=8F=90=EA=B8=B0?= =?UTF-8?q?=20=EB=8C=80=EC=9D=91)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Google이 비공식 trends endpoint 두 가지(/trends/.../rss + /trends/api/dailytrends) 모두 404로 폐기 (NAS에서 직접 호출 시 확정). 대안으로 YouTube Data API v3 mostPopular(regionCode=KR, 50개)로 source 교체: - source 이름: google_trends → youtube_trending - 키워드: 영상 제목 정제 (대괄호·이모지 제거, 60자 limit) - API 키: YOUTUBE_DATA_API_KEY (agent-office와 공유, .env 그대로 활용) - 키 미설정 시 graceful skip - docker-compose insta-lab에 환경변수 추가 - 테스트 9/9 pass (기존 6 + youtube 3 신규) --- docker-compose.yml | 1 + insta-lab/app/config.py | 1 + insta-lab/app/main.py | 2 +- insta-lab/app/trend_collector.py | 97 ++++++++++++++----------- insta-lab/tests/test_main_trends.py | 2 +- insta-lab/tests/test_trend_collector.py | 79 ++++++++++---------- 6 files changed, 99 insertions(+), 83 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 3f820fa..ae035e1 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -100,6 +100,7 @@ services: - ANTHROPIC_MODEL_SONNET=${ANTHROPIC_MODEL_SONNET:-claude-sonnet-4-6} - NAVER_CLIENT_ID=${NAVER_CLIENT_ID:-} - NAVER_CLIENT_SECRET=${NAVER_CLIENT_SECRET:-} + - YOUTUBE_DATA_API_KEY=${YOUTUBE_DATA_API_KEY:-} - INSTA_DATA_PATH=/app/data - CARD_TEMPLATE_DIR=/app/app/templates - CORS_ALLOW_ORIGINS=${CORS_ALLOW_ORIGINS:-http://localhost:3007,http://localhost:8080} diff --git a/insta-lab/app/config.py b/insta-lab/app/config.py index 347aae2..ee6f85f 100644 --- a/insta-lab/app/config.py +++ b/insta-lab/app/config.py @@ -2,6 +2,7 @@ import os NAVER_CLIENT_ID = os.getenv("NAVER_CLIENT_ID", "") NAVER_CLIENT_SECRET = os.getenv("NAVER_CLIENT_SECRET", "") +YOUTUBE_DATA_API_KEY = os.getenv("YOUTUBE_DATA_API_KEY", "") ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY", "") ANTHROPIC_MODEL_HAIKU = os.getenv("ANTHROPIC_MODEL_HAIKU", "claude-haiku-4-5-20251001") ANTHROPIC_MODEL_SONNET = os.getenv("ANTHROPIC_MODEL_SONNET", "claude-sonnet-4-6") diff --git a/insta-lab/app/main.py b/insta-lab/app/main.py index 195eae5..5f86fae 100644 --- a/insta-lab/app/main.py +++ b/insta-lab/app/main.py @@ -265,7 +265,7 @@ async def _bg_collect_trends(task_id: str, categories: list[str]): try: db.update_task(task_id, "processing", 10, "외부 트렌드 수집 중") result = trend_collector.collect_all(categories) - msg = f"naver:{result['naver_popular']}, google:{result['google_trends']}" + msg = f"naver:{result['naver_popular']}, youtube:{result['youtube_trending']}" db.update_task(task_id, "succeeded", 100, msg, result_id=sum(result.values())) except Exception as e: logger.exception("trends collect failed") diff --git a/insta-lab/app/trend_collector.py b/insta-lab/app/trend_collector.py index caace23..87d7415 100644 --- a/insta-lab/app/trend_collector.py +++ b/insta-lab/app/trend_collector.py @@ -1,8 +1,9 @@ -"""외부 트렌드 수집 — NAVER 인기 + Google Trends + LLM 카테고리 분류. +"""외부 트렌드 수집 — NAVER 인기 + YouTube 인기 영상 + LLM 카테고리 분류. NAVER: 카테고리별 시드 키워드로 인기 검색 → 빈도 상위 추출. -Google Trends: pytrends 4.x + daily RSS endpoint 모두 폐기/404로 깨진 상태라 -'/trends/api/dailytrends' JSON API를 직접 호출 (응답 앞 `)]}'` XSSI 접두사 자름). +YouTube: Google Trends 비공식 endpoint(RSS / dailytrends JSON)가 모두 404 폐기되어 +대체로 YouTube Data API v3 (`videos.list?chart=mostPopular®ionCode=KR`) 사용. +무료 일일 quota 10000, 한국 region 지원, 인기 영상 50개 제목에서 트렌드 추출. LLM 분류 결과는 24h in-memory 캐시. """ @@ -17,7 +18,7 @@ from anthropic import Anthropic from .config import ( NAVER_CLIENT_ID, NAVER_CLIENT_SECRET, DEFAULT_CATEGORY_SEEDS, - ANTHROPIC_API_KEY, ANTHROPIC_MODEL_HAIKU, + ANTHROPIC_API_KEY, ANTHROPIC_MODEL_HAIKU, YOUTUBE_DATA_API_KEY, ) from . import db from .news_collector import _clean @@ -31,10 +32,17 @@ _NAVER_HEADERS = { "X-Naver-Client-Secret": NAVER_CLIENT_SECRET, } -GOOGLE_TRENDS_DAILY_URL = ( - "https://trends.google.com/trends/api/dailytrends" - "?hl=ko&tz=-540&geo=KR&ns=15" +YOUTUBE_TRENDING_URL = "https://www.googleapis.com/youtube/v3/videos" +# YouTube 제목 정제: 대괄호·이모지·과도한 길이 제거 후 카드 주제로 적합한 키워드 형태 +_TITLE_BRACKET_RE = re.compile(r"[\[【「『\(][^\]】」』\)]{0,30}[\]】」』\)]") +_EMOJI_RE = re.compile( + r"[" + r"\U0001F300-\U0001FAFF" # symbols & pictographs, etc. + r"\U00002600-\U000027BF" # misc symbols, dingbats + r"\U0001F1E6-\U0001F1FF" # regional indicator + r"]" ) +_TITLE_MAX_LEN = 60 _PLACEHOLDER_SEEDS = {"...", "…", "tbd", "todo", "placeholder", "example"} @@ -167,49 +175,52 @@ def classify_keyword(keyword: str) -> str: return cat -# ── Google Trends ───────────────────────────────────────────────────────────── -# pytrends 4.x + daily RSS endpoint(`/trends/trendingsearches/daily/rss`) 모두 -# 폐기/404 상태라 Google Trends 비공식 JSON API `/trends/api/dailytrends`를 직접 -# 호출. 응답 앞에 `)]}'` XSSI 보호 prefix가 붙어있어 잘라낸 후 JSON 파싱. -# 응답 구조: default.trendingSearchesDays[].trendingSearches[].title.query +# ── YouTube Trending ────────────────────────────────────────────────────────── +# YouTube Data API v3 videos.list?chart=mostPopular®ionCode=KR +# 한국 인기 영상 50개 제목에서 카드 주제로 적합한 키워드 추출. -_XSSI_PREFIX_RE = re.compile(r"^[\s\)\]\}',\n]+") +def _clean_yt_title(title: str) -> str: + """[공식]·【속보】·🔥 등 제거 후 60자 이내로 자른다.""" + if not title: + return "" + cleaned = _TITLE_BRACKET_RE.sub("", title) + cleaned = _EMOJI_RE.sub("", cleaned) + cleaned = re.sub(r"\s+", " ", cleaned).strip() + return cleaned[:_TITLE_MAX_LEN] -def fetch_google_trends() -> List[Dict[str, Any]]: - """Google Trends Daily JSON API (한국) 직접 호출. 실패 시 빈 리스트.""" +def fetch_youtube_trending() -> List[Dict[str, Any]]: + """YouTube Data API v3 mostPopular (한국, 50개). API 키 없거나 호출 실패 시 빈 리스트.""" + if not YOUTUBE_DATA_API_KEY: + logger.info("YOUTUBE_DATA_API_KEY 미설정 — youtube_trending skip") + return [] try: resp = requests.get( - GOOGLE_TRENDS_DAILY_URL, + YOUTUBE_TRENDING_URL, + params={ + "part": "snippet", + "chart": "mostPopular", + "regionCode": "KR", + "maxResults": 50, + "key": YOUTUBE_DATA_API_KEY, + }, timeout=15, - headers={"User-Agent": "Mozilla/5.0 (insta-lab trend collector)"}, ) resp.raise_for_status() - body = _XSSI_PREFIX_RE.sub("", resp.text, count=1) - data = json.loads(body) - days = data.get("default", {}).get("trendingSearchesDays", []) or [] - raw_titles: List[str] = [] - for day in days: - for ts in day.get("trendingSearches", []) or []: - q = (ts.get("title") or {}).get("query", "") - if isinstance(q, str): - q = q.strip() - if q: - raw_titles.append(q) - # 중복 제거 (등장 순서 유지) - seen = set() - titles: List[str] = [] - for t in raw_titles: - if t not in seen: - seen.add(t) - titles.append(t) + videos = resp.json().get("items", []) or [] except Exception as e: - logger.warning("Google Trends daily fetch failed: %s", e) + logger.warning("YouTube trending fetch failed: %s", e) return [] items: List[Dict[str, Any]] = [] - total = max(1, len(titles)) - for idx, kw in enumerate(titles): + seen = set() + total = max(1, len(videos)) + for idx, v in enumerate(videos): + title = (v.get("snippet") or {}).get("title", "") + kw = _clean_yt_title(title) + if not kw or kw in seen: + continue + seen.add(kw) try: cat = classify_keyword(kw) except Exception as e: @@ -219,15 +230,15 @@ def fetch_google_trends() -> List[Dict[str, Any]]: items.append({ "keyword": kw, "category": cat, - "source": "google_trends", + "source": "youtube_trending", "score": rank_score, "articles_count": 0, }) return items -def collect_google_trends() -> int: - items = fetch_google_trends() +def collect_youtube_trending() -> int: + items = fetch_youtube_trending() for it in items: db.add_external_trend(it) return len(items) @@ -235,5 +246,5 @@ def collect_google_trends() -> int: def collect_all(categories: List[str]) -> Dict[str, int]: naver_n = collect_naver_popular_for(categories) - google_n = collect_google_trends() - return {"naver_popular": naver_n, "google_trends": google_n} + yt_n = collect_youtube_trending() + return {"naver_popular": naver_n, "youtube_trending": yt_n} diff --git a/insta-lab/tests/test_main_trends.py b/insta-lab/tests/test_main_trends.py index c642881..8028cf2 100644 --- a/insta-lab/tests/test_main_trends.py +++ b/insta-lab/tests/test_main_trends.py @@ -59,7 +59,7 @@ def test_collect_trends_kicks_background(client, monkeypatch): def fake_collect_all(cats): captured["called"] = True - return {"naver_popular": 3, "google_trends": 2} + return {"naver_popular": 3, "youtube_trending": 2} monkeypatch.setattr(trend_collector, "collect_all", fake_collect_all) resp = client.post("/api/insta/trends/collect", json={}) diff --git a/insta-lab/tests/test_trend_collector.py b/insta-lab/tests/test_trend_collector.py index eae7964..81e2a51 100644 --- a/insta-lab/tests/test_trend_collector.py +++ b/insta-lab/tests/test_trend_collector.py @@ -77,57 +77,60 @@ def test_classify_keyword_with_cache(monkeypatch): assert calls["n"] == 1 -def test_fetch_google_trends_parses_json_and_classifies(tmp_db, monkeypatch): - import json as _json +def test_fetch_youtube_trending_parses_and_cleans_titles(tmp_db, monkeypatch): + """YouTube Data API mostPopular 응답 → 제목 정제 + 분류.""" + monkeypatch.setattr(trend_collector, "YOUTUBE_DATA_API_KEY", "fake_key") payload = { - "default": { - "trendingSearchesDays": [ - { - "date": "20260517", - "trendingSearches": [ - {"title": {"query": "기준금리"}}, - {"title": {"query": "BTS 컴백"}}, - {"title": {"query": "스트레스 관리"}}, - # 다음 날 데이터에 중복 키워드 — 중복 제거 확인 - {"title": {"query": "기준금리"}}, - ], - } - ] - } + "items": [ + {"snippet": {"title": "[속보] 기준금리 인상 단행 🔥"}}, + {"snippet": {"title": "(공식) BTS 컴백 무대 🎤"}}, + {"snippet": {"title": "스트레스 관리 5가지 방법"}}, + # 중복 제목 — 중복 제거 확인 + {"snippet": {"title": "[속보] 기준금리 인상 단행 🔥"}}, + ] } fake_resp = MagicMock() - # 실제 Google 응답 형태: `)]}',\n` XSSI prefix가 앞에 붙음 - fake_resp.text = ")]}',\n" + _json.dumps(payload, ensure_ascii=False) + fake_resp.json.return_value = payload fake_resp.raise_for_status.return_value = None monkeypatch.setattr(trend_collector.requests, "get", lambda *a, **kw: fake_resp) - monkeypatch.setattr(trend_collector, "classify_keyword", - lambda kw: {"기준금리": "economy", "BTS 컴백": "celebrity", - "스트레스 관리": "psychology"}.get(kw, "uncategorized")) + monkeypatch.setattr( + trend_collector, "classify_keyword", + lambda kw: ("economy" if "금리" in kw else + "celebrity" if "BTS" in kw else + "psychology" if "스트레스" in kw else "uncategorized"), + ) - trends = trend_collector.fetch_google_trends() - by_kw = {t["keyword"]: t for t in trends} - assert set(by_kw.keys()) == {"기준금리", "BTS 컴백", "스트레스 관리"} # 중복 제거됨 - assert by_kw["기준금리"]["category"] == "economy" - assert by_kw["BTS 컴백"]["category"] == "celebrity" - assert by_kw["스트레스 관리"]["category"] == "psychology" - assert all(t["source"] == "google_trends" for t in trends) + trends = trend_collector.fetch_youtube_trending() + keywords = [t["keyword"] for t in trends] + assert "기준금리 인상 단행" in keywords # 대괄호·이모지 제거 + assert "BTS 컴백 무대" in keywords # 괄호 제거 + assert "스트레스 관리 5가지 방법" in keywords # 그대로 + assert len(trends) == 3 # 중복 제거됨 + assert all(t["source"] == "youtube_trending" for t in trends) + + +def test_fetch_youtube_trending_no_api_key_returns_empty(monkeypatch): + monkeypatch.setattr(trend_collector, "YOUTUBE_DATA_API_KEY", "") + out = trend_collector.fetch_youtube_trending() + assert out == [] + + +def test_fetch_youtube_trending_graceful_on_api_failure(monkeypatch): + monkeypatch.setattr(trend_collector, "YOUTUBE_DATA_API_KEY", "fake_key") + fake_resp = MagicMock() + fake_resp.raise_for_status.side_effect = RuntimeError("quota exceeded") + monkeypatch.setattr(trend_collector.requests, "get", lambda *a, **kw: fake_resp) + out = trend_collector.fetch_youtube_trending() + assert out == [] def test_collect_all_invokes_both_sources(tmp_db, monkeypatch): monkeypatch.setattr(trend_collector, "collect_naver_popular_for", lambda cats: 5) - monkeypatch.setattr(trend_collector, "collect_google_trends", + monkeypatch.setattr(trend_collector, "collect_youtube_trending", lambda: 3) out = trend_collector.collect_all(["economy"]) - assert out == {"naver_popular": 5, "google_trends": 3} - - -def test_fetch_google_trends_graceful_on_api_failure(monkeypatch): - fake_resp = MagicMock() - fake_resp.raise_for_status.side_effect = RuntimeError("Google returned 404") - monkeypatch.setattr(trend_collector.requests, "get", lambda *a, **kw: fake_resp) - out = trend_collector.fetch_google_trends() - assert out == [] + assert out == {"naver_popular": 5, "youtube_trending": 3} def test_seeds_for_filters_placeholder(tmp_db, monkeypatch):