feat(agent-office): youtube_researcher — YouTube API·pytrends·Billboard 수집

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-01 12:05:58 +09:00
parent f83b900320
commit 21666f4372
2 changed files with 144 additions and 0 deletions

View File

@@ -0,0 +1,142 @@
import os
import re
import asyncio
from typing import List, Dict, Any
import httpx
YOUTUBE_DATA_API_KEY = os.getenv("YOUTUBE_DATA_API_KEY", "")
MUSIC_LAB_URL = os.getenv("MUSIC_LAB_URL", "http://music-lab:8000")
TARGET_COUNTRIES = ["BR", "ID", "MX", "US", "KR"]
TREND_KEYWORDS = ["lofi music", "phonk", "ambient music", "chill beats", "study music"]
YOUTUBE_MUSIC_CAT = "10"
GENRE_TAGS = {
"lo-fi": ["lofi", "lo-fi", "lo fi", "chill", "study"],
"phonk": ["phonk", "drift", "memphis"],
"ambient": ["ambient", "relaxing", "meditation"],
"pop": ["pop", "kpop", "k-pop"],
"funk": ["funk", "baile funk"],
"latin": ["latin", "reggaeton", "sertanejo"],
}
def _tags_to_genre(tags: list) -> str:
joined = " ".join(t.lower() for t in tags)
for genre, kws in GENRE_TAGS.items():
if any(kw in joined for kw in kws):
return genre
return "general"
async def fetch_youtube_trending(country: str, max_results: int = 50) -> List[Dict[str, Any]]:
"""YouTube Data API v3 — 국가별 트렌딩 음악 영상 (categoryId=10)."""
if not YOUTUBE_DATA_API_KEY:
return []
async with httpx.AsyncClient(timeout=10.0) as client:
try:
resp = await client.get(
"https://www.googleapis.com/youtube/v3/videos",
params={
"part": "snippet,statistics",
"chart": "mostPopular",
"regionCode": country,
"videoCategoryId": YOUTUBE_MUSIC_CAT,
"maxResults": max_results,
"key": YOUTUBE_DATA_API_KEY,
},
)
if resp.status_code != 200:
return []
items = resp.json().get("items", [])
except Exception:
return []
results = []
for i, item in enumerate(items):
snippet = item.get("snippet", {})
stats = item.get("statistics", {})
genre = _tags_to_genre(snippet.get("tags") or [])
results.append({
"source": "youtube",
"country": country,
"genre": genre,
"keyword": snippet.get("title", "")[:100],
"score": round(1.0 - i / max_results, 3),
"rank": i + 1,
"metadata": {
"video_id": item["id"],
"view_count": int(stats.get("viewCount", 0)),
"channel": snippet.get("channelTitle", ""),
},
})
return results
async def fetch_google_trends(keywords: List[str], countries: List[str]) -> List[Dict[str, Any]]:
"""pytrends — 키워드별 Google 관심도 (sync → threadpool)."""
try:
from pytrends.request import TrendReq
except ImportError:
return []
def _sync_fetch(kw: str) -> List[Dict[str, Any]]:
try:
pt = TrendReq(hl="en-US", tz=0)
pt.build_payload([kw], timeframe="now 7-d")
df = pt.interest_over_time()
if df.empty or kw not in df.columns:
return []
score = round(float(df[kw].mean()) / 100.0, 3)
return [
{"source": "google_trends", "country": c, "genre": "",
"keyword": kw, "score": score, "rank": None, "metadata": {}}
for c in countries
]
except Exception:
return []
loop = asyncio.get_event_loop()
results = []
for kw in keywords[:5]:
rows = await loop.run_in_executor(None, _sync_fetch, kw)
results.extend(rows)
await asyncio.sleep(1.0)
return results
async def fetch_billboard_top20() -> List[Dict[str, Any]]:
"""Billboard Hot 100 스크래핑 — 상위 20위."""
async with httpx.AsyncClient(
timeout=10.0,
headers={"User-Agent": "Mozilla/5.0 (compatible; music-research-bot/1.0)"},
follow_redirects=True,
) as client:
try:
resp = await client.get("https://www.billboard.com/charts/hot-100/")
if resp.status_code != 200:
return []
titles = re.findall(
r'class="c-title[^"]*"[^>]*>\s*([^<\n]{3,80})\s*<', resp.text
)[:20]
return [
{"source": "billboard", "country": "US", "genre": "pop",
"keyword": t.strip(), "score": round(1.0 - i / 20, 3),
"rank": i + 1, "metadata": {}}
for i, t in enumerate(titles) if t.strip()
]
except Exception:
return []
async def push_to_music_lab(trends: List[Dict[str, Any]], report_date: str) -> bool:
"""수집한 트렌드를 music-lab /api/music/market/ingest로 push."""
async with httpx.AsyncClient(timeout=15.0) as client:
try:
resp = await client.post(
f"{MUSIC_LAB_URL}/api/music/market/ingest",
json={"trends": trends, "report_date": report_date},
)
return resp.status_code == 200
except Exception:
return False