feat(agent-office): youtube_researcher — YouTube API·pytrends·Billboard 수집
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
142
agent-office/app/youtube_researcher.py
Normal file
142
agent-office/app/youtube_researcher.py
Normal file
@@ -0,0 +1,142 @@
|
|||||||
|
import os
|
||||||
|
import re
|
||||||
|
import asyncio
|
||||||
|
from typing import List, Dict, Any
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
YOUTUBE_DATA_API_KEY = os.getenv("YOUTUBE_DATA_API_KEY", "")
|
||||||
|
MUSIC_LAB_URL = os.getenv("MUSIC_LAB_URL", "http://music-lab:8000")
|
||||||
|
TARGET_COUNTRIES = ["BR", "ID", "MX", "US", "KR"]
|
||||||
|
TREND_KEYWORDS = ["lofi music", "phonk", "ambient music", "chill beats", "study music"]
|
||||||
|
YOUTUBE_MUSIC_CAT = "10"
|
||||||
|
|
||||||
|
GENRE_TAGS = {
|
||||||
|
"lo-fi": ["lofi", "lo-fi", "lo fi", "chill", "study"],
|
||||||
|
"phonk": ["phonk", "drift", "memphis"],
|
||||||
|
"ambient": ["ambient", "relaxing", "meditation"],
|
||||||
|
"pop": ["pop", "kpop", "k-pop"],
|
||||||
|
"funk": ["funk", "baile funk"],
|
||||||
|
"latin": ["latin", "reggaeton", "sertanejo"],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _tags_to_genre(tags: list) -> str:
|
||||||
|
joined = " ".join(t.lower() for t in tags)
|
||||||
|
for genre, kws in GENRE_TAGS.items():
|
||||||
|
if any(kw in joined for kw in kws):
|
||||||
|
return genre
|
||||||
|
return "general"
|
||||||
|
|
||||||
|
|
||||||
|
async def fetch_youtube_trending(country: str, max_results: int = 50) -> List[Dict[str, Any]]:
|
||||||
|
"""YouTube Data API v3 — 국가별 트렌딩 음악 영상 (categoryId=10)."""
|
||||||
|
if not YOUTUBE_DATA_API_KEY:
|
||||||
|
return []
|
||||||
|
async with httpx.AsyncClient(timeout=10.0) as client:
|
||||||
|
try:
|
||||||
|
resp = await client.get(
|
||||||
|
"https://www.googleapis.com/youtube/v3/videos",
|
||||||
|
params={
|
||||||
|
"part": "snippet,statistics",
|
||||||
|
"chart": "mostPopular",
|
||||||
|
"regionCode": country,
|
||||||
|
"videoCategoryId": YOUTUBE_MUSIC_CAT,
|
||||||
|
"maxResults": max_results,
|
||||||
|
"key": YOUTUBE_DATA_API_KEY,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
if resp.status_code != 200:
|
||||||
|
return []
|
||||||
|
items = resp.json().get("items", [])
|
||||||
|
except Exception:
|
||||||
|
return []
|
||||||
|
|
||||||
|
results = []
|
||||||
|
for i, item in enumerate(items):
|
||||||
|
snippet = item.get("snippet", {})
|
||||||
|
stats = item.get("statistics", {})
|
||||||
|
genre = _tags_to_genre(snippet.get("tags") or [])
|
||||||
|
results.append({
|
||||||
|
"source": "youtube",
|
||||||
|
"country": country,
|
||||||
|
"genre": genre,
|
||||||
|
"keyword": snippet.get("title", "")[:100],
|
||||||
|
"score": round(1.0 - i / max_results, 3),
|
||||||
|
"rank": i + 1,
|
||||||
|
"metadata": {
|
||||||
|
"video_id": item["id"],
|
||||||
|
"view_count": int(stats.get("viewCount", 0)),
|
||||||
|
"channel": snippet.get("channelTitle", ""),
|
||||||
|
},
|
||||||
|
})
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
async def fetch_google_trends(keywords: List[str], countries: List[str]) -> List[Dict[str, Any]]:
|
||||||
|
"""pytrends — 키워드별 Google 관심도 (sync → threadpool)."""
|
||||||
|
try:
|
||||||
|
from pytrends.request import TrendReq
|
||||||
|
except ImportError:
|
||||||
|
return []
|
||||||
|
|
||||||
|
def _sync_fetch(kw: str) -> List[Dict[str, Any]]:
|
||||||
|
try:
|
||||||
|
pt = TrendReq(hl="en-US", tz=0)
|
||||||
|
pt.build_payload([kw], timeframe="now 7-d")
|
||||||
|
df = pt.interest_over_time()
|
||||||
|
if df.empty or kw not in df.columns:
|
||||||
|
return []
|
||||||
|
score = round(float(df[kw].mean()) / 100.0, 3)
|
||||||
|
return [
|
||||||
|
{"source": "google_trends", "country": c, "genre": "",
|
||||||
|
"keyword": kw, "score": score, "rank": None, "metadata": {}}
|
||||||
|
for c in countries
|
||||||
|
]
|
||||||
|
except Exception:
|
||||||
|
return []
|
||||||
|
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
results = []
|
||||||
|
for kw in keywords[:5]:
|
||||||
|
rows = await loop.run_in_executor(None, _sync_fetch, kw)
|
||||||
|
results.extend(rows)
|
||||||
|
await asyncio.sleep(1.0)
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
async def fetch_billboard_top20() -> List[Dict[str, Any]]:
|
||||||
|
"""Billboard Hot 100 스크래핑 — 상위 20위."""
|
||||||
|
async with httpx.AsyncClient(
|
||||||
|
timeout=10.0,
|
||||||
|
headers={"User-Agent": "Mozilla/5.0 (compatible; music-research-bot/1.0)"},
|
||||||
|
follow_redirects=True,
|
||||||
|
) as client:
|
||||||
|
try:
|
||||||
|
resp = await client.get("https://www.billboard.com/charts/hot-100/")
|
||||||
|
if resp.status_code != 200:
|
||||||
|
return []
|
||||||
|
titles = re.findall(
|
||||||
|
r'class="c-title[^"]*"[^>]*>\s*([^<\n]{3,80})\s*<', resp.text
|
||||||
|
)[:20]
|
||||||
|
return [
|
||||||
|
{"source": "billboard", "country": "US", "genre": "pop",
|
||||||
|
"keyword": t.strip(), "score": round(1.0 - i / 20, 3),
|
||||||
|
"rank": i + 1, "metadata": {}}
|
||||||
|
for i, t in enumerate(titles) if t.strip()
|
||||||
|
]
|
||||||
|
except Exception:
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
async def push_to_music_lab(trends: List[Dict[str, Any]], report_date: str) -> bool:
|
||||||
|
"""수집한 트렌드를 music-lab /api/music/market/ingest로 push."""
|
||||||
|
async with httpx.AsyncClient(timeout=15.0) as client:
|
||||||
|
try:
|
||||||
|
resp = await client.post(
|
||||||
|
f"{MUSIC_LAB_URL}/api/music/market/ingest",
|
||||||
|
json={"trends": trends, "report_date": report_date},
|
||||||
|
)
|
||||||
|
return resp.status_code == 200
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
@@ -3,3 +3,5 @@ uvicorn[standard]==0.30.6
|
|||||||
apscheduler==3.10.4
|
apscheduler==3.10.4
|
||||||
websockets>=12.0
|
websockets>=12.0
|
||||||
httpx>=0.27
|
httpx>=0.27
|
||||||
|
google-api-python-client>=2.100.0
|
||||||
|
pytrends>=4.9.2
|
||||||
|
|||||||
Reference in New Issue
Block a user