"""NAVER 뉴스 검색 API 연동 — 카테고리별 시드 키워드로 일일 수집.""" import html import logging import re from typing import Any, Dict, List, Optional import requests from .config import NAVER_CLIENT_ID, NAVER_CLIENT_SECRET, NEWS_PER_CATEGORY from . import db logger = logging.getLogger(__name__) NEWS_URL = "https://openapi.naver.com/v1/search/news.json" _HEADERS = { "X-Naver-Client-Id": NAVER_CLIENT_ID, "X-Naver-Client-Secret": NAVER_CLIENT_SECRET, } _TAG_RE = re.compile(r"<[^>]+>") def _clean(text: str) -> str: if not text: return "" no_tag = _TAG_RE.sub("", text) return html.unescape(no_tag).strip() def search_news(keyword: str, display: int = 30, sort: str = "date") -> List[Dict[str, Any]]: """NAVER news.json 단일 호출. Returns: list of {title, link, summary, pub_date} """ resp = requests.get( NEWS_URL, headers=_HEADERS, params={"query": keyword, "display": display, "sort": sort}, timeout=10, ) resp.raise_for_status() data = resp.json() return [ { "title": _clean(item.get("title", "")), "link": item.get("link") or item.get("originallink", ""), "summary": _clean(item.get("description", "")), "pub_date": item.get("pubDate", ""), } for item in data.get("items", []) ] def collect_for_category(category: str, seed_keywords: List[str], per_keyword: Optional[int] = None) -> int: """카테고리에 대해 시드 키워드 각각으로 검색 후 DB에 삽입. UNIQUE(link)가 중복 삽입을 막음. 시도된 기사 수(중복 포함) 반환. """ per_kw = per_keyword if per_keyword is not None else max(1, NEWS_PER_CATEGORY // max(1, len(seed_keywords))) seen_links = set() attempted = 0 for kw in seed_keywords: try: items = search_news(kw, display=per_kw) except Exception as e: logger.warning("search_news failed kw=%s err=%s", kw, e) continue for item in items: link = item["link"] if not link or link in seen_links: continue seen_links.add(link) db.add_news_article({ "category": category, "title": item["title"], "link": link, "summary": item["summary"], "pub_date": item["pub_date"], }) attempted += 1 return attempted