feat(insta-lab): news_collector with NAVER news.json + dedupe
This commit is contained in:
82
insta-lab/app/news_collector.py
Normal file
82
insta-lab/app/news_collector.py
Normal file
@@ -0,0 +1,82 @@
|
||||
"""NAVER 뉴스 검색 API 연동 — 카테고리별 시드 키워드로 일일 수집."""
|
||||
|
||||
import html
|
||||
import logging
|
||||
import re
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
import requests
|
||||
|
||||
from .config import NAVER_CLIENT_ID, NAVER_CLIENT_SECRET, NEWS_PER_CATEGORY
|
||||
from . import db
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
NEWS_URL = "https://openapi.naver.com/v1/search/news.json"
|
||||
_HEADERS = {
|
||||
"X-Naver-Client-Id": NAVER_CLIENT_ID,
|
||||
"X-Naver-Client-Secret": NAVER_CLIENT_SECRET,
|
||||
}
|
||||
_TAG_RE = re.compile(r"<[^>]+>")
|
||||
|
||||
|
||||
def _clean(text: str) -> str:
|
||||
if not text:
|
||||
return ""
|
||||
no_tag = _TAG_RE.sub("", text)
|
||||
return html.unescape(no_tag).strip()
|
||||
|
||||
|
||||
def search_news(keyword: str, display: int = 30, sort: str = "date") -> List[Dict[str, Any]]:
|
||||
"""NAVER news.json 단일 호출.
|
||||
|
||||
Returns: list of {title, link, summary, pub_date}
|
||||
"""
|
||||
resp = requests.get(
|
||||
NEWS_URL,
|
||||
headers=_HEADERS,
|
||||
params={"query": keyword, "display": display, "sort": sort},
|
||||
timeout=10,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
return [
|
||||
{
|
||||
"title": _clean(item.get("title", "")),
|
||||
"link": item.get("link") or item.get("originallink", ""),
|
||||
"summary": _clean(item.get("description", "")),
|
||||
"pub_date": item.get("pubDate", ""),
|
||||
}
|
||||
for item in data.get("items", [])
|
||||
]
|
||||
|
||||
|
||||
def collect_for_category(category: str,
|
||||
seed_keywords: List[str],
|
||||
per_keyword: Optional[int] = None) -> int:
|
||||
"""카테고리에 대해 시드 키워드 각각으로 검색 후 DB에 삽입.
|
||||
UNIQUE(link)가 중복 삽입을 막음. 시도된 기사 수(중복 포함) 반환.
|
||||
"""
|
||||
per_kw = per_keyword if per_keyword is not None else max(1, NEWS_PER_CATEGORY // max(1, len(seed_keywords)))
|
||||
seen_links = set()
|
||||
attempted = 0
|
||||
for kw in seed_keywords:
|
||||
try:
|
||||
items = search_news(kw, display=per_kw)
|
||||
except Exception as e:
|
||||
logger.warning("search_news failed kw=%s err=%s", kw, e)
|
||||
continue
|
||||
for item in items:
|
||||
link = item["link"]
|
||||
if not link or link in seen_links:
|
||||
continue
|
||||
seen_links.add(link)
|
||||
db.add_news_article({
|
||||
"category": category,
|
||||
"title": item["title"],
|
||||
"link": link,
|
||||
"summary": item["summary"],
|
||||
"pub_date": item["pub_date"],
|
||||
})
|
||||
attempted += 1
|
||||
return attempted
|
||||
Reference in New Issue
Block a user