diff --git a/insta-lab/app/news_collector.py b/insta-lab/app/news_collector.py new file mode 100644 index 0000000..94acda7 --- /dev/null +++ b/insta-lab/app/news_collector.py @@ -0,0 +1,82 @@ +"""NAVER 뉴스 검색 API 연동 — 카테고리별 시드 키워드로 일일 수집.""" + +import html +import logging +import re +from typing import Any, Dict, List, Optional + +import requests + +from .config import NAVER_CLIENT_ID, NAVER_CLIENT_SECRET, NEWS_PER_CATEGORY +from . import db + +logger = logging.getLogger(__name__) + +NEWS_URL = "https://openapi.naver.com/v1/search/news.json" +_HEADERS = { + "X-Naver-Client-Id": NAVER_CLIENT_ID, + "X-Naver-Client-Secret": NAVER_CLIENT_SECRET, +} +_TAG_RE = re.compile(r"<[^>]+>") + + +def _clean(text: str) -> str: + if not text: + return "" + no_tag = _TAG_RE.sub("", text) + return html.unescape(no_tag).strip() + + +def search_news(keyword: str, display: int = 30, sort: str = "date") -> List[Dict[str, Any]]: + """NAVER news.json 단일 호출. + + Returns: list of {title, link, summary, pub_date} + """ + resp = requests.get( + NEWS_URL, + headers=_HEADERS, + params={"query": keyword, "display": display, "sort": sort}, + timeout=10, + ) + resp.raise_for_status() + data = resp.json() + return [ + { + "title": _clean(item.get("title", "")), + "link": item.get("link") or item.get("originallink", ""), + "summary": _clean(item.get("description", "")), + "pub_date": item.get("pubDate", ""), + } + for item in data.get("items", []) + ] + + +def collect_for_category(category: str, + seed_keywords: List[str], + per_keyword: Optional[int] = None) -> int: + """카테고리에 대해 시드 키워드 각각으로 검색 후 DB에 삽입. + UNIQUE(link)가 중복 삽입을 막음. 시도된 기사 수(중복 포함) 반환. + """ + per_kw = per_keyword if per_keyword is not None else max(1, NEWS_PER_CATEGORY // max(1, len(seed_keywords))) + seen_links = set() + attempted = 0 + for kw in seed_keywords: + try: + items = search_news(kw, display=per_kw) + except Exception as e: + logger.warning("search_news failed kw=%s err=%s", kw, e) + continue + for item in items: + link = item["link"] + if not link or link in seen_links: + continue + seen_links.add(link) + db.add_news_article({ + "category": category, + "title": item["title"], + "link": link, + "summary": item["summary"], + "pub_date": item["pub_date"], + }) + attempted += 1 + return attempted diff --git a/insta-lab/tests/test_news_collector.py b/insta-lab/tests/test_news_collector.py new file mode 100644 index 0000000..a582bac --- /dev/null +++ b/insta-lab/tests/test_news_collector.py @@ -0,0 +1,89 @@ +from unittest.mock import patch, MagicMock +import os +import tempfile + +import pytest + +from app import db as db_module +from app import news_collector + + +@pytest.fixture +def tmp_db(monkeypatch): + fd, path = tempfile.mkstemp(suffix=".db") + os.close(fd) + monkeypatch.setattr(db_module, "DB_PATH", path) + db_module.init_db() + yield path + # Close all SQLite WAL files before removal (needed on Windows) + import gc + gc.collect() + for ext in ("", "-wal", "-shm"): + try: + os.remove(path + ext) + except FileNotFoundError: + pass + + +SAMPLE_RESPONSE = { + "items": [ + { + "title": "금리 인상 단행", + "originallink": "https://news.example.com/1", + "link": "https://n.news.naver.com/article/1", + "description": "한국은행이 기준금리를 25bp 올렸다.", + "pubDate": "Fri, 15 May 2026 08:00:00 +0900", + }, + { + "title": "환율 급등", + "originallink": "https://news.example.com/2", + "link": "https://n.news.naver.com/article/2", + "description": "원달러 환율이 1400원을 돌파했다.", + "pubDate": "Fri, 15 May 2026 09:00:00 +0900", + }, + ], +} + + +def test_strip_html_and_decode_entities(): + out = news_collector._clean(' "테스트" & 아이템 ') + assert out == '"테스트" & 아이템' + + +def test_search_news_parses_items(tmp_db): + fake_resp = MagicMock() + fake_resp.json.return_value = SAMPLE_RESPONSE + fake_resp.raise_for_status.return_value = None + with patch.object(news_collector.requests, "get", return_value=fake_resp): + items = news_collector.search_news("금리", display=10) + assert len(items) == 2 + assert items[0]["title"] == "금리 인상 단행" + assert items[0]["summary"].startswith("한국은행") + + +def test_collect_for_category_inserts(tmp_db): + fake_resp = MagicMock() + fake_resp.json.return_value = SAMPLE_RESPONSE + fake_resp.raise_for_status.return_value = None + with patch.object(news_collector.requests, "get", return_value=fake_resp): + news_collector.collect_for_category("economy", seed_keywords=["금리"], per_keyword=10) + rows = db_module.list_news_articles(category="economy", days=7) + assert {r["link"] for r in rows} == { + "https://n.news.naver.com/article/1", + "https://n.news.naver.com/article/2", + } + + +def test_collect_dedupes_existing(tmp_db): + db_module.add_news_article({ + "category": "economy", "title": "기존", + "link": "https://n.news.naver.com/article/1", "summary": "" + }) + fake_resp = MagicMock() + fake_resp.json.return_value = SAMPLE_RESPONSE + fake_resp.raise_for_status.return_value = None + with patch.object(news_collector.requests, "get", return_value=fake_resp): + news_collector.collect_for_category("economy", seed_keywords=["금리"]) + rows = db_module.list_news_articles(category="economy", days=7) + # 1 pre-existing + 1 newly added (the other link); UNIQUE link blocks duplicate insert + assert len(rows) == 2