diff --git a/stock-lab/app/screener/ai_news/articles_source.py b/stock-lab/app/screener/ai_news/articles_source.py new file mode 100644 index 0000000..7b44cdf --- /dev/null +++ b/stock-lab/app/screener/ai_news/articles_source.py @@ -0,0 +1,70 @@ +"""기존 articles 테이블에서 종목별 뉴스 매핑.""" + +from __future__ import annotations + +import datetime as dt +import logging +import sqlite3 +from typing import Any, Dict, List, Tuple + +log = logging.getLogger(__name__) + + +def gather_articles_for_tickers( + conn: sqlite3.Connection, + tickers: List[str], + asof: dt.date, + *, + window_days: int = 1, + max_per_ticker: int = 5, +) -> Tuple[Dict[str, List[Dict[str, Any]]], Dict[str, int]]: + """articles 에서 ticker.name substring 매칭으로 종목별 뉴스 dict 반환. + + Returns: + ( + {ticker: [{"title": str, "summary": str, "press": str, "pub_date": str}, ...]}, + {"total_articles": int, "matched_pairs": int, "hit_tickers": int}, + ) + """ + out: Dict[str, List[Dict[str, Any]]] = {t: [] for t in tickers} + stats = {"total_articles": 0, "matched_pairs": 0, "hit_tickers": 0} + + if not tickers: + return out, stats + + cutoff = (asof - dt.timedelta(days=window_days)).isoformat() + + placeholders = ",".join("?" * len(tickers)) + name_rows = conn.execute( + f"SELECT ticker, name FROM krx_master WHERE ticker IN ({placeholders})", + tickers, + ).fetchall() + # 2글자 미만 회사명은 false positive 위험으로 제외 + name_map = {r[0]: r[1] for r in name_rows if r[1] and len(r[1]) >= 2} + + articles = conn.execute( + "SELECT title, summary, press, pub_date, crawled_at " + "FROM articles WHERE crawled_at >= ? ORDER BY crawled_at DESC", + (cutoff,), + ).fetchall() + stats["total_articles"] = len(articles) + + for a in articles: + title = (a[0] or "").strip() + summary = (a[1] or "").strip() + haystack = title + " " + summary + for ticker, name in name_map.items(): + if name not in haystack: + continue + if len(out[ticker]) >= max_per_ticker: + continue + out[ticker].append({ + "title": title, + "summary": summary, + "press": a[2] or "", + "pub_date": a[3] or "", + }) + stats["matched_pairs"] += 1 + + stats["hit_tickers"] = sum(1 for arts in out.values() if arts) + return out, stats diff --git a/stock-lab/app/screener/schema.py b/stock-lab/app/screener/schema.py index e81d75b..76d8391 100644 --- a/stock-lab/app/screener/schema.py +++ b/stock-lab/app/screener/schema.py @@ -115,6 +115,22 @@ CREATE TABLE IF NOT EXISTS screener_results ( ); CREATE INDEX IF NOT EXISTS idx_results_run_rank ON screener_results(run_id, rank); +-- articles 테이블 (도메스틱/해외 뉴스 원본). +-- 메인 app.db.init_db() 에서도 생성하지만, 테스트 환경 및 단독 screener 컨텍스트 +-- (ai_news.articles_source 등)에서도 참조 가능하도록 idempotent 하게 보장한다. +CREATE TABLE IF NOT EXISTS articles ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + hash TEXT UNIQUE NOT NULL, + category TEXT DEFAULT 'domestic', + title TEXT NOT NULL, + link TEXT, + summary TEXT, + press TEXT, + pub_date TEXT, + crawled_at TEXT +); +CREATE INDEX IF NOT EXISTS idx_articles_crawled ON articles(crawled_at DESC); + CREATE TABLE IF NOT EXISTS news_sentiment ( ticker TEXT NOT NULL, date TEXT NOT NULL, diff --git a/stock-lab/tests/test_ai_news_articles_source.py b/stock-lab/tests/test_ai_news_articles_source.py new file mode 100644 index 0000000..7f3d532 --- /dev/null +++ b/stock-lab/tests/test_ai_news_articles_source.py @@ -0,0 +1,108 @@ +import datetime as dt +import sqlite3 +import pytest + +from app.screener.ai_news import articles_source +from app.screener.schema import ensure_screener_schema + + +@pytest.fixture +def conn(): + c = sqlite3.connect(":memory:") + c.row_factory = sqlite3.Row + ensure_screener_schema(c) + yield c + c.close() + + +def _seed_master(conn, ticker, name): + conn.execute( + "INSERT INTO krx_master (ticker, name, market, market_cap, updated_at) " + "VALUES (?, ?, 'KOSPI', 1000000000, datetime('now'))", + (ticker, name), + ) + + +def _seed_article(conn, title, summary="", crawled_at="2026-05-14T07:30:00"): + import hashlib + h = hashlib.md5(f"{title}|x".encode()).hexdigest() + conn.execute( + "INSERT INTO articles (hash, title, summary, link, press, pub_date, crawled_at) " + "VALUES (?, ?, ?, '', '', '2026-05-14', ?)", + (h, title, summary, crawled_at), + ) + + +ASOF = dt.date(2026, 5, 14) + + +def test_single_ticker_match_in_title(conn): + _seed_master(conn, "005930", "삼성전자") + _seed_article(conn, "삼성전자, HBM 양산 가시화") + conn.commit() + out, stats = articles_source.gather_articles_for_tickers( + conn, ["005930"], ASOF, window_days=1, max_per_ticker=5, + ) + assert len(out["005930"]) == 1 + assert out["005930"][0]["title"] == "삼성전자, HBM 양산 가시화" + assert stats["matched_pairs"] == 1 + assert stats["hit_tickers"] == 1 + + +def test_single_ticker_match_in_summary(conn): + _seed_master(conn, "005930", "삼성전자") + _seed_article(conn, "메모리 시장 회복세", summary="삼성전자가 1분기 어닝 서프라이즈") + conn.commit() + out, _ = articles_source.gather_articles_for_tickers( + conn, ["005930"], ASOF, window_days=1, max_per_ticker=5, + ) + assert len(out["005930"]) == 1 + + +def test_multi_ticker_match(conn): + _seed_master(conn, "005930", "삼성전자") + _seed_master(conn, "000660", "SK하이닉스") + _seed_article(conn, "삼성전자와 SK하이닉스, 메모리 양산 경쟁") + conn.commit() + out, stats = articles_source.gather_articles_for_tickers( + conn, ["005930", "000660"], ASOF, window_days=1, max_per_ticker=5, + ) + assert len(out["005930"]) == 1 + assert len(out["000660"]) == 1 + assert stats["matched_pairs"] == 2 + assert stats["hit_tickers"] == 2 + + +def test_no_match_returns_empty_list(conn): + _seed_master(conn, "005930", "삼성전자") + _seed_article(conn, "엔비디아 실적 발표", summary="AI 칩 수요 견조") + conn.commit() + out, stats = articles_source.gather_articles_for_tickers( + conn, ["005930"], ASOF, window_days=1, max_per_ticker=5, + ) + assert out["005930"] == [] + assert stats["matched_pairs"] == 0 + assert stats["hit_tickers"] == 0 + + +def test_max_per_ticker_caps_results(conn): + _seed_master(conn, "005930", "삼성전자") + for i in range(6): + _seed_article(conn, f"삼성전자 뉴스 #{i}", crawled_at=f"2026-05-14T0{i}:00:00") + conn.commit() + out, _ = articles_source.gather_articles_for_tickers( + conn, ["005930"], ASOF, window_days=1, max_per_ticker=5, + ) + assert len(out["005930"]) == 5 + + +def test_window_days_filters_old_articles(conn): + _seed_master(conn, "005930", "삼성전자") + _seed_article(conn, "삼성전자 최신 뉴스", crawled_at="2026-05-14T07:00:00") + _seed_article(conn, "삼성전자 오래된 뉴스", crawled_at="2026-05-01T07:00:00") + conn.commit() + out, _ = articles_source.gather_articles_for_tickers( + conn, ["005930"], ASOF, window_days=1, max_per_ticker=5, + ) + assert len(out["005930"]) == 1 + assert "최신" in out["005930"][0]["title"]