feat(ai_news): articles_source module (substring ticker matching)
This commit is contained in:
70
stock-lab/app/screener/ai_news/articles_source.py
Normal file
70
stock-lab/app/screener/ai_news/articles_source.py
Normal file
@@ -0,0 +1,70 @@
|
|||||||
|
"""기존 articles 테이블에서 종목별 뉴스 매핑."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import datetime as dt
|
||||||
|
import logging
|
||||||
|
import sqlite3
|
||||||
|
from typing import Any, Dict, List, Tuple
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def gather_articles_for_tickers(
|
||||||
|
conn: sqlite3.Connection,
|
||||||
|
tickers: List[str],
|
||||||
|
asof: dt.date,
|
||||||
|
*,
|
||||||
|
window_days: int = 1,
|
||||||
|
max_per_ticker: int = 5,
|
||||||
|
) -> Tuple[Dict[str, List[Dict[str, Any]]], Dict[str, int]]:
|
||||||
|
"""articles 에서 ticker.name substring 매칭으로 종목별 뉴스 dict 반환.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(
|
||||||
|
{ticker: [{"title": str, "summary": str, "press": str, "pub_date": str}, ...]},
|
||||||
|
{"total_articles": int, "matched_pairs": int, "hit_tickers": int},
|
||||||
|
)
|
||||||
|
"""
|
||||||
|
out: Dict[str, List[Dict[str, Any]]] = {t: [] for t in tickers}
|
||||||
|
stats = {"total_articles": 0, "matched_pairs": 0, "hit_tickers": 0}
|
||||||
|
|
||||||
|
if not tickers:
|
||||||
|
return out, stats
|
||||||
|
|
||||||
|
cutoff = (asof - dt.timedelta(days=window_days)).isoformat()
|
||||||
|
|
||||||
|
placeholders = ",".join("?" * len(tickers))
|
||||||
|
name_rows = conn.execute(
|
||||||
|
f"SELECT ticker, name FROM krx_master WHERE ticker IN ({placeholders})",
|
||||||
|
tickers,
|
||||||
|
).fetchall()
|
||||||
|
# 2글자 미만 회사명은 false positive 위험으로 제외
|
||||||
|
name_map = {r[0]: r[1] for r in name_rows if r[1] and len(r[1]) >= 2}
|
||||||
|
|
||||||
|
articles = conn.execute(
|
||||||
|
"SELECT title, summary, press, pub_date, crawled_at "
|
||||||
|
"FROM articles WHERE crawled_at >= ? ORDER BY crawled_at DESC",
|
||||||
|
(cutoff,),
|
||||||
|
).fetchall()
|
||||||
|
stats["total_articles"] = len(articles)
|
||||||
|
|
||||||
|
for a in articles:
|
||||||
|
title = (a[0] or "").strip()
|
||||||
|
summary = (a[1] or "").strip()
|
||||||
|
haystack = title + " " + summary
|
||||||
|
for ticker, name in name_map.items():
|
||||||
|
if name not in haystack:
|
||||||
|
continue
|
||||||
|
if len(out[ticker]) >= max_per_ticker:
|
||||||
|
continue
|
||||||
|
out[ticker].append({
|
||||||
|
"title": title,
|
||||||
|
"summary": summary,
|
||||||
|
"press": a[2] or "",
|
||||||
|
"pub_date": a[3] or "",
|
||||||
|
})
|
||||||
|
stats["matched_pairs"] += 1
|
||||||
|
|
||||||
|
stats["hit_tickers"] = sum(1 for arts in out.values() if arts)
|
||||||
|
return out, stats
|
||||||
@@ -115,6 +115,22 @@ CREATE TABLE IF NOT EXISTS screener_results (
|
|||||||
);
|
);
|
||||||
CREATE INDEX IF NOT EXISTS idx_results_run_rank ON screener_results(run_id, rank);
|
CREATE INDEX IF NOT EXISTS idx_results_run_rank ON screener_results(run_id, rank);
|
||||||
|
|
||||||
|
-- articles 테이블 (도메스틱/해외 뉴스 원본).
|
||||||
|
-- 메인 app.db.init_db() 에서도 생성하지만, 테스트 환경 및 단독 screener 컨텍스트
|
||||||
|
-- (ai_news.articles_source 등)에서도 참조 가능하도록 idempotent 하게 보장한다.
|
||||||
|
CREATE TABLE IF NOT EXISTS articles (
|
||||||
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
hash TEXT UNIQUE NOT NULL,
|
||||||
|
category TEXT DEFAULT 'domestic',
|
||||||
|
title TEXT NOT NULL,
|
||||||
|
link TEXT,
|
||||||
|
summary TEXT,
|
||||||
|
press TEXT,
|
||||||
|
pub_date TEXT,
|
||||||
|
crawled_at TEXT
|
||||||
|
);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_articles_crawled ON articles(crawled_at DESC);
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS news_sentiment (
|
CREATE TABLE IF NOT EXISTS news_sentiment (
|
||||||
ticker TEXT NOT NULL,
|
ticker TEXT NOT NULL,
|
||||||
date TEXT NOT NULL,
|
date TEXT NOT NULL,
|
||||||
|
|||||||
108
stock-lab/tests/test_ai_news_articles_source.py
Normal file
108
stock-lab/tests/test_ai_news_articles_source.py
Normal file
@@ -0,0 +1,108 @@
|
|||||||
|
import datetime as dt
|
||||||
|
import sqlite3
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from app.screener.ai_news import articles_source
|
||||||
|
from app.screener.schema import ensure_screener_schema
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def conn():
|
||||||
|
c = sqlite3.connect(":memory:")
|
||||||
|
c.row_factory = sqlite3.Row
|
||||||
|
ensure_screener_schema(c)
|
||||||
|
yield c
|
||||||
|
c.close()
|
||||||
|
|
||||||
|
|
||||||
|
def _seed_master(conn, ticker, name):
|
||||||
|
conn.execute(
|
||||||
|
"INSERT INTO krx_master (ticker, name, market, market_cap, updated_at) "
|
||||||
|
"VALUES (?, ?, 'KOSPI', 1000000000, datetime('now'))",
|
||||||
|
(ticker, name),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _seed_article(conn, title, summary="", crawled_at="2026-05-14T07:30:00"):
|
||||||
|
import hashlib
|
||||||
|
h = hashlib.md5(f"{title}|x".encode()).hexdigest()
|
||||||
|
conn.execute(
|
||||||
|
"INSERT INTO articles (hash, title, summary, link, press, pub_date, crawled_at) "
|
||||||
|
"VALUES (?, ?, ?, '', '', '2026-05-14', ?)",
|
||||||
|
(h, title, summary, crawled_at),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
ASOF = dt.date(2026, 5, 14)
|
||||||
|
|
||||||
|
|
||||||
|
def test_single_ticker_match_in_title(conn):
|
||||||
|
_seed_master(conn, "005930", "삼성전자")
|
||||||
|
_seed_article(conn, "삼성전자, HBM 양산 가시화")
|
||||||
|
conn.commit()
|
||||||
|
out, stats = articles_source.gather_articles_for_tickers(
|
||||||
|
conn, ["005930"], ASOF, window_days=1, max_per_ticker=5,
|
||||||
|
)
|
||||||
|
assert len(out["005930"]) == 1
|
||||||
|
assert out["005930"][0]["title"] == "삼성전자, HBM 양산 가시화"
|
||||||
|
assert stats["matched_pairs"] == 1
|
||||||
|
assert stats["hit_tickers"] == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_single_ticker_match_in_summary(conn):
|
||||||
|
_seed_master(conn, "005930", "삼성전자")
|
||||||
|
_seed_article(conn, "메모리 시장 회복세", summary="삼성전자가 1분기 어닝 서프라이즈")
|
||||||
|
conn.commit()
|
||||||
|
out, _ = articles_source.gather_articles_for_tickers(
|
||||||
|
conn, ["005930"], ASOF, window_days=1, max_per_ticker=5,
|
||||||
|
)
|
||||||
|
assert len(out["005930"]) == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_multi_ticker_match(conn):
|
||||||
|
_seed_master(conn, "005930", "삼성전자")
|
||||||
|
_seed_master(conn, "000660", "SK하이닉스")
|
||||||
|
_seed_article(conn, "삼성전자와 SK하이닉스, 메모리 양산 경쟁")
|
||||||
|
conn.commit()
|
||||||
|
out, stats = articles_source.gather_articles_for_tickers(
|
||||||
|
conn, ["005930", "000660"], ASOF, window_days=1, max_per_ticker=5,
|
||||||
|
)
|
||||||
|
assert len(out["005930"]) == 1
|
||||||
|
assert len(out["000660"]) == 1
|
||||||
|
assert stats["matched_pairs"] == 2
|
||||||
|
assert stats["hit_tickers"] == 2
|
||||||
|
|
||||||
|
|
||||||
|
def test_no_match_returns_empty_list(conn):
|
||||||
|
_seed_master(conn, "005930", "삼성전자")
|
||||||
|
_seed_article(conn, "엔비디아 실적 발표", summary="AI 칩 수요 견조")
|
||||||
|
conn.commit()
|
||||||
|
out, stats = articles_source.gather_articles_for_tickers(
|
||||||
|
conn, ["005930"], ASOF, window_days=1, max_per_ticker=5,
|
||||||
|
)
|
||||||
|
assert out["005930"] == []
|
||||||
|
assert stats["matched_pairs"] == 0
|
||||||
|
assert stats["hit_tickers"] == 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_max_per_ticker_caps_results(conn):
|
||||||
|
_seed_master(conn, "005930", "삼성전자")
|
||||||
|
for i in range(6):
|
||||||
|
_seed_article(conn, f"삼성전자 뉴스 #{i}", crawled_at=f"2026-05-14T0{i}:00:00")
|
||||||
|
conn.commit()
|
||||||
|
out, _ = articles_source.gather_articles_for_tickers(
|
||||||
|
conn, ["005930"], ASOF, window_days=1, max_per_ticker=5,
|
||||||
|
)
|
||||||
|
assert len(out["005930"]) == 5
|
||||||
|
|
||||||
|
|
||||||
|
def test_window_days_filters_old_articles(conn):
|
||||||
|
_seed_master(conn, "005930", "삼성전자")
|
||||||
|
_seed_article(conn, "삼성전자 최신 뉴스", crawled_at="2026-05-14T07:00:00")
|
||||||
|
_seed_article(conn, "삼성전자 오래된 뉴스", crawled_at="2026-05-01T07:00:00")
|
||||||
|
conn.commit()
|
||||||
|
out, _ = articles_source.gather_articles_for_tickers(
|
||||||
|
conn, ["005930"], ASOF, window_days=1, max_per_ticker=5,
|
||||||
|
)
|
||||||
|
assert len(out["005930"]) == 1
|
||||||
|
assert "최신" in out["005930"][0]["title"]
|
||||||
Reference in New Issue
Block a user