71 lines
2.2 KiB
Python
71 lines
2.2 KiB
Python
"""기존 articles 테이블에서 종목별 뉴스 매핑."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import datetime as dt
|
|
import logging
|
|
import sqlite3
|
|
from typing import Any, Dict, List, Tuple
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
|
|
def gather_articles_for_tickers(
|
|
conn: sqlite3.Connection,
|
|
tickers: List[str],
|
|
asof: dt.date,
|
|
*,
|
|
window_days: int = 1,
|
|
max_per_ticker: int = 5,
|
|
) -> Tuple[Dict[str, List[Dict[str, Any]]], Dict[str, int]]:
|
|
"""articles 에서 ticker.name substring 매칭으로 종목별 뉴스 dict 반환.
|
|
|
|
Returns:
|
|
(
|
|
{ticker: [{"title": str, "summary": str, "press": str, "pub_date": str}, ...]},
|
|
{"total_articles": int, "matched_pairs": int, "hit_tickers": int},
|
|
)
|
|
"""
|
|
out: Dict[str, List[Dict[str, Any]]] = {t: [] for t in tickers}
|
|
stats = {"total_articles": 0, "matched_pairs": 0, "hit_tickers": 0}
|
|
|
|
if not tickers:
|
|
return out, stats
|
|
|
|
cutoff = (asof - dt.timedelta(days=window_days)).isoformat()
|
|
|
|
placeholders = ",".join("?" * len(tickers))
|
|
name_rows = conn.execute(
|
|
f"SELECT ticker, name FROM krx_master WHERE ticker IN ({placeholders})",
|
|
tickers,
|
|
).fetchall()
|
|
# 2글자 미만 회사명은 false positive 위험으로 제외
|
|
name_map = {r[0]: r[1] for r in name_rows if r[1] and len(r[1]) >= 2}
|
|
|
|
articles = conn.execute(
|
|
"SELECT title, summary, press, pub_date, crawled_at "
|
|
"FROM articles WHERE crawled_at >= ? ORDER BY crawled_at DESC",
|
|
(cutoff,),
|
|
).fetchall()
|
|
stats["total_articles"] = len(articles)
|
|
|
|
for a in articles:
|
|
title = (a[0] or "").strip()
|
|
summary = (a[1] or "").strip()
|
|
haystack = title + " " + summary
|
|
for ticker, name in name_map.items():
|
|
if name not in haystack:
|
|
continue
|
|
if len(out[ticker]) >= max_per_ticker:
|
|
continue
|
|
out[ticker].append({
|
|
"title": title,
|
|
"summary": summary,
|
|
"press": a[2] or "",
|
|
"pub_date": a[3] or "",
|
|
})
|
|
stats["matched_pairs"] += 1
|
|
|
|
stats["hit_tickers"] = sum(1 for arts in out.values() if arts)
|
|
return out, stats
|