""" 뉴스 스냅샷 인프라 (v3.2) 목적: - 수집한 뉴스를 SQLite에 타임스탬프와 함께 영구 저장 - 사후 감성 신호 재검증 (LLM 재호출 / 모델 비교) 가능하게 - 백테스트에서 '그 시점에 실제로 알 수 있던 뉴스'만 사용 스키마: news_snapshots( id INTEGER PK, captured_at TEXT, # ISO8601 (KST) — 수집 시점 query TEXT, # 수집 쿼리 (예: '주식 시장', '삼성전자') ticker TEXT, # 종목 코드 (종목 뉴스일 때, else NULL) title TEXT, url TEXT UNIQUE, pub_date TEXT, # RSS pubDate 원본 source TEXT DEFAULT 'google_news' ) sentiment_scores( # 야간 배치로 사후 생성 news_id INTEGER PK, scored_at TEXT, model TEXT, sentiment REAL, # -1.0 ~ 1.0 confidence REAL, raw_json TEXT, FOREIGN KEY (news_id) REFERENCES news_snapshots(id) ) 순수 I/O 모듈 — 네트워크 의존성 없음 → unit 테스트 가능. """ import os import sqlite3 from datetime import datetime, timezone, timedelta from typing import Iterable, List, Optional, Dict KST = timezone(timedelta(hours=9)) class NewsSnapshotStore: """ SQLite 기반 뉴스 스냅샷 저장소. 사용 예: store = NewsSnapshotStore("data/news_snapshots.db") store.save_many(items, query="삼성전자", ticker="005930") rows = store.query_between(start, end, ticker="005930") """ def __init__(self, db_path: str): self.db_path = db_path os.makedirs(os.path.dirname(db_path) or ".", exist_ok=True) self._init_schema() # ────────────────────────────────────────────── # 스키마 # ────────────────────────────────────────────── def _connect(self) -> sqlite3.Connection: conn = sqlite3.connect(self.db_path) conn.row_factory = sqlite3.Row return conn def _init_schema(self): with self._connect() as conn: conn.executescript(""" CREATE TABLE IF NOT EXISTS news_snapshots ( id INTEGER PRIMARY KEY AUTOINCREMENT, captured_at TEXT NOT NULL, query TEXT NOT NULL, ticker TEXT, title TEXT NOT NULL, url TEXT NOT NULL UNIQUE, pub_date TEXT, source TEXT DEFAULT 'google_news' ); CREATE INDEX IF NOT EXISTS idx_news_captured ON news_snapshots(captured_at); CREATE INDEX IF NOT EXISTS idx_news_ticker ON news_snapshots(ticker, captured_at); CREATE TABLE IF NOT EXISTS sentiment_scores ( news_id INTEGER PRIMARY KEY, scored_at TEXT NOT NULL, model TEXT NOT NULL, sentiment REAL NOT NULL, confidence REAL NOT NULL, raw_json TEXT, FOREIGN KEY (news_id) REFERENCES news_snapshots(id) ); """) # ────────────────────────────────────────────── # 쓰기 # ────────────────────────────────────────────── def save_many(self, items: Iterable[Dict], query: str, ticker: Optional[str] = None, captured_at: Optional[datetime] = None) -> int: """ 뉴스 다건 저장. URL 기준 중복 자동 무시. Args: items: [{"title": str, "url": str, "pub_date": str?}, ...] Returns: 실제로 삽입된 행 수 """ if captured_at is None: captured_at = datetime.now(KST) ts = captured_at.isoformat() rows = [] for it in items: title = (it.get("title") or "").strip() url = (it.get("url") or "").strip() if not title or not url: continue rows.append((ts, query, ticker, title, url, it.get("pub_date"))) if not rows: return 0 with self._connect() as conn: before = conn.total_changes conn.executemany( "INSERT OR IGNORE INTO news_snapshots " "(captured_at, query, ticker, title, url, pub_date) " "VALUES (?, ?, ?, ?, ?, ?)", rows, ) inserted = conn.total_changes - before return inserted def save_sentiment(self, news_id: int, model: str, sentiment: float, confidence: float, raw_json: str = "", scored_at: Optional[datetime] = None) -> None: if scored_at is None: scored_at = datetime.now(KST) with self._connect() as conn: conn.execute( "INSERT OR REPLACE INTO sentiment_scores " "(news_id, scored_at, model, sentiment, confidence, raw_json) " "VALUES (?, ?, ?, ?, ?, ?)", (news_id, scored_at.isoformat(), model, float(sentiment), float(confidence), raw_json), ) # ────────────────────────────────────────────── # 읽기 # ────────────────────────────────────────────── def query_between(self, start: datetime, end: datetime, ticker: Optional[str] = None, query: Optional[str] = None) -> List[sqlite3.Row]: """특정 기간 내 수집된 뉴스 조회.""" sql = "SELECT * FROM news_snapshots WHERE captured_at >= ? AND captured_at < ?" args = [start.isoformat(), end.isoformat()] if ticker is not None: sql += " AND ticker = ?" args.append(ticker) if query is not None: sql += " AND query = ?" args.append(query) sql += " ORDER BY captured_at ASC" with self._connect() as conn: return list(conn.execute(sql, args)) def pending_sentiment(self, limit: int = 100) -> List[sqlite3.Row]: """아직 감성 점수가 없는 뉴스 반환 (야간 배치용).""" with self._connect() as conn: return list(conn.execute( """SELECT n.* FROM news_snapshots n LEFT JOIN sentiment_scores s ON s.news_id = n.id WHERE s.news_id IS NULL ORDER BY n.captured_at DESC LIMIT ?""", (limit,) )) def stats(self) -> Dict: """DB 통계 (row 수, 감성 커버리지).""" with self._connect() as conn: total = conn.execute("SELECT COUNT(*) FROM news_snapshots").fetchone()[0] scored = conn.execute("SELECT COUNT(*) FROM sentiment_scores").fetchone()[0] return { "total_news": total, "scored": scored, "pending": total - scored, "coverage_pct": (scored / total * 100) if total else 0.0, }