ai-trade/signal_v1/modules/services/news_snapshot.py

"""
뉴스 스냅샷 인프라 (v3.2)

목적:
  - 수집한 뉴스를 SQLite에 타임스탬프와 함께 영구 저장
  - 사후 감성 신호 재검증 (LLM 재호출 / 모델 비교) 가능하게
  - 백테스트에서 '그 시점에 실제로 알 수 있던 뉴스'만 사용

스키마:
  news_snapshots(
    id INTEGER PK,
    captured_at TEXT,      # ISO8601 (KST) — 수집 시점
    query TEXT,            # 수집 쿼리 (예: '주식 시장', '삼성전자')
    ticker TEXT,           # 종목 코드 (종목 뉴스일 때, else NULL)
    title TEXT,
    url TEXT UNIQUE,
    pub_date TEXT,         # RSS pubDate 원본
    source TEXT DEFAULT 'google_news'
  )
  sentiment_scores(         # 야간 배치로 사후 생성
    news_id INTEGER PK,
    scored_at TEXT,
    model TEXT,
    sentiment REAL,        # -1.0 ~ 1.0
    confidence REAL,
    raw_json TEXT,
    FOREIGN KEY (news_id) REFERENCES news_snapshots(id)
  )

순수 I/O 모듈 — 네트워크 의존성 없음 → unit 테스트 가능.
"""
import os
import sqlite3
from datetime import datetime, timezone, timedelta
from typing import Iterable, List, Optional, Dict

KST = timezone(timedelta(hours=9))


class NewsSnapshotStore:
    """
    SQLite 기반 뉴스 스냅샷 저장소.

    사용 예:
        store = NewsSnapshotStore("data/news_snapshots.db")
        store.save_many(items, query="삼성전자", ticker="005930")
        rows = store.query_between(start, end, ticker="005930")
    """

    def __init__(self, db_path: str):
        self.db_path = db_path
        os.makedirs(os.path.dirname(db_path) or ".", exist_ok=True)
        self._init_schema()

    # ──────────────────────────────────────────────
    # 스키마
    # ──────────────────────────────────────────────
    def _connect(self) -> sqlite3.Connection:
        conn = sqlite3.connect(self.db_path)
        conn.row_factory = sqlite3.Row
        return conn

    def _init_schema(self):
        with self._connect() as conn:
            conn.executescript("""
                CREATE TABLE IF NOT EXISTS news_snapshots (
                    id          INTEGER PRIMARY KEY AUTOINCREMENT,
                    captured_at TEXT NOT NULL,
                    query       TEXT NOT NULL,
                    ticker      TEXT,
                    title       TEXT NOT NULL,
                    url         TEXT NOT NULL UNIQUE,
                    pub_date    TEXT,
                    source      TEXT DEFAULT 'google_news'
                );
                CREATE INDEX IF NOT EXISTS idx_news_captured
                    ON news_snapshots(captured_at);
                CREATE INDEX IF NOT EXISTS idx_news_ticker
                    ON news_snapshots(ticker, captured_at);

                CREATE TABLE IF NOT EXISTS sentiment_scores (
                    news_id     INTEGER PRIMARY KEY,
                    scored_at   TEXT NOT NULL,
                    model       TEXT NOT NULL,
                    sentiment   REAL NOT NULL,
                    confidence  REAL NOT NULL,
                    raw_json    TEXT,
                    FOREIGN KEY (news_id) REFERENCES news_snapshots(id)
                );
            """)

    # ──────────────────────────────────────────────
    # 쓰기
    # ──────────────────────────────────────────────
    def save_many(self, items: Iterable[Dict], query: str,
                  ticker: Optional[str] = None,
                  captured_at: Optional[datetime] = None) -> int:
        """
        뉴스 다건 저장. URL 기준 중복 자동 무시.

        Args:
            items: [{"title": str, "url": str, "pub_date": str?}, ...]

        Returns:
            실제로 삽입된 행 수
        """
        if captured_at is None:
            captured_at = datetime.now(KST)
        ts = captured_at.isoformat()

        rows = []
        for it in items:
            title = (it.get("title") or "").strip()
            url = (it.get("url") or "").strip()
            if not title or not url:
                continue
            rows.append((ts, query, ticker, title, url, it.get("pub_date")))

        if not rows:
            return 0

        with self._connect() as conn:
            before = conn.total_changes
            conn.executemany(
                "INSERT OR IGNORE INTO news_snapshots "
                "(captured_at, query, ticker, title, url, pub_date) "
                "VALUES (?, ?, ?, ?, ?, ?)",
                rows,
            )
            inserted = conn.total_changes - before
        return inserted

    def save_sentiment(self, news_id: int, model: str,
                       sentiment: float, confidence: float,
                       raw_json: str = "",
                       scored_at: Optional[datetime] = None) -> None:
        if scored_at is None:
            scored_at = datetime.now(KST)
        with self._connect() as conn:
            conn.execute(
                "INSERT OR REPLACE INTO sentiment_scores "
                "(news_id, scored_at, model, sentiment, confidence, raw_json) "
                "VALUES (?, ?, ?, ?, ?, ?)",
                (news_id, scored_at.isoformat(), model,
                 float(sentiment), float(confidence), raw_json),
            )

    # ──────────────────────────────────────────────
    # 읽기
    # ──────────────────────────────────────────────
    def query_between(self, start: datetime, end: datetime,
                      ticker: Optional[str] = None,
                      query: Optional[str] = None) -> List[sqlite3.Row]:
        """특정 기간 내 수집된 뉴스 조회."""
        sql = "SELECT * FROM news_snapshots WHERE captured_at >= ? AND captured_at < ?"
        args = [start.isoformat(), end.isoformat()]
        if ticker is not None:
            sql += " AND ticker = ?"
            args.append(ticker)
        if query is not None:
            sql += " AND query = ?"
            args.append(query)
        sql += " ORDER BY captured_at ASC"
        with self._connect() as conn:
            return list(conn.execute(sql, args))

    def pending_sentiment(self, limit: int = 100) -> List[sqlite3.Row]:
        """아직 감성 점수가 없는 뉴스 반환 (야간 배치용)."""
        with self._connect() as conn:
            return list(conn.execute(
                """SELECT n.* FROM news_snapshots n
                   LEFT JOIN sentiment_scores s ON s.news_id = n.id
                   WHERE s.news_id IS NULL
                   ORDER BY n.captured_at DESC
                   LIMIT ?""",
                (limit,)
            ))

    def stats(self) -> Dict:
        """DB 통계 (row 수, 감성 커버리지)."""
        with self._connect() as conn:
            total = conn.execute("SELECT COUNT(*) FROM news_snapshots").fetchone()[0]
            scored = conn.execute("SELECT COUNT(*) FROM sentiment_scores").fetchone()[0]
            return {
                "total_news": total,
                "scored": scored,
                "pending": total - scored,
                "coverage_pct": (scored / total * 100) if total else 0.0,
            }