refactor: web-ai V1 assets → signal_v1/ (graduation prep)

Atomic mv of root V1 assets (main_server.py + modules/ + data/ + tests/ + entry scripts + docs + logs) into signal_v1/ subdirectory. load_dotenv() updated to load web-ai/.env explicitly via Path. Adds web-ai/CLAUDE.md (workspace guide) and web-ai/start.bat (signal_v1 entry wrapper). Prepares for signal_v2/ Phase 2. Tests: signal_v1/tests/unit baseline preserved (no regression). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-16 03:00:11 +09:00
parent 42b91d03cf
commit 7ea1a21487
39 changed files with 722 additions and 691 deletions
--- a/signal_v1/modules/services/news_snapshot.py
+++ b/signal_v1/modules/services/news_snapshot.py
@@ -0,0 +1,189 @@
+"""
+뉴스 스냅샷 인프라 (v3.2)
+
+목적:
+  - 수집한 뉴스를 SQLite에 타임스탬프와 함께 영구 저장
+  - 사후 감성 신호 재검증 (LLM 재호출 / 모델 비교) 가능하게
+  - 백테스트에서 '그 시점에 실제로 알 수 있던 뉴스'만 사용
+
+스키마:
+  news_snapshots(
+    id INTEGER PK,
+    captured_at TEXT,      # ISO8601 (KST) — 수집 시점
+    query TEXT,            # 수집 쿼리 (예: '주식 시장', '삼성전자')
+    ticker TEXT,           # 종목 코드 (종목 뉴스일 때, else NULL)
+    title TEXT,
+    url TEXT UNIQUE,
+    pub_date TEXT,         # RSS pubDate 원본
+    source TEXT DEFAULT 'google_news'
+  )
+  sentiment_scores(         # 야간 배치로 사후 생성
+    news_id INTEGER PK,
+    scored_at TEXT,
+    model TEXT,
+    sentiment REAL,        # -1.0 ~ 1.0
+    confidence REAL,
+    raw_json TEXT,
+    FOREIGN KEY (news_id) REFERENCES news_snapshots(id)
+  )
+
+순수 I/O 모듈 — 네트워크 의존성 없음 → unit 테스트 가능.
+"""
+import os
+import sqlite3
+from datetime import datetime, timezone, timedelta
+from typing import Iterable, List, Optional, Dict
+
+KST = timezone(timedelta(hours=9))
+
+
+class NewsSnapshotStore:
+    """
+    SQLite 기반 뉴스 스냅샷 저장소.
+
+    사용 예:
+        store = NewsSnapshotStore("data/news_snapshots.db")
+        store.save_many(items, query="삼성전자", ticker="005930")
+        rows = store.query_between(start, end, ticker="005930")
+    """
+
+    def __init__(self, db_path: str):
+        self.db_path = db_path
+        os.makedirs(os.path.dirname(db_path) or ".", exist_ok=True)
+        self._init_schema()
+
+    # ──────────────────────────────────────────────
+    # 스키마
+    # ──────────────────────────────────────────────
+    def _connect(self) -> sqlite3.Connection:
+        conn = sqlite3.connect(self.db_path)
+        conn.row_factory = sqlite3.Row
+        return conn
+
+    def _init_schema(self):
+        with self._connect() as conn:
+            conn.executescript("""
+                CREATE TABLE IF NOT EXISTS news_snapshots (
+                    id          INTEGER PRIMARY KEY AUTOINCREMENT,
+                    captured_at TEXT NOT NULL,
+                    query       TEXT NOT NULL,
+                    ticker      TEXT,
+                    title       TEXT NOT NULL,
+                    url         TEXT NOT NULL UNIQUE,
+                    pub_date    TEXT,
+                    source      TEXT DEFAULT 'google_news'
+                );
+                CREATE INDEX IF NOT EXISTS idx_news_captured
+                    ON news_snapshots(captured_at);
+                CREATE INDEX IF NOT EXISTS idx_news_ticker
+                    ON news_snapshots(ticker, captured_at);
+
+                CREATE TABLE IF NOT EXISTS sentiment_scores (
+                    news_id     INTEGER PRIMARY KEY,
+                    scored_at   TEXT NOT NULL,
+                    model       TEXT NOT NULL,
+                    sentiment   REAL NOT NULL,
+                    confidence  REAL NOT NULL,
+                    raw_json    TEXT,
+                    FOREIGN KEY (news_id) REFERENCES news_snapshots(id)
+                );
+            """)
+
+    # ──────────────────────────────────────────────
+    # 쓰기
+    # ──────────────────────────────────────────────
+    def save_many(self, items: Iterable[Dict], query: str,
+                  ticker: Optional[str] = None,
+                  captured_at: Optional[datetime] = None) -> int:
+        """
+        뉴스 다건 저장. URL 기준 중복 자동 무시.
+
+        Args:
+            items: [{"title": str, "url": str, "pub_date": str?}, ...]
+
+        Returns:
+            실제로 삽입된 행 수
+        """
+        if captured_at is None:
+            captured_at = datetime.now(KST)
+        ts = captured_at.isoformat()
+
+        rows = []
+        for it in items:
+            title = (it.get("title") or "").strip()
+            url = (it.get("url") or "").strip()
+            if not title or not url:
+                continue
+            rows.append((ts, query, ticker, title, url, it.get("pub_date")))
+
+        if not rows:
+            return 0
+
+        with self._connect() as conn:
+            before = conn.total_changes
+            conn.executemany(
+                "INSERT OR IGNORE INTO news_snapshots "
+                "(captured_at, query, ticker, title, url, pub_date) "
+                "VALUES (?, ?, ?, ?, ?, ?)",
+                rows,
+            )
+            inserted = conn.total_changes - before
+        return inserted
+
+    def save_sentiment(self, news_id: int, model: str,
+                       sentiment: float, confidence: float,
+                       raw_json: str = "",
+                       scored_at: Optional[datetime] = None) -> None:
+        if scored_at is None:
+            scored_at = datetime.now(KST)
+        with self._connect() as conn:
+            conn.execute(
+                "INSERT OR REPLACE INTO sentiment_scores "
+                "(news_id, scored_at, model, sentiment, confidence, raw_json) "
+                "VALUES (?, ?, ?, ?, ?, ?)",
+                (news_id, scored_at.isoformat(), model,
+                 float(sentiment), float(confidence), raw_json),
+            )
+
+    # ──────────────────────────────────────────────
+    # 읽기
+    # ──────────────────────────────────────────────
+    def query_between(self, start: datetime, end: datetime,
+                      ticker: Optional[str] = None,
+                      query: Optional[str] = None) -> List[sqlite3.Row]:
+        """특정 기간 내 수집된 뉴스 조회."""
+        sql = "SELECT * FROM news_snapshots WHERE captured_at >= ? AND captured_at < ?"
+        args = [start.isoformat(), end.isoformat()]
+        if ticker is not None:
+            sql += " AND ticker = ?"
+            args.append(ticker)
+        if query is not None:
+            sql += " AND query = ?"
+            args.append(query)
+        sql += " ORDER BY captured_at ASC"
+        with self._connect() as conn:
+            return list(conn.execute(sql, args))
+
+    def pending_sentiment(self, limit: int = 100) -> List[sqlite3.Row]:
+        """아직 감성 점수가 없는 뉴스 반환 (야간 배치용)."""
+        with self._connect() as conn:
+            return list(conn.execute(
+                """SELECT n.* FROM news_snapshots n
+                   LEFT JOIN sentiment_scores s ON s.news_id = n.id
+                   WHERE s.news_id IS NULL
+                   ORDER BY n.captured_at DESC
+                   LIMIT ?""",
+                (limit,)
+            ))
+
+    def stats(self) -> Dict:
+        """DB 통계 (row 수, 감성 커버리지)."""
+        with self._connect() as conn:
+            total = conn.execute("SELECT COUNT(*) FROM news_snapshots").fetchone()[0]
+            scored = conn.execute("SELECT COUNT(*) FROM sentiment_scores").fetchone()[0]
+            return {
+                "total_news": total,
+                "scored": scored,
+                "pending": total - scored,
+                "coverage_pct": (scored / total * 100) if total else 0.0,
+            }