Atomic mv of root V1 assets (main_server.py + modules/ + data/ + tests/ + entry scripts + docs + logs) into signal_v1/ subdirectory. load_dotenv() updated to load web-ai/.env explicitly via Path. Adds web-ai/CLAUDE.md (workspace guide) and web-ai/start.bat (signal_v1 entry wrapper). Prepares for signal_v2/ Phase 2. Tests: signal_v1/tests/unit baseline preserved (no regression). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
190 lines
7.5 KiB
Python
190 lines
7.5 KiB
Python
"""
|
|
뉴스 스냅샷 인프라 (v3.2)
|
|
|
|
목적:
|
|
- 수집한 뉴스를 SQLite에 타임스탬프와 함께 영구 저장
|
|
- 사후 감성 신호 재검증 (LLM 재호출 / 모델 비교) 가능하게
|
|
- 백테스트에서 '그 시점에 실제로 알 수 있던 뉴스'만 사용
|
|
|
|
스키마:
|
|
news_snapshots(
|
|
id INTEGER PK,
|
|
captured_at TEXT, # ISO8601 (KST) — 수집 시점
|
|
query TEXT, # 수집 쿼리 (예: '주식 시장', '삼성전자')
|
|
ticker TEXT, # 종목 코드 (종목 뉴스일 때, else NULL)
|
|
title TEXT,
|
|
url TEXT UNIQUE,
|
|
pub_date TEXT, # RSS pubDate 원본
|
|
source TEXT DEFAULT 'google_news'
|
|
)
|
|
sentiment_scores( # 야간 배치로 사후 생성
|
|
news_id INTEGER PK,
|
|
scored_at TEXT,
|
|
model TEXT,
|
|
sentiment REAL, # -1.0 ~ 1.0
|
|
confidence REAL,
|
|
raw_json TEXT,
|
|
FOREIGN KEY (news_id) REFERENCES news_snapshots(id)
|
|
)
|
|
|
|
순수 I/O 모듈 — 네트워크 의존성 없음 → unit 테스트 가능.
|
|
"""
|
|
import os
|
|
import sqlite3
|
|
from datetime import datetime, timezone, timedelta
|
|
from typing import Iterable, List, Optional, Dict
|
|
|
|
KST = timezone(timedelta(hours=9))
|
|
|
|
|
|
class NewsSnapshotStore:
|
|
"""
|
|
SQLite 기반 뉴스 스냅샷 저장소.
|
|
|
|
사용 예:
|
|
store = NewsSnapshotStore("data/news_snapshots.db")
|
|
store.save_many(items, query="삼성전자", ticker="005930")
|
|
rows = store.query_between(start, end, ticker="005930")
|
|
"""
|
|
|
|
def __init__(self, db_path: str):
|
|
self.db_path = db_path
|
|
os.makedirs(os.path.dirname(db_path) or ".", exist_ok=True)
|
|
self._init_schema()
|
|
|
|
# ──────────────────────────────────────────────
|
|
# 스키마
|
|
# ──────────────────────────────────────────────
|
|
def _connect(self) -> sqlite3.Connection:
|
|
conn = sqlite3.connect(self.db_path)
|
|
conn.row_factory = sqlite3.Row
|
|
return conn
|
|
|
|
def _init_schema(self):
|
|
with self._connect() as conn:
|
|
conn.executescript("""
|
|
CREATE TABLE IF NOT EXISTS news_snapshots (
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
captured_at TEXT NOT NULL,
|
|
query TEXT NOT NULL,
|
|
ticker TEXT,
|
|
title TEXT NOT NULL,
|
|
url TEXT NOT NULL UNIQUE,
|
|
pub_date TEXT,
|
|
source TEXT DEFAULT 'google_news'
|
|
);
|
|
CREATE INDEX IF NOT EXISTS idx_news_captured
|
|
ON news_snapshots(captured_at);
|
|
CREATE INDEX IF NOT EXISTS idx_news_ticker
|
|
ON news_snapshots(ticker, captured_at);
|
|
|
|
CREATE TABLE IF NOT EXISTS sentiment_scores (
|
|
news_id INTEGER PRIMARY KEY,
|
|
scored_at TEXT NOT NULL,
|
|
model TEXT NOT NULL,
|
|
sentiment REAL NOT NULL,
|
|
confidence REAL NOT NULL,
|
|
raw_json TEXT,
|
|
FOREIGN KEY (news_id) REFERENCES news_snapshots(id)
|
|
);
|
|
""")
|
|
|
|
# ──────────────────────────────────────────────
|
|
# 쓰기
|
|
# ──────────────────────────────────────────────
|
|
def save_many(self, items: Iterable[Dict], query: str,
|
|
ticker: Optional[str] = None,
|
|
captured_at: Optional[datetime] = None) -> int:
|
|
"""
|
|
뉴스 다건 저장. URL 기준 중복 자동 무시.
|
|
|
|
Args:
|
|
items: [{"title": str, "url": str, "pub_date": str?}, ...]
|
|
|
|
Returns:
|
|
실제로 삽입된 행 수
|
|
"""
|
|
if captured_at is None:
|
|
captured_at = datetime.now(KST)
|
|
ts = captured_at.isoformat()
|
|
|
|
rows = []
|
|
for it in items:
|
|
title = (it.get("title") or "").strip()
|
|
url = (it.get("url") or "").strip()
|
|
if not title or not url:
|
|
continue
|
|
rows.append((ts, query, ticker, title, url, it.get("pub_date")))
|
|
|
|
if not rows:
|
|
return 0
|
|
|
|
with self._connect() as conn:
|
|
before = conn.total_changes
|
|
conn.executemany(
|
|
"INSERT OR IGNORE INTO news_snapshots "
|
|
"(captured_at, query, ticker, title, url, pub_date) "
|
|
"VALUES (?, ?, ?, ?, ?, ?)",
|
|
rows,
|
|
)
|
|
inserted = conn.total_changes - before
|
|
return inserted
|
|
|
|
def save_sentiment(self, news_id: int, model: str,
|
|
sentiment: float, confidence: float,
|
|
raw_json: str = "",
|
|
scored_at: Optional[datetime] = None) -> None:
|
|
if scored_at is None:
|
|
scored_at = datetime.now(KST)
|
|
with self._connect() as conn:
|
|
conn.execute(
|
|
"INSERT OR REPLACE INTO sentiment_scores "
|
|
"(news_id, scored_at, model, sentiment, confidence, raw_json) "
|
|
"VALUES (?, ?, ?, ?, ?, ?)",
|
|
(news_id, scored_at.isoformat(), model,
|
|
float(sentiment), float(confidence), raw_json),
|
|
)
|
|
|
|
# ──────────────────────────────────────────────
|
|
# 읽기
|
|
# ──────────────────────────────────────────────
|
|
def query_between(self, start: datetime, end: datetime,
|
|
ticker: Optional[str] = None,
|
|
query: Optional[str] = None) -> List[sqlite3.Row]:
|
|
"""특정 기간 내 수집된 뉴스 조회."""
|
|
sql = "SELECT * FROM news_snapshots WHERE captured_at >= ? AND captured_at < ?"
|
|
args = [start.isoformat(), end.isoformat()]
|
|
if ticker is not None:
|
|
sql += " AND ticker = ?"
|
|
args.append(ticker)
|
|
if query is not None:
|
|
sql += " AND query = ?"
|
|
args.append(query)
|
|
sql += " ORDER BY captured_at ASC"
|
|
with self._connect() as conn:
|
|
return list(conn.execute(sql, args))
|
|
|
|
def pending_sentiment(self, limit: int = 100) -> List[sqlite3.Row]:
|
|
"""아직 감성 점수가 없는 뉴스 반환 (야간 배치용)."""
|
|
with self._connect() as conn:
|
|
return list(conn.execute(
|
|
"""SELECT n.* FROM news_snapshots n
|
|
LEFT JOIN sentiment_scores s ON s.news_id = n.id
|
|
WHERE s.news_id IS NULL
|
|
ORDER BY n.captured_at DESC
|
|
LIMIT ?""",
|
|
(limit,)
|
|
))
|
|
|
|
def stats(self) -> Dict:
|
|
"""DB 통계 (row 수, 감성 커버리지)."""
|
|
with self._connect() as conn:
|
|
total = conn.execute("SELECT COUNT(*) FROM news_snapshots").fetchone()[0]
|
|
scored = conn.execute("SELECT COUNT(*) FROM sentiment_scores").fetchone()[0]
|
|
return {
|
|
"total_news": total,
|
|
"scored": scored,
|
|
"pending": total - scored,
|
|
"coverage_pct": (scored / total * 100) if total else 0.0,
|
|
}
|