refactor: web-ai V1 assets → signal_v1/ (graduation prep)
Atomic mv of root V1 assets (main_server.py + modules/ + data/ + tests/ + entry scripts + docs + logs) into signal_v1/ subdirectory. load_dotenv() updated to load web-ai/.env explicitly via Path. Adds web-ai/CLAUDE.md (workspace guide) and web-ai/start.bat (signal_v1 entry wrapper). Prepares for signal_v2/ Phase 2. Tests: signal_v1/tests/unit baseline preserved (no regression). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
189
signal_v1/modules/services/news_snapshot.py
Normal file
189
signal_v1/modules/services/news_snapshot.py
Normal file
@@ -0,0 +1,189 @@
|
||||
"""
|
||||
뉴스 스냅샷 인프라 (v3.2)
|
||||
|
||||
목적:
|
||||
- 수집한 뉴스를 SQLite에 타임스탬프와 함께 영구 저장
|
||||
- 사후 감성 신호 재검증 (LLM 재호출 / 모델 비교) 가능하게
|
||||
- 백테스트에서 '그 시점에 실제로 알 수 있던 뉴스'만 사용
|
||||
|
||||
스키마:
|
||||
news_snapshots(
|
||||
id INTEGER PK,
|
||||
captured_at TEXT, # ISO8601 (KST) — 수집 시점
|
||||
query TEXT, # 수집 쿼리 (예: '주식 시장', '삼성전자')
|
||||
ticker TEXT, # 종목 코드 (종목 뉴스일 때, else NULL)
|
||||
title TEXT,
|
||||
url TEXT UNIQUE,
|
||||
pub_date TEXT, # RSS pubDate 원본
|
||||
source TEXT DEFAULT 'google_news'
|
||||
)
|
||||
sentiment_scores( # 야간 배치로 사후 생성
|
||||
news_id INTEGER PK,
|
||||
scored_at TEXT,
|
||||
model TEXT,
|
||||
sentiment REAL, # -1.0 ~ 1.0
|
||||
confidence REAL,
|
||||
raw_json TEXT,
|
||||
FOREIGN KEY (news_id) REFERENCES news_snapshots(id)
|
||||
)
|
||||
|
||||
순수 I/O 모듈 — 네트워크 의존성 없음 → unit 테스트 가능.
|
||||
"""
|
||||
import os
|
||||
import sqlite3
|
||||
from datetime import datetime, timezone, timedelta
|
||||
from typing import Iterable, List, Optional, Dict
|
||||
|
||||
KST = timezone(timedelta(hours=9))
|
||||
|
||||
|
||||
class NewsSnapshotStore:
|
||||
"""
|
||||
SQLite 기반 뉴스 스냅샷 저장소.
|
||||
|
||||
사용 예:
|
||||
store = NewsSnapshotStore("data/news_snapshots.db")
|
||||
store.save_many(items, query="삼성전자", ticker="005930")
|
||||
rows = store.query_between(start, end, ticker="005930")
|
||||
"""
|
||||
|
||||
def __init__(self, db_path: str):
|
||||
self.db_path = db_path
|
||||
os.makedirs(os.path.dirname(db_path) or ".", exist_ok=True)
|
||||
self._init_schema()
|
||||
|
||||
# ──────────────────────────────────────────────
|
||||
# 스키마
|
||||
# ──────────────────────────────────────────────
|
||||
def _connect(self) -> sqlite3.Connection:
|
||||
conn = sqlite3.connect(self.db_path)
|
||||
conn.row_factory = sqlite3.Row
|
||||
return conn
|
||||
|
||||
def _init_schema(self):
|
||||
with self._connect() as conn:
|
||||
conn.executescript("""
|
||||
CREATE TABLE IF NOT EXISTS news_snapshots (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
captured_at TEXT NOT NULL,
|
||||
query TEXT NOT NULL,
|
||||
ticker TEXT,
|
||||
title TEXT NOT NULL,
|
||||
url TEXT NOT NULL UNIQUE,
|
||||
pub_date TEXT,
|
||||
source TEXT DEFAULT 'google_news'
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS idx_news_captured
|
||||
ON news_snapshots(captured_at);
|
||||
CREATE INDEX IF NOT EXISTS idx_news_ticker
|
||||
ON news_snapshots(ticker, captured_at);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS sentiment_scores (
|
||||
news_id INTEGER PRIMARY KEY,
|
||||
scored_at TEXT NOT NULL,
|
||||
model TEXT NOT NULL,
|
||||
sentiment REAL NOT NULL,
|
||||
confidence REAL NOT NULL,
|
||||
raw_json TEXT,
|
||||
FOREIGN KEY (news_id) REFERENCES news_snapshots(id)
|
||||
);
|
||||
""")
|
||||
|
||||
# ──────────────────────────────────────────────
|
||||
# 쓰기
|
||||
# ──────────────────────────────────────────────
|
||||
def save_many(self, items: Iterable[Dict], query: str,
|
||||
ticker: Optional[str] = None,
|
||||
captured_at: Optional[datetime] = None) -> int:
|
||||
"""
|
||||
뉴스 다건 저장. URL 기준 중복 자동 무시.
|
||||
|
||||
Args:
|
||||
items: [{"title": str, "url": str, "pub_date": str?}, ...]
|
||||
|
||||
Returns:
|
||||
실제로 삽입된 행 수
|
||||
"""
|
||||
if captured_at is None:
|
||||
captured_at = datetime.now(KST)
|
||||
ts = captured_at.isoformat()
|
||||
|
||||
rows = []
|
||||
for it in items:
|
||||
title = (it.get("title") or "").strip()
|
||||
url = (it.get("url") or "").strip()
|
||||
if not title or not url:
|
||||
continue
|
||||
rows.append((ts, query, ticker, title, url, it.get("pub_date")))
|
||||
|
||||
if not rows:
|
||||
return 0
|
||||
|
||||
with self._connect() as conn:
|
||||
before = conn.total_changes
|
||||
conn.executemany(
|
||||
"INSERT OR IGNORE INTO news_snapshots "
|
||||
"(captured_at, query, ticker, title, url, pub_date) "
|
||||
"VALUES (?, ?, ?, ?, ?, ?)",
|
||||
rows,
|
||||
)
|
||||
inserted = conn.total_changes - before
|
||||
return inserted
|
||||
|
||||
def save_sentiment(self, news_id: int, model: str,
|
||||
sentiment: float, confidence: float,
|
||||
raw_json: str = "",
|
||||
scored_at: Optional[datetime] = None) -> None:
|
||||
if scored_at is None:
|
||||
scored_at = datetime.now(KST)
|
||||
with self._connect() as conn:
|
||||
conn.execute(
|
||||
"INSERT OR REPLACE INTO sentiment_scores "
|
||||
"(news_id, scored_at, model, sentiment, confidence, raw_json) "
|
||||
"VALUES (?, ?, ?, ?, ?, ?)",
|
||||
(news_id, scored_at.isoformat(), model,
|
||||
float(sentiment), float(confidence), raw_json),
|
||||
)
|
||||
|
||||
# ──────────────────────────────────────────────
|
||||
# 읽기
|
||||
# ──────────────────────────────────────────────
|
||||
def query_between(self, start: datetime, end: datetime,
|
||||
ticker: Optional[str] = None,
|
||||
query: Optional[str] = None) -> List[sqlite3.Row]:
|
||||
"""특정 기간 내 수집된 뉴스 조회."""
|
||||
sql = "SELECT * FROM news_snapshots WHERE captured_at >= ? AND captured_at < ?"
|
||||
args = [start.isoformat(), end.isoformat()]
|
||||
if ticker is not None:
|
||||
sql += " AND ticker = ?"
|
||||
args.append(ticker)
|
||||
if query is not None:
|
||||
sql += " AND query = ?"
|
||||
args.append(query)
|
||||
sql += " ORDER BY captured_at ASC"
|
||||
with self._connect() as conn:
|
||||
return list(conn.execute(sql, args))
|
||||
|
||||
def pending_sentiment(self, limit: int = 100) -> List[sqlite3.Row]:
|
||||
"""아직 감성 점수가 없는 뉴스 반환 (야간 배치용)."""
|
||||
with self._connect() as conn:
|
||||
return list(conn.execute(
|
||||
"""SELECT n.* FROM news_snapshots n
|
||||
LEFT JOIN sentiment_scores s ON s.news_id = n.id
|
||||
WHERE s.news_id IS NULL
|
||||
ORDER BY n.captured_at DESC
|
||||
LIMIT ?""",
|
||||
(limit,)
|
||||
))
|
||||
|
||||
def stats(self) -> Dict:
|
||||
"""DB 통계 (row 수, 감성 커버리지)."""
|
||||
with self._connect() as conn:
|
||||
total = conn.execute("SELECT COUNT(*) FROM news_snapshots").fetchone()[0]
|
||||
scored = conn.execute("SELECT COUNT(*) FROM sentiment_scores").fetchone()[0]
|
||||
return {
|
||||
"total_news": total,
|
||||
"scored": scored,
|
||||
"pending": total - scored,
|
||||
"coverage_pct": (scored / total * 100) if total else 0.0,
|
||||
}
|
||||
Reference in New Issue
Block a user