refactor: web-ai V1 assets → signal_v1/ (graduation prep)

Atomic mv of root V1 assets (main_server.py + modules/ + data/ +
tests/ + entry scripts + docs + logs) into signal_v1/ subdirectory.
load_dotenv() updated to load web-ai/.env explicitly via Path.

Adds web-ai/CLAUDE.md (workspace guide) and web-ai/start.bat
(signal_v1 entry wrapper). Prepares for signal_v2/ Phase 2.

Tests: signal_v1/tests/unit baseline preserved (no regression).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-16 03:00:11 +09:00
parent 42b91d03cf
commit 7ea1a21487
39 changed files with 722 additions and 691 deletions

View File

@@ -0,0 +1,189 @@
"""
뉴스 스냅샷 인프라 (v3.2)
목적:
- 수집한 뉴스를 SQLite에 타임스탬프와 함께 영구 저장
- 사후 감성 신호 재검증 (LLM 재호출 / 모델 비교) 가능하게
- 백테스트에서 '그 시점에 실제로 알 수 있던 뉴스'만 사용
스키마:
news_snapshots(
id INTEGER PK,
captured_at TEXT, # ISO8601 (KST) — 수집 시점
query TEXT, # 수집 쿼리 (예: '주식 시장', '삼성전자')
ticker TEXT, # 종목 코드 (종목 뉴스일 때, else NULL)
title TEXT,
url TEXT UNIQUE,
pub_date TEXT, # RSS pubDate 원본
source TEXT DEFAULT 'google_news'
)
sentiment_scores( # 야간 배치로 사후 생성
news_id INTEGER PK,
scored_at TEXT,
model TEXT,
sentiment REAL, # -1.0 ~ 1.0
confidence REAL,
raw_json TEXT,
FOREIGN KEY (news_id) REFERENCES news_snapshots(id)
)
순수 I/O 모듈 — 네트워크 의존성 없음 → unit 테스트 가능.
"""
import os
import sqlite3
from datetime import datetime, timezone, timedelta
from typing import Iterable, List, Optional, Dict
KST = timezone(timedelta(hours=9))
class NewsSnapshotStore:
"""
SQLite 기반 뉴스 스냅샷 저장소.
사용 예:
store = NewsSnapshotStore("data/news_snapshots.db")
store.save_many(items, query="삼성전자", ticker="005930")
rows = store.query_between(start, end, ticker="005930")
"""
def __init__(self, db_path: str):
self.db_path = db_path
os.makedirs(os.path.dirname(db_path) or ".", exist_ok=True)
self._init_schema()
# ──────────────────────────────────────────────
# 스키마
# ──────────────────────────────────────────────
def _connect(self) -> sqlite3.Connection:
conn = sqlite3.connect(self.db_path)
conn.row_factory = sqlite3.Row
return conn
def _init_schema(self):
with self._connect() as conn:
conn.executescript("""
CREATE TABLE IF NOT EXISTS news_snapshots (
id INTEGER PRIMARY KEY AUTOINCREMENT,
captured_at TEXT NOT NULL,
query TEXT NOT NULL,
ticker TEXT,
title TEXT NOT NULL,
url TEXT NOT NULL UNIQUE,
pub_date TEXT,
source TEXT DEFAULT 'google_news'
);
CREATE INDEX IF NOT EXISTS idx_news_captured
ON news_snapshots(captured_at);
CREATE INDEX IF NOT EXISTS idx_news_ticker
ON news_snapshots(ticker, captured_at);
CREATE TABLE IF NOT EXISTS sentiment_scores (
news_id INTEGER PRIMARY KEY,
scored_at TEXT NOT NULL,
model TEXT NOT NULL,
sentiment REAL NOT NULL,
confidence REAL NOT NULL,
raw_json TEXT,
FOREIGN KEY (news_id) REFERENCES news_snapshots(id)
);
""")
# ──────────────────────────────────────────────
# 쓰기
# ──────────────────────────────────────────────
def save_many(self, items: Iterable[Dict], query: str,
ticker: Optional[str] = None,
captured_at: Optional[datetime] = None) -> int:
"""
뉴스 다건 저장. URL 기준 중복 자동 무시.
Args:
items: [{"title": str, "url": str, "pub_date": str?}, ...]
Returns:
실제로 삽입된 행 수
"""
if captured_at is None:
captured_at = datetime.now(KST)
ts = captured_at.isoformat()
rows = []
for it in items:
title = (it.get("title") or "").strip()
url = (it.get("url") or "").strip()
if not title or not url:
continue
rows.append((ts, query, ticker, title, url, it.get("pub_date")))
if not rows:
return 0
with self._connect() as conn:
before = conn.total_changes
conn.executemany(
"INSERT OR IGNORE INTO news_snapshots "
"(captured_at, query, ticker, title, url, pub_date) "
"VALUES (?, ?, ?, ?, ?, ?)",
rows,
)
inserted = conn.total_changes - before
return inserted
def save_sentiment(self, news_id: int, model: str,
sentiment: float, confidence: float,
raw_json: str = "",
scored_at: Optional[datetime] = None) -> None:
if scored_at is None:
scored_at = datetime.now(KST)
with self._connect() as conn:
conn.execute(
"INSERT OR REPLACE INTO sentiment_scores "
"(news_id, scored_at, model, sentiment, confidence, raw_json) "
"VALUES (?, ?, ?, ?, ?, ?)",
(news_id, scored_at.isoformat(), model,
float(sentiment), float(confidence), raw_json),
)
# ──────────────────────────────────────────────
# 읽기
# ──────────────────────────────────────────────
def query_between(self, start: datetime, end: datetime,
ticker: Optional[str] = None,
query: Optional[str] = None) -> List[sqlite3.Row]:
"""특정 기간 내 수집된 뉴스 조회."""
sql = "SELECT * FROM news_snapshots WHERE captured_at >= ? AND captured_at < ?"
args = [start.isoformat(), end.isoformat()]
if ticker is not None:
sql += " AND ticker = ?"
args.append(ticker)
if query is not None:
sql += " AND query = ?"
args.append(query)
sql += " ORDER BY captured_at ASC"
with self._connect() as conn:
return list(conn.execute(sql, args))
def pending_sentiment(self, limit: int = 100) -> List[sqlite3.Row]:
"""아직 감성 점수가 없는 뉴스 반환 (야간 배치용)."""
with self._connect() as conn:
return list(conn.execute(
"""SELECT n.* FROM news_snapshots n
LEFT JOIN sentiment_scores s ON s.news_id = n.id
WHERE s.news_id IS NULL
ORDER BY n.captured_at DESC
LIMIT ?""",
(limit,)
))
def stats(self) -> Dict:
"""DB 통계 (row 수, 감성 커버리지)."""
with self._connect() as conn:
total = conn.execute("SELECT COUNT(*) FROM news_snapshots").fetchone()[0]
scored = conn.execute("SELECT COUNT(*) FROM sentiment_scores").fetchone()[0]
return {
"total_news": total,
"scored": scored,
"pending": total - scored,
"coverage_pct": (scored / total * 100) if total else 0.0,
}