"""AI news sentiment validation — Spearman IC vs forward returns. 핵심 metric: 일자별 score_raw 와 다음 N일 forward return 의 Spearman 상관. 4주+ 누적 후 IC mean > 0.05 면 weight 활성화 가치 있음. """ from __future__ import annotations import datetime as dt import sqlite3 from typing import Any, Dict, List, Optional import pandas as pd def _spearman(a: pd.Series, b: pd.Series) -> Optional[float]: """Spearman rank correlation. None if insufficient/degenerate data.""" if len(a) < 5 or len(b) < 5: return None if a.std(ddof=0) == 0 or b.std(ddof=0) == 0: return None return float(a.rank().corr(b.rank())) def compute_ic( conn: sqlite3.Connection, *, days: int = 30, horizon: int = 1, min_news_count: int = 1, asof_today: Optional[dt.date] = None, ) -> Dict[str, Any]: """Compute daily Spearman IC of ai_news.score_raw vs forward return. Returns: { "horizon_days": int, "min_news_count": int, "window_days": int, "ic_count": int, # 유효 일수 "ic_mean": float | None, "ic_std": float | None, "ic_per_day": [{"date": "YYYY-MM-DD", "ic": float, "n": int}, ...], "verdict": "skip" | "weak" | "strong", } verdict: - skip: ic_count < 10 - weak: ic_mean in [-0.05, 0.05] - strong: |ic_mean| > 0.05 """ asof_today = asof_today or dt.date.today() cutoff = (asof_today - dt.timedelta(days=days)).isoformat() sentiment = pd.read_sql_query( "SELECT ticker, date, score_raw, news_count " "FROM news_sentiment WHERE date >= ? AND news_count >= ? ORDER BY date", conn, params=(cutoff, min_news_count), ) if sentiment.empty: return _empty_result(days, horizon, min_news_count) # forward return 조회: 각 (ticker, date) 에 대해 close[date+horizon] / close[date] - 1 prices = pd.read_sql_query( "SELECT ticker, date, close FROM krx_daily_prices " "WHERE date >= ? ORDER BY ticker, date", conn, params=(cutoff,), ) if prices.empty: return _empty_result(days, horizon, min_news_count) prices = prices.sort_values(["ticker", "date"]) prices["fwd_close"] = prices.groupby("ticker", group_keys=False)["close"].shift(-horizon) prices["fwd_ret"] = prices["fwd_close"] / prices["close"] - 1.0 merged = sentiment.merge( prices[["ticker", "date", "fwd_ret"]], on=["ticker", "date"], how="inner" ) merged = merged.dropna(subset=["fwd_ret"]) if merged.empty: return _empty_result(days, horizon, min_news_count) ic_rows: List[Dict[str, Any]] = [] for date, grp in merged.groupby("date"): ic = _spearman(grp["score_raw"], grp["fwd_ret"]) if ic is not None: ic_rows.append({"date": date, "ic": ic, "n": int(len(grp))}) if not ic_rows: return _empty_result(days, horizon, min_news_count) ic_series = pd.Series([r["ic"] for r in ic_rows], dtype=float) ic_mean = float(ic_series.mean()) ic_std = float(ic_series.std(ddof=0)) if len(ic_series) > 1 else 0.0 if len(ic_rows) < 10: verdict = "skip" elif abs(ic_mean) > 0.05: verdict = "strong" else: verdict = "weak" return { "horizon_days": horizon, "min_news_count": min_news_count, "window_days": days, "ic_count": len(ic_rows), "ic_mean": round(ic_mean, 4), "ic_std": round(ic_std, 4), "ic_per_day": ic_rows, "verdict": verdict, } def _empty_result(days: int, horizon: int, min_news_count: int) -> Dict[str, Any]: return { "horizon_days": horizon, "min_news_count": min_news_count, "window_days": days, "ic_count": 0, "ic_mean": None, "ic_std": None, "ic_per_day": [], "verdict": "skip", }