refactor: web-ai V1 assets → signal_v1/ (graduation prep)

Atomic mv of root V1 assets (main_server.py + modules/ + data/ + tests/ + entry scripts + docs + logs) into signal_v1/ subdirectory. load_dotenv() updated to load web-ai/.env explicitly via Path. Adds web-ai/CLAUDE.md (workspace guide) and web-ai/start.bat (signal_v1 entry wrapper). Prepares for signal_v2/ Phase 2. Tests: signal_v1/tests/unit baseline preserved (no regression). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-16 03:00:11 +09:00
parent 42b91d03cf
commit 7ea1a21487
39 changed files with 722 additions and 691 deletions
--- a/signal_v1/modules/services/llm_client.py
+++ b/signal_v1/modules/services/llm_client.py
@@ -0,0 +1,199 @@
+"""
+통합 LLM 클라이언트 — Gemini 2.5 Flash (Primary) + Ollama (Fallback)
+
+설계 원칙:
+  - OllamaManager.request_inference(prompt) 와 동일한 인터페이스 유지
+    → process.py, ai_council.py 코드 변경 최소화
+  - Gemini 실패(네트워크, Rate Limit) 시 자동으로 로컬 Ollama 폴백
+  - 15 RPM 제한 준수를 위한 자동 스로틀링
+  - VRAM 충돌 없음 (외부 API 호출이므로 LSTM 학습과 간섭 없음)
+
+Rate Limit (Gemini 2.5 Flash 무료 티어):
+  - 15 RPM, 1,500 RPD (봇 필요량 ~240/일 → 여유 6배)
+
+추가 패키지 불필요:
+  - requests (이미 설치됨) 기반 REST API 직접 호출
+"""
+
+import time
+import requests
+import json
+
+from modules.config import Config
+
+
+class GeminiLLMClient:
+    """
+    Gemini API 클라이언트
+
+    사용법:
+        client = GeminiLLMClient()
+        result = client.request_inference(prompt)  # str | None
+    """
+
+    _GENERATE_URL = (
+        "https://generativelanguage.googleapis.com/v1beta/models"
+        "/{model}:generateContent?key={key}"
+    )
+    # 15 RPM → 최소 4초 간격 (여유 0.1초 추가)
+    _MIN_INTERVAL = 4.1
+    # 클래스 변수: 같은 프로세스 내 재생성 시에도 마지막 호출 시각 유지
+    # (워커 OOM 재시작 후 싱글톤 교체 시에도 스로틀 유효)
+    _class_last_call_ts: float = 0.0
+
+    def __init__(self):
+        self.api_key = Config.GEMINI_API_KEY
+        self.model   = Config.GEMINI_MODEL
+        self._ollama = None          # Ollama 폴백 (lazy init)
+        self._use_gemini = bool(self.api_key)
+
+        if self._use_gemini:
+            print(f"✅ [LLMClient] Primary: Gemini {self.model}")
+        else:
+            print("⚠️  [LLMClient] GEMINI_API_KEY 미설정 → Ollama 전용 모드")
+
+    # ── 내부 헬퍼 ────────────────────────────────────────────────────────────
+
+    def _throttle(self):
+        """15 RPM 제한 준수 — 최소 호출 간격 강제 대기 (클래스 공유 타임스탬프)"""
+        elapsed = time.time() - GeminiLLMClient._class_last_call_ts
+        if elapsed < self._MIN_INTERVAL:
+            time.sleep(self._MIN_INTERVAL - elapsed)
+
+    def _call_gemini(self, prompt: str) -> str | None:
+        """
+        Gemini REST API 단일 호출
+
+        설정:
+          - systemInstruction: JSON 전용 응답 강제
+          - thinkingBudget=0: 내부 추론 비활성 (속도 1.5초 / 토큰 절약)
+          - maxOutputTokens=512: 200은 thinking 소모로 잘리므로 여유 확보
+        """
+        self._throttle()
+
+        url = self._GENERATE_URL.format(model=self.model, key=self.api_key)
+        payload = {
+            "system_instruction": {
+                "parts": [{"text": (
+                    "You are a Korean stock market analyst. "
+                    "Respond with valid JSON only. "
+                    "No markdown, no code blocks, no explanations."
+                )}]
+            },
+            "contents": [{"parts": [{"text": prompt}]}],
+            "generationConfig": {
+                "maxOutputTokens": 512,   # 200→512 (thinking 비활성 후 실제 응답 공간 확보)
+                "temperature": 0.1,       # 결정론적 출력
+                "thinkingConfig": {"thinkingBudget": 0},  # 내부 추론 끔 (속도↑, 토큰↓)
+            },
+        }
+
+        try:
+            resp = requests.post(url, json=payload, timeout=30)
+            GeminiLLMClient._class_last_call_ts = time.time()
+
+            # Rate Limit 초과
+            if resp.status_code == 429:
+                print("[LLMClient] Gemini Rate Limit (429) → Ollama 폴백")
+                return None
+
+            resp.raise_for_status()
+            data = resp.json()
+
+            # thinking 파트 제외, 실제 텍스트 파트만 결합
+            candidate = data.get("candidates", [{}])[0]
+            parts = candidate.get("content", {}).get("parts", [])
+            text = "".join(
+                p.get("text", "") for p in parts
+                if "text" in p and not p.get("thought")
+            ).strip()
+
+            return text if text else None
+
+        except requests.exceptions.Timeout:
+            print("[LLMClient] Gemini Timeout (30s) → Ollama 폴백")
+            return None
+        except Exception as e:
+            print(f"[LLMClient] Gemini Error: {e} → Ollama 폴백")
+            return None
+
+    def _get_ollama(self):
+        """Ollama 폴백 인스턴스 (lazy init — 필요할 때만 로드)"""
+        if self._ollama is None:
+            from modules.services.ollama import OllamaManager
+            self._ollama = OllamaManager()
+            # Ollama 실행 여부 사전 확인 (WinError 10061 조기 감지)
+            try:
+                requests.get(
+                    f"{Config.OLLAMA_API_URL}/api/tags",
+                    timeout=3,
+                )
+            except Exception:
+                print(
+                    f"❌ [LLMClient] Ollama 미실행 (localhost:11434 연결 거부) — "
+                    f"`ollama serve` 명령으로 Ollama를 시작하세요."
+                )
+        return self._ollama
+
+    # ── 공개 인터페이스 ───────────────────────────────────────────────────────
+
+    def request_inference(self, prompt: str, context_data=None) -> str | None:
+        """
+        LLM 추론 요청 — OllamaManager.request_inference()와 동일한 시그니처
+
+        순서:
+          1) GEMINI_API_KEY 있음 → Gemini API 호출
+          2) Gemini 실패(에러/타임아웃/Rate Limit) → Ollama 로컬 폴백
+          3) GEMINI_API_KEY 없음 → 바로 Ollama 사용
+        """
+        if self._use_gemini:
+            result = self._call_gemini(prompt)
+            if result is not None:
+                return result
+            # Gemini 실패 → Ollama 폴백
+            print("[LLMClient] Ollama 폴백 시도 중...")
+
+        return self._get_ollama().request_inference(prompt, context_data)
+
+    # ── OllamaManager 호환 메서드 (ai_council, evaluator 등에서 사용) ─────────
+
+    def check_vram(self) -> float:
+        """VRAM 사용량 반환 (Ollama 측 정보, Gemini 호출 시엔 무관)"""
+        if self._ollama:
+            return self._ollama.check_vram()
+        return 0.0
+
+    def get_gpu_status(self) -> dict:
+        """GPU 상태 반환 (OllamaManager 호환)"""
+        return self._get_ollama().get_gpu_status()
+
+    def unload_model(self):
+        """Ollama 모델 언로드 (LSTM 학습 전 호출용, Gemini는 무작동)"""
+        if self._ollama:
+            try:
+                requests.post(
+                    f"{Config.OLLAMA_API_URL}/api/generate",
+                    json={"model": Config.OLLAMA_MODEL, "keep_alive": 0},
+                    timeout=5,
+                )
+            except Exception:
+                pass
+
+
+# ── 워커 프로세스 전역 싱글톤 ─────────────────────────────────────────────────
+
+_llm_client: GeminiLLMClient | None = None
+
+
+def get_llm_client() -> GeminiLLMClient:
+    """
+    워커 프로세스 내 GeminiLLMClient 싱글톤 반환
+
+    process.py에서 기존 get_ollama() 대신 이 함수를 사용:
+        ollama = get_llm_client()
+        result = ollama.request_inference(prompt)
+    """
+    global _llm_client
+    if _llm_client is None:
+        _llm_client = GeminiLLMClient()
+    return _llm_client