refactor: web-ai V1 assets → signal_v1/ (graduation prep)

Atomic mv of root V1 assets (main_server.py + modules/ + data/ + tests/ + entry scripts + docs + logs) into signal_v1/ subdirectory. load_dotenv() updated to load web-ai/.env explicitly via Path. Adds web-ai/CLAUDE.md (workspace guide) and web-ai/start.bat (signal_v1 entry wrapper). Prepares for signal_v2/ Phase 2. Tests: signal_v1/tests/unit baseline preserved (no regression). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-16 03:00:11 +09:00
parent 42b91d03cf
commit 7ea1a21487
39 changed files with 722 additions and 691 deletions
--- a/signal_v1/modules/services/ollama.py
+++ b/signal_v1/modules/services/ollama.py
@@ -0,0 +1,136 @@
+import requests
+import json
+import psutil
+try:
+    import pynvml
+except ImportError:
+    pynvml = None
+
+from modules.config import Config
+
+class OllamaManager:
+    """
+    Ollama API 세션 관리 및 메모리 누수 방지 래퍼
+    - GPU VRAM 사용량 모니터링
+    - keep_alive 파라미터를 통한 메모리 관리
+    """
+    def __init__(self, model_name=None, base_url=None):
+        self.model_name = model_name or Config.OLLAMA_MODEL
+        self.base_url = base_url or Config.OLLAMA_API_URL
+        self.generate_url = f"{self.base_url}/api/generate"
+        
+        self.gpu_available = False
+        try:
+            if pynvml:
+                pynvml.nvmlInit()
+                self.handle = pynvml.nvmlDeviceGetHandleByIndex(0) # 0번 GPU (5070 Ti)
+                self.gpu_available = True
+                print("✅ [OllamaManager] NVIDIA GPU Monitoring On")
+            else:
+                print("⚠️ [OllamaManager] 'nvidia-ml-py' not installed. GPU monitoring disabled.")
+        except Exception as e:
+            print(f"⚠️ [OllamaManager] GPU Init Failed: {e}")
+
+    def check_vram(self):
+        """현재 GPU VRAM 사용량(GB) 반환"""
+        if not self.gpu_available:
+            return 0.0
+        try:
+            info = pynvml.nvmlDeviceGetMemoryInfo(self.handle)
+            used_gb = info.used / 1024**3
+            return used_gb
+        except Exception:
+            return 0.0
+
+    def get_gpu_status(self):
+        """GPU 종합 상태 반환 (온도, 메모리, 사용률, 이름)"""
+        if not self.gpu_available:
+            return {"name": "N/A", "temp": 0, "vram_used": 0, "vram_total": 0, "load": 0}
+            
+        try:
+            # GPU 이름
+            name = pynvml.nvmlDeviceGetName(self.handle)
+            if isinstance(name, bytes):
+                name = name.decode('utf-8')
+            
+            # 온도
+            temp = pynvml.nvmlDeviceGetTemperature(self.handle, pynvml.NVML_TEMPERATURE_GPU)
+            # 메모리
+            mem_info = pynvml.nvmlDeviceGetMemoryInfo(self.handle)
+            vram_used = mem_info.used / 1024**3
+            vram_total = mem_info.total / 1024**3
+            # 사용률
+            util = pynvml.nvmlDeviceGetUtilizationRates(self.handle)
+            load = util.gpu
+            
+            return {
+                "name": name,
+                "temp": temp,
+                "vram_used": round(vram_used, 1),
+                "vram_total": round(vram_total, 1),
+                "load": load
+            }
+        except Exception as e:
+            print(f"⚠️ GPU Status Check Failed: {e}")
+            return {"name": "N/A", "temp": 0, "vram_used": 0, "vram_total": 0, "load": 0}
+
+    def is_training_active(self):
+        """LSTM 학습 중인지 확인 (GPU 메모리 충돌 방지)"""
+        try:
+            import torch
+            if torch.cuda.is_available():
+                # VRAM 사용량으로 학습 여부 추정
+                vram = self.check_vram()
+                return vram > Config.VRAM_WARNING_THRESHOLD
+        except Exception:
+            pass
+        return False
+
+    def request_inference(self, prompt, context_data=None):
+        """
+        Ollama에 추론 요청
+        - LSTM 학습 중이면 대기 (GPU 메모리 충돌 방지)
+        """
+        # LSTM 학습 중이면 최대 60초 대기
+        import time as _time
+        for _ in range(12):
+            if not self.is_training_active():
+                break
+            print("[Ollama] Waiting for LSTM training to finish...")
+            _time.sleep(5)
+
+        vram = self.check_vram()
+        if vram > Config.VRAM_WARNING_THRESHOLD:
+            print(f"[OllamaManager] High VRAM Usage ({vram:.1f}GB). Requesting unload.")
+            try:
+                # keep_alive=0으로 설정하여 모델 즉시 언로드
+                requests.post(self.generate_url, 
+                              json={"model": self.model_name, "keep_alive": 0}, timeout=5)
+            except Exception as e:
+                print(f"Warning: Failed to unload model: {e}")
+
+        payload = {
+            "model": self.model_name,
+            "prompt": prompt,
+            "stream": False,
+            "format": "json",
+            "options": {
+                "num_ctx": Config.OLLAMA_NUM_CTX,      # 4096 (속도 2배)
+                "num_predict": Config.OLLAMA_NUM_PREDICT,  # 응답 토큰 제한
+                "temperature": 0.1,  # 더 결정론적 (JSON 파싱 안정성)
+                "num_gpu": 1,
+                "num_thread": Config.OLLAMA_NUM_THREAD  # Config 설정값 (기본 8)
+            },
+            "keep_alive": "5m"  # 5분 유지 (불필요한 VRAM 점유 줄임)
+        }
+
+        try:
+            response = requests.post(self.generate_url, json=payload, timeout=90)  # 180→90초
+            response.raise_for_status()
+            return response.json().get('response')
+        except requests.exceptions.Timeout:
+            print(f"❌ Inference Timeout (90s): {self.model_name}")
+            return None
+        except Exception as e:
+            print(f"❌ Inference Error: {e}")
+            return None