import requests import json import psutil try: import pynvml except ImportError: pynvml = None from modules.config import Config class OllamaManager: """ Ollama API 세션 관리 및 메모리 누수 방지 래퍼 - GPU VRAM 사용량 모니터링 - keep_alive 파라미터를 통한 메모리 관리 """ def __init__(self, model_name=None, base_url=None): self.model_name = model_name or Config.OLLAMA_MODEL self.base_url = base_url or Config.OLLAMA_API_URL self.generate_url = f"{self.base_url}/api/generate" self.gpu_available = False try: if pynvml: pynvml.nvmlInit() self.handle = pynvml.nvmlDeviceGetHandleByIndex(0) # 0번 GPU (5070 Ti) self.gpu_available = True print("✅ [OllamaManager] NVIDIA GPU Monitoring On") else: print("⚠️ [OllamaManager] 'nvidia-ml-py' not installed. GPU monitoring disabled.") except Exception as e: print(f"⚠️ [OllamaManager] GPU Init Failed: {e}") def check_vram(self): """현재 GPU VRAM 사용량(GB) 반환""" if not self.gpu_available: return 0.0 try: info = pynvml.nvmlDeviceGetMemoryInfo(self.handle) used_gb = info.used / 1024**3 return used_gb except Exception: return 0.0 def get_gpu_status(self): """GPU 종합 상태 반환 (온도, 메모리, 사용률, 이름)""" if not self.gpu_available: return {"name": "N/A", "temp": 0, "vram_used": 0, "vram_total": 0, "load": 0} try: # GPU 이름 name = pynvml.nvmlDeviceGetName(self.handle) if isinstance(name, bytes): name = name.decode('utf-8') # 온도 temp = pynvml.nvmlDeviceGetTemperature(self.handle, pynvml.NVML_TEMPERATURE_GPU) # 메모리 mem_info = pynvml.nvmlDeviceGetMemoryInfo(self.handle) vram_used = mem_info.used / 1024**3 vram_total = mem_info.total / 1024**3 # 사용률 util = pynvml.nvmlDeviceGetUtilizationRates(self.handle) load = util.gpu return { "name": name, "temp": temp, "vram_used": round(vram_used, 1), "vram_total": round(vram_total, 1), "load": load } except Exception as e: print(f"⚠️ GPU Status Check Failed: {e}") return {"name": "N/A", "temp": 0, "vram_used": 0, "vram_total": 0, "load": 0} def is_training_active(self): """LSTM 학습 중인지 확인 (GPU 메모리 충돌 방지)""" try: import torch if torch.cuda.is_available(): # VRAM 사용량으로 학습 여부 추정 vram = self.check_vram() return vram > Config.VRAM_WARNING_THRESHOLD except Exception: pass return False def request_inference(self, prompt, context_data=None): """ Ollama에 추론 요청 - LSTM 학습 중이면 대기 (GPU 메모리 충돌 방지) """ # LSTM 학습 중이면 최대 60초 대기 import time as _time for _ in range(12): if not self.is_training_active(): break print("[Ollama] Waiting for LSTM training to finish...") _time.sleep(5) vram = self.check_vram() if vram > Config.VRAM_WARNING_THRESHOLD: print(f"[OllamaManager] High VRAM Usage ({vram:.1f}GB). Requesting unload.") try: # keep_alive=0으로 설정하여 모델 즉시 언로드 requests.post(self.generate_url, json={"model": self.model_name, "keep_alive": 0}, timeout=5) except Exception as e: print(f"Warning: Failed to unload model: {e}") payload = { "model": self.model_name, "prompt": prompt, "stream": False, "format": "json", "options": { "num_ctx": Config.OLLAMA_NUM_CTX, # 4096 (속도 2배) "num_predict": Config.OLLAMA_NUM_PREDICT, # 응답 토큰 제한 "temperature": 0.1, # 더 결정론적 (JSON 파싱 안정성) "num_gpu": 1, "num_thread": Config.OLLAMA_NUM_THREAD # Config 설정값 (기본 8) }, "keep_alive": "5m" # 5분 유지 (불필요한 VRAM 점유 줄임) } try: response = requests.post(self.generate_url, json=payload, timeout=90) # 180→90초 response.raise_for_status() return response.json().get('response') except requests.exceptions.Timeout: print(f"❌ Inference Timeout (90s): {self.model_name}") return None except Exception as e: print(f"❌ Inference Error: {e}") return None