Files
gahusb 7ea1a21487 refactor: web-ai V1 assets → signal_v1/ (graduation prep)
Atomic mv of root V1 assets (main_server.py + modules/ + data/ +
tests/ + entry scripts + docs + logs) into signal_v1/ subdirectory.
load_dotenv() updated to load web-ai/.env explicitly via Path.

Adds web-ai/CLAUDE.md (workspace guide) and web-ai/start.bat
(signal_v1 entry wrapper). Prepares for signal_v2/ Phase 2.

Tests: signal_v1/tests/unit baseline preserved (no regression).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-16 03:00:11 +09:00

137 lines
5.0 KiB
Python

import requests
import json
import psutil
try:
import pynvml
except ImportError:
pynvml = None
from modules.config import Config
class OllamaManager:
"""
Ollama API 세션 관리 및 메모리 누수 방지 래퍼
- GPU VRAM 사용량 모니터링
- keep_alive 파라미터를 통한 메모리 관리
"""
def __init__(self, model_name=None, base_url=None):
self.model_name = model_name or Config.OLLAMA_MODEL
self.base_url = base_url or Config.OLLAMA_API_URL
self.generate_url = f"{self.base_url}/api/generate"
self.gpu_available = False
try:
if pynvml:
pynvml.nvmlInit()
self.handle = pynvml.nvmlDeviceGetHandleByIndex(0) # 0번 GPU (5070 Ti)
self.gpu_available = True
print("✅ [OllamaManager] NVIDIA GPU Monitoring On")
else:
print("⚠️ [OllamaManager] 'nvidia-ml-py' not installed. GPU monitoring disabled.")
except Exception as e:
print(f"⚠️ [OllamaManager] GPU Init Failed: {e}")
def check_vram(self):
"""현재 GPU VRAM 사용량(GB) 반환"""
if not self.gpu_available:
return 0.0
try:
info = pynvml.nvmlDeviceGetMemoryInfo(self.handle)
used_gb = info.used / 1024**3
return used_gb
except Exception:
return 0.0
def get_gpu_status(self):
"""GPU 종합 상태 반환 (온도, 메모리, 사용률, 이름)"""
if not self.gpu_available:
return {"name": "N/A", "temp": 0, "vram_used": 0, "vram_total": 0, "load": 0}
try:
# GPU 이름
name = pynvml.nvmlDeviceGetName(self.handle)
if isinstance(name, bytes):
name = name.decode('utf-8')
# 온도
temp = pynvml.nvmlDeviceGetTemperature(self.handle, pynvml.NVML_TEMPERATURE_GPU)
# 메모리
mem_info = pynvml.nvmlDeviceGetMemoryInfo(self.handle)
vram_used = mem_info.used / 1024**3
vram_total = mem_info.total / 1024**3
# 사용률
util = pynvml.nvmlDeviceGetUtilizationRates(self.handle)
load = util.gpu
return {
"name": name,
"temp": temp,
"vram_used": round(vram_used, 1),
"vram_total": round(vram_total, 1),
"load": load
}
except Exception as e:
print(f"⚠️ GPU Status Check Failed: {e}")
return {"name": "N/A", "temp": 0, "vram_used": 0, "vram_total": 0, "load": 0}
def is_training_active(self):
"""LSTM 학습 중인지 확인 (GPU 메모리 충돌 방지)"""
try:
import torch
if torch.cuda.is_available():
# VRAM 사용량으로 학습 여부 추정
vram = self.check_vram()
return vram > Config.VRAM_WARNING_THRESHOLD
except Exception:
pass
return False
def request_inference(self, prompt, context_data=None):
"""
Ollama에 추론 요청
- LSTM 학습 중이면 대기 (GPU 메모리 충돌 방지)
"""
# LSTM 학습 중이면 최대 60초 대기
import time as _time
for _ in range(12):
if not self.is_training_active():
break
print("[Ollama] Waiting for LSTM training to finish...")
_time.sleep(5)
vram = self.check_vram()
if vram > Config.VRAM_WARNING_THRESHOLD:
print(f"[OllamaManager] High VRAM Usage ({vram:.1f}GB). Requesting unload.")
try:
# keep_alive=0으로 설정하여 모델 즉시 언로드
requests.post(self.generate_url,
json={"model": self.model_name, "keep_alive": 0}, timeout=5)
except Exception as e:
print(f"Warning: Failed to unload model: {e}")
payload = {
"model": self.model_name,
"prompt": prompt,
"stream": False,
"format": "json",
"options": {
"num_ctx": Config.OLLAMA_NUM_CTX, # 4096 (속도 2배)
"num_predict": Config.OLLAMA_NUM_PREDICT, # 응답 토큰 제한
"temperature": 0.1, # 더 결정론적 (JSON 파싱 안정성)
"num_gpu": 1,
"num_thread": Config.OLLAMA_NUM_THREAD # Config 설정값 (기본 8)
},
"keep_alive": "5m" # 5분 유지 (불필요한 VRAM 점유 줄임)
}
try:
response = requests.post(self.generate_url, json=payload, timeout=90) # 180→90초
response.raise_for_status()
return response.json().get('response')
except requests.exceptions.Timeout:
print(f"❌ Inference Timeout (90s): {self.model_name}")
return None
except Exception as e:
print(f"❌ Inference Error: {e}")
return None