import psutil import time from datetime import datetime class SystemMonitor: def __init__(self, messenger, ollama_manager): self.messenger = messenger self.ollama_monitor = ollama_manager self.last_health_check = datetime.now() def check_health(self): """시스템 상태 점검 및 알림 (CPU, RAM, GPU) - 5분마다 실행""" now = datetime.now() # 5분에 한 번씩만 체크 if (now - self.last_health_check).total_seconds() < 300: return self.last_health_check = now alerts = [] # 1. CPU Check (Double Verify) # 1초 간격으로 측정 cpu_usage = psutil.cpu_percent(interval=1) if cpu_usage > 90: # 일시적인 스파이크일 수 있으므로 3초 후 재측정 time.sleep(3) cpu_usage_2nd = psutil.cpu_percent(interval=1) if cpu_usage_2nd > 90: # 과부하 시 원인 프로세스 추적 top_processes = [] for proc in psutil.process_iter(['pid', 'name', 'cpu_percent']): try: # Windows 유휴 프로세스 제외 if proc.info['name'] == 'System Idle Process': continue top_processes.append(proc.info) except (psutil.NoSuchProcess, psutil.AccessDenied): pass # CPU 사용률 내림차순 정렬 top_processes.sort(key=lambda x: x['cpu_percent'], reverse=True) # 상위 프로세스들의 CPU 합계 검증 (측정 오류 필터링) total_top_cpu = sum(p['cpu_percent'] for p in top_processes[:3]) if total_top_cpu < 30.0: print(f"⚠️ [Monitor] Ignored CPU Alert: usage={cpu_usage_2nd}% but top3_sum={total_top_cpu}%") else: top_3_str = "" for p in top_processes[:3]: top_3_str += f"\n- {p['name']} ({p['cpu_percent']}%)" alerts.append(f"🔥 **[CPU Overload]** Usage: `{cpu_usage_2nd}%`\n**Top Processes:**{top_3_str}") # 2. RAM Check ram = psutil.virtual_memory() if ram.percent > 90: alerts.append(f"💾 **[RAM High]** Usage: `{ram.percent}%` (Free: {ram.available / 1024**3:.1f}GB)") # 3. GPU Check if self.ollama_monitor: gpu_status = self.ollama_monitor.get_gpu_status() temp = gpu_status.get('temp', 0) if temp > 80: alerts.append(f"♨️ **[GPU Overheat]** Temp: `{temp}°C`") # 알림 전송 if alerts: msg = "⚠️ **[System Health Alert]**\n" + "\n".join(alerts) if self.messenger: self.messenger.send_message(msg)