Files
ai-trade/signal_v1/modules/utils/monitor.py
gahusb 7ea1a21487 refactor: web-ai V1 assets → signal_v1/ (graduation prep)
Atomic mv of root V1 assets (main_server.py + modules/ + data/ +
tests/ + entry scripts + docs + logs) into signal_v1/ subdirectory.
load_dotenv() updated to load web-ai/.env explicitly via Path.

Adds web-ai/CLAUDE.md (workspace guide) and web-ai/start.bat
(signal_v1 entry wrapper). Prepares for signal_v2/ Phase 2.

Tests: signal_v1/tests/unit baseline preserved (no regression).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-16 03:00:11 +09:00

111 lines
4.5 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import psutil
from datetime import datetime
from modules.config import Config
class SystemMonitor:
def __init__(self, messenger, ollama_manager):
self.messenger = messenger
self.ollama_monitor = ollama_manager
self.last_health_check = datetime.now()
# CPU 서킷 브레이커 상태
self._cpu_overload_count = 0 # 연속 과부하 횟수
self._circuit_open = False # 서킷 브레이커 발동 여부
self._circuit_open_since = None
def is_cpu_critical(self):
"""서킷 브레이커가 발동 상태인지 반환 (True이면 분석 사이클 스킵)"""
return self._circuit_open
def reset_circuit(self):
"""서킷 브레이커 수동 리셋"""
if self._circuit_open:
print("[Monitor] CPU Circuit Breaker RESET")
self._circuit_open = False
self._cpu_overload_count = 0
self._circuit_open_since = None
def check_health(self):
"""시스템 상태 점검 및 알림 (CPU, RAM, GPU) - 3분마다 실행"""
now = datetime.now()
if (now - self.last_health_check).total_seconds() < 180:
return
self.last_health_check = now
alerts = []
# 1. CPU Check
cpu_usage = psutil.cpu_percent(interval=1) # 1초 측정 (더 정확)
if cpu_usage > Config.CPU_CIRCUIT_BREAKER_THRESHOLD:
self._cpu_overload_count += 1
# 상위 프로세스 조회
top_processes = []
for proc in psutil.process_iter(['pid', 'name', 'cpu_percent']):
try:
if proc.info['name'] in ('System Idle Process', 'Idle'):
continue
top_processes.append(proc.info)
except (psutil.NoSuchProcess, psutil.AccessDenied):
pass
top_processes.sort(key=lambda x: x['cpu_percent'], reverse=True)
top_3_str = ""
for p in top_processes[:3]:
top_3_str += f"\n- {p['name']} ({p['cpu_percent']}%)"
# 서킷 브레이커 발동 조건
if self._cpu_overload_count >= Config.CPU_CIRCUIT_BREAKER_CONSECUTIVE:
if not self._circuit_open:
self._circuit_open = True
self._circuit_open_since = now
alerts.append(
f"🔴 [CPU Circuit Breaker OPEN] {cpu_usage}% × {self._cpu_overload_count}회 연속\n"
f"⛔ 분석 사이클 일시 중단 (5분 후 자동 복구)\nTop Processes:{top_3_str}"
)
print(f"[Monitor] CPU Circuit Breaker OPEN! CPU={cpu_usage}%")
else:
alerts.append(
f"⚠️ [CPU Overload] Usage: {cpu_usage}% ({self._cpu_overload_count}회)\nTop Processes:{top_3_str}"
)
else:
# CPU 정상 → 카운터 리셋
if self._cpu_overload_count > 0:
print(f"[Monitor] CPU 정상화 ({cpu_usage}%). 카운터 리셋.")
self._cpu_overload_count = 0
# 서킷 브레이커가 열린 후 5분 경과 시 자동 복구
if self._circuit_open and self._circuit_open_since:
elapsed = (now - self._circuit_open_since).total_seconds()
if elapsed >= 300: # 5분
self._circuit_open = False
self._circuit_open_since = None
alerts.append("✅ [CPU Circuit Breaker CLOSED] 시스템 안정화. 분석 재개.")
print("[Monitor] CPU Circuit Breaker CLOSED. 분석 재개.")
# 2. RAM Check
ram = psutil.virtual_memory()
if ram.percent > 90:
alerts.append(f"⚠️ [RAM High] Usage: {ram.percent}% (Free: {ram.available / 1024**3:.1f}GB)")
# 3. GPU Check
if self.ollama_monitor:
gpu_status = self.ollama_monitor.get_gpu_status()
temp = gpu_status.get('temp', 0)
if temp > 80:
alerts.append(f"🔥 [GPU Overheat] Temp: {temp}°C")
# 알림 전송 (텔레그램 비활성화 - 콘솔 로그만 사용)
if alerts:
# 콘솔에만 출력
for alert in alerts:
print(f"[Monitor] {alert}")
# [비활성화] 텔레그램 알림 - 필요시 재활성화
# msg = "🔔 <b>[System Health Alert]</b>\n" + "\n".join(alerts)
# if self.messenger:
# self.messenger.send_message(msg)