464 lines
17 KiB
Python
464 lines
17 KiB
Python
import os
|
|
import time
|
|
import torch
|
|
import torch.nn as nn
|
|
import numpy as np
|
|
from sklearn.preprocessing import MinMaxScaler
|
|
|
|
from modules.config import Config
|
|
|
|
# cuDNN 벤치마크 활성화 (고정 입력 크기에 대해 최적 커널 자동 선택)
|
|
torch.backends.cudnn.benchmark = True
|
|
|
|
|
|
class Attention(nn.Module):
|
|
def __init__(self, hidden_size):
|
|
super(Attention, self).__init__()
|
|
self.attn = nn.Linear(hidden_size, 1)
|
|
|
|
def forward(self, lstm_output):
|
|
attn_weights = torch.softmax(self.attn(lstm_output), dim=1)
|
|
context = torch.sum(attn_weights * lstm_output, dim=1)
|
|
return context, attn_weights
|
|
|
|
|
|
class AdvancedLSTM(nn.Module):
|
|
def __init__(self, input_size=1, hidden_size=512, num_layers=4, output_size=1, dropout=0.3):
|
|
super(AdvancedLSTM, self).__init__()
|
|
self.hidden_size = hidden_size
|
|
self.num_layers = num_layers
|
|
|
|
self.lstm = nn.LSTM(input_size, hidden_size, num_layers,
|
|
batch_first=True, dropout=dropout)
|
|
self.attention = Attention(hidden_size)
|
|
self.fc = nn.Sequential(
|
|
nn.Linear(hidden_size, hidden_size // 2),
|
|
nn.ReLU(),
|
|
nn.Dropout(dropout),
|
|
nn.Linear(hidden_size // 2, hidden_size // 4),
|
|
nn.ReLU(),
|
|
nn.Linear(hidden_size // 4, output_size)
|
|
)
|
|
|
|
def forward(self, x):
|
|
h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
|
|
c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
|
|
lstm_out, _ = self.lstm(x, (h0, c0))
|
|
context, _ = self.attention(lstm_out)
|
|
out = self.fc(context)
|
|
return out
|
|
|
|
|
|
def _unload_ollama():
|
|
"""LSTM 학습 전 Ollama 모델 언로드하여 GPU 메모리 확보"""
|
|
try:
|
|
import requests
|
|
url = f"{Config.OLLAMA_API_URL}/api/generate"
|
|
requests.post(url, json={
|
|
"model": Config.OLLAMA_MODEL,
|
|
"keep_alive": 0
|
|
}, timeout=5)
|
|
print("[AI] Ollama model unloaded (GPU memory freed)")
|
|
time.sleep(1) # 메모리 해제 대기
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
def _preload_ollama():
|
|
"""LSTM 학습 후 Ollama 모델 다시 로드"""
|
|
try:
|
|
import requests
|
|
url = f"{Config.OLLAMA_API_URL}/api/generate"
|
|
requests.post(url, json={
|
|
"model": Config.OLLAMA_MODEL,
|
|
"prompt": "",
|
|
"keep_alive": "10m"
|
|
}, timeout=10)
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
def _log_gpu_memory(tag=""):
|
|
"""GPU 메모리 사용량 로깅"""
|
|
if torch.cuda.is_available():
|
|
allocated = torch.cuda.memory_allocated(0) / 1024**3
|
|
reserved = torch.cuda.memory_reserved(0) / 1024**3
|
|
print(f"[AI GPU {tag}] Allocated: {allocated:.2f}GB / Reserved: {reserved:.2f}GB")
|
|
|
|
|
|
class PricePredictor:
|
|
"""
|
|
주가 예측 Deep Learning 모델 (GPU 최적화)
|
|
- 전체 학습 데이터를 GPU에 상주 (CPU↔GPU 전송 최소화)
|
|
- Ollama 모델 언로드/리로드로 GPU 메모리 확보
|
|
- Early Stopping + Mixed Precision (FP16)
|
|
- 종목별 모델 체크포인트
|
|
"""
|
|
def __init__(self):
|
|
self.scaler = MinMaxScaler(feature_range=(0, 1))
|
|
|
|
self.hidden_size = 512
|
|
self.num_layers = 4
|
|
|
|
self.model = AdvancedLSTM(input_size=1, hidden_size=self.hidden_size,
|
|
num_layers=self.num_layers, dropout=0.3)
|
|
self.criterion = nn.MSELoss()
|
|
|
|
# CUDA 설정
|
|
self.device = torch.device('cpu')
|
|
self.use_amp = False
|
|
|
|
if torch.cuda.is_available():
|
|
try:
|
|
gpu_name = torch.cuda.get_device_name(0)
|
|
vram_gb = torch.cuda.get_device_properties(0).total_memory / 1024**3
|
|
|
|
self.device = torch.device('cuda')
|
|
self.model.to(self.device)
|
|
|
|
# Mixed Precision (Compute Capability >= 7.0: Volta 이상)
|
|
if torch.cuda.get_device_capability(0)[0] >= 7:
|
|
self.use_amp = True
|
|
|
|
# Warm-up: CUDA 커널 컴파일 유도
|
|
dummy = torch.zeros(1, 60, 1, device=self.device)
|
|
with torch.no_grad():
|
|
_ = self.model(dummy)
|
|
torch.cuda.synchronize()
|
|
|
|
print(f"[AI] GPU Mode: {gpu_name} ({vram_gb:.1f}GB)"
|
|
f" | FP16={'ON' if self.use_amp else 'OFF'}"
|
|
f" | cuDNN Benchmark=ON")
|
|
_log_gpu_memory("init")
|
|
|
|
except Exception as e:
|
|
print(f"[AI] GPU Init Failed ({e}), falling back to CPU")
|
|
self.device = torch.device('cpu')
|
|
self.model.to(self.device)
|
|
else:
|
|
print("[AI] No CUDA GPU detected. Running on CPU.")
|
|
|
|
self.optimizer = torch.optim.AdamW(self.model.parameters(), lr=0.001, weight_decay=1e-4)
|
|
# [v2.0] Learning Rate Scheduler (ReduceLROnPlateau: val_loss 정체 시 lr 감소)
|
|
self.lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
|
|
self.optimizer, mode='min', factor=0.5, patience=7, min_lr=1e-6, verbose=False
|
|
)
|
|
self.scaler_amp = torch.amp.GradScaler('cuda') if self.use_amp else None
|
|
|
|
self.batch_size = 64
|
|
self.max_epochs = 200
|
|
self.seq_length = 60
|
|
self.patience = 15
|
|
# [v2.0] Gradient Clipping 값 (exploding gradient 방지)
|
|
self.max_grad_norm = 1.0
|
|
|
|
self.training_status = {
|
|
"is_training": False,
|
|
"loss": 0.0,
|
|
"current_ticker": None
|
|
}
|
|
|
|
@staticmethod
|
|
def verify_hardware():
|
|
if torch.cuda.is_available():
|
|
try:
|
|
gpu_name = torch.cuda.get_device_name(0)
|
|
vram_gb = torch.cuda.get_device_properties(0).total_memory / 1024**3
|
|
print(f"[AI Check] {gpu_name} ({vram_gb:.1f}GB VRAM) | cuDNN={torch.backends.cudnn.is_available()}")
|
|
return True
|
|
except Exception as e:
|
|
print(f"[AI Check] GPU Error: {e}")
|
|
return False
|
|
print("[AI Check] No GPU. CPU Mode.")
|
|
return False
|
|
|
|
def _get_checkpoint_path(self, ticker):
|
|
return os.path.join(Config.MODEL_DIR, f"{ticker}_lstm.pt")
|
|
|
|
def _load_checkpoint(self, ticker):
|
|
path = self._get_checkpoint_path(ticker)
|
|
if os.path.exists(path):
|
|
try:
|
|
checkpoint = torch.load(path, map_location=self.device, weights_only=True)
|
|
self.model.load_state_dict(checkpoint['model_state_dict'])
|
|
self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
|
|
print(f"[AI] Checkpoint loaded: {ticker}")
|
|
return True
|
|
except Exception as e:
|
|
print(f"[AI] Checkpoint load failed ({ticker}): {e}")
|
|
return False
|
|
|
|
def _save_checkpoint(self, ticker, epoch, loss):
|
|
path = self._get_checkpoint_path(ticker)
|
|
try:
|
|
torch.save({
|
|
'model_state_dict': self.model.state_dict(),
|
|
'optimizer_state_dict': self.optimizer.state_dict(),
|
|
'epoch': epoch,
|
|
'loss': loss
|
|
}, path)
|
|
except Exception as e:
|
|
print(f"[AI] Checkpoint save failed ({ticker}): {e}")
|
|
|
|
def train_and_predict(self, prices, forecast_days=1, ticker=None):
|
|
if len(prices) < (self.seq_length + 10):
|
|
return None
|
|
|
|
is_gpu = self.device.type == 'cuda'
|
|
|
|
# --- Ollama 모델 언로드 (GPU 메모리 확보) ---
|
|
if is_gpu:
|
|
_unload_ollama()
|
|
torch.cuda.empty_cache()
|
|
_log_gpu_memory("pre-train")
|
|
|
|
t_start = time.time()
|
|
|
|
# 1. 데이터 전처리 (CPU에서 numpy 작업)
|
|
data = np.array(prices).reshape(-1, 1)
|
|
scaled_data = self.scaler.fit_transform(data)
|
|
|
|
x_seqs, y_seqs = [], []
|
|
for i in range(len(scaled_data) - self.seq_length):
|
|
x_seqs.append(scaled_data[i:i + self.seq_length])
|
|
y_seqs.append(scaled_data[i + self.seq_length])
|
|
|
|
# 2. 텐서 생성 → 즉시 GPU로 이동 (이후 CPU↔GPU 전송 없음)
|
|
x_all = torch.FloatTensor(np.array(x_seqs)).to(self.device)
|
|
y_all = torch.FloatTensor(np.array(y_seqs)).to(self.device)
|
|
|
|
# Validation split (80/20)
|
|
split_idx = int(len(x_all) * 0.8)
|
|
x_train = x_all[:split_idx]
|
|
y_train = y_all[:split_idx]
|
|
x_val = x_all[split_idx:]
|
|
y_val = y_all[split_idx:]
|
|
|
|
dataset_size = len(x_train)
|
|
|
|
# 3. 체크포인트 로드
|
|
has_checkpoint = False
|
|
if ticker:
|
|
has_checkpoint = self._load_checkpoint(ticker)
|
|
max_epochs = 50 if has_checkpoint else self.max_epochs
|
|
|
|
# 4. 학습 (전체 데이터 GPU 상주, DataLoader 미사용)
|
|
# [v2.0] LR Scheduler 리셋
|
|
self.optimizer.param_groups[0]['lr'] = 0.001 if not has_checkpoint else 0.0005
|
|
self.lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
|
|
self.optimizer, mode='min', factor=0.5, patience=7, min_lr=1e-6, verbose=False
|
|
)
|
|
|
|
self.model.train()
|
|
self.training_status["is_training"] = True
|
|
if ticker:
|
|
self.training_status["current_ticker"] = ticker
|
|
|
|
best_val_loss = float('inf')
|
|
best_model_state = None # [v2.0] Best Model 저장
|
|
patience_counter = 0
|
|
final_loss = 0.0
|
|
actual_epochs = 0
|
|
|
|
for epoch in range(max_epochs):
|
|
# --- Training (GPU 내에서 셔플 + 미니배치) ---
|
|
perm = torch.randperm(dataset_size, device=self.device)
|
|
x_shuffled = x_train[perm]
|
|
y_shuffled = y_train[perm]
|
|
|
|
epoch_loss = 0.0
|
|
steps = 0
|
|
|
|
for i in range(0, dataset_size, self.batch_size):
|
|
end = min(i + self.batch_size, dataset_size)
|
|
batch_x = x_shuffled[i:end]
|
|
batch_y = y_shuffled[i:end]
|
|
|
|
self.optimizer.zero_grad(set_to_none=True)
|
|
|
|
if self.use_amp:
|
|
with torch.amp.autocast('cuda'):
|
|
outputs = self.model(batch_x)
|
|
loss = self.criterion(outputs, batch_y)
|
|
self.scaler_amp.scale(loss).backward()
|
|
# [v2.0] Gradient Clipping (AMP 호환)
|
|
self.scaler_amp.unscale_(self.optimizer)
|
|
torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.max_grad_norm)
|
|
self.scaler_amp.step(self.optimizer)
|
|
self.scaler_amp.update()
|
|
else:
|
|
outputs = self.model(batch_x)
|
|
loss = self.criterion(outputs, batch_y)
|
|
loss.backward()
|
|
# [v2.0] Gradient Clipping
|
|
torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.max_grad_norm)
|
|
self.optimizer.step()
|
|
|
|
epoch_loss += loss.item()
|
|
steps += 1
|
|
|
|
train_loss = epoch_loss / max(1, steps)
|
|
|
|
# --- Validation (GPU에서 직접 수행) ---
|
|
self.model.eval()
|
|
with torch.no_grad():
|
|
if self.use_amp:
|
|
with torch.amp.autocast('cuda'):
|
|
val_out = self.model(x_val)
|
|
val_loss = self.criterion(val_out, y_val).item()
|
|
else:
|
|
val_out = self.model(x_val)
|
|
val_loss = self.criterion(val_out, y_val).item()
|
|
self.model.train()
|
|
|
|
# [v2.0] LR Scheduler step (val_loss 기반)
|
|
self.lr_scheduler.step(val_loss)
|
|
|
|
final_loss = train_loss
|
|
actual_epochs = epoch + 1
|
|
|
|
if val_loss < best_val_loss:
|
|
best_val_loss = val_loss
|
|
patience_counter = 0
|
|
# [v2.0] Best model 상태 저장 (overfitting 방지)
|
|
best_model_state = {k: v.clone() for k, v in self.model.state_dict().items()}
|
|
else:
|
|
patience_counter += 1
|
|
if patience_counter >= self.patience:
|
|
break
|
|
|
|
# [v2.0] Best model 복원 (early stopping 후 최적 상태로 복구)
|
|
if best_model_state:
|
|
self.model.load_state_dict(best_model_state)
|
|
|
|
self.training_status["is_training"] = False
|
|
self.training_status["loss"] = final_loss
|
|
|
|
if is_gpu:
|
|
torch.cuda.synchronize()
|
|
|
|
elapsed = time.time() - t_start
|
|
print(f"[AI] {ticker or '?'}: {actual_epochs} epochs in {elapsed:.1f}s"
|
|
f" | loss={final_loss:.6f} val={best_val_loss:.6f}"
|
|
f" | device={self.device}")
|
|
|
|
# 5. 체크포인트 저장
|
|
if ticker:
|
|
self._save_checkpoint(ticker, actual_epochs, final_loss)
|
|
|
|
# 6. 예측
|
|
self.model.eval()
|
|
with torch.no_grad():
|
|
last_seq = torch.FloatTensor(
|
|
scaled_data[-self.seq_length:]
|
|
).unsqueeze(0).to(self.device)
|
|
|
|
if self.use_amp:
|
|
with torch.amp.autocast('cuda'):
|
|
predicted_scaled = self.model(last_seq)
|
|
else:
|
|
predicted_scaled = self.model(last_seq)
|
|
|
|
predicted_price = self.scaler.inverse_transform(
|
|
predicted_scaled.cpu().float().numpy())[0][0]
|
|
|
|
# 7. GPU 메모리 정리 + Ollama 리로드
|
|
if is_gpu:
|
|
# 학습 중간 텐서 해제
|
|
del x_all, y_all, x_train, y_train, x_val, y_val
|
|
torch.cuda.empty_cache()
|
|
_log_gpu_memory("post-train")
|
|
_preload_ollama()
|
|
|
|
current_price = prices[-1]
|
|
trend = "UP" if predicted_price > current_price else "DOWN"
|
|
change_rate = ((predicted_price - current_price) / current_price) * 100
|
|
|
|
# [v2.0] 개선된 신뢰도 계산
|
|
# 1. 학습 손실 기반 (낮을수록 좋음)
|
|
loss_confidence = 1.0 / (1.0 + (best_val_loss * 50))
|
|
|
|
# 2. Train/Val 괴리도 (overfitting 감지)
|
|
overfit_ratio = final_loss / (best_val_loss + 1e-9)
|
|
if overfit_ratio < 0.5:
|
|
# Train loss가 Val loss보다 훨씬 낮음 = overfitting
|
|
overfit_penalty = 0.7
|
|
elif overfit_ratio > 2.0:
|
|
# Train loss가 Val loss보다 훨씬 높음 = underfitting
|
|
overfit_penalty = 0.8
|
|
else:
|
|
overfit_penalty = 1.0
|
|
|
|
# 3. 에포크 수 기반 (너무 적거나 많으면 불신)
|
|
epoch_factor = 1.0
|
|
if actual_epochs < 10:
|
|
epoch_factor = 0.6 # 학습 부족
|
|
elif actual_epochs >= max_epochs:
|
|
epoch_factor = 0.8 # 수렴 실패
|
|
|
|
confidence = min(0.95, loss_confidence * overfit_penalty * epoch_factor)
|
|
|
|
return {
|
|
"current": current_price,
|
|
"predicted": float(predicted_price),
|
|
"change_rate": round(change_rate, 2),
|
|
"trend": trend,
|
|
"loss": final_loss,
|
|
"val_loss": best_val_loss,
|
|
"confidence": round(confidence, 2),
|
|
"epochs": actual_epochs,
|
|
"device": str(self.device),
|
|
"lr": self.optimizer.param_groups[0]['lr']
|
|
}
|
|
|
|
def batch_predict(self, prices_dict):
|
|
results = {}
|
|
seqs = []
|
|
metas = []
|
|
|
|
for ticker, prices in prices_dict.items():
|
|
if len(prices) < (self.seq_length + 10):
|
|
results[ticker] = None
|
|
continue
|
|
|
|
data = np.array(prices).reshape(-1, 1)
|
|
scaler = MinMaxScaler(feature_range=(0, 1))
|
|
scaled_data = scaler.fit_transform(data)
|
|
|
|
seq = torch.FloatTensor(scaled_data[-self.seq_length:]).unsqueeze(0)
|
|
seqs.append(seq)
|
|
metas.append((ticker, scaler, prices[-1]))
|
|
|
|
if not seqs:
|
|
return results
|
|
|
|
# 배치로 합쳐서 한번에 GPU 추론
|
|
batch = torch.cat(seqs, dim=0).to(self.device)
|
|
|
|
self.model.eval()
|
|
with torch.no_grad():
|
|
if self.use_amp:
|
|
with torch.amp.autocast('cuda'):
|
|
preds = self.model(batch)
|
|
else:
|
|
preds = self.model(batch)
|
|
|
|
preds_cpu = preds.cpu().float().numpy()
|
|
|
|
for i, (ticker, scaler, current_price) in enumerate(metas):
|
|
predicted_price = scaler.inverse_transform(preds_cpu[i:i+1])[0][0]
|
|
trend = "UP" if predicted_price > current_price else "DOWN"
|
|
change_rate = ((predicted_price - current_price) / current_price) * 100
|
|
|
|
results[ticker] = {
|
|
"current": current_price,
|
|
"predicted": float(predicted_price),
|
|
"change_rate": round(change_rate, 2),
|
|
"trend": trend
|
|
}
|
|
|
|
if self.device.type == 'cuda':
|
|
torch.cuda.empty_cache()
|
|
|
|
return results
|