import os import time import torch import torch.nn as nn import numpy as np from sklearn.preprocessing import MinMaxScaler from modules.config import Config # cuDNN 벤치마크 활성화 (고정 입력 크기에 대해 최적 커널 자동 선택) torch.backends.cudnn.benchmark = True class Attention(nn.Module): def __init__(self, hidden_size): super(Attention, self).__init__() self.attn = nn.Linear(hidden_size, 1) def forward(self, lstm_output): attn_weights = torch.softmax(self.attn(lstm_output), dim=1) context = torch.sum(attn_weights * lstm_output, dim=1) return context, attn_weights class AdvancedLSTM(nn.Module): def __init__(self, input_size=1, hidden_size=512, num_layers=4, output_size=1, dropout=0.3): super(AdvancedLSTM, self).__init__() self.hidden_size = hidden_size self.num_layers = num_layers self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout) self.attention = Attention(hidden_size) self.fc = nn.Sequential( nn.Linear(hidden_size, hidden_size // 2), nn.ReLU(), nn.Dropout(dropout), nn.Linear(hidden_size // 2, hidden_size // 4), nn.ReLU(), nn.Linear(hidden_size // 4, output_size) ) def forward(self, x): h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device) c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device) lstm_out, _ = self.lstm(x, (h0, c0)) context, _ = self.attention(lstm_out) out = self.fc(context) return out def _unload_ollama(): """LSTM 학습 전 Ollama 모델 언로드하여 GPU 메모리 확보""" try: import requests url = f"{Config.OLLAMA_API_URL}/api/generate" requests.post(url, json={ "model": Config.OLLAMA_MODEL, "keep_alive": 0 }, timeout=5) print("[AI] Ollama model unloaded (GPU memory freed)") time.sleep(1) # 메모리 해제 대기 except Exception: pass def _preload_ollama(): """LSTM 학습 후 Ollama 모델 다시 로드""" try: import requests url = f"{Config.OLLAMA_API_URL}/api/generate" requests.post(url, json={ "model": Config.OLLAMA_MODEL, "prompt": "", "keep_alive": "10m" }, timeout=10) except Exception: pass def _log_gpu_memory(tag=""): """GPU 메모리 사용량 로깅""" if torch.cuda.is_available(): allocated = torch.cuda.memory_allocated(0) / 1024**3 reserved = torch.cuda.memory_reserved(0) / 1024**3 print(f"[AI GPU {tag}] Allocated: {allocated:.2f}GB / Reserved: {reserved:.2f}GB") class PricePredictor: """ 주가 예측 Deep Learning 모델 (GPU 최적화) - 전체 학습 데이터를 GPU에 상주 (CPU↔GPU 전송 최소화) - Ollama 모델 언로드/리로드로 GPU 메모리 확보 - Early Stopping + Mixed Precision (FP16) - 종목별 모델 체크포인트 """ def __init__(self): self.scaler = MinMaxScaler(feature_range=(0, 1)) self.hidden_size = 512 self.num_layers = 4 self.model = AdvancedLSTM(input_size=1, hidden_size=self.hidden_size, num_layers=self.num_layers, dropout=0.3) self.criterion = nn.MSELoss() # CUDA 설정 self.device = torch.device('cpu') self.use_amp = False if torch.cuda.is_available(): try: gpu_name = torch.cuda.get_device_name(0) vram_gb = torch.cuda.get_device_properties(0).total_memory / 1024**3 self.device = torch.device('cuda') self.model.to(self.device) # Mixed Precision (Compute Capability >= 7.0: Volta 이상) if torch.cuda.get_device_capability(0)[0] >= 7: self.use_amp = True # Warm-up: CUDA 커널 컴파일 유도 dummy = torch.zeros(1, 60, 1, device=self.device) with torch.no_grad(): _ = self.model(dummy) torch.cuda.synchronize() print(f"[AI] GPU Mode: {gpu_name} ({vram_gb:.1f}GB)" f" | FP16={'ON' if self.use_amp else 'OFF'}" f" | cuDNN Benchmark=ON") _log_gpu_memory("init") except Exception as e: print(f"[AI] GPU Init Failed ({e}), falling back to CPU") self.device = torch.device('cpu') self.model.to(self.device) else: print("[AI] No CUDA GPU detected. Running on CPU.") self.optimizer = torch.optim.AdamW(self.model.parameters(), lr=0.0005, weight_decay=1e-4) self.scaler_amp = torch.amp.GradScaler('cuda') if self.use_amp else None self.batch_size = 64 self.max_epochs = 200 self.seq_length = 60 self.patience = 15 self.training_status = { "is_training": False, "loss": 0.0, "current_ticker": None } @staticmethod def verify_hardware(): if torch.cuda.is_available(): try: gpu_name = torch.cuda.get_device_name(0) vram_gb = torch.cuda.get_device_properties(0).total_memory / 1024**3 print(f"[AI Check] {gpu_name} ({vram_gb:.1f}GB VRAM) | cuDNN={torch.backends.cudnn.is_available()}") return True except Exception as e: print(f"[AI Check] GPU Error: {e}") return False print("[AI Check] No GPU. CPU Mode.") return False def _get_checkpoint_path(self, ticker): return os.path.join(Config.MODEL_DIR, f"{ticker}_lstm.pt") def _load_checkpoint(self, ticker): path = self._get_checkpoint_path(ticker) if os.path.exists(path): try: checkpoint = torch.load(path, map_location=self.device, weights_only=True) self.model.load_state_dict(checkpoint['model_state_dict']) self.optimizer.load_state_dict(checkpoint['optimizer_state_dict']) print(f"[AI] Checkpoint loaded: {ticker}") return True except Exception as e: print(f"[AI] Checkpoint load failed ({ticker}): {e}") return False def _save_checkpoint(self, ticker, epoch, loss): path = self._get_checkpoint_path(ticker) try: torch.save({ 'model_state_dict': self.model.state_dict(), 'optimizer_state_dict': self.optimizer.state_dict(), 'epoch': epoch, 'loss': loss }, path) except Exception as e: print(f"[AI] Checkpoint save failed ({ticker}): {e}") def train_and_predict(self, prices, forecast_days=1, ticker=None): if len(prices) < (self.seq_length + 10): return None is_gpu = self.device.type == 'cuda' # --- Ollama 모델 언로드 (GPU 메모리 확보) --- if is_gpu: _unload_ollama() torch.cuda.empty_cache() _log_gpu_memory("pre-train") t_start = time.time() # 1. 데이터 전처리 (CPU에서 numpy 작업) data = np.array(prices).reshape(-1, 1) scaled_data = self.scaler.fit_transform(data) x_seqs, y_seqs = [], [] for i in range(len(scaled_data) - self.seq_length): x_seqs.append(scaled_data[i:i + self.seq_length]) y_seqs.append(scaled_data[i + self.seq_length]) # 2. 텐서 생성 → 즉시 GPU로 이동 (이후 CPU↔GPU 전송 없음) x_all = torch.FloatTensor(np.array(x_seqs)).to(self.device) y_all = torch.FloatTensor(np.array(y_seqs)).to(self.device) # Validation split (80/20) split_idx = int(len(x_all) * 0.8) x_train = x_all[:split_idx] y_train = y_all[:split_idx] x_val = x_all[split_idx:] y_val = y_all[split_idx:] dataset_size = len(x_train) # 3. 체크포인트 로드 has_checkpoint = False if ticker: has_checkpoint = self._load_checkpoint(ticker) max_epochs = 50 if has_checkpoint else self.max_epochs # 4. 학습 (전체 데이터 GPU 상주, DataLoader 미사용) self.model.train() self.training_status["is_training"] = True if ticker: self.training_status["current_ticker"] = ticker best_val_loss = float('inf') patience_counter = 0 final_loss = 0.0 actual_epochs = 0 for epoch in range(max_epochs): # --- Training (GPU 내에서 셔플 + 미니배치) --- perm = torch.randperm(dataset_size, device=self.device) x_shuffled = x_train[perm] y_shuffled = y_train[perm] epoch_loss = 0.0 steps = 0 for i in range(0, dataset_size, self.batch_size): end = min(i + self.batch_size, dataset_size) batch_x = x_shuffled[i:end] batch_y = y_shuffled[i:end] self.optimizer.zero_grad(set_to_none=True) if self.use_amp: with torch.amp.autocast('cuda'): outputs = self.model(batch_x) loss = self.criterion(outputs, batch_y) self.scaler_amp.scale(loss).backward() self.scaler_amp.step(self.optimizer) self.scaler_amp.update() else: outputs = self.model(batch_x) loss = self.criterion(outputs, batch_y) loss.backward() self.optimizer.step() epoch_loss += loss.item() steps += 1 train_loss = epoch_loss / max(1, steps) # --- Validation (GPU에서 직접 수행) --- self.model.eval() with torch.no_grad(): if self.use_amp: with torch.amp.autocast('cuda'): val_out = self.model(x_val) val_loss = self.criterion(val_out, y_val).item() else: val_out = self.model(x_val) val_loss = self.criterion(val_out, y_val).item() self.model.train() final_loss = train_loss actual_epochs = epoch + 1 if val_loss < best_val_loss: best_val_loss = val_loss patience_counter = 0 else: patience_counter += 1 if patience_counter >= self.patience: break self.training_status["is_training"] = False self.training_status["loss"] = final_loss if is_gpu: torch.cuda.synchronize() elapsed = time.time() - t_start print(f"[AI] {ticker or '?'}: {actual_epochs} epochs in {elapsed:.1f}s" f" | loss={final_loss:.6f} val={best_val_loss:.6f}" f" | device={self.device}") # 5. 체크포인트 저장 if ticker: self._save_checkpoint(ticker, actual_epochs, final_loss) # 6. 예측 self.model.eval() with torch.no_grad(): last_seq = torch.FloatTensor( scaled_data[-self.seq_length:] ).unsqueeze(0).to(self.device) if self.use_amp: with torch.amp.autocast('cuda'): predicted_scaled = self.model(last_seq) else: predicted_scaled = self.model(last_seq) predicted_price = self.scaler.inverse_transform( predicted_scaled.cpu().float().numpy())[0][0] # 7. GPU 메모리 정리 + Ollama 리로드 if is_gpu: # 학습 중간 텐서 해제 del x_all, y_all, x_train, y_train, x_val, y_val torch.cuda.empty_cache() _log_gpu_memory("post-train") _preload_ollama() current_price = prices[-1] trend = "UP" if predicted_price > current_price else "DOWN" change_rate = ((predicted_price - current_price) / current_price) * 100 confidence = 1.0 / (1.0 + (final_loss * 100)) return { "current": current_price, "predicted": float(predicted_price), "change_rate": round(change_rate, 2), "trend": trend, "loss": final_loss, "confidence": round(confidence, 2), "epochs": actual_epochs, "device": str(self.device) } def batch_predict(self, prices_dict): results = {} seqs = [] metas = [] for ticker, prices in prices_dict.items(): if len(prices) < (self.seq_length + 10): results[ticker] = None continue data = np.array(prices).reshape(-1, 1) scaler = MinMaxScaler(feature_range=(0, 1)) scaled_data = scaler.fit_transform(data) seq = torch.FloatTensor(scaled_data[-self.seq_length:]).unsqueeze(0) seqs.append(seq) metas.append((ticker, scaler, prices[-1])) if not seqs: return results # 배치로 합쳐서 한번에 GPU 추론 batch = torch.cat(seqs, dim=0).to(self.device) self.model.eval() with torch.no_grad(): if self.use_amp: with torch.amp.autocast('cuda'): preds = self.model(batch) else: preds = self.model(batch) preds_cpu = preds.cpu().float().numpy() for i, (ticker, scaler, current_price) in enumerate(metas): predicted_price = scaler.inverse_transform(preds_cpu[i:i+1])[0][0] trend = "UP" if predicted_price > current_price else "DOWN" change_rate = ((predicted_price - current_price) / current_price) * 100 results[ticker] = { "current": current_price, "predicted": float(predicted_price), "change_rate": round(change_rate, 2), "trend": trend } if self.device.type == 'cuda': torch.cuda.empty_cache() return results