"""
통합 LLM 클라이언트 — Gemini 2.5 Flash (Primary) + Ollama (Fallback)

설계 원칙:
  - OllamaManager.request_inference(prompt) 와 동일한 인터페이스 유지
    → process.py, ai_council.py 코드 변경 최소화
  - Gemini 실패(네트워크, Rate Limit) 시 자동으로 로컬 Ollama 폴백
  - 15 RPM 제한 준수를 위한 자동 스로틀링
  - VRAM 충돌 없음 (외부 API 호출이므로 LSTM 학습과 간섭 없음)

Rate Limit (Gemini 2.5 Flash 무료 티어):
  - 15 RPM, 1,500 RPD (봇 필요량 ~240/일 → 여유 6배)

추가 패키지 불필요:
  - requests (이미 설치됨) 기반 REST API 직접 호출
"""

import time
import requests
import json

from modules.config import Config


class GeminiLLMClient:
    """
    Gemini API 클라이언트

    사용법:
        client = GeminiLLMClient()
        result = client.request_inference(prompt)  # str | None
    """

    _GENERATE_URL = (
        "https://generativelanguage.googleapis.com/v1beta/models"
        "/{model}:generateContent?key={key}"
    )
    # 15 RPM → 최소 4초 간격 (여유 0.1초 추가)
    _MIN_INTERVAL = 4.1
    # 클래스 변수: 같은 프로세스 내 재생성 시에도 마지막 호출 시각 유지
    # (워커 OOM 재시작 후 싱글톤 교체 시에도 스로틀 유효)
    _class_last_call_ts: float = 0.0

    def __init__(self):
        self.api_key = Config.GEMINI_API_KEY
        self.model   = Config.GEMINI_MODEL
        self._ollama = None          # Ollama 폴백 (lazy init)
        self._use_gemini = bool(self.api_key)

        if self._use_gemini:
            print(f"✅ [LLMClient] Primary: Gemini {self.model}")
        else:
            print("⚠️  [LLMClient] GEMINI_API_KEY 미설정 → Ollama 전용 모드")

    # ── 내부 헬퍼 ────────────────────────────────────────────────────────────

    def _throttle(self):
        """15 RPM 제한 준수 — 최소 호출 간격 강제 대기 (클래스 공유 타임스탬프)"""
        elapsed = time.time() - GeminiLLMClient._class_last_call_ts
        if elapsed < self._MIN_INTERVAL:
            time.sleep(self._MIN_INTERVAL - elapsed)

    def _call_gemini(self, prompt: str) -> str | None:
        """
        Gemini REST API 단일 호출

        설정:
          - systemInstruction: JSON 전용 응답 강제
          - thinkingBudget=0: 내부 추론 비활성 (속도 1.5초 / 토큰 절약)
          - maxOutputTokens=512: 200은 thinking 소모로 잘리므로 여유 확보
        """
        self._throttle()

        url = self._GENERATE_URL.format(model=self.model, key=self.api_key)
        payload = {
            "system_instruction": {
                "parts": [{"text": (
                    "You are a Korean stock market analyst. "
                    "Respond with valid JSON only. "
                    "No markdown, no code blocks, no explanations."
                )}]
            },
            "contents": [{"parts": [{"text": prompt}]}],
            "generationConfig": {
                "maxOutputTokens": 512,   # 200→512 (thinking 비활성 후 실제 응답 공간 확보)
                "temperature": 0.1,       # 결정론적 출력
                "thinkingConfig": {"thinkingBudget": 0},  # 내부 추론 끔 (속도↑, 토큰↓)
            },
        }

        try:
            resp = requests.post(url, json=payload, timeout=30)
            GeminiLLMClient._class_last_call_ts = time.time()

            # Rate Limit 초과
            if resp.status_code == 429:
                print("[LLMClient] Gemini Rate Limit (429) → Ollama 폴백")
                return None

            resp.raise_for_status()
            data = resp.json()

            # thinking 파트 제외, 실제 텍스트 파트만 결합
            candidate = data.get("candidates", [{}])[0]
            parts = candidate.get("content", {}).get("parts", [])
            text = "".join(
                p.get("text", "") for p in parts
                if "text" in p and not p.get("thought")
            ).strip()

            return text if text else None

        except requests.exceptions.Timeout:
            print("[LLMClient] Gemini Timeout (30s) → Ollama 폴백")
            return None
        except Exception as e:
            print(f"[LLMClient] Gemini Error: {e} → Ollama 폴백")
            return None

    def _get_ollama(self):
        """Ollama 폴백 인스턴스 (lazy init — 필요할 때만 로드)"""
        if self._ollama is None:
            from modules.services.ollama import OllamaManager
            self._ollama = OllamaManager()
            # Ollama 실행 여부 사전 확인 (WinError 10061 조기 감지)
            try:
                requests.get(
                    f"{Config.OLLAMA_API_URL}/api/tags",
                    timeout=3,
                )
            except Exception:
                print(
                    f"❌ [LLMClient] Ollama 미실행 (localhost:11434 연결 거부) — "
                    f"`ollama serve` 명령으로 Ollama를 시작하세요."
                )
        return self._ollama

    # ── 공개 인터페이스 ───────────────────────────────────────────────────────

    def request_inference(self, prompt: str, context_data=None) -> str | None:
        """
        LLM 추론 요청 — OllamaManager.request_inference()와 동일한 시그니처

        순서:
          1) GEMINI_API_KEY 있음 → Gemini API 호출
          2) Gemini 실패(에러/타임아웃/Rate Limit) → Ollama 로컬 폴백
          3) GEMINI_API_KEY 없음 → 바로 Ollama 사용
        """
        if self._use_gemini:
            result = self._call_gemini(prompt)
            if result is not None:
                return result
            # Gemini 실패 → Ollama 폴백
            print("[LLMClient] Ollama 폴백 시도 중...")

        return self._get_ollama().request_inference(prompt, context_data)

    # ── OllamaManager 호환 메서드 (ai_council, evaluator 등에서 사용) ─────────

    def check_vram(self) -> float:
        """VRAM 사용량 반환 (Ollama 측 정보, Gemini 호출 시엔 무관)"""
        if self._ollama:
            return self._ollama.check_vram()
        return 0.0

    def get_gpu_status(self) -> dict:
        """GPU 상태 반환 (OllamaManager 호환)"""
        return self._get_ollama().get_gpu_status()

    def unload_model(self):
        """Ollama 모델 언로드 (LSTM 학습 전 호출용, Gemini는 무작동)"""
        if self._ollama:
            try:
                requests.post(
                    f"{Config.OLLAMA_API_URL}/api/generate",
                    json={"model": Config.OLLAMA_MODEL, "keep_alive": 0},
                    timeout=5,
                )
            except Exception:
                pass


# ── 워커 프로세스 전역 싱글톤 ─────────────────────────────────────────────────

_llm_client: GeminiLLMClient | None = None


def get_llm_client() -> GeminiLLMClient:
    """
    워커 프로세스 내 GeminiLLMClient 싱글톤 반환

    process.py에서 기존 get_ollama() 대신 이 함수를 사용:
        ollama = get_llm_client()
        result = ollama.request_inference(prompt)
    """
    global _llm_client
    if _llm_client is None:
        _llm_client = GeminiLLMClient()
    return _llm_client