web-page-backend/stock-lab/app/scraper.py

import logging
import requests
from bs4 import BeautifulSoup
from typing import List, Dict, Any
import time

logger = logging.getLogger("stock-lab.scraper")

# 네이버 파이낸스 주요 뉴스
NAVER_FINANCE_NEWS_URL = "https://finance.naver.com/news/mainnews.naver"
# 해외증시 뉴스 (모바일 API 사용)
# NAVER_FINANCE_WORLD_NEWS_URL 사용 안함.

# 해외증시 메인 (지수용)
NAVER_FINANCE_WORLD_URL = "https://finance.naver.com/world/"

def fetch_market_news() -> List[Dict[str, str]]:
    """
    네이버 금융 '주요 뉴스' 크롤링
    반환: [{"title": "...", "link": "...", "summary": "...", "date": "..."}, ...]
    """
    try:
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36"
        }
        resp = requests.get(NAVER_FINANCE_NEWS_URL, headers=headers, timeout=10)
        resp.raise_for_status()

        soup = BeautifulSoup(resp.content, "html.parser", from_encoding="cp949")

        # 주요 뉴스 리스트 추출
        # 구조: div.mainNewsList > ul > li
        articles = []
        news_list = soup.select(".mainNewsList ul li")

        for li in news_list:
            # 썸네일 있을 수도 있고 없을 수도 있음
            dl = li.select_one("dl")
            if not dl:
                continue

            # 제목 (dd.articleSubject > a)
            subject_tag = dl.select_one(".articleSubject a")
            if not subject_tag:
                continue

            title = subject_tag.get_text(strip=True)
            link = "https://finance.naver.com" + subject_tag["href"]

            # 요약 (dd.articleSummary)
            summary_tag = dl.select_one(".articleSummary")
            summary = ""
            press = ""
            date = ""

            if summary_tag:
                # 불필요한 태그 제거
                for child in summary_tag.select(".press, .wdate"):
                    if "press" in child.get("class", []):
                        press = child.get_text(strip=True)
                    if "wdate" in child.get("class", []):
                        date = child.get_text(strip=True)
                    child.extract()
                summary = summary_tag.get_text(strip=True)

            articles.append({
                "title": title,
                "link": link,
                "summary": summary,
                "press": press,
                "date": date,
                "crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"),
                "category": "domestic"
            })

        return articles

    except Exception as e:
        logger.error(f"국내 뉴스 스크래핑 실패: {e}")
        return []

def fetch_major_indices() -> Dict[str, Any]:
    """
    KOSPI, KOSDAQ, KOSPI200 등 주요 지표 (네이버 금융 홈)
    """
    url = "https://finance.naver.com/"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)"
    }
    try:
        targets = [
            {"key": "KOSPI", "selector": ".kospi_area", "url": "https://finance.naver.com/"},
            {"key": "KOSDAQ", "selector": ".kosdaq_area", "url": "https://finance.naver.com/"},
            {"key": "KOSPI200", "selector": ".kospi200_area", "url": "https://finance.naver.com/"},
        ]

        # 해외 지수 (네이버 금융 해외 메인) - 여기서는 별도 URL 호출 필요하거나, 메인에 있는지 확인
        # 네이버 메인에는 해외지수가 안 나옴. https://finance.naver.com/world/ 에서 긁어야 함
        # 그러나 한 번에 처리하기 위해 함수 내에서 추가 호출

        indices = []

        # --- 국내 ---
        resp_kr = requests.get("https://finance.naver.com/", headers=headers, timeout=5)
        soup_kr = BeautifulSoup(resp_kr.content, "html.parser", from_encoding="cp949")

        for t in targets:
            area = soup_kr.select_one(t["selector"])
            if not area: continue

            # (기존 파싱 로직)
            num_tag = area.select_one(".num")
            value = num_tag.get_text(strip=True) if num_tag else ""

            change_val_tag = area.select_one(".num2")
            change_pct_tag = area.select_one(".num3")
            change_val = change_val_tag.get_text(strip=True) if change_val_tag else ""
            change_pct = change_pct_tag.get_text(strip=True) if change_pct_tag else ""

            direction = ""
            if area.select_one(".bu_p"): direction = "red"
            elif area.select_one(".bu_m"): direction = "blue"

            indices.append({
                "name": t["key"],
                "value": value,
                "change_value": change_val,
                "change_percent": change_pct,
                "direction": direction,
                "type": "domestic"
            })

        # --- 해외 (DJI, NAS, SPI) ---
        try:
            resp_world = requests.get(NAVER_FINANCE_WORLD_URL, headers=headers, timeout=5)
            soup_world = BeautifulSoup(resp_world.content, "html.parser", from_encoding="cp949")

            world_targets = [
                {"key": "DJI", "name": "다우산업", "sym": "DJI@DJI"},
                {"key": "NAS", "name": "나스닥", "sym": "NAS@IXIC"},
                {"key": "SPI", "name": "S&P500", "sym": "SPI@SPX"},
            ]

            for wt in world_targets:
                # 심볼 링크로 찾기 (가장 정확함)
                a_tag = soup_world.select_one(f"a[href*='symbol={wt['sym']}']")
                if not a_tag:
                    continue

                # 상위 dl 태그 찾기
                dl = a_tag.find_parent("dl")
                if not dl:
                    continue

                # 값 파싱 (dd.point_status)
                status_dd = dl.select_one("dd.point_status")
                if not status_dd:
                    continue

                # 1. 현재가 (strong)
                val_tag = status_dd.select_one("strong")
                value = val_tag.get_text(strip=True) if val_tag else ""

                # 2. 등락폭 (em)
                change_val_tag = status_dd.select_one("em")
                change_val = change_val_tag.get_text(strip=True) if change_val_tag else ""

                # 3. 등락률 (span)
                change_pct_tag = status_dd.select_one("span")
                change_pct = change_pct_tag.get_text(strip=True) if change_pct_tag else ""

                # 4. 방향 (dl 클래스 활용)
                direction = ""
                dl_classes = dl.get("class", [])
                if "point_up" in dl_classes:
                    direction = "red"
                elif "point_dn" in dl_classes:
                    direction = "blue"

                indices.append({
                    "name": wt["name"], # 한글 이름 사용
                    "value": value,
                    "change_value": change_val,
                    "change_percent": change_pct,
                    "direction": direction,
                    "type": "overseas"
                })

        except Exception as e:
             logger.error(f"해외 지수 스크래핑 실패: {e}")

        # --- 환율 (USD/KRW) ---
        try:
            resp_ex = requests.get("https://finance.naver.com/marketindex/", headers=headers, timeout=5)
            soup_ex = BeautifulSoup(resp_ex.content, "html.parser", from_encoding="cp949")

            usd_item = soup_ex.select_one("#exchangeList li.on > a.head.usd")
            if usd_item:
                value = usd_item.select_one(".value").get_text(strip=True)
                change_val = usd_item.select_one(".change").get_text(strip=True)

                # 방향 (blind 텍스트: 상승, 하락)
                direction = ""
                blind_txt = usd_item.select_one(".blind").get_text(strip=True)
                if "상승" in blind_txt: direction = "red"
                elif "하락" in blind_txt: direction = "blue"

                # 등락률은 리스트에는 안나오고 상세에 나오지만, 여기선 생략하거나 계산 가능.
                # 일단 UI 통일성을 위해 빈값 혹은 계산된 값 등 처리.
                # 네이버 메인 환율 영역엔 등락률이 텍스트로 바로 안보임 (title 속성 등에 있을수 있음).
                # 여기서는 간단히 값만 처리.

                indices.append({
                    "name": "원달러 환율",
                    "value": value,
                    "change_value": change_val,
                    "change_percent": "", # 메인 리스트에서 바로 안보임
                    "direction": direction,
                    "type": "exchange"
                })
        except Exception as e:
            logger.error(f"환율 스크래핑 실패: {e}")

        return {"indices": indices, "crawled_at": time.strftime("%Y-%m-%d %H:%M:%S")}

    except Exception as e:
        logger.error(f"지수 스크래핑 전체 실패: {e}")
        return {"indices": [], "error": str(e)}