Files
web-page-backend/stock-lab/app/scraper.py

228 lines
8.8 KiB
Python

import requests
from bs4 import BeautifulSoup
from typing import List, Dict, Any
import time
# 네이버 파이낸스 주요 뉴스
NAVER_FINANCE_NEWS_URL = "https://finance.naver.com/news/mainnews.naver"
# 해외증시 뉴스 (모바일 API 사용)
# NAVER_FINANCE_WORLD_NEWS_URL 사용 안함.
# 해외증시 메인 (지수용)
NAVER_FINANCE_WORLD_URL = "https://finance.naver.com/world/"
def fetch_market_news() -> List[Dict[str, str]]:
"""
네이버 금융 '주요 뉴스' 크롤링
반환: [{"title": "...", "link": "...", "summary": "...", "date": "..."}, ...]
"""
try:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36"
}
resp = requests.get(NAVER_FINANCE_NEWS_URL, headers=headers, timeout=10)
resp.raise_for_status()
soup = BeautifulSoup(resp.content, "html.parser", from_encoding="cp949")
# 주요 뉴스 리스트 추출
# 구조: div.mainNewsList > ul > li
articles = []
news_list = soup.select(".mainNewsList ul li")
for li in news_list:
# 썸네일 있을 수도 있고 없을 수도 있음
dl = li.select_one("dl")
if not dl:
continue
# 제목 (dd.articleSubject > a)
subject_tag = dl.select_one(".articleSubject a")
if not subject_tag:
continue
title = subject_tag.get_text(strip=True)
link = "https://finance.naver.com" + subject_tag["href"]
# 요약 (dd.articleSummary)
summary_tag = dl.select_one(".articleSummary")
summary = ""
press = ""
date = ""
if summary_tag:
# 불필요한 태그 제거
for child in summary_tag.select(".press, .wdate"):
if "press" in child.get("class", []):
press = child.get_text(strip=True)
if "wdate" in child.get("class", []):
date = child.get_text(strip=True)
child.extract()
summary = summary_tag.get_text(strip=True)
articles.append({
"title": title,
"link": link,
"summary": summary,
"press": press,
"date": date,
"crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"),
"category": "domestic"
})
return articles
except Exception as e:
print(f"[StockLab] Scraping failed: {e}")
return []
def fetch_overseas_news() -> List[Dict[str, str]]:
"""
네이버 금융 해외증시 뉴스 크롤링 (모바일 API 사용)
"""
api_url = "https://api.stock.naver.com/news/overseas/mainnews"
try:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)"
}
resp = requests.get(api_url, headers=headers, timeout=10)
resp.raise_for_status()
data = resp.json()
if isinstance(data, list):
items = data
else:
items = data.get("result", [])
articles = []
for item in items:
# API 키 매핑 (subject/title/tit, summary/subContent/sub_tit 등)
title = item.get("subject") or item.get("title") or item.get("tit") or ""
summary = item.get("summary") or item.get("subContent") or item.get("sub_tit") or ""
press = item.get("officeName") or item.get("office_name") or item.get("cp_name") or ""
# 날짜 포맷팅 (20260126123000 -> 2026-01-26 12:30:00)
raw_dt = str(item.get("dt", ""))
if len(raw_dt) == 14:
date = f"{raw_dt[:4]}-{raw_dt[4:6]}-{raw_dt[6:8]} {raw_dt[8:10]}:{raw_dt[10:12]}:{raw_dt[12:]}"
else:
date = raw_dt
# 링크 생성
aid = item.get("articleId")
oid = item.get("officeId")
link = f"https://m.stock.naver.com/worldstock/news/read/{oid}/{aid}"
articles.append({
"title": title,
"link": link,
"summary": summary,
"press": press,
"date": date,
"crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"),
"category": "overseas"
})
return articles
except Exception as e:
print(f"[StockLab] Overseas news failed: {e}")
return []
def fetch_major_indices() -> Dict[str, Any]:
"""
KOSPI, KOSDAQ, KOSPI200 등 주요 지표 (네이버 금융 홈)
"""
url = "https://finance.naver.com/"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)"
}
try:
targets = [
{"key": "KOSPI", "selector": ".kospi_area", "url": "https://finance.naver.com/"},
{"key": "KOSDAQ", "selector": ".kosdaq_area", "url": "https://finance.naver.com/"},
{"key": "KOSPI200", "selector": ".kospi200_area", "url": "https://finance.naver.com/"},
]
# 해외 지수 (네이버 금융 해외 메인) - 여기서는 별도 URL 호출 필요하거나, 메인에 있는지 확인
# 네이버 메인에는 해외지수가 안 나옴. https://finance.naver.com/world/ 에서 긁어야 함
# 그러나 한 번에 처리하기 위해 함수 내에서 추가 호출
indices = []
# --- 국내 ---
resp_kr = requests.get("https://finance.naver.com/", headers=headers, timeout=5)
soup_kr = BeautifulSoup(resp_kr.content, "html.parser", from_encoding="cp949")
for t in targets:
area = soup_kr.select_one(t["selector"])
if not area: continue
# (기존 파싱 로직)
num_tag = area.select_one(".num")
value = num_tag.get_text(strip=True) if num_tag else ""
change_val_tag = area.select_one(".num2")
change_pct_tag = area.select_one(".num3")
change_val = change_val_tag.get_text(strip=True) if change_val_tag else ""
change_pct = change_pct_tag.get_text(strip=True) if change_pct_tag else ""
direction = ""
if area.select_one(".bu_p"): direction = "red"
elif area.select_one(".bu_m"): direction = "blue"
indices.append({
"name": t["key"],
"value": value,
"change_value": change_val,
"change_percent": change_pct,
"direction": direction,
"type": "domestic"
})
# --- 해외 (DJI, NAS, SPI) ---
try:
resp_world = requests.get(NAVER_FINANCE_WORLD_URL, headers=headers, timeout=5)
soup_world = BeautifulSoup(resp_world.content, "html.parser", from_encoding="cp949")
world_targets = [
{"key": "DJI", "selector": ".sise_major .data_list li:nth-child(1)"},
{"key": "NAS", "selector": ".sise_major .data_list li:nth-child(2)"},
{"key": "SPI", "selector": ".sise_major .data_list li:nth-child(3)"},
]
for wt in world_targets:
li = soup_world.select_one(wt["selector"])
if not li: continue
# 값: dd.point_status strong
val_tag = li.select_one("dd.point_status strong")
value = val_tag.get_text(strip=True) if val_tag else ""
# 등락: dd.point_status em
direction = ""
status_dd = li.select_one("dd.point_status")
if status_dd:
em = status_dd.select_one("em")
if em:
if "red" in em.get("class", []): direction = "red"
elif "blue" in em.get("class", []): direction = "blue"
indices.append({
"name": wt["key"],
"value": value,
"change_value": "",
"change_percent": "",
"direction": direction,
"type": "overseas"
})
except Exception as e:
print(f"[StockLab] World indices failed: {e}")
return {"indices": indices, "crawled_at": time.strftime("%Y-%m-%d %H:%M:%S")}
except Exception as e:
print(f"[StockLab] Indices scraping failed: {e}")
return {"indices": [], "error": str(e)}