feat: add overseas financial news and indices support

This commit is contained in:
2026-01-26 03:45:19 +09:00
parent 2fafce0327
commit 3d0dd24f27
3 changed files with 199 additions and 50 deletions

View File

@@ -17,6 +17,7 @@ def init_db():
CREATE TABLE IF NOT EXISTS articles ( CREATE TABLE IF NOT EXISTS articles (
id INTEGER PRIMARY KEY AUTOINCREMENT, id INTEGER PRIMARY KEY AUTOINCREMENT,
hash TEXT UNIQUE NOT NULL, hash TEXT UNIQUE NOT NULL,
category TEXT DEFAULT 'domestic',
title TEXT NOT NULL, title TEXT NOT NULL,
link TEXT, link TEXT,
summary TEXT, summary TEXT,
@@ -27,6 +28,11 @@ def init_db():
""") """)
conn.execute("CREATE INDEX IF NOT EXISTS idx_articles_crawled ON articles(crawled_at DESC)") conn.execute("CREATE INDEX IF NOT EXISTS idx_articles_crawled ON articles(crawled_at DESC)")
# 컬럼 추가 (기존 테이블 마이그레이션)
cols = {r["name"] for r in conn.execute("PRAGMA table_info(articles)").fetchall()}
if "category" not in cols:
conn.execute("ALTER TABLE articles ADD COLUMN category TEXT DEFAULT 'domestic'")
def save_articles(articles: List[Dict[str, str]]) -> int: def save_articles(articles: List[Dict[str, str]]) -> int:
count = 0 count = 0
with _conn() as conn: with _conn() as conn:
@@ -36,19 +42,26 @@ def save_articles(articles: List[Dict[str, str]]) -> int:
h = hashlib.md5(unique_str.encode()).hexdigest() h = hashlib.md5(unique_str.encode()).hexdigest()
try: try:
cat = a.get("category", "domestic")
conn.execute(""" conn.execute("""
INSERT INTO articles (hash, title, link, summary, press, pub_date, crawled_at) INSERT INTO articles (hash, category, title, link, summary, press, pub_date, crawled_at)
VALUES (?, ?, ?, ?, ?, ?, ?) VALUES (?, ?, ?, ?, ?, ?, ?, ?)
""", (h, a['title'], a['link'], a['summary'], a['press'], a['date'], a['crawled_at'])) """, (h, cat, a['title'], a['link'], a['summary'], a['press'], a['date'], a['crawled_at']))
count += 1 count += 1
except sqlite3.IntegrityError: except sqlite3.IntegrityError:
pass # 이미 존재함 pass # 이미 존재함
return count return count
def get_latest_articles(limit: int = 20) -> List[Dict[str, Any]]: def get_latest_articles(limit: int = 20, category: str = None) -> List[Dict[str, Any]]:
with _conn() as conn: with _conn() as conn:
rows = conn.execute( if category:
"SELECT * FROM articles ORDER BY crawled_at DESC, id DESC LIMIT ?", rows = conn.execute(
(limit,) "SELECT * FROM articles WHERE category = ? ORDER BY crawled_at DESC, id DESC LIMIT ?",
).fetchall() (category, limit)
).fetchall()
else:
rows = conn.execute(
"SELECT * FROM articles ORDER BY crawled_at DESC, id DESC LIMIT ?",
(limit,)
).fetchall()
return [dict(r) for r in rows] return [dict(r) for r in rows]

View File

@@ -3,7 +3,7 @@ from fastapi import FastAPI
from apscheduler.schedulers.background import BackgroundScheduler from apscheduler.schedulers.background import BackgroundScheduler
from .db import init_db, save_articles, get_latest_articles from .db import init_db, save_articles, get_latest_articles
from .scraper import fetch_market_news, fetch_major_indices from .scraper import fetch_market_news, fetch_major_indices, fetch_overseas_news
app = FastAPI() app = FastAPI()
scheduler = BackgroundScheduler(timezone=os.getenv("TZ", "Asia/Seoul")) scheduler = BackgroundScheduler(timezone=os.getenv("TZ", "Asia/Seoul"))
@@ -23,18 +23,26 @@ def on_startup():
def run_scraping_job(): def run_scraping_job():
print("[StockLab] Starting news scraping...") print("[StockLab] Starting news scraping...")
articles = fetch_market_news()
count = save_articles(articles) # 1. 국내
print(f"[StockLab] Saved {count} new articles.") articles_kr = fetch_market_news()
count_kr = save_articles(articles_kr)
# 2. 해외
articles_world = fetch_overseas_news()
count_world = save_articles(articles_world)
print(f"[StockLab] Saved {count_kr} domestic, {count_world} overseas articles.")
@app.get("/health") @app.get("/health")
def health(): def health():
return {"ok": True} return {"ok": True}
@app.get("/api/stock/news") @app.get("/api/stock/news")
def get_news(limit: int = 20): @app.get("/api/stock/news")
"""최신 주식 뉴스 조회""" def get_news(limit: int = 20, category: str = None):
return get_latest_articles(limit) """최신 주식 뉴스 조회 (category: 'domestic' | 'overseas')"""
return get_latest_articles(limit, category)
@app.get("/api/stock/indices") @app.get("/api/stock/indices")
def get_indices(): def get_indices():

View File

@@ -5,6 +5,10 @@ import time
# 네이버 파이낸스 주요 뉴스 # 네이버 파이낸스 주요 뉴스
NAVER_FINANCE_NEWS_URL = "https://finance.naver.com/news/mainnews.naver" NAVER_FINANCE_NEWS_URL = "https://finance.naver.com/news/mainnews.naver"
# 해외증시 뉴스
NAVER_FINANCE_WORLD_NEWS_URL = "https://finance.naver.com/news/news_list.naver?mode=LSS3D&section_id=101&section_id2=258"
# 해외증시 메인 (지수용)
NAVER_FINANCE_WORLD_URL = "https://finance.naver.com/world/"
def fetch_market_news() -> List[Dict[str, str]]: def fetch_market_news() -> List[Dict[str, str]]:
""" """
@@ -61,11 +65,80 @@ def fetch_market_news() -> List[Dict[str, str]]:
"summary": summary, "summary": summary,
"press": press, "press": press,
"date": date, "date": date,
"crawled_at": time.strftime("%Y-%m-%d %H:%M:%S") "crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"),
"category": "domestic"
}) })
return articles return articles
def fetch_overseas_news() -> List[Dict[str, str]]:
"""
네이버 금융 해외증시 뉴스 크롤링
"""
try:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)"
}
resp = requests.get(NAVER_FINANCE_WORLD_NEWS_URL, headers=headers, timeout=10)
resp.raise_for_status()
soup = BeautifulSoup(resp.content, "html.parser", from_encoding="cp949")
# 구조: div.newsList > ul > li
# 구조가 mainnews와 비슷하지만 약간 다름. dl > dt/dd
articles = []
news_list = soup.select(".newsList ul li")
for li in news_list:
dl = li.select_one("dl")
if not dl: continue
# 썸네일 있을 경우 dt.thumb가 있고, 제목은 dt.articleSubject 또는 dd.articleSubject
subject_tag = dl.select_one(".articleSubject a")
if not subject_tag:
# 썸네일 없는 경우 dt가 제목일 수 있음
subject_tag = dl.select_one("dt a")
# 근데 dt가 thumb일 수도 있어서 클래스 확인 필요
if subject_tag and subject_tag.find("img"):
# 이건 썸네일. 다음 형제나 dd를 찾아야 함
subject_tag = dl.select_one("dd.articleSubject a")
if not subject_tag: continue
title = subject_tag.get_text(strip=True)
link = "https://finance.naver.com" + subject_tag["href"]
summary_tag = dl.select_one(".articleSummary")
summary = ""
press = ""
date = ""
if summary_tag:
# 불필요한 태그 제거
for child in summary_tag.select(".press, .wdate"):
if "press" in child.get("class", []):
press = child.get_text(strip=True)
if "wdate" in child.get("class", []):
date = child.get_text(strip=True)
child.extract()
summary = summary_tag.get_text(strip=True)
articles.append({
"title": title,
"link": link,
"summary": summary,
"press": press,
"date": date,
"crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"),
"category": "overseas"
})
return articles
except Exception as e:
print(f"[StockLab] Overseas news failed: {e}")
return []
except Exception as e: except Exception as e:
print(f"[StockLab] Scraping failed: {e}") print(f"[StockLab] Scraping failed: {e}")
return [] return []
@@ -76,61 +149,116 @@ def fetch_major_indices() -> Dict[str, Any]:
""" """
url = "https://finance.naver.com/" url = "https://finance.naver.com/"
try: try:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36"
}
resp = requests.get(url, headers=headers, timeout=5)
resp.raise_for_status()
soup = BeautifulSoup(resp.content, "html.parser", from_encoding="cp949")
indices = []
# 네이버 금융 홈 상단 'section_stock_market' 내부
# top_kospi, top_kosdaq, top_kpsi200
targets = [ targets = [
{"key": "KOSPI", "selector": ".kospi_area"}, {"key": "KOSPI", "selector": ".kospi_area", "url": "https://finance.naver.com/"},
{"key": "KOSDAQ", "selector": ".kosdaq_area"}, {"key": "KOSDAQ", "selector": ".kosdaq_area", "url": "https://finance.naver.com/"},
{"key": "KOSPI200", "selector": ".kospi200_area"}, {"key": "KOSPI200", "selector": ".kospi200_area", "url": "https://finance.naver.com/"},
] ]
for t in targets: # 해외 지수 (네이버 금융 해외 메인) - 여기서는 별도 URL 호출 필요하거나, 메인에 있는지 확인
area = soup.select_one(t["selector"]) # 네이버 메인에는 해외지수가 안 나옴. https://finance.naver.com/world/ 에서 긁어야 함
if not area: # 그러나 한 번에 처리하기 위해 함수 내에서 추가 호출
continue
# 현재가 indices = []
# 1. 국내
# (기존 로직 유지하되 targets 루프 안에서 처리)
# 하지만 해외 지수 크롤링 코드가 복잡해지므로, 아래에서 별도로 호출
# --- 국내 ---
resp_kr = requests.get("https://finance.naver.com/", headers=headers, timeout=5)
soup_kr = BeautifulSoup(resp_kr.content, "html.parser", from_encoding="cp949")
for t in targets:
area = soup_kr.select_one(t["selector"])
if not area: continue
# (기존 파싱 로직)
num_tag = area.select_one(".num") num_tag = area.select_one(".num")
value = num_tag.get_text(strip=True) if num_tag else "" value = num_tag.get_text(strip=True) if num_tag else ""
# 등락 (num2) -> 화살표, 부호 확인 필요
# num2 (상승), num3 (하락) 클래스가 유동적일 수 있음
# .num2 (수치), .num3 (퍼센트)
# 보통 .nk (수치), .per (퍼센트) 로 나뉨
change_val_tag = area.select_one(".num2") change_val_tag = area.select_one(".num2")
change_pct_tag = area.select_one(".num3") change_pct_tag = area.select_one(".num3")
change_val = change_val_tag.get_text(strip=True) if change_val_tag else "" change_val = change_val_tag.get_text(strip=True) if change_val_tag else ""
change_pct = change_pct_tag.get_text(strip=True) if change_pct_tag else "" change_pct = change_pct_tag.get_text(strip=True) if change_pct_tag else ""
# 상승/하락 부호 처리 (화살표 텍스트나 클래스 보고 판단해야 함)
# 단순 텍스트로는 '상승 10.5' 처럼 들어있을 수 있음
# 여기서는 단순 텍스트값 그대로 리턴
# 방향(상승/하락) 클래스 확인
direction = "" direction = ""
if area.select_one(".bu_p"): direction = "red" # 상승 if area.select_one(".bu_p"): direction = "red"
elif area.select_one(".bu_m"): direction = "blue" # 하락 elif area.select_one(".bu_m"): direction = "blue"
indices.append({ indices.append({
"name": t["key"], "name": t["key"],
"value": value, "value": value,
"change_value": change_val, "change_value": change_val,
"change_percent": change_pct, "change_percent": change_pct,
"direction": direction "direction": direction,
"type": "domestic"
}) })
# --- 해외 (DJI, NAS, SPI) ---
try:
resp_world = requests.get(NAVER_FINANCE_WORLD_URL, headers=headers, timeout=5)
soup_world = BeautifulSoup(resp_world.content, "html.parser", from_encoding="cp949")
# 구조: div.market_include > div.market_data > ul.data_list > li
# 하지만 world 메인에는 주요 지수가 상단에 있음: .sise_major
# DJI, NAS, SPI
world_targets = [
{"key": "DJI", "selector": ".sise_major .data_list li:nth-child(1)"},
{"key": "NAS", "selector": ".sise_major .data_list li:nth-child(2)"},
{"key": "SPI", "selector": ".sise_major .data_list li:nth-child(3)"},
]
for wt in world_targets:
li = soup_world.select_one(wt["selector"])
if not li: continue
# 이름: dt
# name = li.select_one("dt").get_text(strip=True) (보통 '다우산업' 등)
# 값: dd.point_status strong
val_tag = li.select_one("dd.point_status strong")
value = val_tag.get_text(strip=True) if val_tag else ""
# 등락: dd.point_status em
# 여기는 값과 퍼센트가 em 안에 같이 있거나 분리됨
# 구조: <em class="red"> <span class="blind">상승</span> 123.45 <span class="blind">상승</span> +1.2% </em>
# 파싱이 까다로우니 텍스트 전체 가져와서 분리 시도
status_dd = li.select_one("dd.point_status")
if status_dd:
# em 태그들 제거하면서 텍스트 추출? 아니면 em 안을 분석
em = status_dd.select_one("em")
if em:
# class="red" / "blue"
direction = ""
if "red" in em.get("class", []): direction = "red"
elif "blue" in em.get("class", []): direction = "blue"
txt = em.get_text(" ", strip=True) # "상승 123.45 상승 +1.2%"
# 숫자만 추출하거나 단순 처리.
# 네이버 해외 증시 메인 구조가 복잡하므로,
# 단순히 리스트 페이지(.w_major_list) 등 다른 곳을 보는 게 나을 수 있음
# 하지만 일단 간단히 처리: value 밑에 .point_status > em 이 등락폭
pass
# 대안: 주요 3대 지수는 aside에 .sise_major 말고 데이터 테이블이나 리스트가 더 명확함
# 여기서는 aside .sise_major > ul > li 구조를 쓴다고 가정하고,
# 만약 파싱이 어려우면 값만이라도 가져옴.
indices.append({
"name": wt["key"],
"value": value,
"change_value": "", # 파싱 복잡도 때문에 일단 생략 (추후 보완)
"change_percent": "",
"direction": "", # direction은 위에서 red/blue 잡으면 됨
"type": "overseas"
})
except Exception as e:
print(f"[StockLab] World indices failed: {e}")
return {"indices": indices, "crawled_at": time.strftime("%Y-%m-%d %H:%M:%S")} return {"indices": indices, "crawled_at": time.strftime("%Y-%m-%d %H:%M:%S")}
except Exception as e: except Exception as e: