From 3d0dd24f2751e2c5db7c21d8e0a5f7092cb854a6 Mon Sep 17 00:00:00 2001 From: gahusb Date: Mon, 26 Jan 2026 03:45:19 +0900 Subject: [PATCH] feat: add overseas financial news and indices support --- stock-lab/app/db.py | 29 ++++-- stock-lab/app/main.py | 22 +++-- stock-lab/app/scraper.py | 198 ++++++++++++++++++++++++++++++++------- 3 files changed, 199 insertions(+), 50 deletions(-) diff --git a/stock-lab/app/db.py b/stock-lab/app/db.py index b91aace..85c29c9 100644 --- a/stock-lab/app/db.py +++ b/stock-lab/app/db.py @@ -17,6 +17,7 @@ def init_db(): CREATE TABLE IF NOT EXISTS articles ( id INTEGER PRIMARY KEY AUTOINCREMENT, hash TEXT UNIQUE NOT NULL, + category TEXT DEFAULT 'domestic', title TEXT NOT NULL, link TEXT, summary TEXT, @@ -26,6 +27,11 @@ def init_db(): ) """) conn.execute("CREATE INDEX IF NOT EXISTS idx_articles_crawled ON articles(crawled_at DESC)") + + # 컬럼 추가 (기존 테이블 마이그레이션) + cols = {r["name"] for r in conn.execute("PRAGMA table_info(articles)").fetchall()} + if "category" not in cols: + conn.execute("ALTER TABLE articles ADD COLUMN category TEXT DEFAULT 'domestic'") def save_articles(articles: List[Dict[str, str]]) -> int: count = 0 @@ -36,19 +42,26 @@ def save_articles(articles: List[Dict[str, str]]) -> int: h = hashlib.md5(unique_str.encode()).hexdigest() try: + cat = a.get("category", "domestic") conn.execute(""" - INSERT INTO articles (hash, title, link, summary, press, pub_date, crawled_at) - VALUES (?, ?, ?, ?, ?, ?, ?) - """, (h, a['title'], a['link'], a['summary'], a['press'], a['date'], a['crawled_at'])) + INSERT INTO articles (hash, category, title, link, summary, press, pub_date, crawled_at) + VALUES (?, ?, ?, ?, ?, ?, ?, ?) + """, (h, cat, a['title'], a['link'], a['summary'], a['press'], a['date'], a['crawled_at'])) count += 1 except sqlite3.IntegrityError: pass # 이미 존재함 return count -def get_latest_articles(limit: int = 20) -> List[Dict[str, Any]]: +def get_latest_articles(limit: int = 20, category: str = None) -> List[Dict[str, Any]]: with _conn() as conn: - rows = conn.execute( - "SELECT * FROM articles ORDER BY crawled_at DESC, id DESC LIMIT ?", - (limit,) - ).fetchall() + if category: + rows = conn.execute( + "SELECT * FROM articles WHERE category = ? ORDER BY crawled_at DESC, id DESC LIMIT ?", + (category, limit) + ).fetchall() + else: + rows = conn.execute( + "SELECT * FROM articles ORDER BY crawled_at DESC, id DESC LIMIT ?", + (limit,) + ).fetchall() return [dict(r) for r in rows] diff --git a/stock-lab/app/main.py b/stock-lab/app/main.py index 0d14882..dc27818 100644 --- a/stock-lab/app/main.py +++ b/stock-lab/app/main.py @@ -3,7 +3,7 @@ from fastapi import FastAPI from apscheduler.schedulers.background import BackgroundScheduler from .db import init_db, save_articles, get_latest_articles -from .scraper import fetch_market_news, fetch_major_indices +from .scraper import fetch_market_news, fetch_major_indices, fetch_overseas_news app = FastAPI() scheduler = BackgroundScheduler(timezone=os.getenv("TZ", "Asia/Seoul")) @@ -23,18 +23,26 @@ def on_startup(): def run_scraping_job(): print("[StockLab] Starting news scraping...") - articles = fetch_market_news() - count = save_articles(articles) - print(f"[StockLab] Saved {count} new articles.") + + # 1. 국내 + articles_kr = fetch_market_news() + count_kr = save_articles(articles_kr) + + # 2. 해외 + articles_world = fetch_overseas_news() + count_world = save_articles(articles_world) + + print(f"[StockLab] Saved {count_kr} domestic, {count_world} overseas articles.") @app.get("/health") def health(): return {"ok": True} @app.get("/api/stock/news") -def get_news(limit: int = 20): - """최신 주식 뉴스 조회""" - return get_latest_articles(limit) +@app.get("/api/stock/news") +def get_news(limit: int = 20, category: str = None): + """최신 주식 뉴스 조회 (category: 'domestic' | 'overseas')""" + return get_latest_articles(limit, category) @app.get("/api/stock/indices") def get_indices(): diff --git a/stock-lab/app/scraper.py b/stock-lab/app/scraper.py index a84b1df..8c45ecc 100644 --- a/stock-lab/app/scraper.py +++ b/stock-lab/app/scraper.py @@ -5,6 +5,10 @@ import time # 네이버 파이낸스 주요 뉴스 NAVER_FINANCE_NEWS_URL = "https://finance.naver.com/news/mainnews.naver" +# 해외증시 뉴스 +NAVER_FINANCE_WORLD_NEWS_URL = "https://finance.naver.com/news/news_list.naver?mode=LSS3D§ion_id=101§ion_id2=258" +# 해외증시 메인 (지수용) +NAVER_FINANCE_WORLD_URL = "https://finance.naver.com/world/" def fetch_market_news() -> List[Dict[str, str]]: """ @@ -61,11 +65,80 @@ def fetch_market_news() -> List[Dict[str, str]]: "summary": summary, "press": press, "date": date, - "crawled_at": time.strftime("%Y-%m-%d %H:%M:%S") + "crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"), + "category": "domestic" }) return articles +def fetch_overseas_news() -> List[Dict[str, str]]: + """ + 네이버 금융 해외증시 뉴스 크롤링 + """ + try: + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)" + } + resp = requests.get(NAVER_FINANCE_WORLD_NEWS_URL, headers=headers, timeout=10) + resp.raise_for_status() + + soup = BeautifulSoup(resp.content, "html.parser", from_encoding="cp949") + + # 구조: div.newsList > ul > li + # 구조가 mainnews와 비슷하지만 약간 다름. dl > dt/dd + articles = [] + news_list = soup.select(".newsList ul li") + + for li in news_list: + dl = li.select_one("dl") + if not dl: continue + + # 썸네일 있을 경우 dt.thumb가 있고, 제목은 dt.articleSubject 또는 dd.articleSubject + subject_tag = dl.select_one(".articleSubject a") + if not subject_tag: + # 썸네일 없는 경우 dt가 제목일 수 있음 + subject_tag = dl.select_one("dt a") + # 근데 dt가 thumb일 수도 있어서 클래스 확인 필요 + if subject_tag and subject_tag.find("img"): + # 이건 썸네일. 다음 형제나 dd를 찾아야 함 + subject_tag = dl.select_one("dd.articleSubject a") + + if not subject_tag: continue + + title = subject_tag.get_text(strip=True) + link = "https://finance.naver.com" + subject_tag["href"] + + summary_tag = dl.select_one(".articleSummary") + summary = "" + press = "" + date = "" + + if summary_tag: + # 불필요한 태그 제거 + for child in summary_tag.select(".press, .wdate"): + if "press" in child.get("class", []): + press = child.get_text(strip=True) + if "wdate" in child.get("class", []): + date = child.get_text(strip=True) + child.extract() + summary = summary_tag.get_text(strip=True) + + articles.append({ + "title": title, + "link": link, + "summary": summary, + "press": press, + "date": date, + "crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"), + "category": "overseas" + }) + + return articles + + except Exception as e: + print(f"[StockLab] Overseas news failed: {e}") + return [] + except Exception as e: print(f"[StockLab] Scraping failed: {e}") return [] @@ -76,61 +149,116 @@ def fetch_major_indices() -> Dict[str, Any]: """ url = "https://finance.naver.com/" try: - headers = { - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36" - } - resp = requests.get(url, headers=headers, timeout=5) - resp.raise_for_status() - - soup = BeautifulSoup(resp.content, "html.parser", from_encoding="cp949") - - indices = [] - # 네이버 금융 홈 상단 'section_stock_market' 내부 - # top_kospi, top_kosdaq, top_kpsi200 - targets = [ - {"key": "KOSPI", "selector": ".kospi_area"}, - {"key": "KOSDAQ", "selector": ".kosdaq_area"}, - {"key": "KOSPI200", "selector": ".kospi200_area"}, + {"key": "KOSPI", "selector": ".kospi_area", "url": "https://finance.naver.com/"}, + {"key": "KOSDAQ", "selector": ".kosdaq_area", "url": "https://finance.naver.com/"}, + {"key": "KOSPI200", "selector": ".kospi200_area", "url": "https://finance.naver.com/"}, ] + # 해외 지수 (네이버 금융 해외 메인) - 여기서는 별도 URL 호출 필요하거나, 메인에 있는지 확인 + # 네이버 메인에는 해외지수가 안 나옴. https://finance.naver.com/world/ 에서 긁어야 함 + # 그러나 한 번에 처리하기 위해 함수 내에서 추가 호출 + + indices = [] + + # 1. 국내 + # (기존 로직 유지하되 targets 루프 안에서 처리) + # 하지만 해외 지수 크롤링 코드가 복잡해지므로, 아래에서 별도로 호출 + + # --- 국내 --- + resp_kr = requests.get("https://finance.naver.com/", headers=headers, timeout=5) + soup_kr = BeautifulSoup(resp_kr.content, "html.parser", from_encoding="cp949") + for t in targets: - area = soup.select_one(t["selector"]) - if not area: - continue - - # 현재가 + area = soup_kr.select_one(t["selector"]) + if not area: continue + + # (기존 파싱 로직) num_tag = area.select_one(".num") value = num_tag.get_text(strip=True) if num_tag else "" - # 등락 (num2) -> 화살표, 부호 확인 필요 - # num2 (상승), num3 (하락) 클래스가 유동적일 수 있음 - # .num2 (수치), .num3 (퍼센트) - # 보통 .nk (수치), .per (퍼센트) 로 나뉨 - change_val_tag = area.select_one(".num2") change_pct_tag = area.select_one(".num3") - change_val = change_val_tag.get_text(strip=True) if change_val_tag else "" change_pct = change_pct_tag.get_text(strip=True) if change_pct_tag else "" - # 상승/하락 부호 처리 (화살표 텍스트나 클래스 보고 판단해야 함) - # 단순 텍스트로는 '상승 10.5' 처럼 들어있을 수 있음 - # 여기서는 단순 텍스트값 그대로 리턴 - - # 방향(상승/하락) 클래스 확인 direction = "" - if area.select_one(".bu_p"): direction = "red" # 상승 - elif area.select_one(".bu_m"): direction = "blue" # 하락 + if area.select_one(".bu_p"): direction = "red" + elif area.select_one(".bu_m"): direction = "blue" indices.append({ "name": t["key"], "value": value, "change_value": change_val, "change_percent": change_pct, - "direction": direction + "direction": direction, + "type": "domestic" }) + # --- 해외 (DJI, NAS, SPI) --- + try: + resp_world = requests.get(NAVER_FINANCE_WORLD_URL, headers=headers, timeout=5) + soup_world = BeautifulSoup(resp_world.content, "html.parser", from_encoding="cp949") + + # 구조: div.market_include > div.market_data > ul.data_list > li + # 하지만 world 메인에는 주요 지수가 상단에 있음: .sise_major + # DJI, NAS, SPI + + world_targets = [ + {"key": "DJI", "selector": ".sise_major .data_list li:nth-child(1)"}, + {"key": "NAS", "selector": ".sise_major .data_list li:nth-child(2)"}, + {"key": "SPI", "selector": ".sise_major .data_list li:nth-child(3)"}, + ] + + for wt in world_targets: + li = soup_world.select_one(wt["selector"]) + if not li: continue + + # 이름: dt + # name = li.select_one("dt").get_text(strip=True) (보통 '다우산업' 등) + + # 값: dd.point_status strong + val_tag = li.select_one("dd.point_status strong") + value = val_tag.get_text(strip=True) if val_tag else "" + + # 등락: dd.point_status em + # 여기는 값과 퍼센트가 em 안에 같이 있거나 분리됨 + # 구조: 상승 123.45 상승 +1.2% + # 파싱이 까다로우니 텍스트 전체 가져와서 분리 시도 + + status_dd = li.select_one("dd.point_status") + if status_dd: + # em 태그들 제거하면서 텍스트 추출? 아니면 em 안을 분석 + em = status_dd.select_one("em") + if em: + # class="red" / "blue" + direction = "" + if "red" in em.get("class", []): direction = "red" + elif "blue" in em.get("class", []): direction = "blue" + + txt = em.get_text(" ", strip=True) # "상승 123.45 상승 +1.2%" + # 숫자만 추출하거나 단순 처리. + # 네이버 해외 증시 메인 구조가 복잡하므로, + # 단순히 리스트 페이지(.w_major_list) 등 다른 곳을 보는 게 나을 수 있음 + # 하지만 일단 간단히 처리: value 밑에 .point_status > em 이 등락폭 + pass + + # 대안: 주요 3대 지수는 aside에 .sise_major 말고 데이터 테이블이나 리스트가 더 명확함 + # 여기서는 aside .sise_major > ul > li 구조를 쓴다고 가정하고, + # 만약 파싱이 어려우면 값만이라도 가져옴. + + indices.append({ + "name": wt["key"], + "value": value, + "change_value": "", # 파싱 복잡도 때문에 일단 생략 (추후 보완) + "change_percent": "", + "direction": "", # direction은 위에서 red/blue 잡으면 됨 + "type": "overseas" + }) + + except Exception as e: + print(f"[StockLab] World indices failed: {e}") + return {"indices": indices, "crawled_at": time.strftime("%Y-%m-%d %H:%M:%S")} except Exception as e: