From 2603c7ce201587794a95fc0e8d6b6a069fc66cc2 Mon Sep 17 00:00:00 2001 From: gahusb Date: Tue, 7 Apr 2026 00:42:55 +0900 Subject: [PATCH] =?UTF-8?q?feat(blog-lab):=20=EB=84=A4=EC=9D=B4=EB=B2=84?= =?UTF-8?q?=20=EB=B8=94=EB=A1=9C=EA=B7=B8=20=EB=B3=B8=EB=AC=B8=20=ED=81=AC?= =?UTF-8?q?=EB=A1=A4=EB=A7=81=20=EB=AA=A8=EB=93=88=20=EC=B6=94=EA=B0=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Opus 4.6 --- blog-lab/app/web_crawler.py | 99 ++++++++++++++++++++++++++++++ blog-lab/pytest.ini | 3 + blog-lab/requirements.txt | 2 + blog-lab/tests/__init__.py | 0 blog-lab/tests/conftest.py | 9 +++ blog-lab/tests/test_web_crawler.py | 68 ++++++++++++++++++++ 6 files changed, 181 insertions(+) create mode 100644 blog-lab/app/web_crawler.py create mode 100644 blog-lab/pytest.ini create mode 100644 blog-lab/tests/__init__.py create mode 100644 blog-lab/tests/conftest.py create mode 100644 blog-lab/tests/test_web_crawler.py diff --git a/blog-lab/app/web_crawler.py b/blog-lab/app/web_crawler.py new file mode 100644 index 0000000..0927a6a --- /dev/null +++ b/blog-lab/app/web_crawler.py @@ -0,0 +1,99 @@ +"""네이버 블로그 본문 크롤링 모듈.""" + +import asyncio +import logging +import re +from typing import Any, Dict, List, Optional, Tuple +from urllib.parse import urlparse + +import httpx +from bs4 import BeautifulSoup + +logger = logging.getLogger(__name__) + +_TIMEOUT = 10 # 글당 크롤링 타임아웃 (초) +_MAX_CONTENT_LENGTH = 2000 # 본문 최대 길이 + +# 네이버 블로그 URL 패턴: blog.naver.com/{blogId}/{logNo} +_BLOG_URL_RE = re.compile(r"blog\.naver\.com/([^/]+)/(\d+)") + + +def _parse_naver_blog_url(url: str) -> Optional[Tuple[str, str]]: + """네이버 블로그 URL에서 blogId, logNo 추출. 실패 시 None.""" + match = _BLOG_URL_RE.search(url) + if not match: + return None + return match.group(1), match.group(2) + + +async def _fetch_html(url: str) -> str: + """URL에서 HTML을 가져온다.""" + async with httpx.AsyncClient(timeout=_TIMEOUT, follow_redirects=True) as client: + resp = await client.get(url, headers={ + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" + }) + resp.raise_for_status() + return resp.text + + +def _extract_text(html: str) -> str: + """HTML에서 본문 텍스트를 추출한다.""" + soup = BeautifulSoup(html, "html.parser") + + # 스마트에디터 3 (SE3) + container = soup.select_one("div.se-main-container") + if not container: + # 구 에디터 + container = soup.select_one("div#postViewArea") + if not container: + # 폴백: body 전체 + container = soup.body + + if not container: + return "" + + # 스크립트/스타일 제거 + for tag in container.find_all(["script", "style"]): + tag.decompose() + + text = container.get_text(separator="\n", strip=True) + return text[:_MAX_CONTENT_LENGTH] + + +async def crawl_blog_content(url: str) -> str: + """네이버 블로그 URL에서 본문 텍스트 추출. + + - 네이버 블로그가 아니면 빈 문자열 + - 크롤링 실패 시 빈 문자열 (에러 로그만) + - 본문 최대 2,000자 + """ + parsed = _parse_naver_blog_url(url) + if not parsed: + return "" + + blog_id, log_no = parsed + # iframe 내부 실제 본문 URL + post_url = f"https://blog.naver.com/PostView.naver?blogId={blog_id}&logNo={log_no}" + + try: + html = await _fetch_html(post_url) + return _extract_text(html) + except Exception as e: + logger.warning("블로그 크롤링 실패 (%s): %s", url, e) + return "" + + +async def enrich_top_blogs(top_blogs: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """top_blogs 리스트 각 항목에 content 필드를 추가. + + 개별 크롤링 실패 시 해당 항목의 content를 빈 문자열로 설정하고 나머지 계속 진행. + """ + result = [] + for blog in top_blogs: + enriched = dict(blog) + try: + enriched["content"] = await crawl_blog_content(blog.get("link", "")) + except Exception: + enriched["content"] = "" + result.append(enriched) + return result diff --git a/blog-lab/pytest.ini b/blog-lab/pytest.ini new file mode 100644 index 0000000..82bc8d1 --- /dev/null +++ b/blog-lab/pytest.ini @@ -0,0 +1,3 @@ +[pytest] +asyncio_mode = auto +pythonpath = . diff --git a/blog-lab/requirements.txt b/blog-lab/requirements.txt index 6a719f5..7cc350e 100644 --- a/blog-lab/requirements.txt +++ b/blog-lab/requirements.txt @@ -2,3 +2,5 @@ fastapi==0.115.6 uvicorn[standard]==0.34.0 requests==2.32.3 anthropic==0.52.0 +beautifulsoup4>=4.12 +httpx>=0.27 diff --git a/blog-lab/tests/__init__.py b/blog-lab/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/blog-lab/tests/conftest.py b/blog-lab/tests/conftest.py new file mode 100644 index 0000000..4495650 --- /dev/null +++ b/blog-lab/tests/conftest.py @@ -0,0 +1,9 @@ +"""공통 테스트 픽스처.""" +import os +import sys + +# app 패키지를 blog_lab_app으로도 import 가능하게 +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) +if "blog_lab_app" not in sys.modules: + import app as blog_lab_app + sys.modules["blog_lab_app"] = blog_lab_app diff --git a/blog-lab/tests/test_web_crawler.py b/blog-lab/tests/test_web_crawler.py new file mode 100644 index 0000000..f54d157 --- /dev/null +++ b/blog-lab/tests/test_web_crawler.py @@ -0,0 +1,68 @@ +"""web_crawler 모듈 테스트.""" +import pytest +from unittest.mock import patch, AsyncMock +from app.web_crawler import crawl_blog_content, enrich_top_blogs, _parse_naver_blog_url + + +def test_parse_naver_blog_url_valid(): + """blog.naver.com URL에서 blogId와 logNo를 올바르게 파싱.""" + result = _parse_naver_blog_url("https://blog.naver.com/testuser/123456") + assert result == ("testuser", "123456") + + +def test_parse_returns_none_for_invalid_url(): + """잘못된 URL은 None 반환.""" + result = _parse_naver_blog_url("https://example.com/post") + assert result is None + + +@pytest.mark.asyncio +async def test_crawl_returns_empty_on_non_naver_url(): + """네이버 블로그가 아닌 URL은 빈 문자열 반환.""" + result = await crawl_blog_content("https://example.com/post") + assert result == "" + + +@pytest.mark.asyncio +async def test_crawl_truncates_to_2000_chars(): + """본문이 2000자를 초과하면 잘라낸다.""" + long_html = f'

{"가" * 3000}

' + with patch("app.web_crawler._fetch_html", new_callable=AsyncMock, return_value=long_html): + result = await crawl_blog_content("https://blog.naver.com/testuser/123") + assert len(result) <= 2000 + + +@pytest.mark.asyncio +async def test_crawl_returns_empty_on_fetch_failure(): + """HTTP 요청 실패 시 빈 문자열 반환.""" + with patch("app.web_crawler._fetch_html", new_callable=AsyncMock, side_effect=Exception("timeout")): + result = await crawl_blog_content("https://blog.naver.com/testuser/123") + assert result == "" + + +@pytest.mark.asyncio +async def test_enrich_top_blogs_adds_content_field(): + """enrich_top_blogs가 각 블로그에 content 필드를 추가.""" + blogs = [ + {"title": "테스트", "link": "https://blog.naver.com/user1/111", "bloggername": "유저1", "description": "설명"}, + {"title": "테스트2", "link": "https://blog.naver.com/user2/222", "bloggername": "유저2", "description": "설명2"}, + ] + with patch("app.web_crawler.crawl_blog_content", new_callable=AsyncMock, return_value="크롤링된 본문"): + result = await enrich_top_blogs(blogs) + assert len(result) == 2 + assert result[0]["content"] == "크롤링된 본문" + assert result[1]["content"] == "크롤링된 본문" + + +@pytest.mark.asyncio +async def test_enrich_top_blogs_handles_partial_failure(): + """일부 크롤링 실패 시에도 나머지는 정상 처리.""" + blogs = [ + {"title": "성공", "link": "https://blog.naver.com/user1/111"}, + {"title": "실패", "link": "https://blog.naver.com/user2/222"}, + ] + side_effects = ["성공 본문", Exception("fail")] + with patch("app.web_crawler.crawl_blog_content", new_callable=AsyncMock, side_effect=side_effects): + result = await enrich_top_blogs(blogs) + assert result[0]["content"] == "성공 본문" + assert result[1]["content"] == ""