From 2603c7ce201587794a95fc0e8d6b6a069fc66cc2 Mon Sep 17 00:00:00 2001
From: gahusb <bgg8988@gmail.com>
Date: Tue, 7 Apr 2026 00:42:55 +0900
Subject: [PATCH] =?UTF-8?q?feat(blog-lab):=20=EB=84=A4=EC=9D=B4=EB=B2=84?=
 =?UTF-8?q?=20=EB=B8=94=EB=A1=9C=EA=B7=B8=20=EB=B3=B8=EB=AC=B8=20=ED=81=AC?=
 =?UTF-8?q?=EB=A1=A4=EB=A7=81=20=EB=AA=A8=EB=93=88=20=EC=B6=94=EA=B0=80?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 blog-lab/app/web_crawler.py        | 99 ++++++++++++++++++++++++++++++
 blog-lab/pytest.ini                |  3 +
 blog-lab/requirements.txt          |  2 +
 blog-lab/tests/__init__.py         |  0
 blog-lab/tests/conftest.py         |  9 +++
 blog-lab/tests/test_web_crawler.py | 68 ++++++++++++++++++++
 6 files changed, 181 insertions(+)
 create mode 100644 blog-lab/app/web_crawler.py
 create mode 100644 blog-lab/pytest.ini
 create mode 100644 blog-lab/tests/__init__.py
 create mode 100644 blog-lab/tests/conftest.py
 create mode 100644 blog-lab/tests/test_web_crawler.py

diff --git a/blog-lab/app/web_crawler.py b/blog-lab/app/web_crawler.py
new file mode 100644
index 0000000..0927a6a
--- /dev/null
+++ b/blog-lab/app/web_crawler.py
@@ -0,0 +1,99 @@
+"""네이버 블로그 본문 크롤링 모듈."""
+
+import asyncio
+import logging
+import re
+from typing import Any, Dict, List, Optional, Tuple
+from urllib.parse import urlparse
+
+import httpx
+from bs4 import BeautifulSoup
+
+logger = logging.getLogger(__name__)
+
+_TIMEOUT = 10  # 글당 크롤링 타임아웃 (초)
+_MAX_CONTENT_LENGTH = 2000  # 본문 최대 길이
+
+# 네이버 블로그 URL 패턴: blog.naver.com/{blogId}/{logNo}
+_BLOG_URL_RE = re.compile(r"blog\.naver\.com/([^/]+)/(\d+)")
+
+
+def _parse_naver_blog_url(url: str) -> Optional[Tuple[str, str]]:
+    """네이버 블로그 URL에서 blogId, logNo 추출. 실패 시 None."""
+    match = _BLOG_URL_RE.search(url)
+    if not match:
+        return None
+    return match.group(1), match.group(2)
+
+
+async def _fetch_html(url: str) -> str:
+    """URL에서 HTML을 가져온다."""
+    async with httpx.AsyncClient(timeout=_TIMEOUT, follow_redirects=True) as client:
+        resp = await client.get(url, headers={
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
+        })
+        resp.raise_for_status()
+        return resp.text
+
+
+def _extract_text(html: str) -> str:
+    """HTML에서 본문 텍스트를 추출한다."""
+    soup = BeautifulSoup(html, "html.parser")
+
+    # 스마트에디터 3 (SE3)
+    container = soup.select_one("div.se-main-container")
+    if not container:
+        # 구 에디터
+        container = soup.select_one("div#postViewArea")
+    if not container:
+        # 폴백: body 전체
+        container = soup.body
+
+    if not container:
+        return ""
+
+    # 스크립트/스타일 제거
+    for tag in container.find_all(["script", "style"]):
+        tag.decompose()
+
+    text = container.get_text(separator="\n", strip=True)
+    return text[:_MAX_CONTENT_LENGTH]
+
+
+async def crawl_blog_content(url: str) -> str:
+    """네이버 블로그 URL에서 본문 텍스트 추출.
+
+    - 네이버 블로그가 아니면 빈 문자열
+    - 크롤링 실패 시 빈 문자열 (에러 로그만)
+    - 본문 최대 2,000자
+    """
+    parsed = _parse_naver_blog_url(url)
+    if not parsed:
+        return ""
+
+    blog_id, log_no = parsed
+    # iframe 내부 실제 본문 URL
+    post_url = f"https://blog.naver.com/PostView.naver?blogId={blog_id}&logNo={log_no}"
+
+    try:
+        html = await _fetch_html(post_url)
+        return _extract_text(html)
+    except Exception as e:
+        logger.warning("블로그 크롤링 실패 (%s): %s", url, e)
+        return ""
+
+
+async def enrich_top_blogs(top_blogs: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    """top_blogs 리스트 각 항목에 content 필드를 추가.
+
+    개별 크롤링 실패 시 해당 항목의 content를 빈 문자열로 설정하고 나머지 계속 진행.
+    """
+    result = []
+    for blog in top_blogs:
+        enriched = dict(blog)
+        try:
+            enriched["content"] = await crawl_blog_content(blog.get("link", ""))
+        except Exception:
+            enriched["content"] = ""
+        result.append(enriched)
+    return result
diff --git a/blog-lab/pytest.ini b/blog-lab/pytest.ini
new file mode 100644
index 0000000..82bc8d1
--- /dev/null
+++ b/blog-lab/pytest.ini
@@ -0,0 +1,3 @@
+[pytest]
+asyncio_mode = auto
+pythonpath = .
diff --git a/blog-lab/requirements.txt b/blog-lab/requirements.txt
index 6a719f5..7cc350e 100644
--- a/blog-lab/requirements.txt
+++ b/blog-lab/requirements.txt
@@ -2,3 +2,5 @@ fastapi==0.115.6
 uvicorn[standard]==0.34.0
 requests==2.32.3
 anthropic==0.52.0
+beautifulsoup4>=4.12
+httpx>=0.27
diff --git a/blog-lab/tests/__init__.py b/blog-lab/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/blog-lab/tests/conftest.py b/blog-lab/tests/conftest.py
new file mode 100644
index 0000000..4495650
--- /dev/null
+++ b/blog-lab/tests/conftest.py
@@ -0,0 +1,9 @@
+"""공통 테스트 픽스처."""
+import os
+import sys
+
+# app 패키지를 blog_lab_app으로도 import 가능하게
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+if "blog_lab_app" not in sys.modules:
+    import app as blog_lab_app
+    sys.modules["blog_lab_app"] = blog_lab_app
diff --git a/blog-lab/tests/test_web_crawler.py b/blog-lab/tests/test_web_crawler.py
new file mode 100644
index 0000000..f54d157
--- /dev/null
+++ b/blog-lab/tests/test_web_crawler.py
@@ -0,0 +1,68 @@
+"""web_crawler 모듈 테스트."""
+import pytest
+from unittest.mock import patch, AsyncMock
+from app.web_crawler import crawl_blog_content, enrich_top_blogs, _parse_naver_blog_url
+
+
+def test_parse_naver_blog_url_valid():
+    """blog.naver.com URL에서 blogId와 logNo를 올바르게 파싱."""
+    result = _parse_naver_blog_url("https://blog.naver.com/testuser/123456")
+    assert result == ("testuser", "123456")
+
+
+def test_parse_returns_none_for_invalid_url():
+    """잘못된 URL은 None 반환."""
+    result = _parse_naver_blog_url("https://example.com/post")
+    assert result is None
+
+
+@pytest.mark.asyncio
+async def test_crawl_returns_empty_on_non_naver_url():
+    """네이버 블로그가 아닌 URL은 빈 문자열 반환."""
+    result = await crawl_blog_content("https://example.com/post")
+    assert result == ""
+
+
+@pytest.mark.asyncio
+async def test_crawl_truncates_to_2000_chars():
+    """본문이 2000자를 초과하면 잘라낸다."""
+    long_html = f'<div class="se-main-container"><p>{"가" * 3000}</p></div>'
+    with patch("app.web_crawler._fetch_html", new_callable=AsyncMock, return_value=long_html):
+        result = await crawl_blog_content("https://blog.naver.com/testuser/123")
+    assert len(result) <= 2000
+
+
+@pytest.mark.asyncio
+async def test_crawl_returns_empty_on_fetch_failure():
+    """HTTP 요청 실패 시 빈 문자열 반환."""
+    with patch("app.web_crawler._fetch_html", new_callable=AsyncMock, side_effect=Exception("timeout")):
+        result = await crawl_blog_content("https://blog.naver.com/testuser/123")
+    assert result == ""
+
+
+@pytest.mark.asyncio
+async def test_enrich_top_blogs_adds_content_field():
+    """enrich_top_blogs가 각 블로그에 content 필드를 추가."""
+    blogs = [
+        {"title": "테스트", "link": "https://blog.naver.com/user1/111", "bloggername": "유저1", "description": "설명"},
+        {"title": "테스트2", "link": "https://blog.naver.com/user2/222", "bloggername": "유저2", "description": "설명2"},
+    ]
+    with patch("app.web_crawler.crawl_blog_content", new_callable=AsyncMock, return_value="크롤링된 본문"):
+        result = await enrich_top_blogs(blogs)
+    assert len(result) == 2
+    assert result[0]["content"] == "크롤링된 본문"
+    assert result[1]["content"] == "크롤링된 본문"
+
+
+@pytest.mark.asyncio
+async def test_enrich_top_blogs_handles_partial_failure():
+    """일부 크롤링 실패 시에도 나머지는 정상 처리."""
+    blogs = [
+        {"title": "성공", "link": "https://blog.naver.com/user1/111"},
+        {"title": "실패", "link": "https://blog.naver.com/user2/222"},
+    ]
+    side_effects = ["성공 본문", Exception("fail")]
+    with patch("app.web_crawler.crawl_blog_content", new_callable=AsyncMock, side_effect=side_effects):
+        result = await enrich_top_blogs(blogs)
+    assert result[0]["content"] == "성공 본문"
+    assert result[1]["content"] == ""