Files
web-page-backend/blog-lab/tests/test_web_crawler.py

95 lines
3.9 KiB
Python

"""web_crawler 모듈 테스트."""
import pytest
from unittest.mock import patch, AsyncMock
from app.web_crawler import crawl_blog_content, enrich_top_blogs, _parse_naver_blog_url, _extract_text
def test_parse_naver_blog_url_valid():
"""blog.naver.com URL에서 blogId와 logNo를 올바르게 파싱."""
result = _parse_naver_blog_url("https://blog.naver.com/testuser/123456")
assert result == ("testuser", "123456")
def test_parse_returns_none_for_invalid_url():
"""잘못된 URL은 None 반환."""
result = _parse_naver_blog_url("https://example.com/post")
assert result is None
def test_extract_text_prefers_se_main_container():
"""SE3 에디터 컨테이너를 우선 선택."""
html = '<div class="se-main-container"><p>SE3 본문</p></div><div id="postViewArea"><p>구 에디터</p></div>'
assert _extract_text(html) == "SE3 본문"
def test_extract_text_falls_back_to_post_view_area():
"""SE3 없으면 구 에디터 컨테이너 사용."""
html = '<div id="postViewArea"><p>구 에디터 본문</p></div>'
assert _extract_text(html) == "구 에디터 본문"
def test_extract_text_removes_script_and_style():
"""스크립트/스타일 태그 제거."""
html = '<div class="se-main-container"><p>본문</p><script>alert(1)</script><style>.x{}</style></div>'
result = _extract_text(html)
assert "alert" not in result
assert ".x" not in result
assert "본문" in result
def test_extract_text_returns_empty_on_no_container():
"""컨테이너가 없고 body도 없으면 빈 문자열."""
assert _extract_text("") == ""
@pytest.mark.asyncio
async def test_crawl_returns_empty_on_non_naver_url():
"""네이버 블로그가 아닌 URL은 빈 문자열 반환."""
result = await crawl_blog_content("https://example.com/post")
assert result == ""
@pytest.mark.asyncio
async def test_crawl_truncates_to_2000_chars():
"""본문이 2000자를 초과하면 잘라낸다."""
long_html = f'<div class="se-main-container"><p>{"" * 3000}</p></div>'
with patch("app.web_crawler._fetch_html", new_callable=AsyncMock, return_value=long_html):
result = await crawl_blog_content("https://blog.naver.com/testuser/123")
assert len(result) <= 2000
@pytest.mark.asyncio
async def test_crawl_returns_empty_on_fetch_failure():
"""HTTP 요청 실패 시 빈 문자열 반환."""
with patch("app.web_crawler._fetch_html", new_callable=AsyncMock, side_effect=Exception("timeout")):
result = await crawl_blog_content("https://blog.naver.com/testuser/123")
assert result == ""
@pytest.mark.asyncio
async def test_enrich_top_blogs_adds_content_field():
"""enrich_top_blogs가 각 블로그에 content 필드를 추가."""
blogs = [
{"title": "테스트", "link": "https://blog.naver.com/user1/111", "bloggername": "유저1", "description": "설명"},
{"title": "테스트2", "link": "https://blog.naver.com/user2/222", "bloggername": "유저2", "description": "설명2"},
]
with patch("app.web_crawler.crawl_blog_content", new_callable=AsyncMock, return_value="크롤링된 본문"):
result = await enrich_top_blogs(blogs)
assert len(result) == 2
assert result[0]["content"] == "크롤링된 본문"
assert result[1]["content"] == "크롤링된 본문"
@pytest.mark.asyncio
async def test_enrich_top_blogs_handles_partial_failure():
"""일부 크롤링 실패 시에도 나머지는 정상 처리."""
blogs = [
{"title": "성공", "link": "https://blog.naver.com/user1/111"},
{"title": "실패", "link": "https://blog.naver.com/user2/222"},
]
side_effects = ["성공 본문", Exception("fail")]
with patch("app.web_crawler.crawl_blog_content", new_callable=AsyncMock, side_effect=side_effects):
result = await enrich_top_blogs(blogs)
assert result[0]["content"] == "성공 본문"
assert result[1]["content"] == ""