test(blog-lab): _extract_text 직접 테스트 추가

This commit is contained in:
2026-04-07 00:44:47 +09:00
parent 2603c7ce20
commit 7c7093d67c

View File

@@ -1,7 +1,7 @@
"""web_crawler 모듈 테스트.""" """web_crawler 모듈 테스트."""
import pytest import pytest
from unittest.mock import patch, AsyncMock from unittest.mock import patch, AsyncMock
from app.web_crawler import crawl_blog_content, enrich_top_blogs, _parse_naver_blog_url from app.web_crawler import crawl_blog_content, enrich_top_blogs, _parse_naver_blog_url, _extract_text
def test_parse_naver_blog_url_valid(): def test_parse_naver_blog_url_valid():
@@ -16,6 +16,32 @@ def test_parse_returns_none_for_invalid_url():
assert result is None assert result is None
def test_extract_text_prefers_se_main_container():
"""SE3 에디터 컨테이너를 우선 선택."""
html = '<div class="se-main-container"><p>SE3 본문</p></div><div id="postViewArea"><p>구 에디터</p></div>'
assert _extract_text(html) == "SE3 본문"
def test_extract_text_falls_back_to_post_view_area():
"""SE3 없으면 구 에디터 컨테이너 사용."""
html = '<div id="postViewArea"><p>구 에디터 본문</p></div>'
assert _extract_text(html) == "구 에디터 본문"
def test_extract_text_removes_script_and_style():
"""스크립트/스타일 태그 제거."""
html = '<div class="se-main-container"><p>본문</p><script>alert(1)</script><style>.x{}</style></div>'
result = _extract_text(html)
assert "alert" not in result
assert ".x" not in result
assert "본문" in result
def test_extract_text_returns_empty_on_no_container():
"""컨테이너가 없고 body도 없으면 빈 문자열."""
assert _extract_text("") == ""
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_crawl_returns_empty_on_non_naver_url(): async def test_crawl_returns_empty_on_non_naver_url():
"""네이버 블로그가 아닌 URL은 빈 문자열 반환.""" """네이버 블로그가 아닌 URL은 빈 문자열 반환."""