From 7c7093d67c56ab4527b1ae671458239cf8084392 Mon Sep 17 00:00:00 2001 From: gahusb Date: Tue, 7 Apr 2026 00:44:47 +0900 Subject: [PATCH] =?UTF-8?q?test(blog-lab):=20=5Fextract=5Ftext=20=EC=A7=81?= =?UTF-8?q?=EC=A0=91=20=ED=85=8C=EC=8A=A4=ED=8A=B8=20=EC=B6=94=EA=B0=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- blog-lab/tests/test_web_crawler.py | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/blog-lab/tests/test_web_crawler.py b/blog-lab/tests/test_web_crawler.py index f54d157..617c2d6 100644 --- a/blog-lab/tests/test_web_crawler.py +++ b/blog-lab/tests/test_web_crawler.py @@ -1,7 +1,7 @@ """web_crawler 모듈 테스트.""" import pytest from unittest.mock import patch, AsyncMock -from app.web_crawler import crawl_blog_content, enrich_top_blogs, _parse_naver_blog_url +from app.web_crawler import crawl_blog_content, enrich_top_blogs, _parse_naver_blog_url, _extract_text def test_parse_naver_blog_url_valid(): @@ -16,6 +16,32 @@ def test_parse_returns_none_for_invalid_url(): assert result is None +def test_extract_text_prefers_se_main_container(): + """SE3 에디터 컨테이너를 우선 선택.""" + html = '

SE3 본문

구 에디터

' + assert _extract_text(html) == "SE3 본문" + + +def test_extract_text_falls_back_to_post_view_area(): + """SE3 없으면 구 에디터 컨테이너 사용.""" + html = '

구 에디터 본문

' + assert _extract_text(html) == "구 에디터 본문" + + +def test_extract_text_removes_script_and_style(): + """스크립트/스타일 태그 제거.""" + html = '

본문

' + result = _extract_text(html) + assert "alert" not in result + assert ".x" not in result + assert "본문" in result + + +def test_extract_text_returns_empty_on_no_container(): + """컨테이너가 없고 body도 없으면 빈 문자열.""" + assert _extract_text("") == "" + + @pytest.mark.asyncio async def test_crawl_returns_empty_on_non_naver_url(): """네이버 블로그가 아닌 URL은 빈 문자열 반환."""