"""web_crawler 모듈 테스트.""" import pytest from unittest.mock import patch, AsyncMock from app.web_crawler import crawl_blog_content, enrich_top_blogs, _parse_naver_blog_url, _extract_text def test_parse_naver_blog_url_valid(): """blog.naver.com URL에서 blogId와 logNo를 올바르게 파싱.""" result = _parse_naver_blog_url("https://blog.naver.com/testuser/123456") assert result == ("testuser", "123456") def test_parse_returns_none_for_invalid_url(): """잘못된 URL은 None 반환.""" result = _parse_naver_blog_url("https://example.com/post") assert result is None def test_extract_text_prefers_se_main_container(): """SE3 에디터 컨테이너를 우선 선택.""" html = '

SE3 본문

구 에디터

' assert _extract_text(html) == "SE3 본문" def test_extract_text_falls_back_to_post_view_area(): """SE3 없으면 구 에디터 컨테이너 사용.""" html = '

구 에디터 본문

' assert _extract_text(html) == "구 에디터 본문" def test_extract_text_removes_script_and_style(): """스크립트/스타일 태그 제거.""" html = '

본문

' result = _extract_text(html) assert "alert" not in result assert ".x" not in result assert "본문" in result def test_extract_text_returns_empty_on_no_container(): """컨테이너가 없고 body도 없으면 빈 문자열.""" assert _extract_text("") == "" @pytest.mark.asyncio async def test_crawl_returns_empty_on_non_naver_url(): """네이버 블로그가 아닌 URL은 빈 문자열 반환.""" result = await crawl_blog_content("https://example.com/post") assert result == "" @pytest.mark.asyncio async def test_crawl_truncates_to_2000_chars(): """본문이 2000자를 초과하면 잘라낸다.""" long_html = f'

{"가" * 3000}

' with patch("app.web_crawler._fetch_html", new_callable=AsyncMock, return_value=long_html): result = await crawl_blog_content("https://blog.naver.com/testuser/123") assert len(result) <= 2000 @pytest.mark.asyncio async def test_crawl_returns_empty_on_fetch_failure(): """HTTP 요청 실패 시 빈 문자열 반환.""" with patch("app.web_crawler._fetch_html", new_callable=AsyncMock, side_effect=Exception("timeout")): result = await crawl_blog_content("https://blog.naver.com/testuser/123") assert result == "" @pytest.mark.asyncio async def test_enrich_top_blogs_adds_content_field(): """enrich_top_blogs가 각 블로그에 content 필드를 추가.""" blogs = [ {"title": "테스트", "link": "https://blog.naver.com/user1/111", "bloggername": "유저1", "description": "설명"}, {"title": "테스트2", "link": "https://blog.naver.com/user2/222", "bloggername": "유저2", "description": "설명2"}, ] with patch("app.web_crawler.crawl_blog_content", new_callable=AsyncMock, return_value="크롤링된 본문"): result = await enrich_top_blogs(blogs) assert len(result) == 2 assert result[0]["content"] == "크롤링된 본문" assert result[1]["content"] == "크롤링된 본문" @pytest.mark.asyncio async def test_enrich_top_blogs_handles_partial_failure(): """일부 크롤링 실패 시에도 나머지는 정상 처리.""" blogs = [ {"title": "성공", "link": "https://blog.naver.com/user1/111"}, {"title": "실패", "link": "https://blog.naver.com/user2/222"}, ] side_effects = ["성공 본문", Exception("fail")] with patch("app.web_crawler.crawl_blog_content", new_callable=AsyncMock, side_effect=side_effects): result = await enrich_top_blogs(blogs) assert result[0]["content"] == "성공 본문" assert result[1]["content"] == ""