From 9dd517e82ac3f81c147605476e406e0782dfeeea Mon Sep 17 00:00:00 2001 From: gahusb Date: Tue, 28 Apr 2026 08:28:10 +0900 Subject: [PATCH] feat(realestate-collector): 30-day window + district extraction + completed skip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add _extract_district() helper with DISTRICT_PATTERN regex (서울 only) - collect_all() now passes RCRIT_PBLANC_DE_FROM param (30-day window) to all detail endpoints - collect_all() skips announcements where compute_status() returns '완료' - collect_all() stamps district on each parsed announcement before upsert - upsert_announcement(): add district to INSERT/VALUES/ON CONFLICT UPDATE; data.setdefault('district', None) - ANNOUNCEMENT_COLUMNS: add 'district' (closes deferred gap from Task 2 review) - 9 new tests in realestate-lab/tests/test_collector.py (6 unit + 3 integration) - Full suite: 22 passed Co-Authored-By: Claude Sonnet 4.6 --- realestate-lab/app/collector.py | 48 +++++++-- realestate-lab/app/db.py | 8 +- realestate-lab/tests/test_collector.py | 131 +++++++++++++++++++++++++ 3 files changed, 178 insertions(+), 9 deletions(-) create mode 100644 realestate-lab/tests/test_collector.py diff --git a/realestate-lab/app/collector.py b/realestate-lab/app/collector.py index 7c583e2..a71d502 100644 --- a/realestate-lab/app/collector.py +++ b/realestate-lab/app/collector.py @@ -1,9 +1,11 @@ import os +import re import logging +from datetime import date, timedelta import requests from typing import List, Dict, Any -from .db import upsert_announcement, upsert_model, save_collect_log +from .db import upsert_announcement, upsert_model, save_collect_log, compute_status logger = logging.getLogger("realestate-lab") @@ -19,6 +21,19 @@ DETAIL_ENDPOINTS = [ ("getOPTLttotPblancDetail", "getOPTLttotPblancMdl"), ] +DISTRICT_PATTERN = re.compile(r"(?:서울특별시|서울시|서울)\s+(\S+?(?:구|군))") + + +def _extract_district(parsed: Dict[str, Any]) -> str | None: + """파싱된 공고에서 자치구를 추출. 서울 외 지역·실패 시 None.""" + for src in (parsed.get("address"), parsed.get("region_name")): + if not src: + continue + m = DISTRICT_PATTERN.search(src) + if m: + return m.group(1) + return None + def _api_call(endpoint: str, params: Dict[str, Any] = None) -> List[Dict]: """페이지네이션 처리하여 API 전체 데이터를 반환한다.""" @@ -130,28 +145,49 @@ def _parse_model(raw: Dict[str, Any]) -> Dict[str, Any]: def collect_all() -> Dict[str, Any]: - """모든 엔드포인트를 순회하며 공고 + 모델 데이터를 수집·저장한다.""" + """모든 엔드포인트를 순회하며 공고 + 모델 데이터를 수집·저장한다. + 모집공고일 30일 이전 데이터는 API 파라미터로 사전 좁힘. + status='완료'로 판정되는 응답은 저장하지 않음. + """ if not API_KEY: logger.warning("API 키 미설정 — 수집 중단") save_collect_log(0, 0, "API 키 미설정") return {"new_count": 0, "total_count": 0} + today = date.today() + date_from = (today - timedelta(days=30)).strftime("%Y%m%d") + total_count = 0 new_count = 0 + skipped_completed = 0 for detail_ep, model_ep in DETAIL_ENDPOINTS: - # 공고 상세 수집 - detail_rows = _api_call(detail_ep) + # 공고 상세 수집 — API에 모집공고일 윈도우 파라미터 전달 + # 일부 엔드포인트는 파라미터 미지원일 수 있어 무시되지만 응답에 영향 없음 + detail_rows = _api_call(detail_ep, params={"RCRIT_PBLANC_DE_FROM": date_from}) for raw in detail_rows: try: parsed = _parse_apt_detail(raw) - # 일정 정보가 하나도 없는 공고는 건너뜀 + parsed["district"] = _extract_district(parsed) + + # 일정 정보가 하나도 없는 공고는 건너뜀 (기존) has_dates = any(parsed.get(f) for f in ( "receipt_start", "receipt_end", "spsply_start", "gnrl_rank1_start", "winner_date", "contract_start", )) if not has_dates: continue + + # status='완료'면 저장하지 않음 (자원 절감) + status = compute_status( + parsed.get("receipt_start", "") or "", + parsed.get("receipt_end", "") or "", + parsed.get("winner_date", "") or "", + ) + if status == "완료": + skipped_completed += 1 + continue + _, is_new = upsert_announcement(parsed) total_count += 1 if is_new: @@ -168,5 +204,5 @@ def collect_all() -> Dict[str, Any]: except Exception as e: logger.error("모델 upsert 실패 [%s]: %s", model_ep, e) save_collect_log(new_count, total_count) - logger.info("수집 완료: new=%d, total=%d", new_count, total_count) + logger.info("수집 완료: new=%d, total=%d, skipped_completed=%d", new_count, total_count, skipped_completed) return {"new_count": new_count, "total_count": total_count} diff --git a/realestate-lab/app/db.py b/realestate-lab/app/db.py index dd69bd9..e949065 100644 --- a/realestate-lab/app/db.py +++ b/realestate-lab/app/db.py @@ -194,6 +194,7 @@ def _ann_row_to_dict(r) -> Dict[str, Any]: def upsert_announcement(data: Dict[str, Any]) -> tuple: """공고 upsert — house_manage_no + pblanc_no 기준. Returns (dict, is_new: bool).""" + data.setdefault("district", None) # 수동 등록 등에서 누락 시 안전 처리 status = compute_status( data.get("receipt_start", ""), data.get("receipt_end", ""), @@ -208,7 +209,7 @@ def upsert_announcement(data: Dict[str, Any]) -> tuple: conn.execute(""" INSERT INTO announcements ( house_manage_no, pblanc_no, house_nm, house_secd, house_dtl_secd, - rent_secd, region_code, region_name, address, total_units, + rent_secd, region_code, region_name, district, address, total_units, rcrit_date, receipt_start, receipt_end, spsply_start, spsply_end, gnrl_rank1_start, gnrl_rank1_end, winner_date, contract_start, contract_end, homepage_url, pblanc_url, constructor, developer, @@ -216,7 +217,7 @@ def upsert_announcement(data: Dict[str, Any]) -> tuple: status, source ) VALUES ( :house_manage_no, :pblanc_no, :house_nm, :house_secd, :house_dtl_secd, - :rent_secd, :region_code, :region_name, :address, :total_units, + :rent_secd, :region_code, :region_name, :district, :address, :total_units, :rcrit_date, :receipt_start, :receipt_end, :spsply_start, :spsply_end, :gnrl_rank1_start, :gnrl_rank1_end, :winner_date, :contract_start, :contract_end, :homepage_url, :pblanc_url, :constructor, :developer, @@ -230,6 +231,7 @@ def upsert_announcement(data: Dict[str, Any]) -> tuple: rent_secd=excluded.rent_secd, region_code=excluded.region_code, region_name=excluded.region_name, + district=excluded.district, address=excluded.address, total_units=excluded.total_units, rcrit_date=excluded.rcrit_date, @@ -368,7 +370,7 @@ def create_announcement(data: Dict[str, Any]) -> Dict[str, Any]: ANNOUNCEMENT_COLUMNS = { "house_nm", "house_secd", "house_dtl_secd", "rent_secd", - "region_code", "region_name", "address", "total_units", + "region_code", "region_name", "district", "address", "total_units", "rcrit_date", "receipt_start", "receipt_end", "spsply_start", "spsply_end", "gnrl_rank1_start", "gnrl_rank1_end", "winner_date", "contract_start", "contract_end", "homepage_url", "pblanc_url", diff --git a/realestate-lab/tests/test_collector.py b/realestate-lab/tests/test_collector.py new file mode 100644 index 0000000..fa024ab --- /dev/null +++ b/realestate-lab/tests/test_collector.py @@ -0,0 +1,131 @@ +"""Tests for collector.py — _extract_district unit tests + collect_all integration tests.""" +from datetime import date, timedelta + + +# ── _extract_district unit tests ───────────────────────────────────────────── + +def test_extract_district_seoul_full_address(): + from app.collector import _extract_district + parsed = {"address": "서울특별시 강남구 도곡동 123-45", "region_name": None} + assert _extract_district(parsed) == "강남구" + + +def test_extract_district_seoul_short(): + from app.collector import _extract_district + parsed = {"address": None, "region_name": "서울 송파구"} + assert _extract_district(parsed) == "송파구" + + +def test_extract_district_busan_returns_none(): + from app.collector import _extract_district + parsed = {"address": "부산광역시 해운대구 우동", "region_name": None} + assert _extract_district(parsed) is None + + +def test_extract_district_empty_returns_none(): + from app.collector import _extract_district + parsed = {"address": "", "region_name": ""} + assert _extract_district(parsed) is None + + +def test_extract_district_seoul_county(): + from app.collector import _extract_district + parsed = {"address": "서울 강서구", "region_name": None} + assert _extract_district(parsed) == "강서구" + + +def test_extract_district_prefers_address_over_region(): + from app.collector import _extract_district + parsed = {"address": "서울특별시 마포구 합정동", "region_name": "서울 강남구"} + assert _extract_district(parsed) == "마포구" + + +# ── collect_all integration tests ──────────────────────────────────────────── + +def test_collect_skips_completed_status(monkeypatch): + """winner_date가 과거인 응답은 status='완료'로 판정되어 upsert되지 않는다.""" + from app import collector + from app.db import _conn + + monkeypatch.setenv("DATA_GO_KR_API_KEY", "TEST") + monkeypatch.setattr(collector, "API_KEY", "TEST") + + past_winner = (date.today() - timedelta(days=10)).strftime("%Y-%m-%d") + + fake_detail_rows = [{ + "HOUSE_MANAGE_NO": "DONE-1", + "PBLANC_NO": "01", + "HOUSE_NM": "완료된단지", + "HSSPLY_ADRES": "서울특별시 강남구", + "RCEPT_BGNDE": "2026-01-01", + "RCEPT_ENDDE": "2026-01-05", + "PRZWNER_PRESNATN_DE": past_winner, + }] + + def fake_call(endpoint, params=None): + if "Detail" in endpoint: + return fake_detail_rows + return [] + + monkeypatch.setattr(collector, "_api_call", fake_call) + collector.collect_all() + + with _conn() as conn: + rows = conn.execute("SELECT * FROM announcements WHERE house_manage_no='DONE-1'").fetchall() + assert len(rows) == 0 + + +def test_collect_stores_district_for_seoul_announcement(monkeypatch): + from app import collector + from app.db import _conn + + monkeypatch.setenv("DATA_GO_KR_API_KEY", "TEST") + monkeypatch.setattr(collector, "API_KEY", "TEST") + + future_start = (date.today() + timedelta(days=10)).strftime("%Y-%m-%d") + future_end = (date.today() + timedelta(days=15)).strftime("%Y-%m-%d") + future_winner = (date.today() + timedelta(days=30)).strftime("%Y-%m-%d") + + fake_detail = [{ + "HOUSE_MANAGE_NO": "SEOUL-1", + "PBLANC_NO": "01", + "HOUSE_NM": "강남단지", + "HSSPLY_ADRES": "서울특별시 강남구 도곡동 1", + "RCEPT_BGNDE": future_start, + "RCEPT_ENDDE": future_end, + "PRZWNER_PRESNATN_DE": future_winner, + }] + + def fake_call(endpoint, params=None): + if "Detail" in endpoint: + return fake_detail + return [] + + monkeypatch.setattr(collector, "_api_call", fake_call) + collector.collect_all() + + with _conn() as conn: + row = conn.execute("SELECT district, status FROM announcements WHERE house_manage_no='SEOUL-1'").fetchone() + assert row["district"] == "강남구" + assert row["status"] in ("청약예정", "청약중") + + +def test_collect_passes_date_window_param(monkeypatch): + from app import collector + + monkeypatch.setenv("DATA_GO_KR_API_KEY", "TEST") + monkeypatch.setattr(collector, "API_KEY", "TEST") + + captured_params = [] + + def fake_call(endpoint, params=None): + captured_params.append(params or {}) + return [] + + monkeypatch.setattr(collector, "_api_call", fake_call) + collector.collect_all() + + expected_from = (date.today() - timedelta(days=30)).strftime("%Y%m%d") + detail_calls = [p for p in captured_params if "RCRIT_PBLANC_DE_FROM" in p] + assert detail_calls, "detail 엔드포인트 호출에 윈도우 파라미터가 없음" + assert detail_calls[0]["RCRIT_PBLANC_DE_FROM"] == expected_from