feat(realestate-collector): 30-day window + district extraction + completed skip

- Add _extract_district() helper with DISTRICT_PATTERN regex (서울 only)
- collect_all() now passes RCRIT_PBLANC_DE_FROM param (30-day window) to all detail endpoints
- collect_all() skips announcements where compute_status() returns '완료'
- collect_all() stamps district on each parsed announcement before upsert
- upsert_announcement(): add district to INSERT/VALUES/ON CONFLICT UPDATE; data.setdefault('district', None)
- ANNOUNCEMENT_COLUMNS: add 'district' (closes deferred gap from Task 2 review)
- 9 new tests in realestate-lab/tests/test_collector.py (6 unit + 3 integration)
- Full suite: 22 passed

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-28 08:28:10 +09:00
parent 496e3a6a73
commit 9dd517e82a
3 changed files with 178 additions and 9 deletions

View File

@@ -1,9 +1,11 @@
import os import os
import re
import logging import logging
from datetime import date, timedelta
import requests import requests
from typing import List, Dict, Any from typing import List, Dict, Any
from .db import upsert_announcement, upsert_model, save_collect_log from .db import upsert_announcement, upsert_model, save_collect_log, compute_status
logger = logging.getLogger("realestate-lab") logger = logging.getLogger("realestate-lab")
@@ -19,6 +21,19 @@ DETAIL_ENDPOINTS = [
("getOPTLttotPblancDetail", "getOPTLttotPblancMdl"), ("getOPTLttotPblancDetail", "getOPTLttotPblancMdl"),
] ]
DISTRICT_PATTERN = re.compile(r"(?:서울특별시|서울시|서울)\s+(\S+?(?:구|군))")
def _extract_district(parsed: Dict[str, Any]) -> str | None:
"""파싱된 공고에서 자치구를 추출. 서울 외 지역·실패 시 None."""
for src in (parsed.get("address"), parsed.get("region_name")):
if not src:
continue
m = DISTRICT_PATTERN.search(src)
if m:
return m.group(1)
return None
def _api_call(endpoint: str, params: Dict[str, Any] = None) -> List[Dict]: def _api_call(endpoint: str, params: Dict[str, Any] = None) -> List[Dict]:
"""페이지네이션 처리하여 API 전체 데이터를 반환한다.""" """페이지네이션 처리하여 API 전체 데이터를 반환한다."""
@@ -130,28 +145,49 @@ def _parse_model(raw: Dict[str, Any]) -> Dict[str, Any]:
def collect_all() -> Dict[str, Any]: def collect_all() -> Dict[str, Any]:
"""모든 엔드포인트를 순회하며 공고 + 모델 데이터를 수집·저장한다.""" """모든 엔드포인트를 순회하며 공고 + 모델 데이터를 수집·저장한다.
모집공고일 30일 이전 데이터는 API 파라미터로 사전 좁힘.
status='완료'로 판정되는 응답은 저장하지 않음.
"""
if not API_KEY: if not API_KEY:
logger.warning("API 키 미설정 — 수집 중단") logger.warning("API 키 미설정 — 수집 중단")
save_collect_log(0, 0, "API 키 미설정") save_collect_log(0, 0, "API 키 미설정")
return {"new_count": 0, "total_count": 0} return {"new_count": 0, "total_count": 0}
today = date.today()
date_from = (today - timedelta(days=30)).strftime("%Y%m%d")
total_count = 0 total_count = 0
new_count = 0 new_count = 0
skipped_completed = 0
for detail_ep, model_ep in DETAIL_ENDPOINTS: for detail_ep, model_ep in DETAIL_ENDPOINTS:
# 공고 상세 수집 # 공고 상세 수집 — API에 모집공고일 윈도우 파라미터 전달
detail_rows = _api_call(detail_ep) # 일부 엔드포인트는 파라미터 미지원일 수 있어 무시되지만 응답에 영향 없음
detail_rows = _api_call(detail_ep, params={"RCRIT_PBLANC_DE_FROM": date_from})
for raw in detail_rows: for raw in detail_rows:
try: try:
parsed = _parse_apt_detail(raw) parsed = _parse_apt_detail(raw)
# 일정 정보가 하나도 없는 공고는 건너뜀 parsed["district"] = _extract_district(parsed)
# 일정 정보가 하나도 없는 공고는 건너뜀 (기존)
has_dates = any(parsed.get(f) for f in ( has_dates = any(parsed.get(f) for f in (
"receipt_start", "receipt_end", "spsply_start", "receipt_start", "receipt_end", "spsply_start",
"gnrl_rank1_start", "winner_date", "contract_start", "gnrl_rank1_start", "winner_date", "contract_start",
)) ))
if not has_dates: if not has_dates:
continue continue
# status='완료'면 저장하지 않음 (자원 절감)
status = compute_status(
parsed.get("receipt_start", "") or "",
parsed.get("receipt_end", "") or "",
parsed.get("winner_date", "") or "",
)
if status == "완료":
skipped_completed += 1
continue
_, is_new = upsert_announcement(parsed) _, is_new = upsert_announcement(parsed)
total_count += 1 total_count += 1
if is_new: if is_new:
@@ -168,5 +204,5 @@ def collect_all() -> Dict[str, Any]:
except Exception as e: except Exception as e:
logger.error("모델 upsert 실패 [%s]: %s", model_ep, e) logger.error("모델 upsert 실패 [%s]: %s", model_ep, e)
save_collect_log(new_count, total_count) save_collect_log(new_count, total_count)
logger.info("수집 완료: new=%d, total=%d", new_count, total_count) logger.info("수집 완료: new=%d, total=%d, skipped_completed=%d", new_count, total_count, skipped_completed)
return {"new_count": new_count, "total_count": total_count} return {"new_count": new_count, "total_count": total_count}

View File

@@ -194,6 +194,7 @@ def _ann_row_to_dict(r) -> Dict[str, Any]:
def upsert_announcement(data: Dict[str, Any]) -> tuple: def upsert_announcement(data: Dict[str, Any]) -> tuple:
"""공고 upsert — house_manage_no + pblanc_no 기준. Returns (dict, is_new: bool).""" """공고 upsert — house_manage_no + pblanc_no 기준. Returns (dict, is_new: bool)."""
data.setdefault("district", None) # 수동 등록 등에서 누락 시 안전 처리
status = compute_status( status = compute_status(
data.get("receipt_start", ""), data.get("receipt_start", ""),
data.get("receipt_end", ""), data.get("receipt_end", ""),
@@ -208,7 +209,7 @@ def upsert_announcement(data: Dict[str, Any]) -> tuple:
conn.execute(""" conn.execute("""
INSERT INTO announcements ( INSERT INTO announcements (
house_manage_no, pblanc_no, house_nm, house_secd, house_dtl_secd, house_manage_no, pblanc_no, house_nm, house_secd, house_dtl_secd,
rent_secd, region_code, region_name, address, total_units, rent_secd, region_code, region_name, district, address, total_units,
rcrit_date, receipt_start, receipt_end, spsply_start, spsply_end, rcrit_date, receipt_start, receipt_end, spsply_start, spsply_end,
gnrl_rank1_start, gnrl_rank1_end, winner_date, contract_start, gnrl_rank1_start, gnrl_rank1_end, winner_date, contract_start,
contract_end, homepage_url, pblanc_url, constructor, developer, contract_end, homepage_url, pblanc_url, constructor, developer,
@@ -216,7 +217,7 @@ def upsert_announcement(data: Dict[str, Any]) -> tuple:
status, source status, source
) VALUES ( ) VALUES (
:house_manage_no, :pblanc_no, :house_nm, :house_secd, :house_dtl_secd, :house_manage_no, :pblanc_no, :house_nm, :house_secd, :house_dtl_secd,
:rent_secd, :region_code, :region_name, :address, :total_units, :rent_secd, :region_code, :region_name, :district, :address, :total_units,
:rcrit_date, :receipt_start, :receipt_end, :spsply_start, :spsply_end, :rcrit_date, :receipt_start, :receipt_end, :spsply_start, :spsply_end,
:gnrl_rank1_start, :gnrl_rank1_end, :winner_date, :contract_start, :gnrl_rank1_start, :gnrl_rank1_end, :winner_date, :contract_start,
:contract_end, :homepage_url, :pblanc_url, :constructor, :developer, :contract_end, :homepage_url, :pblanc_url, :constructor, :developer,
@@ -230,6 +231,7 @@ def upsert_announcement(data: Dict[str, Any]) -> tuple:
rent_secd=excluded.rent_secd, rent_secd=excluded.rent_secd,
region_code=excluded.region_code, region_code=excluded.region_code,
region_name=excluded.region_name, region_name=excluded.region_name,
district=excluded.district,
address=excluded.address, address=excluded.address,
total_units=excluded.total_units, total_units=excluded.total_units,
rcrit_date=excluded.rcrit_date, rcrit_date=excluded.rcrit_date,
@@ -368,7 +370,7 @@ def create_announcement(data: Dict[str, Any]) -> Dict[str, Any]:
ANNOUNCEMENT_COLUMNS = { ANNOUNCEMENT_COLUMNS = {
"house_nm", "house_secd", "house_dtl_secd", "rent_secd", "house_nm", "house_secd", "house_dtl_secd", "rent_secd",
"region_code", "region_name", "address", "total_units", "region_code", "region_name", "district", "address", "total_units",
"rcrit_date", "receipt_start", "receipt_end", "spsply_start", "spsply_end", "rcrit_date", "receipt_start", "receipt_end", "spsply_start", "spsply_end",
"gnrl_rank1_start", "gnrl_rank1_end", "winner_date", "gnrl_rank1_start", "gnrl_rank1_end", "winner_date",
"contract_start", "contract_end", "homepage_url", "pblanc_url", "contract_start", "contract_end", "homepage_url", "pblanc_url",

View File

@@ -0,0 +1,131 @@
"""Tests for collector.py — _extract_district unit tests + collect_all integration tests."""
from datetime import date, timedelta
# ── _extract_district unit tests ─────────────────────────────────────────────
def test_extract_district_seoul_full_address():
from app.collector import _extract_district
parsed = {"address": "서울특별시 강남구 도곡동 123-45", "region_name": None}
assert _extract_district(parsed) == "강남구"
def test_extract_district_seoul_short():
from app.collector import _extract_district
parsed = {"address": None, "region_name": "서울 송파구"}
assert _extract_district(parsed) == "송파구"
def test_extract_district_busan_returns_none():
from app.collector import _extract_district
parsed = {"address": "부산광역시 해운대구 우동", "region_name": None}
assert _extract_district(parsed) is None
def test_extract_district_empty_returns_none():
from app.collector import _extract_district
parsed = {"address": "", "region_name": ""}
assert _extract_district(parsed) is None
def test_extract_district_seoul_county():
from app.collector import _extract_district
parsed = {"address": "서울 강서구", "region_name": None}
assert _extract_district(parsed) == "강서구"
def test_extract_district_prefers_address_over_region():
from app.collector import _extract_district
parsed = {"address": "서울특별시 마포구 합정동", "region_name": "서울 강남구"}
assert _extract_district(parsed) == "마포구"
# ── collect_all integration tests ────────────────────────────────────────────
def test_collect_skips_completed_status(monkeypatch):
"""winner_date가 과거인 응답은 status='완료'로 판정되어 upsert되지 않는다."""
from app import collector
from app.db import _conn
monkeypatch.setenv("DATA_GO_KR_API_KEY", "TEST")
monkeypatch.setattr(collector, "API_KEY", "TEST")
past_winner = (date.today() - timedelta(days=10)).strftime("%Y-%m-%d")
fake_detail_rows = [{
"HOUSE_MANAGE_NO": "DONE-1",
"PBLANC_NO": "01",
"HOUSE_NM": "완료된단지",
"HSSPLY_ADRES": "서울특별시 강남구",
"RCEPT_BGNDE": "2026-01-01",
"RCEPT_ENDDE": "2026-01-05",
"PRZWNER_PRESNATN_DE": past_winner,
}]
def fake_call(endpoint, params=None):
if "Detail" in endpoint:
return fake_detail_rows
return []
monkeypatch.setattr(collector, "_api_call", fake_call)
collector.collect_all()
with _conn() as conn:
rows = conn.execute("SELECT * FROM announcements WHERE house_manage_no='DONE-1'").fetchall()
assert len(rows) == 0
def test_collect_stores_district_for_seoul_announcement(monkeypatch):
from app import collector
from app.db import _conn
monkeypatch.setenv("DATA_GO_KR_API_KEY", "TEST")
monkeypatch.setattr(collector, "API_KEY", "TEST")
future_start = (date.today() + timedelta(days=10)).strftime("%Y-%m-%d")
future_end = (date.today() + timedelta(days=15)).strftime("%Y-%m-%d")
future_winner = (date.today() + timedelta(days=30)).strftime("%Y-%m-%d")
fake_detail = [{
"HOUSE_MANAGE_NO": "SEOUL-1",
"PBLANC_NO": "01",
"HOUSE_NM": "강남단지",
"HSSPLY_ADRES": "서울특별시 강남구 도곡동 1",
"RCEPT_BGNDE": future_start,
"RCEPT_ENDDE": future_end,
"PRZWNER_PRESNATN_DE": future_winner,
}]
def fake_call(endpoint, params=None):
if "Detail" in endpoint:
return fake_detail
return []
monkeypatch.setattr(collector, "_api_call", fake_call)
collector.collect_all()
with _conn() as conn:
row = conn.execute("SELECT district, status FROM announcements WHERE house_manage_no='SEOUL-1'").fetchone()
assert row["district"] == "강남구"
assert row["status"] in ("청약예정", "청약중")
def test_collect_passes_date_window_param(monkeypatch):
from app import collector
monkeypatch.setenv("DATA_GO_KR_API_KEY", "TEST")
monkeypatch.setattr(collector, "API_KEY", "TEST")
captured_params = []
def fake_call(endpoint, params=None):
captured_params.append(params or {})
return []
monkeypatch.setattr(collector, "_api_call", fake_call)
collector.collect_all()
expected_from = (date.today() - timedelta(days=30)).strftime("%Y%m%d")
detail_calls = [p for p in captured_params if "RCRIT_PBLANC_DE_FROM" in p]
assert detail_calls, "detail 엔드포인트 호출에 윈도우 파라미터가 없음"
assert detail_calls[0]["RCRIT_PBLANC_DE_FROM"] == expected_from