feat(realestate-collector): 30-day window + district extraction + completed skip
- Add _extract_district() helper with DISTRICT_PATTERN regex (서울 only)
- collect_all() now passes RCRIT_PBLANC_DE_FROM param (30-day window) to all detail endpoints
- collect_all() skips announcements where compute_status() returns '완료'
- collect_all() stamps district on each parsed announcement before upsert
- upsert_announcement(): add district to INSERT/VALUES/ON CONFLICT UPDATE; data.setdefault('district', None)
- ANNOUNCEMENT_COLUMNS: add 'district' (closes deferred gap from Task 2 review)
- 9 new tests in realestate-lab/tests/test_collector.py (6 unit + 3 integration)
- Full suite: 22 passed
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,9 +1,11 @@
|
|||||||
import os
|
import os
|
||||||
|
import re
|
||||||
import logging
|
import logging
|
||||||
|
from datetime import date, timedelta
|
||||||
import requests
|
import requests
|
||||||
from typing import List, Dict, Any
|
from typing import List, Dict, Any
|
||||||
|
|
||||||
from .db import upsert_announcement, upsert_model, save_collect_log
|
from .db import upsert_announcement, upsert_model, save_collect_log, compute_status
|
||||||
|
|
||||||
logger = logging.getLogger("realestate-lab")
|
logger = logging.getLogger("realestate-lab")
|
||||||
|
|
||||||
@@ -19,6 +21,19 @@ DETAIL_ENDPOINTS = [
|
|||||||
("getOPTLttotPblancDetail", "getOPTLttotPblancMdl"),
|
("getOPTLttotPblancDetail", "getOPTLttotPblancMdl"),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
DISTRICT_PATTERN = re.compile(r"(?:서울특별시|서울시|서울)\s+(\S+?(?:구|군))")
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_district(parsed: Dict[str, Any]) -> str | None:
|
||||||
|
"""파싱된 공고에서 자치구를 추출. 서울 외 지역·실패 시 None."""
|
||||||
|
for src in (parsed.get("address"), parsed.get("region_name")):
|
||||||
|
if not src:
|
||||||
|
continue
|
||||||
|
m = DISTRICT_PATTERN.search(src)
|
||||||
|
if m:
|
||||||
|
return m.group(1)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def _api_call(endpoint: str, params: Dict[str, Any] = None) -> List[Dict]:
|
def _api_call(endpoint: str, params: Dict[str, Any] = None) -> List[Dict]:
|
||||||
"""페이지네이션 처리하여 API 전체 데이터를 반환한다."""
|
"""페이지네이션 처리하여 API 전체 데이터를 반환한다."""
|
||||||
@@ -130,28 +145,49 @@ def _parse_model(raw: Dict[str, Any]) -> Dict[str, Any]:
|
|||||||
|
|
||||||
|
|
||||||
def collect_all() -> Dict[str, Any]:
|
def collect_all() -> Dict[str, Any]:
|
||||||
"""모든 엔드포인트를 순회하며 공고 + 모델 데이터를 수집·저장한다."""
|
"""모든 엔드포인트를 순회하며 공고 + 모델 데이터를 수집·저장한다.
|
||||||
|
모집공고일 30일 이전 데이터는 API 파라미터로 사전 좁힘.
|
||||||
|
status='완료'로 판정되는 응답은 저장하지 않음.
|
||||||
|
"""
|
||||||
if not API_KEY:
|
if not API_KEY:
|
||||||
logger.warning("API 키 미설정 — 수집 중단")
|
logger.warning("API 키 미설정 — 수집 중단")
|
||||||
save_collect_log(0, 0, "API 키 미설정")
|
save_collect_log(0, 0, "API 키 미설정")
|
||||||
return {"new_count": 0, "total_count": 0}
|
return {"new_count": 0, "total_count": 0}
|
||||||
|
|
||||||
|
today = date.today()
|
||||||
|
date_from = (today - timedelta(days=30)).strftime("%Y%m%d")
|
||||||
|
|
||||||
total_count = 0
|
total_count = 0
|
||||||
new_count = 0
|
new_count = 0
|
||||||
|
skipped_completed = 0
|
||||||
|
|
||||||
for detail_ep, model_ep in DETAIL_ENDPOINTS:
|
for detail_ep, model_ep in DETAIL_ENDPOINTS:
|
||||||
# 공고 상세 수집
|
# 공고 상세 수집 — API에 모집공고일 윈도우 파라미터 전달
|
||||||
detail_rows = _api_call(detail_ep)
|
# 일부 엔드포인트는 파라미터 미지원일 수 있어 무시되지만 응답에 영향 없음
|
||||||
|
detail_rows = _api_call(detail_ep, params={"RCRIT_PBLANC_DE_FROM": date_from})
|
||||||
for raw in detail_rows:
|
for raw in detail_rows:
|
||||||
try:
|
try:
|
||||||
parsed = _parse_apt_detail(raw)
|
parsed = _parse_apt_detail(raw)
|
||||||
# 일정 정보가 하나도 없는 공고는 건너뜀
|
parsed["district"] = _extract_district(parsed)
|
||||||
|
|
||||||
|
# 일정 정보가 하나도 없는 공고는 건너뜀 (기존)
|
||||||
has_dates = any(parsed.get(f) for f in (
|
has_dates = any(parsed.get(f) for f in (
|
||||||
"receipt_start", "receipt_end", "spsply_start",
|
"receipt_start", "receipt_end", "spsply_start",
|
||||||
"gnrl_rank1_start", "winner_date", "contract_start",
|
"gnrl_rank1_start", "winner_date", "contract_start",
|
||||||
))
|
))
|
||||||
if not has_dates:
|
if not has_dates:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# status='완료'면 저장하지 않음 (자원 절감)
|
||||||
|
status = compute_status(
|
||||||
|
parsed.get("receipt_start", "") or "",
|
||||||
|
parsed.get("receipt_end", "") or "",
|
||||||
|
parsed.get("winner_date", "") or "",
|
||||||
|
)
|
||||||
|
if status == "완료":
|
||||||
|
skipped_completed += 1
|
||||||
|
continue
|
||||||
|
|
||||||
_, is_new = upsert_announcement(parsed)
|
_, is_new = upsert_announcement(parsed)
|
||||||
total_count += 1
|
total_count += 1
|
||||||
if is_new:
|
if is_new:
|
||||||
@@ -168,5 +204,5 @@ def collect_all() -> Dict[str, Any]:
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error("모델 upsert 실패 [%s]: %s", model_ep, e)
|
logger.error("모델 upsert 실패 [%s]: %s", model_ep, e)
|
||||||
save_collect_log(new_count, total_count)
|
save_collect_log(new_count, total_count)
|
||||||
logger.info("수집 완료: new=%d, total=%d", new_count, total_count)
|
logger.info("수집 완료: new=%d, total=%d, skipped_completed=%d", new_count, total_count, skipped_completed)
|
||||||
return {"new_count": new_count, "total_count": total_count}
|
return {"new_count": new_count, "total_count": total_count}
|
||||||
|
|||||||
@@ -194,6 +194,7 @@ def _ann_row_to_dict(r) -> Dict[str, Any]:
|
|||||||
|
|
||||||
def upsert_announcement(data: Dict[str, Any]) -> tuple:
|
def upsert_announcement(data: Dict[str, Any]) -> tuple:
|
||||||
"""공고 upsert — house_manage_no + pblanc_no 기준. Returns (dict, is_new: bool)."""
|
"""공고 upsert — house_manage_no + pblanc_no 기준. Returns (dict, is_new: bool)."""
|
||||||
|
data.setdefault("district", None) # 수동 등록 등에서 누락 시 안전 처리
|
||||||
status = compute_status(
|
status = compute_status(
|
||||||
data.get("receipt_start", ""),
|
data.get("receipt_start", ""),
|
||||||
data.get("receipt_end", ""),
|
data.get("receipt_end", ""),
|
||||||
@@ -208,7 +209,7 @@ def upsert_announcement(data: Dict[str, Any]) -> tuple:
|
|||||||
conn.execute("""
|
conn.execute("""
|
||||||
INSERT INTO announcements (
|
INSERT INTO announcements (
|
||||||
house_manage_no, pblanc_no, house_nm, house_secd, house_dtl_secd,
|
house_manage_no, pblanc_no, house_nm, house_secd, house_dtl_secd,
|
||||||
rent_secd, region_code, region_name, address, total_units,
|
rent_secd, region_code, region_name, district, address, total_units,
|
||||||
rcrit_date, receipt_start, receipt_end, spsply_start, spsply_end,
|
rcrit_date, receipt_start, receipt_end, spsply_start, spsply_end,
|
||||||
gnrl_rank1_start, gnrl_rank1_end, winner_date, contract_start,
|
gnrl_rank1_start, gnrl_rank1_end, winner_date, contract_start,
|
||||||
contract_end, homepage_url, pblanc_url, constructor, developer,
|
contract_end, homepage_url, pblanc_url, constructor, developer,
|
||||||
@@ -216,7 +217,7 @@ def upsert_announcement(data: Dict[str, Any]) -> tuple:
|
|||||||
status, source
|
status, source
|
||||||
) VALUES (
|
) VALUES (
|
||||||
:house_manage_no, :pblanc_no, :house_nm, :house_secd, :house_dtl_secd,
|
:house_manage_no, :pblanc_no, :house_nm, :house_secd, :house_dtl_secd,
|
||||||
:rent_secd, :region_code, :region_name, :address, :total_units,
|
:rent_secd, :region_code, :region_name, :district, :address, :total_units,
|
||||||
:rcrit_date, :receipt_start, :receipt_end, :spsply_start, :spsply_end,
|
:rcrit_date, :receipt_start, :receipt_end, :spsply_start, :spsply_end,
|
||||||
:gnrl_rank1_start, :gnrl_rank1_end, :winner_date, :contract_start,
|
:gnrl_rank1_start, :gnrl_rank1_end, :winner_date, :contract_start,
|
||||||
:contract_end, :homepage_url, :pblanc_url, :constructor, :developer,
|
:contract_end, :homepage_url, :pblanc_url, :constructor, :developer,
|
||||||
@@ -230,6 +231,7 @@ def upsert_announcement(data: Dict[str, Any]) -> tuple:
|
|||||||
rent_secd=excluded.rent_secd,
|
rent_secd=excluded.rent_secd,
|
||||||
region_code=excluded.region_code,
|
region_code=excluded.region_code,
|
||||||
region_name=excluded.region_name,
|
region_name=excluded.region_name,
|
||||||
|
district=excluded.district,
|
||||||
address=excluded.address,
|
address=excluded.address,
|
||||||
total_units=excluded.total_units,
|
total_units=excluded.total_units,
|
||||||
rcrit_date=excluded.rcrit_date,
|
rcrit_date=excluded.rcrit_date,
|
||||||
@@ -368,7 +370,7 @@ def create_announcement(data: Dict[str, Any]) -> Dict[str, Any]:
|
|||||||
|
|
||||||
ANNOUNCEMENT_COLUMNS = {
|
ANNOUNCEMENT_COLUMNS = {
|
||||||
"house_nm", "house_secd", "house_dtl_secd", "rent_secd",
|
"house_nm", "house_secd", "house_dtl_secd", "rent_secd",
|
||||||
"region_code", "region_name", "address", "total_units",
|
"region_code", "region_name", "district", "address", "total_units",
|
||||||
"rcrit_date", "receipt_start", "receipt_end", "spsply_start", "spsply_end",
|
"rcrit_date", "receipt_start", "receipt_end", "spsply_start", "spsply_end",
|
||||||
"gnrl_rank1_start", "gnrl_rank1_end", "winner_date",
|
"gnrl_rank1_start", "gnrl_rank1_end", "winner_date",
|
||||||
"contract_start", "contract_end", "homepage_url", "pblanc_url",
|
"contract_start", "contract_end", "homepage_url", "pblanc_url",
|
||||||
|
|||||||
131
realestate-lab/tests/test_collector.py
Normal file
131
realestate-lab/tests/test_collector.py
Normal file
@@ -0,0 +1,131 @@
|
|||||||
|
"""Tests for collector.py — _extract_district unit tests + collect_all integration tests."""
|
||||||
|
from datetime import date, timedelta
|
||||||
|
|
||||||
|
|
||||||
|
# ── _extract_district unit tests ─────────────────────────────────────────────
|
||||||
|
|
||||||
|
def test_extract_district_seoul_full_address():
|
||||||
|
from app.collector import _extract_district
|
||||||
|
parsed = {"address": "서울특별시 강남구 도곡동 123-45", "region_name": None}
|
||||||
|
assert _extract_district(parsed) == "강남구"
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_district_seoul_short():
|
||||||
|
from app.collector import _extract_district
|
||||||
|
parsed = {"address": None, "region_name": "서울 송파구"}
|
||||||
|
assert _extract_district(parsed) == "송파구"
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_district_busan_returns_none():
|
||||||
|
from app.collector import _extract_district
|
||||||
|
parsed = {"address": "부산광역시 해운대구 우동", "region_name": None}
|
||||||
|
assert _extract_district(parsed) is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_district_empty_returns_none():
|
||||||
|
from app.collector import _extract_district
|
||||||
|
parsed = {"address": "", "region_name": ""}
|
||||||
|
assert _extract_district(parsed) is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_district_seoul_county():
|
||||||
|
from app.collector import _extract_district
|
||||||
|
parsed = {"address": "서울 강서구", "region_name": None}
|
||||||
|
assert _extract_district(parsed) == "강서구"
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_district_prefers_address_over_region():
|
||||||
|
from app.collector import _extract_district
|
||||||
|
parsed = {"address": "서울특별시 마포구 합정동", "region_name": "서울 강남구"}
|
||||||
|
assert _extract_district(parsed) == "마포구"
|
||||||
|
|
||||||
|
|
||||||
|
# ── collect_all integration tests ────────────────────────────────────────────
|
||||||
|
|
||||||
|
def test_collect_skips_completed_status(monkeypatch):
|
||||||
|
"""winner_date가 과거인 응답은 status='완료'로 판정되어 upsert되지 않는다."""
|
||||||
|
from app import collector
|
||||||
|
from app.db import _conn
|
||||||
|
|
||||||
|
monkeypatch.setenv("DATA_GO_KR_API_KEY", "TEST")
|
||||||
|
monkeypatch.setattr(collector, "API_KEY", "TEST")
|
||||||
|
|
||||||
|
past_winner = (date.today() - timedelta(days=10)).strftime("%Y-%m-%d")
|
||||||
|
|
||||||
|
fake_detail_rows = [{
|
||||||
|
"HOUSE_MANAGE_NO": "DONE-1",
|
||||||
|
"PBLANC_NO": "01",
|
||||||
|
"HOUSE_NM": "완료된단지",
|
||||||
|
"HSSPLY_ADRES": "서울특별시 강남구",
|
||||||
|
"RCEPT_BGNDE": "2026-01-01",
|
||||||
|
"RCEPT_ENDDE": "2026-01-05",
|
||||||
|
"PRZWNER_PRESNATN_DE": past_winner,
|
||||||
|
}]
|
||||||
|
|
||||||
|
def fake_call(endpoint, params=None):
|
||||||
|
if "Detail" in endpoint:
|
||||||
|
return fake_detail_rows
|
||||||
|
return []
|
||||||
|
|
||||||
|
monkeypatch.setattr(collector, "_api_call", fake_call)
|
||||||
|
collector.collect_all()
|
||||||
|
|
||||||
|
with _conn() as conn:
|
||||||
|
rows = conn.execute("SELECT * FROM announcements WHERE house_manage_no='DONE-1'").fetchall()
|
||||||
|
assert len(rows) == 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_collect_stores_district_for_seoul_announcement(monkeypatch):
|
||||||
|
from app import collector
|
||||||
|
from app.db import _conn
|
||||||
|
|
||||||
|
monkeypatch.setenv("DATA_GO_KR_API_KEY", "TEST")
|
||||||
|
monkeypatch.setattr(collector, "API_KEY", "TEST")
|
||||||
|
|
||||||
|
future_start = (date.today() + timedelta(days=10)).strftime("%Y-%m-%d")
|
||||||
|
future_end = (date.today() + timedelta(days=15)).strftime("%Y-%m-%d")
|
||||||
|
future_winner = (date.today() + timedelta(days=30)).strftime("%Y-%m-%d")
|
||||||
|
|
||||||
|
fake_detail = [{
|
||||||
|
"HOUSE_MANAGE_NO": "SEOUL-1",
|
||||||
|
"PBLANC_NO": "01",
|
||||||
|
"HOUSE_NM": "강남단지",
|
||||||
|
"HSSPLY_ADRES": "서울특별시 강남구 도곡동 1",
|
||||||
|
"RCEPT_BGNDE": future_start,
|
||||||
|
"RCEPT_ENDDE": future_end,
|
||||||
|
"PRZWNER_PRESNATN_DE": future_winner,
|
||||||
|
}]
|
||||||
|
|
||||||
|
def fake_call(endpoint, params=None):
|
||||||
|
if "Detail" in endpoint:
|
||||||
|
return fake_detail
|
||||||
|
return []
|
||||||
|
|
||||||
|
monkeypatch.setattr(collector, "_api_call", fake_call)
|
||||||
|
collector.collect_all()
|
||||||
|
|
||||||
|
with _conn() as conn:
|
||||||
|
row = conn.execute("SELECT district, status FROM announcements WHERE house_manage_no='SEOUL-1'").fetchone()
|
||||||
|
assert row["district"] == "강남구"
|
||||||
|
assert row["status"] in ("청약예정", "청약중")
|
||||||
|
|
||||||
|
|
||||||
|
def test_collect_passes_date_window_param(monkeypatch):
|
||||||
|
from app import collector
|
||||||
|
|
||||||
|
monkeypatch.setenv("DATA_GO_KR_API_KEY", "TEST")
|
||||||
|
monkeypatch.setattr(collector, "API_KEY", "TEST")
|
||||||
|
|
||||||
|
captured_params = []
|
||||||
|
|
||||||
|
def fake_call(endpoint, params=None):
|
||||||
|
captured_params.append(params or {})
|
||||||
|
return []
|
||||||
|
|
||||||
|
monkeypatch.setattr(collector, "_api_call", fake_call)
|
||||||
|
collector.collect_all()
|
||||||
|
|
||||||
|
expected_from = (date.today() - timedelta(days=30)).strftime("%Y%m%d")
|
||||||
|
detail_calls = [p for p in captured_params if "RCRIT_PBLANC_DE_FROM" in p]
|
||||||
|
assert detail_calls, "detail 엔드포인트 호출에 윈도우 파라미터가 없음"
|
||||||
|
assert detail_calls[0]["RCRIT_PBLANC_DE_FROM"] == expected_from
|
||||||
Reference in New Issue
Block a user