646 lines
20 KiB
Python
646 lines
20 KiB
Python
import logging
|
|
import os
|
|
import re
|
|
import signal
|
|
import time
|
|
from contextlib import contextmanager
|
|
from pathlib import Path
|
|
from typing import Iterable
|
|
|
|
import polars as pl
|
|
|
|
from constants import (
|
|
ARCGIS_PATH,
|
|
CHANNELS,
|
|
DATA_DIR,
|
|
DELAY_BETWEEN_OUTCODES,
|
|
LONDON_OUTCODE_PREFIXES,
|
|
)
|
|
|
|
from http_client import make_client
|
|
from onthemarket import search_outcode as onthemarket_search_outcode
|
|
from rightmove import resolve_outcode_id
|
|
from rightmove import search_outcode as rightmove_search_outcode
|
|
from spatial import PostcodeSpatialIndex
|
|
from storage import write_parquet
|
|
from zoopla import TurnstileError
|
|
from zoopla import launch_browser as launch_zoopla_browser
|
|
from zoopla import search_outcode as zoopla_search_outcode
|
|
|
|
log = logging.getLogger("rightmove")
|
|
|
|
SOURCE_ORDER = ("rightmove", "onthemarket", "zoopla")
|
|
SALE_CHANNEL = CHANNELS[0]
|
|
LONDON_AREAS = sorted({prefix.upper() for prefix in LONDON_OUTCODE_PREFIXES})
|
|
OUTCODE_RE = re.compile(r"^([A-Z]{1,2}\d[A-Z0-9]?)")
|
|
|
|
|
|
def _arcgis_columns() -> tuple[str, str]:
|
|
"""Return postcode and country column names for supported ARCGIS schemas."""
|
|
columns = set(pl.scan_parquet(ARCGIS_PATH).collect_schema().names())
|
|
|
|
if "pcd" in columns:
|
|
postcode_col = "pcd"
|
|
elif "pcds" in columns:
|
|
postcode_col = "pcds"
|
|
else:
|
|
raise ValueError(f"{ARCGIS_PATH} has no supported postcode column")
|
|
|
|
if "ctry" in columns:
|
|
country_col = "ctry"
|
|
elif "ctry25cd" in columns:
|
|
country_col = "ctry25cd"
|
|
else:
|
|
raise ValueError(f"{ARCGIS_PATH} has no supported country column")
|
|
|
|
return postcode_col, country_col
|
|
|
|
|
|
def _normalize_postcode(postcode: str) -> str:
|
|
compact = "".join(str(postcode).upper().split())
|
|
if len(compact) < 5:
|
|
return compact
|
|
return compact[:-3] + " " + compact[-3:]
|
|
|
|
|
|
def _londonish_postcode_expr(postcode_col: str) -> pl.Expr:
|
|
return (
|
|
pl.col(postcode_col)
|
|
.str.to_uppercase()
|
|
.str.extract(r"^([A-Z]{1,2})", 1)
|
|
.is_in(LONDON_AREAS)
|
|
)
|
|
|
|
|
|
def _outcode_area(outcode: str) -> str:
|
|
chars = []
|
|
for ch in outcode.upper():
|
|
if not ch.isalpha():
|
|
break
|
|
chars.append(ch)
|
|
return "".join(chars)
|
|
|
|
|
|
def is_londonish_outcode(outcode: str) -> bool:
|
|
normalized = outcode.upper()
|
|
return normalized in LONDON_AREAS or _outcode_area(normalized) in LONDON_AREAS
|
|
|
|
|
|
def _property_is_londonish(prop: dict) -> bool:
|
|
postcode = str(prop.get("Postcode") or "").upper().strip()
|
|
match = OUTCODE_RE.match(postcode)
|
|
return bool(match and is_londonish_outcode(match.group(1)))
|
|
|
|
|
|
def filter_londonish_outcodes(outcodes: Iterable[str]) -> list[str]:
|
|
return sorted(
|
|
{outcode.upper() for outcode in outcodes if is_londonish_outcode(outcode)}
|
|
)
|
|
|
|
|
|
def load_outcodes() -> list[str]:
|
|
"""Load England outcodes from ARCGIS and keep only Greater London-ish areas."""
|
|
log.info("Loading outcodes from %s", ARCGIS_PATH)
|
|
postcode_col, country_col = _arcgis_columns()
|
|
df = pl.read_parquet(ARCGIS_PATH, columns=[postcode_col, country_col])
|
|
england = df.filter(
|
|
(pl.col(country_col) == "E92000001")
|
|
& _londonish_postcode_expr(postcode_col)
|
|
)
|
|
|
|
outcodes = (
|
|
england.select(
|
|
pl.col(postcode_col)
|
|
.str.extract(r"^([A-Z]{1,2}\d[A-Z0-9]?)", 1)
|
|
.alias("outcode")
|
|
)
|
|
.drop_nulls()
|
|
.get_column("outcode")
|
|
.unique()
|
|
.sort()
|
|
.to_list()
|
|
)
|
|
londonish = filter_londonish_outcodes(outcodes)
|
|
log.info("Greater London-ish outcodes: %d", len(londonish))
|
|
return londonish
|
|
|
|
|
|
def build_postcode_coords() -> dict[str, tuple[float, float]]:
|
|
"""Build postcode -> (lat, lng) lookup from ARCGIS England postcodes."""
|
|
log.info("Building postcode coords lookup from %s", ARCGIS_PATH)
|
|
postcode_col, country_col = _arcgis_columns()
|
|
df = pl.read_parquet(
|
|
ARCGIS_PATH, columns=[postcode_col, country_col, "lat", "long"]
|
|
)
|
|
england = df.filter(
|
|
(pl.col(country_col) == "E92000001")
|
|
& _londonish_postcode_expr(postcode_col)
|
|
).drop_nulls(
|
|
subset=["lat", "long"]
|
|
)
|
|
coords: dict[str, tuple[float, float]] = {}
|
|
for pcd, lat, lng in zip(
|
|
england.get_column(postcode_col).to_list(),
|
|
england.get_column("lat").to_list(),
|
|
england.get_column("long").to_list(),
|
|
):
|
|
coords[_normalize_postcode(pcd)] = (lat, lng)
|
|
log.info("Postcode coords lookup: %d postcodes", len(coords))
|
|
return coords
|
|
|
|
|
|
def build_postcode_index() -> PostcodeSpatialIndex:
|
|
"""Build spatial index from ARCGIS England postcodes."""
|
|
log.info("Building postcode spatial index from %s", ARCGIS_PATH)
|
|
postcode_col, country_col = _arcgis_columns()
|
|
df = pl.read_parquet(
|
|
ARCGIS_PATH, columns=[postcode_col, country_col, "lat", "long"]
|
|
)
|
|
england = df.filter(
|
|
(pl.col(country_col) == "E92000001")
|
|
& _londonish_postcode_expr(postcode_col)
|
|
).drop_nulls(
|
|
subset=["lat", "long"]
|
|
)
|
|
return PostcodeSpatialIndex(
|
|
england.get_column("lat").to_list(),
|
|
england.get_column("long").to_list(),
|
|
[
|
|
_normalize_postcode(pcd)
|
|
for pcd in england.get_column(postcode_col).to_list()
|
|
],
|
|
)
|
|
|
|
|
|
def _source_names(sources: str | Iterable[str] | None) -> list[str]:
|
|
if sources is None:
|
|
return list(SOURCE_ORDER)
|
|
if isinstance(sources, str):
|
|
requested = [part.strip().lower() for part in sources.split(",")]
|
|
else:
|
|
requested = [str(source).strip().lower() for source in sources]
|
|
|
|
requested = [source for source in requested if source]
|
|
unknown = sorted(set(requested) - set(SOURCE_ORDER) - {"all"})
|
|
if unknown:
|
|
raise ValueError(f"Unknown source(s): {', '.join(unknown)}")
|
|
if "all" in requested:
|
|
return list(SOURCE_ORDER)
|
|
return [source for source in SOURCE_ORDER if source in requested]
|
|
|
|
|
|
def _dedup_key(prop: dict) -> tuple | None:
|
|
postcode = str(prop.get("Postcode") or "").strip().upper()
|
|
price = int(prop.get("price") or 0)
|
|
if not postcode or price <= 0:
|
|
return None
|
|
return (postcode, int(prop.get("Bedrooms") or 0), price)
|
|
|
|
|
|
def _merge_properties(source_results: dict[str, list[dict]]) -> tuple[list[dict], dict, int]:
|
|
merged: dict[str, dict] = {}
|
|
seen_keys: dict[tuple, str] = {}
|
|
seen_ids: set[str] = set()
|
|
counts = {source: 0 for source in SOURCE_ORDER}
|
|
deduped = 0
|
|
|
|
for source in SOURCE_ORDER:
|
|
for prop in source_results.get(source, []):
|
|
key = _dedup_key(prop)
|
|
prop_id_raw = prop.get("id")
|
|
prop_id = str(prop_id_raw).strip() if prop_id_raw is not None else None
|
|
if prop_id:
|
|
if prop_id in seen_ids:
|
|
deduped += 1
|
|
continue
|
|
if key is not None:
|
|
previous_source = seen_keys.get(key)
|
|
if previous_source is not None and previous_source != source:
|
|
deduped += 1
|
|
continue
|
|
seen_ids.add(prop_id)
|
|
storage_key = prop_id
|
|
else:
|
|
if key is not None and key in seen_keys:
|
|
deduped += 1
|
|
continue
|
|
storage_key = f"{source}:{len(merged)}"
|
|
if key is not None:
|
|
seen_keys.setdefault(key, source)
|
|
merged[storage_key] = prop
|
|
counts[source] += 1
|
|
|
|
return list(merged.values()), counts, deduped
|
|
|
|
|
|
def _source_total(
|
|
results: dict[str, list[dict]],
|
|
source: str,
|
|
) -> int:
|
|
return len(results[source])
|
|
|
|
|
|
def _source_remaining(
|
|
results: dict[str, list[dict]],
|
|
source: str,
|
|
max_properties_per_source: int | None,
|
|
) -> int | None:
|
|
if max_properties_per_source is None:
|
|
return None
|
|
return max(max_properties_per_source - _source_total(results, source), 0)
|
|
|
|
|
|
def _store_properties(
|
|
results: dict[str, list[dict]],
|
|
source: str,
|
|
props: list[dict],
|
|
max_properties_per_source: int | None,
|
|
) -> int:
|
|
remaining = _source_remaining(results, source, max_properties_per_source)
|
|
if remaining == 0:
|
|
return 0
|
|
|
|
londonish = [prop for prop in props if _property_is_londonish(prop)]
|
|
dropped_outside_area = len(props) - len(londonish)
|
|
if dropped_outside_area:
|
|
log.debug(
|
|
"%s dropped %d properties outside the Greater London-ish postcode filter",
|
|
source,
|
|
dropped_outside_area,
|
|
)
|
|
|
|
selected = londonish if remaining is None else londonish[:remaining]
|
|
results[source].extend(selected)
|
|
return len(selected)
|
|
|
|
|
|
def _exception_detail(exc: BaseException) -> str:
|
|
detail = " ".join(str(exc).split())
|
|
if not detail:
|
|
detail = repr(exc)
|
|
if len(detail) > 300:
|
|
detail = f"{detail[:300]}..."
|
|
return f"{type(exc).__name__}: {detail}"
|
|
|
|
|
|
def _record_error(
|
|
errors: list[str], source: str, outcode: str, exc: BaseException
|
|
) -> None:
|
|
detail = _exception_detail(exc)
|
|
message = f"{source} {outcode}: {detail}"
|
|
errors.append(message)
|
|
log.warning(message)
|
|
|
|
|
|
def _scrape_rightmove(
|
|
outcodes: list[str],
|
|
pc_index: PostcodeSpatialIndex,
|
|
results: dict[str, list[dict]],
|
|
errors: list[str],
|
|
max_properties_per_source: int | None,
|
|
) -> None:
|
|
client = make_client()
|
|
try:
|
|
for outcode in outcodes:
|
|
if _source_remaining(results, "rightmove", max_properties_per_source) == 0:
|
|
log.info("Rightmove cap reached")
|
|
return
|
|
|
|
try:
|
|
outcode_id = resolve_outcode_id(client, outcode)
|
|
except Exception as exc:
|
|
_record_error(errors, "rightmove", outcode, exc)
|
|
time.sleep(DELAY_BETWEEN_OUTCODES)
|
|
continue
|
|
|
|
if not outcode_id:
|
|
log.debug("No Rightmove outcode ID for %s", outcode)
|
|
time.sleep(DELAY_BETWEEN_OUTCODES)
|
|
continue
|
|
|
|
remaining = _source_remaining(
|
|
results, "rightmove", max_properties_per_source
|
|
)
|
|
if remaining == 0:
|
|
log.info("Rightmove cap reached")
|
|
return
|
|
|
|
try:
|
|
props = rightmove_search_outcode(
|
|
client,
|
|
outcode_id,
|
|
outcode,
|
|
SALE_CHANNEL,
|
|
pc_index,
|
|
max_properties=remaining,
|
|
)
|
|
added = _store_properties(
|
|
results,
|
|
"rightmove",
|
|
props,
|
|
max_properties_per_source,
|
|
)
|
|
log.info("Rightmove %s: +%d", outcode, added)
|
|
except Exception as exc:
|
|
_record_error(errors, "rightmove", outcode, exc)
|
|
|
|
time.sleep(DELAY_BETWEEN_OUTCODES)
|
|
finally:
|
|
client.close()
|
|
|
|
|
|
class OutcodeTimeout(BaseException):
|
|
"""Raised when a single outcode exceeds the wall-clock budget.
|
|
|
|
Inherits BaseException (not Exception) so the SIGALRM-triggered raise can't
|
|
be silently swallowed by any of the broad `except Exception:` handlers
|
|
inside zoopla.py — the signal may fire at any bytecode boundary, including
|
|
inside those handlers."""
|
|
|
|
|
|
def _zoopla_outcode_timeout_seconds() -> int:
|
|
raw = os.environ.get("ZOOPLA_OUTCODE_TIMEOUT_SECONDS")
|
|
if raw is None:
|
|
return 300
|
|
try:
|
|
timeout = int(raw)
|
|
except ValueError as exc:
|
|
raise ValueError("ZOOPLA_OUTCODE_TIMEOUT_SECONDS must be an integer") from exc
|
|
if timeout < 1:
|
|
raise ValueError("ZOOPLA_OUTCODE_TIMEOUT_SECONDS must be greater than zero")
|
|
return timeout
|
|
|
|
|
|
@contextmanager
|
|
def _wall_clock_timeout(seconds: int, label: str):
|
|
"""SIGALRM-based wall-clock guard (POSIX). Raises OutcodeTimeout on expiry.
|
|
|
|
Interrupts a hung Playwright IPC by delivering SIGALRM to the main thread;
|
|
socket waits return EINTR and the handler raises into the caller. The
|
|
browser is presumed unhealthy afterwards — caller must relaunch it."""
|
|
if seconds <= 0:
|
|
yield
|
|
return
|
|
|
|
def _handler(signum, frame):
|
|
raise OutcodeTimeout(f"{label} exceeded {seconds}s budget")
|
|
|
|
old_handler = signal.signal(signal.SIGALRM, _handler)
|
|
signal.alarm(seconds)
|
|
try:
|
|
yield
|
|
finally:
|
|
signal.alarm(0)
|
|
signal.signal(signal.SIGALRM, old_handler)
|
|
|
|
|
|
def _launch_zoopla_with_retries(attempts: int = 3):
|
|
last_error: Exception | None = None
|
|
for attempt in range(1, attempts + 1):
|
|
try:
|
|
return launch_zoopla_browser()
|
|
except TurnstileError:
|
|
raise
|
|
except Exception as exc:
|
|
last_error = exc
|
|
log.warning(
|
|
"Zoopla browser launch failed (%d/%d): %s",
|
|
attempt,
|
|
attempts,
|
|
exc,
|
|
)
|
|
time.sleep(5)
|
|
|
|
assert last_error is not None
|
|
raise last_error
|
|
|
|
|
|
def _close_zoopla_browser(browser, label: str) -> None:
|
|
try:
|
|
with _wall_clock_timeout(15, f"{label} browser close"):
|
|
browser.close()
|
|
return
|
|
except (OutcodeTimeout, Exception) as exc:
|
|
log.warning(
|
|
"%s browser close failed: %s; force-closing",
|
|
label,
|
|
_exception_detail(exc),
|
|
)
|
|
|
|
force_close = getattr(browser, "force_close", None)
|
|
if not callable(force_close):
|
|
log.warning("%s browser has no force-close hook", label)
|
|
return
|
|
|
|
try:
|
|
force_close()
|
|
except Exception as exc:
|
|
log.warning("%s browser force-close failed: %s", label, _exception_detail(exc))
|
|
|
|
|
|
def _scrape_zoopla(
|
|
outcodes: list[str],
|
|
pc_index: PostcodeSpatialIndex,
|
|
pc_coords: dict[str, tuple[float, float]],
|
|
results: dict[str, list[dict]],
|
|
errors: list[str],
|
|
max_properties_per_source: int | None,
|
|
) -> None:
|
|
try:
|
|
browser, page = _launch_zoopla_with_retries()
|
|
except Exception as exc:
|
|
errors.append(f"zoopla: browser launch failed: {exc}")
|
|
log.warning("Zoopla skipped: browser launch failed: %s", exc)
|
|
return
|
|
|
|
outcode_timeout = _zoopla_outcode_timeout_seconds()
|
|
|
|
try:
|
|
for outcode in outcodes:
|
|
if _source_remaining(results, "zoopla", max_properties_per_source) == 0:
|
|
log.info("Zoopla cap reached")
|
|
return
|
|
|
|
for attempt in range(2):
|
|
try:
|
|
with _wall_clock_timeout(outcode_timeout, f"zoopla {outcode}"):
|
|
props, _ = zoopla_search_outcode(
|
|
page,
|
|
outcode,
|
|
pc_index,
|
|
pc_coords,
|
|
max_properties=None,
|
|
)
|
|
added = _store_properties(
|
|
results,
|
|
"zoopla",
|
|
props,
|
|
max_properties_per_source,
|
|
)
|
|
log.info("Zoopla %s: +%d", outcode, added)
|
|
break
|
|
except (OutcodeTimeout, Exception) as exc:
|
|
if attempt == 1:
|
|
_record_error(errors, "zoopla", outcode, exc)
|
|
if isinstance(exc, TurnstileError):
|
|
return
|
|
break
|
|
|
|
log.warning(
|
|
"Zoopla %s attempt %d/2 failed: %s; relaunching browser "
|
|
"and retrying",
|
|
outcode,
|
|
attempt + 1,
|
|
_exception_detail(exc),
|
|
)
|
|
_close_zoopla_browser(browser, f"zoopla {outcode}")
|
|
try:
|
|
browser, page = _launch_zoopla_with_retries()
|
|
log.info("Zoopla %s retrying with fresh browser", outcode)
|
|
except Exception as relaunch_exc:
|
|
_record_error(errors, "zoopla", outcode, relaunch_exc)
|
|
return
|
|
|
|
time.sleep(DELAY_BETWEEN_OUTCODES)
|
|
finally:
|
|
_close_zoopla_browser(browser, "zoopla final")
|
|
|
|
|
|
def _scrape_onthemarket(
|
|
outcodes: list[str],
|
|
pc_index: PostcodeSpatialIndex,
|
|
results: dict[str, list[dict]],
|
|
errors: list[str],
|
|
max_properties_per_source: int | None,
|
|
) -> None:
|
|
client = make_client()
|
|
try:
|
|
for outcode in outcodes:
|
|
if (
|
|
_source_remaining(results, "onthemarket", max_properties_per_source)
|
|
== 0
|
|
):
|
|
log.info("OnTheMarket cap reached")
|
|
return
|
|
|
|
remaining = _source_remaining(
|
|
results, "onthemarket", max_properties_per_source
|
|
)
|
|
|
|
try:
|
|
props = onthemarket_search_outcode(
|
|
client,
|
|
outcode,
|
|
pc_index,
|
|
max_properties=remaining,
|
|
)
|
|
added = _store_properties(
|
|
results,
|
|
"onthemarket",
|
|
props,
|
|
max_properties_per_source,
|
|
)
|
|
log.info("OnTheMarket %s: +%d", outcode, added)
|
|
except Exception as exc:
|
|
_record_error(errors, "onthemarket", outcode, exc)
|
|
|
|
time.sleep(DELAY_BETWEEN_OUTCODES)
|
|
finally:
|
|
client.close()
|
|
|
|
|
|
def run_scrape(
|
|
outcodes: list[str],
|
|
pc_index: PostcodeSpatialIndex,
|
|
pc_coords: dict[str, tuple[float, float]] | None = None,
|
|
sources: str | Iterable[str] | None = None,
|
|
output_dir: str | Path | None = None,
|
|
max_properties_per_source: int | None = None,
|
|
) -> dict:
|
|
"""Run one manual sale-listings scrape and write a parquet output."""
|
|
selected_sources = _source_names(sources)
|
|
selected_outcodes = filter_londonish_outcodes(outcodes)
|
|
if not selected_sources:
|
|
raise ValueError("No sources selected")
|
|
if not selected_outcodes:
|
|
raise ValueError("No Greater London-ish outcodes selected")
|
|
|
|
output_base = Path(output_dir) if output_dir is not None else DATA_DIR
|
|
output_base.mkdir(parents=True, exist_ok=True)
|
|
|
|
errors: list[str] = []
|
|
results = {source: [] for source in SOURCE_ORDER}
|
|
started_at = time.time()
|
|
|
|
log.info(
|
|
"Starting manual sale scrape: %d outcodes, sources=%s, source_cap=%s",
|
|
len(selected_outcodes),
|
|
",".join(selected_sources),
|
|
max_properties_per_source,
|
|
)
|
|
|
|
if "rightmove" in selected_sources:
|
|
_scrape_rightmove(
|
|
selected_outcodes,
|
|
pc_index,
|
|
results,
|
|
errors,
|
|
max_properties_per_source,
|
|
)
|
|
|
|
if "onthemarket" in selected_sources:
|
|
_scrape_onthemarket(
|
|
selected_outcodes,
|
|
pc_index,
|
|
results,
|
|
errors,
|
|
max_properties_per_source,
|
|
)
|
|
|
|
if "zoopla" in selected_sources:
|
|
if pc_coords is None:
|
|
pc_coords = build_postcode_coords()
|
|
_scrape_zoopla(
|
|
selected_outcodes,
|
|
pc_index,
|
|
pc_coords,
|
|
results,
|
|
errors,
|
|
max_properties_per_source,
|
|
)
|
|
|
|
merged, source_counts, deduped = _merge_properties(results)
|
|
output_path = output_base / "online_listings_buy.parquet"
|
|
if merged:
|
|
write_parquet(merged, output_path)
|
|
else:
|
|
if output_path.exists():
|
|
output_path.unlink()
|
|
log.warning("No London-ish properties to write to %s", output_path)
|
|
|
|
counts = {
|
|
"total": len(merged),
|
|
"deduped": deduped,
|
|
"sources": source_counts,
|
|
}
|
|
source_summary = " ".join(
|
|
f"{source}:{source_counts[source]}" for source in SOURCE_ORDER
|
|
)
|
|
log.info(
|
|
"Sale scrape complete: %d unique (%s deduped:%d)",
|
|
len(merged),
|
|
source_summary,
|
|
deduped,
|
|
)
|
|
|
|
return {
|
|
"outcodes": len(selected_outcodes),
|
|
"sources": selected_sources,
|
|
"source_totals": {
|
|
source: _source_total(results, source) for source in selected_sources
|
|
},
|
|
"counts": counts,
|
|
"path": str(output_path),
|
|
"errors": errors,
|
|
"elapsed_seconds": round(time.time() - started_at, 3),
|
|
}
|