has issues

This commit is contained in:
Andras Schmelczer 2026-05-25 13:20:17 +01:00
parent 2e112d7398
commit c645b0f1d4
96 changed files with 2147083 additions and 5787 deletions

View file

@ -1,6 +1,9 @@
import logging
import os
import re
import signal
import time
from contextlib import contextmanager
from pathlib import Path
from typing import Iterable
@ -15,6 +18,7 @@ from constants import (
)
from http_client import make_client
from onthemarket import search_outcode as onthemarket_search_outcode
from rightmove import resolve_outcode_id
from rightmove import search_outcode as rightmove_search_outcode
from spatial import PostcodeSpatialIndex
@ -25,7 +29,7 @@ from zoopla import search_outcode as zoopla_search_outcode
log = logging.getLogger("rightmove")
SOURCE_ORDER = ("rightmove", "zoopla")
SOURCE_ORDER = ("rightmove", "onthemarket", "zoopla")
SALE_CHANNEL = CHANNELS[0]
LONDON_AREAS = sorted({prefix.upper() for prefix in LONDON_OUTCODE_PREFIXES})
OUTCODE_RE = re.compile(r"^([A-Z]{1,2}\d[A-Z0-9]?)")
@ -121,29 +125,6 @@ def load_outcodes() -> list[str]:
return londonish
def build_postcode_index() -> PostcodeSpatialIndex:
"""Build spatial index from ARCGIS England postcodes."""
log.info("Building postcode spatial index from %s", ARCGIS_PATH)
postcode_col, country_col = _arcgis_columns()
df = pl.read_parquet(
ARCGIS_PATH, columns=[postcode_col, country_col, "lat", "long"]
)
england = df.filter(
(pl.col(country_col) == "E92000001")
& _londonish_postcode_expr(postcode_col)
).drop_nulls(
subset=["lat", "long"]
)
return PostcodeSpatialIndex(
england.get_column("lat").to_list(),
england.get_column("long").to_list(),
[
_normalize_postcode(pcd)
for pcd in england.get_column(postcode_col).to_list()
],
)
def build_postcode_coords() -> dict[str, tuple[float, float]]:
"""Build postcode -> (lat, lng) lookup from ARCGIS England postcodes."""
log.info("Building postcode coords lookup from %s", ARCGIS_PATH)
@ -168,6 +149,29 @@ def build_postcode_coords() -> dict[str, tuple[float, float]]:
return coords
def build_postcode_index() -> PostcodeSpatialIndex:
"""Build spatial index from ARCGIS England postcodes."""
log.info("Building postcode spatial index from %s", ARCGIS_PATH)
postcode_col, country_col = _arcgis_columns()
df = pl.read_parquet(
ARCGIS_PATH, columns=[postcode_col, country_col, "lat", "long"]
)
england = df.filter(
(pl.col(country_col) == "E92000001")
& _londonish_postcode_expr(postcode_col)
).drop_nulls(
subset=["lat", "long"]
)
return PostcodeSpatialIndex(
england.get_column("lat").to_list(),
england.get_column("long").to_list(),
[
_normalize_postcode(pcd)
for pcd in england.get_column(postcode_col).to_list()
],
)
def _source_names(sources: str | Iterable[str] | None) -> list[str]:
if sources is None:
return list(SOURCE_ORDER)
@ -185,34 +189,44 @@ def _source_names(sources: str | Iterable[str] | None) -> list[str]:
return [source for source in SOURCE_ORDER if source in requested]
def _dedup_key(prop: dict) -> tuple:
return (prop.get("Postcode", ""), prop.get("Bedrooms", 0), prop.get("price", 0))
def _dedup_key(prop: dict) -> tuple | None:
postcode = str(prop.get("Postcode") or "").strip().upper()
price = int(prop.get("price") or 0)
if not postcode or price <= 0:
return None
return (postcode, int(prop.get("Bedrooms") or 0), price)
def _merge_properties(source_results: dict[str, list[dict]]) -> tuple[list[dict], dict, int]:
merged: dict[str, dict] = {}
seen_keys: set[tuple] = set()
seen_keys: dict[tuple, str] = {}
seen_ids: set[str] = set()
counts = {source: 0 for source in SOURCE_ORDER}
deduped = 0
for source in SOURCE_ORDER:
for prop in source_results.get(source, []):
prop_id = prop.get("id")
if prop_id is not None:
prop_id = str(prop_id)
key = _dedup_key(prop)
prop_id_raw = prop.get("id")
prop_id = str(prop_id_raw).strip() if prop_id_raw is not None else None
if prop_id:
if prop_id in seen_ids:
deduped += 1
continue
if key is not None:
previous_source = seen_keys.get(key)
if previous_source is not None and previous_source != source:
deduped += 1
continue
seen_ids.add(prop_id)
storage_key = prop_id
else:
key = _dedup_key(prop)
if key in seen_keys:
if key is not None and key in seen_keys:
deduped += 1
continue
seen_keys.add(key)
storage_key = f"{source}:{len(merged)}"
if key is not None:
seen_keys.setdefault(key, source)
merged[storage_key] = prop
counts[source] += 1
@ -260,38 +274,24 @@ def _store_properties(
return len(selected)
def _record_error(
errors: list[str], source: str, outcode: str, exc: Exception
) -> None:
def _exception_detail(exc: BaseException) -> str:
detail = " ".join(str(exc).split())
if not detail:
detail = repr(exc)
if len(detail) > 300:
detail = f"{detail[:300]}..."
return f"{type(exc).__name__}: {detail}"
def _record_error(
errors: list[str], source: str, outcode: str, exc: BaseException
) -> None:
detail = _exception_detail(exc)
message = f"{source} {outcode}: {detail}"
errors.append(message)
log.warning(message)
def _launch_zoopla_with_retries(attempts: int = 3):
last_error: Exception | None = None
for attempt in range(1, attempts + 1):
try:
return launch_zoopla_browser()
except TurnstileError:
raise
except Exception as exc:
last_error = exc
log.warning(
"Zoopla browser launch failed (%d/%d): %s",
attempt,
attempts,
exc,
)
time.sleep(5)
assert last_error is not None
raise last_error
def _scrape_rightmove(
outcodes: list[str],
pc_index: PostcodeSpatialIndex,
@ -349,6 +349,95 @@ def _scrape_rightmove(
client.close()
class OutcodeTimeout(BaseException):
"""Raised when a single outcode exceeds the wall-clock budget.
Inherits BaseException (not Exception) so the SIGALRM-triggered raise can't
be silently swallowed by any of the broad `except Exception:` handlers
inside zoopla.py the signal may fire at any bytecode boundary, including
inside those handlers."""
def _zoopla_outcode_timeout_seconds() -> int:
raw = os.environ.get("ZOOPLA_OUTCODE_TIMEOUT_SECONDS")
if raw is None:
return 300
try:
timeout = int(raw)
except ValueError as exc:
raise ValueError("ZOOPLA_OUTCODE_TIMEOUT_SECONDS must be an integer") from exc
if timeout < 1:
raise ValueError("ZOOPLA_OUTCODE_TIMEOUT_SECONDS must be greater than zero")
return timeout
@contextmanager
def _wall_clock_timeout(seconds: int, label: str):
"""SIGALRM-based wall-clock guard (POSIX). Raises OutcodeTimeout on expiry.
Interrupts a hung Playwright IPC by delivering SIGALRM to the main thread;
socket waits return EINTR and the handler raises into the caller. The
browser is presumed unhealthy afterwards caller must relaunch it."""
if seconds <= 0:
yield
return
def _handler(signum, frame):
raise OutcodeTimeout(f"{label} exceeded {seconds}s budget")
old_handler = signal.signal(signal.SIGALRM, _handler)
signal.alarm(seconds)
try:
yield
finally:
signal.alarm(0)
signal.signal(signal.SIGALRM, old_handler)
def _launch_zoopla_with_retries(attempts: int = 3):
last_error: Exception | None = None
for attempt in range(1, attempts + 1):
try:
return launch_zoopla_browser()
except TurnstileError:
raise
except Exception as exc:
last_error = exc
log.warning(
"Zoopla browser launch failed (%d/%d): %s",
attempt,
attempts,
exc,
)
time.sleep(5)
assert last_error is not None
raise last_error
def _close_zoopla_browser(browser, label: str) -> None:
try:
with _wall_clock_timeout(15, f"{label} browser close"):
browser.close()
return
except (OutcodeTimeout, Exception) as exc:
log.warning(
"%s browser close failed: %s; force-closing",
label,
_exception_detail(exc),
)
force_close = getattr(browser, "force_close", None)
if not callable(force_close):
log.warning("%s browser has no force-close hook", label)
return
try:
force_close()
except Exception as exc:
log.warning("%s browser force-close failed: %s", label, _exception_detail(exc))
def _scrape_zoopla(
outcodes: list[str],
pc_index: PostcodeSpatialIndex,
@ -364,6 +453,8 @@ def _scrape_zoopla(
log.warning("Zoopla skipped: browser launch failed: %s", exc)
return
outcode_timeout = _zoopla_outcode_timeout_seconds()
try:
for outcode in outcodes:
if _source_remaining(results, "zoopla", max_properties_per_source) == 0:
@ -372,15 +463,14 @@ def _scrape_zoopla(
for attempt in range(2):
try:
# Fetch the outcode page set first; _store_properties applies
# the London-ish postcode filter and source cap after transformation.
props, _ = zoopla_search_outcode(
page,
outcode,
pc_index,
pc_coords,
max_properties=None,
)
with _wall_clock_timeout(outcode_timeout, f"zoopla {outcode}"):
props, _ = zoopla_search_outcode(
page,
outcode,
pc_index,
pc_coords,
max_properties=None,
)
added = _store_properties(
results,
"zoopla",
@ -389,27 +479,74 @@ def _scrape_zoopla(
)
log.info("Zoopla %s: +%d", outcode, added)
break
except Exception as exc:
except (OutcodeTimeout, Exception) as exc:
if attempt == 1:
_record_error(errors, "zoopla", outcode, exc)
if isinstance(exc, TurnstileError):
return
break
log.warning("Zoopla %s failed; relaunching browser and retrying", outcode)
try:
browser.close()
except Exception:
pass
log.warning(
"Zoopla %s attempt %d/2 failed: %s; relaunching browser "
"and retrying",
outcode,
attempt + 1,
_exception_detail(exc),
)
_close_zoopla_browser(browser, f"zoopla {outcode}")
try:
browser, page = _launch_zoopla_with_retries()
log.info("Zoopla %s retrying with fresh browser", outcode)
except Exception as relaunch_exc:
_record_error(errors, "zoopla", outcode, relaunch_exc)
return
time.sleep(DELAY_BETWEEN_OUTCODES)
finally:
browser.close()
_close_zoopla_browser(browser, "zoopla final")
def _scrape_onthemarket(
outcodes: list[str],
pc_index: PostcodeSpatialIndex,
results: dict[str, list[dict]],
errors: list[str],
max_properties_per_source: int | None,
) -> None:
client = make_client()
try:
for outcode in outcodes:
if (
_source_remaining(results, "onthemarket", max_properties_per_source)
== 0
):
log.info("OnTheMarket cap reached")
return
remaining = _source_remaining(
results, "onthemarket", max_properties_per_source
)
try:
props = onthemarket_search_outcode(
client,
outcode,
pc_index,
max_properties=remaining,
)
added = _store_properties(
results,
"onthemarket",
props,
max_properties_per_source,
)
log.info("OnTheMarket %s: +%d", outcode, added)
except Exception as exc:
_record_error(errors, "onthemarket", outcode, exc)
time.sleep(DELAY_BETWEEN_OUTCODES)
finally:
client.close()
def run_scrape(
@ -451,6 +588,15 @@ def run_scrape(
max_properties_per_source,
)
if "onthemarket" in selected_sources:
_scrape_onthemarket(
selected_outcodes,
pc_index,
results,
errors,
max_properties_per_source,
)
if "zoopla" in selected_sources:
if pc_coords is None:
pc_coords = build_postcode_coords()