has issues
This commit is contained in:
parent
2e112d7398
commit
c645b0f1d4
96 changed files with 2147083 additions and 5787 deletions
|
|
@ -1,6 +1,9 @@
|
|||
import logging
|
||||
import os
|
||||
import re
|
||||
import signal
|
||||
import time
|
||||
from contextlib import contextmanager
|
||||
from pathlib import Path
|
||||
from typing import Iterable
|
||||
|
||||
|
|
@ -15,6 +18,7 @@ from constants import (
|
|||
)
|
||||
|
||||
from http_client import make_client
|
||||
from onthemarket import search_outcode as onthemarket_search_outcode
|
||||
from rightmove import resolve_outcode_id
|
||||
from rightmove import search_outcode as rightmove_search_outcode
|
||||
from spatial import PostcodeSpatialIndex
|
||||
|
|
@ -25,7 +29,7 @@ from zoopla import search_outcode as zoopla_search_outcode
|
|||
|
||||
log = logging.getLogger("rightmove")
|
||||
|
||||
SOURCE_ORDER = ("rightmove", "zoopla")
|
||||
SOURCE_ORDER = ("rightmove", "onthemarket", "zoopla")
|
||||
SALE_CHANNEL = CHANNELS[0]
|
||||
LONDON_AREAS = sorted({prefix.upper() for prefix in LONDON_OUTCODE_PREFIXES})
|
||||
OUTCODE_RE = re.compile(r"^([A-Z]{1,2}\d[A-Z0-9]?)")
|
||||
|
|
@ -121,29 +125,6 @@ def load_outcodes() -> list[str]:
|
|||
return londonish
|
||||
|
||||
|
||||
def build_postcode_index() -> PostcodeSpatialIndex:
|
||||
"""Build spatial index from ARCGIS England postcodes."""
|
||||
log.info("Building postcode spatial index from %s", ARCGIS_PATH)
|
||||
postcode_col, country_col = _arcgis_columns()
|
||||
df = pl.read_parquet(
|
||||
ARCGIS_PATH, columns=[postcode_col, country_col, "lat", "long"]
|
||||
)
|
||||
england = df.filter(
|
||||
(pl.col(country_col) == "E92000001")
|
||||
& _londonish_postcode_expr(postcode_col)
|
||||
).drop_nulls(
|
||||
subset=["lat", "long"]
|
||||
)
|
||||
return PostcodeSpatialIndex(
|
||||
england.get_column("lat").to_list(),
|
||||
england.get_column("long").to_list(),
|
||||
[
|
||||
_normalize_postcode(pcd)
|
||||
for pcd in england.get_column(postcode_col).to_list()
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
def build_postcode_coords() -> dict[str, tuple[float, float]]:
|
||||
"""Build postcode -> (lat, lng) lookup from ARCGIS England postcodes."""
|
||||
log.info("Building postcode coords lookup from %s", ARCGIS_PATH)
|
||||
|
|
@ -168,6 +149,29 @@ def build_postcode_coords() -> dict[str, tuple[float, float]]:
|
|||
return coords
|
||||
|
||||
|
||||
def build_postcode_index() -> PostcodeSpatialIndex:
|
||||
"""Build spatial index from ARCGIS England postcodes."""
|
||||
log.info("Building postcode spatial index from %s", ARCGIS_PATH)
|
||||
postcode_col, country_col = _arcgis_columns()
|
||||
df = pl.read_parquet(
|
||||
ARCGIS_PATH, columns=[postcode_col, country_col, "lat", "long"]
|
||||
)
|
||||
england = df.filter(
|
||||
(pl.col(country_col) == "E92000001")
|
||||
& _londonish_postcode_expr(postcode_col)
|
||||
).drop_nulls(
|
||||
subset=["lat", "long"]
|
||||
)
|
||||
return PostcodeSpatialIndex(
|
||||
england.get_column("lat").to_list(),
|
||||
england.get_column("long").to_list(),
|
||||
[
|
||||
_normalize_postcode(pcd)
|
||||
for pcd in england.get_column(postcode_col).to_list()
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
def _source_names(sources: str | Iterable[str] | None) -> list[str]:
|
||||
if sources is None:
|
||||
return list(SOURCE_ORDER)
|
||||
|
|
@ -185,34 +189,44 @@ def _source_names(sources: str | Iterable[str] | None) -> list[str]:
|
|||
return [source for source in SOURCE_ORDER if source in requested]
|
||||
|
||||
|
||||
def _dedup_key(prop: dict) -> tuple:
|
||||
return (prop.get("Postcode", ""), prop.get("Bedrooms", 0), prop.get("price", 0))
|
||||
def _dedup_key(prop: dict) -> tuple | None:
|
||||
postcode = str(prop.get("Postcode") or "").strip().upper()
|
||||
price = int(prop.get("price") or 0)
|
||||
if not postcode or price <= 0:
|
||||
return None
|
||||
return (postcode, int(prop.get("Bedrooms") or 0), price)
|
||||
|
||||
|
||||
def _merge_properties(source_results: dict[str, list[dict]]) -> tuple[list[dict], dict, int]:
|
||||
merged: dict[str, dict] = {}
|
||||
seen_keys: set[tuple] = set()
|
||||
seen_keys: dict[tuple, str] = {}
|
||||
seen_ids: set[str] = set()
|
||||
counts = {source: 0 for source in SOURCE_ORDER}
|
||||
deduped = 0
|
||||
|
||||
for source in SOURCE_ORDER:
|
||||
for prop in source_results.get(source, []):
|
||||
prop_id = prop.get("id")
|
||||
if prop_id is not None:
|
||||
prop_id = str(prop_id)
|
||||
key = _dedup_key(prop)
|
||||
prop_id_raw = prop.get("id")
|
||||
prop_id = str(prop_id_raw).strip() if prop_id_raw is not None else None
|
||||
if prop_id:
|
||||
if prop_id in seen_ids:
|
||||
deduped += 1
|
||||
continue
|
||||
if key is not None:
|
||||
previous_source = seen_keys.get(key)
|
||||
if previous_source is not None and previous_source != source:
|
||||
deduped += 1
|
||||
continue
|
||||
seen_ids.add(prop_id)
|
||||
storage_key = prop_id
|
||||
else:
|
||||
key = _dedup_key(prop)
|
||||
if key in seen_keys:
|
||||
if key is not None and key in seen_keys:
|
||||
deduped += 1
|
||||
continue
|
||||
seen_keys.add(key)
|
||||
storage_key = f"{source}:{len(merged)}"
|
||||
if key is not None:
|
||||
seen_keys.setdefault(key, source)
|
||||
merged[storage_key] = prop
|
||||
counts[source] += 1
|
||||
|
||||
|
|
@ -260,38 +274,24 @@ def _store_properties(
|
|||
return len(selected)
|
||||
|
||||
|
||||
def _record_error(
|
||||
errors: list[str], source: str, outcode: str, exc: Exception
|
||||
) -> None:
|
||||
def _exception_detail(exc: BaseException) -> str:
|
||||
detail = " ".join(str(exc).split())
|
||||
if not detail:
|
||||
detail = repr(exc)
|
||||
if len(detail) > 300:
|
||||
detail = f"{detail[:300]}..."
|
||||
return f"{type(exc).__name__}: {detail}"
|
||||
|
||||
|
||||
def _record_error(
|
||||
errors: list[str], source: str, outcode: str, exc: BaseException
|
||||
) -> None:
|
||||
detail = _exception_detail(exc)
|
||||
message = f"{source} {outcode}: {detail}"
|
||||
errors.append(message)
|
||||
log.warning(message)
|
||||
|
||||
|
||||
def _launch_zoopla_with_retries(attempts: int = 3):
|
||||
last_error: Exception | None = None
|
||||
for attempt in range(1, attempts + 1):
|
||||
try:
|
||||
return launch_zoopla_browser()
|
||||
except TurnstileError:
|
||||
raise
|
||||
except Exception as exc:
|
||||
last_error = exc
|
||||
log.warning(
|
||||
"Zoopla browser launch failed (%d/%d): %s",
|
||||
attempt,
|
||||
attempts,
|
||||
exc,
|
||||
)
|
||||
time.sleep(5)
|
||||
|
||||
assert last_error is not None
|
||||
raise last_error
|
||||
|
||||
|
||||
def _scrape_rightmove(
|
||||
outcodes: list[str],
|
||||
pc_index: PostcodeSpatialIndex,
|
||||
|
|
@ -349,6 +349,95 @@ def _scrape_rightmove(
|
|||
client.close()
|
||||
|
||||
|
||||
class OutcodeTimeout(BaseException):
|
||||
"""Raised when a single outcode exceeds the wall-clock budget.
|
||||
|
||||
Inherits BaseException (not Exception) so the SIGALRM-triggered raise can't
|
||||
be silently swallowed by any of the broad `except Exception:` handlers
|
||||
inside zoopla.py — the signal may fire at any bytecode boundary, including
|
||||
inside those handlers."""
|
||||
|
||||
|
||||
def _zoopla_outcode_timeout_seconds() -> int:
|
||||
raw = os.environ.get("ZOOPLA_OUTCODE_TIMEOUT_SECONDS")
|
||||
if raw is None:
|
||||
return 300
|
||||
try:
|
||||
timeout = int(raw)
|
||||
except ValueError as exc:
|
||||
raise ValueError("ZOOPLA_OUTCODE_TIMEOUT_SECONDS must be an integer") from exc
|
||||
if timeout < 1:
|
||||
raise ValueError("ZOOPLA_OUTCODE_TIMEOUT_SECONDS must be greater than zero")
|
||||
return timeout
|
||||
|
||||
|
||||
@contextmanager
|
||||
def _wall_clock_timeout(seconds: int, label: str):
|
||||
"""SIGALRM-based wall-clock guard (POSIX). Raises OutcodeTimeout on expiry.
|
||||
|
||||
Interrupts a hung Playwright IPC by delivering SIGALRM to the main thread;
|
||||
socket waits return EINTR and the handler raises into the caller. The
|
||||
browser is presumed unhealthy afterwards — caller must relaunch it."""
|
||||
if seconds <= 0:
|
||||
yield
|
||||
return
|
||||
|
||||
def _handler(signum, frame):
|
||||
raise OutcodeTimeout(f"{label} exceeded {seconds}s budget")
|
||||
|
||||
old_handler = signal.signal(signal.SIGALRM, _handler)
|
||||
signal.alarm(seconds)
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
signal.alarm(0)
|
||||
signal.signal(signal.SIGALRM, old_handler)
|
||||
|
||||
|
||||
def _launch_zoopla_with_retries(attempts: int = 3):
|
||||
last_error: Exception | None = None
|
||||
for attempt in range(1, attempts + 1):
|
||||
try:
|
||||
return launch_zoopla_browser()
|
||||
except TurnstileError:
|
||||
raise
|
||||
except Exception as exc:
|
||||
last_error = exc
|
||||
log.warning(
|
||||
"Zoopla browser launch failed (%d/%d): %s",
|
||||
attempt,
|
||||
attempts,
|
||||
exc,
|
||||
)
|
||||
time.sleep(5)
|
||||
|
||||
assert last_error is not None
|
||||
raise last_error
|
||||
|
||||
|
||||
def _close_zoopla_browser(browser, label: str) -> None:
|
||||
try:
|
||||
with _wall_clock_timeout(15, f"{label} browser close"):
|
||||
browser.close()
|
||||
return
|
||||
except (OutcodeTimeout, Exception) as exc:
|
||||
log.warning(
|
||||
"%s browser close failed: %s; force-closing",
|
||||
label,
|
||||
_exception_detail(exc),
|
||||
)
|
||||
|
||||
force_close = getattr(browser, "force_close", None)
|
||||
if not callable(force_close):
|
||||
log.warning("%s browser has no force-close hook", label)
|
||||
return
|
||||
|
||||
try:
|
||||
force_close()
|
||||
except Exception as exc:
|
||||
log.warning("%s browser force-close failed: %s", label, _exception_detail(exc))
|
||||
|
||||
|
||||
def _scrape_zoopla(
|
||||
outcodes: list[str],
|
||||
pc_index: PostcodeSpatialIndex,
|
||||
|
|
@ -364,6 +453,8 @@ def _scrape_zoopla(
|
|||
log.warning("Zoopla skipped: browser launch failed: %s", exc)
|
||||
return
|
||||
|
||||
outcode_timeout = _zoopla_outcode_timeout_seconds()
|
||||
|
||||
try:
|
||||
for outcode in outcodes:
|
||||
if _source_remaining(results, "zoopla", max_properties_per_source) == 0:
|
||||
|
|
@ -372,15 +463,14 @@ def _scrape_zoopla(
|
|||
|
||||
for attempt in range(2):
|
||||
try:
|
||||
# Fetch the outcode page set first; _store_properties applies
|
||||
# the London-ish postcode filter and source cap after transformation.
|
||||
props, _ = zoopla_search_outcode(
|
||||
page,
|
||||
outcode,
|
||||
pc_index,
|
||||
pc_coords,
|
||||
max_properties=None,
|
||||
)
|
||||
with _wall_clock_timeout(outcode_timeout, f"zoopla {outcode}"):
|
||||
props, _ = zoopla_search_outcode(
|
||||
page,
|
||||
outcode,
|
||||
pc_index,
|
||||
pc_coords,
|
||||
max_properties=None,
|
||||
)
|
||||
added = _store_properties(
|
||||
results,
|
||||
"zoopla",
|
||||
|
|
@ -389,27 +479,74 @@ def _scrape_zoopla(
|
|||
)
|
||||
log.info("Zoopla %s: +%d", outcode, added)
|
||||
break
|
||||
except Exception as exc:
|
||||
except (OutcodeTimeout, Exception) as exc:
|
||||
if attempt == 1:
|
||||
_record_error(errors, "zoopla", outcode, exc)
|
||||
if isinstance(exc, TurnstileError):
|
||||
return
|
||||
break
|
||||
|
||||
log.warning("Zoopla %s failed; relaunching browser and retrying", outcode)
|
||||
try:
|
||||
browser.close()
|
||||
except Exception:
|
||||
pass
|
||||
log.warning(
|
||||
"Zoopla %s attempt %d/2 failed: %s; relaunching browser "
|
||||
"and retrying",
|
||||
outcode,
|
||||
attempt + 1,
|
||||
_exception_detail(exc),
|
||||
)
|
||||
_close_zoopla_browser(browser, f"zoopla {outcode}")
|
||||
try:
|
||||
browser, page = _launch_zoopla_with_retries()
|
||||
log.info("Zoopla %s retrying with fresh browser", outcode)
|
||||
except Exception as relaunch_exc:
|
||||
_record_error(errors, "zoopla", outcode, relaunch_exc)
|
||||
return
|
||||
|
||||
time.sleep(DELAY_BETWEEN_OUTCODES)
|
||||
finally:
|
||||
browser.close()
|
||||
_close_zoopla_browser(browser, "zoopla final")
|
||||
|
||||
|
||||
def _scrape_onthemarket(
|
||||
outcodes: list[str],
|
||||
pc_index: PostcodeSpatialIndex,
|
||||
results: dict[str, list[dict]],
|
||||
errors: list[str],
|
||||
max_properties_per_source: int | None,
|
||||
) -> None:
|
||||
client = make_client()
|
||||
try:
|
||||
for outcode in outcodes:
|
||||
if (
|
||||
_source_remaining(results, "onthemarket", max_properties_per_source)
|
||||
== 0
|
||||
):
|
||||
log.info("OnTheMarket cap reached")
|
||||
return
|
||||
|
||||
remaining = _source_remaining(
|
||||
results, "onthemarket", max_properties_per_source
|
||||
)
|
||||
|
||||
try:
|
||||
props = onthemarket_search_outcode(
|
||||
client,
|
||||
outcode,
|
||||
pc_index,
|
||||
max_properties=remaining,
|
||||
)
|
||||
added = _store_properties(
|
||||
results,
|
||||
"onthemarket",
|
||||
props,
|
||||
max_properties_per_source,
|
||||
)
|
||||
log.info("OnTheMarket %s: +%d", outcode, added)
|
||||
except Exception as exc:
|
||||
_record_error(errors, "onthemarket", outcode, exc)
|
||||
|
||||
time.sleep(DELAY_BETWEEN_OUTCODES)
|
||||
finally:
|
||||
client.close()
|
||||
|
||||
|
||||
def run_scrape(
|
||||
|
|
@ -451,6 +588,15 @@ def run_scrape(
|
|||
max_properties_per_source,
|
||||
)
|
||||
|
||||
if "onthemarket" in selected_sources:
|
||||
_scrape_onthemarket(
|
||||
selected_outcodes,
|
||||
pc_index,
|
||||
results,
|
||||
errors,
|
||||
max_properties_per_source,
|
||||
)
|
||||
|
||||
if "zoopla" in selected_sources:
|
||||
if pc_coords is None:
|
||||
pc_coords = build_postcode_coords()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue