"""Fetch Rightmove outcode→ID mapping for all outcodes in postcode.parquet.""" import argparse import json import time from pathlib import Path import httpx import polars as pl TYPEAHEAD_URL = "https://los.rightmove.co.uk/typeahead" USER_AGENT = ( "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" ) MAX_ATTEMPTS = 4 BACKOFF_BASE_S = 2.0 # Outcodes Rightmove genuinely doesn't know (no listings ever) are tolerable; # more than this fraction missing means we were rate-limited or blocked and the # mapping would silently shrink, so fail the run instead of writing it. MAX_MISS_FRACTION = 0.02 def _fetch_outcode(client: httpx.Client, outcode: str) -> str | None: """Return the Rightmove location ID for an outcode, retrying transient failures with exponential backoff. Returns None only for a definitive no-match answer; raises after MAX_ATTEMPTS on persistent errors.""" last_error: Exception | None = None for attempt in range(MAX_ATTEMPTS): if attempt: time.sleep(BACKOFF_BASE_S * 2 ** (attempt - 1)) try: resp = client.get(TYPEAHEAD_URL, params={"query": outcode, "limit": "5"}) resp.raise_for_status() data = resp.json() except Exception as e: # noqa: BLE001 - retried, re-raised after cap last_error = e continue for m in data.get("matches", []): if m["type"] == "OUTCODE" and m["displayName"].upper().replace( " ", "" ) == outcode.upper().replace(" ", ""): return str(m["id"]) return None raise RuntimeError(f"Rightmove typeahead failed for {outcode}: {last_error}") def fetch_outcode_ids(postcodes_path: Path, output: Path) -> None: df = pl.read_parquet(postcodes_path, columns=["Postcode"]) outcodes = sorted(set(df["Postcode"].str.split(" ").list.first().to_list()) - {""}) print(f"Querying Rightmove typeahead for {len(outcodes)} outcodes...") mapping: dict[str, str] = {} missed: list[str] = [] with httpx.Client(timeout=10, headers={"User-Agent": USER_AGENT}) as client: for i, oc in enumerate(outcodes): rightmove_id = _fetch_outcode(client, oc) if rightmove_id is not None: mapping[oc] = rightmove_id else: missed.append(oc) if (i + 1) % 200 == 0: print(f" {i + 1}/{len(outcodes)} done ({len(mapping)} found)") if missed: print(f"Missed: {missed}") if len(missed) > len(outcodes) * MAX_MISS_FRACTION: raise RuntimeError( f"{len(missed)}/{len(outcodes)} outcodes unresolved " f"(> {MAX_MISS_FRACTION:.0%}); refusing to write a shrunken mapping" ) output.parent.mkdir(parents=True, exist_ok=True) with open(output, "w") as f: json.dump(mapping, f, sort_keys=True) print(f"Wrote {output} ({len(mapping)} outcodes, {len(missed)} missed)") def main() -> None: parser = argparse.ArgumentParser(description="Fetch Rightmove outcode ID mapping") parser.add_argument( "--postcodes", type=Path, required=True, help="postcode.parquet path" ) parser.add_argument( "--output", type=Path, required=True, help="Output JSON file path" ) args = parser.parse_args() fetch_outcode_ids(args.postcodes, args.output) if __name__ == "__main__": main()