perfect-postcode/pipeline/download/rightmove_outcodes.py
Andras Schmelczer f59d01227b
Some checks failed
Build and publish Docker image / build-and-push (push) Failing after 15s
CI / Check (push) Failing after 1m58s
SPlit up
2026-06-12 21:51:37 +01:00

95 lines
3.4 KiB
Python

"""Fetch Rightmove outcode→ID mapping for all outcodes in postcode.parquet."""
import argparse
import json
import time
from pathlib import Path
import httpx
import polars as pl
TYPEAHEAD_URL = "https://los.rightmove.co.uk/typeahead"
USER_AGENT = (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
)
MAX_ATTEMPTS = 4
BACKOFF_BASE_S = 2.0
# Outcodes Rightmove genuinely doesn't know (no listings ever) are tolerable;
# more than this fraction missing means we were rate-limited or blocked and the
# mapping would silently shrink, so fail the run instead of writing it.
MAX_MISS_FRACTION = 0.02
def _fetch_outcode(client: httpx.Client, outcode: str) -> str | None:
"""Return the Rightmove location ID for an outcode, retrying transient
failures with exponential backoff. Returns None only for a definitive
no-match answer; raises after MAX_ATTEMPTS on persistent errors."""
last_error: Exception | None = None
for attempt in range(MAX_ATTEMPTS):
if attempt:
time.sleep(BACKOFF_BASE_S * 2 ** (attempt - 1))
try:
resp = client.get(TYPEAHEAD_URL, params={"query": outcode, "limit": "5"})
resp.raise_for_status()
data = resp.json()
except Exception as e: # noqa: BLE001 - retried, re-raised after cap
last_error = e
continue
for m in data.get("matches", []):
if m["type"] == "OUTCODE" and m["displayName"].upper().replace(
" ", ""
) == outcode.upper().replace(" ", ""):
return str(m["id"])
return None
raise RuntimeError(f"Rightmove typeahead failed for {outcode}: {last_error}")
def fetch_outcode_ids(postcodes_path: Path, output: Path) -> None:
df = pl.read_parquet(postcodes_path, columns=["Postcode"])
outcodes = sorted(set(df["Postcode"].str.split(" ").list.first().to_list()) - {""})
print(f"Querying Rightmove typeahead for {len(outcodes)} outcodes...")
mapping: dict[str, str] = {}
missed: list[str] = []
with httpx.Client(timeout=10, headers={"User-Agent": USER_AGENT}) as client:
for i, oc in enumerate(outcodes):
rightmove_id = _fetch_outcode(client, oc)
if rightmove_id is not None:
mapping[oc] = rightmove_id
else:
missed.append(oc)
if (i + 1) % 200 == 0:
print(f" {i + 1}/{len(outcodes)} done ({len(mapping)} found)")
if missed:
print(f"Missed: {missed}")
if len(missed) > len(outcodes) * MAX_MISS_FRACTION:
raise RuntimeError(
f"{len(missed)}/{len(outcodes)} outcodes unresolved "
f"(> {MAX_MISS_FRACTION:.0%}); refusing to write a shrunken mapping"
)
output.parent.mkdir(parents=True, exist_ok=True)
with open(output, "w") as f:
json.dump(mapping, f, sort_keys=True)
print(f"Wrote {output} ({len(mapping)} outcodes, {len(missed)} missed)")
def main() -> None:
parser = argparse.ArgumentParser(description="Fetch Rightmove outcode ID mapping")
parser.add_argument(
"--postcodes", type=Path, required=True, help="postcode.parquet path"
)
parser.add_argument(
"--output", type=Path, required=True, help="Output JSON file path"
)
args = parser.parse_args()
fetch_outcode_ids(args.postcodes, args.output)
if __name__ == "__main__":
main()