95 lines
3.4 KiB
Python
95 lines
3.4 KiB
Python
"""Fetch Rightmove outcode→ID mapping for all outcodes in postcode.parquet."""
|
|
|
|
import argparse
|
|
import json
|
|
import time
|
|
from pathlib import Path
|
|
|
|
import httpx
|
|
import polars as pl
|
|
|
|
|
|
TYPEAHEAD_URL = "https://los.rightmove.co.uk/typeahead"
|
|
USER_AGENT = (
|
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
|
"(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
|
)
|
|
MAX_ATTEMPTS = 4
|
|
BACKOFF_BASE_S = 2.0
|
|
# Outcodes Rightmove genuinely doesn't know (no listings ever) are tolerable;
|
|
# more than this fraction missing means we were rate-limited or blocked and the
|
|
# mapping would silently shrink, so fail the run instead of writing it.
|
|
MAX_MISS_FRACTION = 0.02
|
|
|
|
|
|
def _fetch_outcode(client: httpx.Client, outcode: str) -> str | None:
|
|
"""Return the Rightmove location ID for an outcode, retrying transient
|
|
failures with exponential backoff. Returns None only for a definitive
|
|
no-match answer; raises after MAX_ATTEMPTS on persistent errors."""
|
|
last_error: Exception | None = None
|
|
for attempt in range(MAX_ATTEMPTS):
|
|
if attempt:
|
|
time.sleep(BACKOFF_BASE_S * 2 ** (attempt - 1))
|
|
try:
|
|
resp = client.get(TYPEAHEAD_URL, params={"query": outcode, "limit": "5"})
|
|
resp.raise_for_status()
|
|
data = resp.json()
|
|
except Exception as e: # noqa: BLE001 - retried, re-raised after cap
|
|
last_error = e
|
|
continue
|
|
for m in data.get("matches", []):
|
|
if m["type"] == "OUTCODE" and m["displayName"].upper().replace(
|
|
" ", ""
|
|
) == outcode.upper().replace(" ", ""):
|
|
return str(m["id"])
|
|
return None
|
|
raise RuntimeError(f"Rightmove typeahead failed for {outcode}: {last_error}")
|
|
|
|
|
|
def fetch_outcode_ids(postcodes_path: Path, output: Path) -> None:
|
|
df = pl.read_parquet(postcodes_path, columns=["Postcode"])
|
|
outcodes = sorted(set(df["Postcode"].str.split(" ").list.first().to_list()) - {""})
|
|
print(f"Querying Rightmove typeahead for {len(outcodes)} outcodes...")
|
|
|
|
mapping: dict[str, str] = {}
|
|
missed: list[str] = []
|
|
with httpx.Client(timeout=10, headers={"User-Agent": USER_AGENT}) as client:
|
|
for i, oc in enumerate(outcodes):
|
|
rightmove_id = _fetch_outcode(client, oc)
|
|
if rightmove_id is not None:
|
|
mapping[oc] = rightmove_id
|
|
else:
|
|
missed.append(oc)
|
|
|
|
if (i + 1) % 200 == 0:
|
|
print(f" {i + 1}/{len(outcodes)} done ({len(mapping)} found)")
|
|
|
|
if missed:
|
|
print(f"Missed: {missed}")
|
|
if len(missed) > len(outcodes) * MAX_MISS_FRACTION:
|
|
raise RuntimeError(
|
|
f"{len(missed)}/{len(outcodes)} outcodes unresolved "
|
|
f"(> {MAX_MISS_FRACTION:.0%}); refusing to write a shrunken mapping"
|
|
)
|
|
|
|
output.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(output, "w") as f:
|
|
json.dump(mapping, f, sort_keys=True)
|
|
|
|
print(f"Wrote {output} ({len(mapping)} outcodes, {len(missed)} missed)")
|
|
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser(description="Fetch Rightmove outcode ID mapping")
|
|
parser.add_argument(
|
|
"--postcodes", type=Path, required=True, help="postcode.parquet path"
|
|
)
|
|
parser.add_argument(
|
|
"--output", type=Path, required=True, help="Output JSON file path"
|
|
)
|
|
args = parser.parse_args()
|
|
fetch_outcode_ids(args.postcodes, args.output)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|