perfect-postcode/pipeline/download/rightmove_outcodes.py

74 lines
2.1 KiB
Python

"""Fetch Rightmove outcode→ID mapping for all outcodes in postcode.parquet."""
import argparse
import json
from pathlib import Path
import httpx
import polars as pl
TYPEAHEAD_URL = "https://los.rightmove.co.uk/typeahead"
def fetch_outcode_ids(postcodes_path: Path, output: Path) -> None:
df = pl.read_parquet(postcodes_path, columns=["Postcode"])
outcodes = sorted(
set(df["Postcode"].str.split(" ").list.first().to_list()) - {""}
)
print(f"Querying Rightmove typeahead for {len(outcodes)} outcodes...")
mapping: dict[str, str] = {}
missed: list[str] = []
client = httpx.Client(timeout=10)
for i, oc in enumerate(outcodes):
try:
resp = client.get(TYPEAHEAD_URL, params={"query": oc, "limit": "5"})
data = resp.json()
found = False
for m in data.get("matches", []):
if (
m["type"] == "OUTCODE"
and m["displayName"].upper().replace(" ", "")
== oc.upper().replace(" ", "")
):
mapping[oc] = str(m["id"])
found = True
break
if not found:
missed.append(oc)
except Exception as e:
missed.append(oc)
print(f" Error for {oc}: {e}")
if (i + 1) % 200 == 0:
print(f" {i + 1}/{len(outcodes)} done ({len(mapping)} found)")
client.close()
output.parent.mkdir(parents=True, exist_ok=True)
with open(output, "w") as f:
json.dump(mapping, f, sort_keys=True)
print(f"Wrote {output} ({len(mapping)} outcodes, {len(missed)} missed)")
if missed:
print(f"Missed: {missed}")
def main() -> None:
parser = argparse.ArgumentParser(
description="Fetch Rightmove outcode ID mapping"
)
parser.add_argument(
"--postcodes", type=Path, required=True, help="postcode.parquet path"
)
parser.add_argument(
"--output", type=Path, required=True, help="Output JSON file path"
)
args = parser.parse_args()
fetch_outcode_ids(args.postcodes, args.output)
if __name__ == "__main__":
main()