Lots of improvements
This commit is contained in:
parent
ef921361ec
commit
80a5a2a774
21 changed files with 489 additions and 337 deletions
74
pipeline/download/rightmove_outcodes.py
Normal file
74
pipeline/download/rightmove_outcodes.py
Normal file
|
|
@ -0,0 +1,74 @@
|
|||
"""Fetch Rightmove outcode→ID mapping for all outcodes in postcode.parquet."""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import httpx
|
||||
import polars as pl
|
||||
|
||||
|
||||
TYPEAHEAD_URL = "https://los.rightmove.co.uk/typeahead"
|
||||
|
||||
|
||||
def fetch_outcode_ids(postcodes_path: Path, output: Path) -> None:
|
||||
df = pl.read_parquet(postcodes_path, columns=["Postcode"])
|
||||
outcodes = sorted(
|
||||
set(df["Postcode"].str.split(" ").list.first().to_list()) - {""}
|
||||
)
|
||||
print(f"Querying Rightmove typeahead for {len(outcodes)} outcodes...")
|
||||
|
||||
mapping: dict[str, str] = {}
|
||||
missed: list[str] = []
|
||||
client = httpx.Client(timeout=10)
|
||||
|
||||
for i, oc in enumerate(outcodes):
|
||||
try:
|
||||
resp = client.get(TYPEAHEAD_URL, params={"query": oc, "limit": "5"})
|
||||
data = resp.json()
|
||||
found = False
|
||||
for m in data.get("matches", []):
|
||||
if (
|
||||
m["type"] == "OUTCODE"
|
||||
and m["displayName"].upper().replace(" ", "")
|
||||
== oc.upper().replace(" ", "")
|
||||
):
|
||||
mapping[oc] = str(m["id"])
|
||||
found = True
|
||||
break
|
||||
if not found:
|
||||
missed.append(oc)
|
||||
except Exception as e:
|
||||
missed.append(oc)
|
||||
print(f" Error for {oc}: {e}")
|
||||
|
||||
if (i + 1) % 200 == 0:
|
||||
print(f" {i + 1}/{len(outcodes)} done ({len(mapping)} found)")
|
||||
|
||||
client.close()
|
||||
|
||||
output.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(output, "w") as f:
|
||||
json.dump(mapping, f, sort_keys=True)
|
||||
|
||||
print(f"Wrote {output} ({len(mapping)} outcodes, {len(missed)} missed)")
|
||||
if missed:
|
||||
print(f"Missed: {missed}")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Fetch Rightmove outcode ID mapping"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--postcodes", type=Path, required=True, help="postcode.parquet path"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output", type=Path, required=True, help="Output JSON file path"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
fetch_outcode_ids(args.postcodes, args.output)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue