This commit is contained in:
Andras Schmelczer 2026-02-15 09:48:30 +00:00
parent 128b3191e7
commit 03445188ea
54 changed files with 596953 additions and 3577 deletions

View file

@ -1,128 +0,0 @@
name: CI
on:
push:
branches: [main]
pull_request:
branches: [main]
jobs:
lint-python:
name: Lint Python
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Install uv
uses: astral-sh/setup-uv@v4
with:
version: "latest"
- name: Set up Python
run: uv python install 3.12
- name: Install dependencies
run: uv sync --dev
- name: Run ruff check
run: uv run ruff check .
- name: Run ruff format check
run: uv run ruff format --check .
lint-frontend:
name: Lint Frontend
runs-on: ubuntu-latest
defaults:
run:
working-directory: frontend
steps:
- uses: actions/checkout@v4
- name: Set up Node.js
uses: actions/setup-node@v4
with:
node-version: "20"
cache: "npm"
cache-dependency-path: frontend/package-lock.json
- name: Install dependencies
run: npm ci
- name: Run ESLint
run: npm run lint
- name: Run Prettier check
run: npm run format:check
- name: Run TypeScript check
run: npm run typecheck
build-frontend:
name: Build Frontend
runs-on: ubuntu-latest
needs: [lint-frontend]
defaults:
run:
working-directory: frontend
steps:
- uses: actions/checkout@v4
- name: Set up Node.js
uses: actions/setup-node@v4
with:
node-version: "20"
cache: "npm"
cache-dependency-path: frontend/package-lock.json
- name: Install dependencies
run: npm ci
- name: Build
run: npm run build
lint-rust:
name: Lint Rust
runs-on: ubuntu-latest
defaults:
run:
working-directory: server-rs
steps:
- uses: actions/checkout@v4
- name: Install Rust toolchain
uses: dtolnay/rust-toolchain@stable
with:
components: clippy, rustfmt
- name: Cache cargo
uses: Swatinem/rust-cache@v2
with:
workspaces: server-rs
- name: Run clippy
run: cargo clippy -- -D warnings
- name: Check formatting
run: cargo fmt --check
test-rust:
name: Test Rust
runs-on: ubuntu-latest
needs: [lint-rust]
defaults:
run:
working-directory: server-rs
steps:
- uses: actions/checkout@v4
- name: Install Rust toolchain
uses: dtolnay/rust-toolchain@stable
- name: Cache cargo
uses: Swatinem/rust-cache@v2
with:
workspaces: server-rs
- name: Run tests
run: cargo test

View file

@ -1,49 +0,0 @@
name: Docker
on:
push:
branches: [main]
env:
REGISTRY: ghcr.io
IMAGE_NAME: ${{ github.repository }}
jobs:
build-and-push:
runs-on: ubuntu-latest
permissions:
contents: read
packages: write
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Log in to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Extract metadata
id: meta
uses: docker/metadata-action@v5
with:
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
tags: |
type=raw,value=latest
type=sha,prefix=sha-,format=short
- name: Build and push
uses: docker/build-push-action@v6
with:
context: .
push: true
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
cache-from: type=gha
cache-to: type=gha,mode=max

View file

@ -1,28 +0,0 @@
name: Lint
on:
push:
branches: [main]
pull_request:
branches: [main]
jobs:
lint:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Install uv
uses: astral-sh/setup-uv@v4
- name: Set up Python
run: uv python install
- name: Install dependencies
run: uv sync
- name: Check linting
run: uv run ruff check .
- name: Check formatting
run: uv run ruff format --check .

594829
analyses/rightmove_buy.ipynb Normal file

File diff suppressed because one or more lines are too long

View file

@ -31,7 +31,7 @@ services:
OLLAMA_URL: http://host.docker.internal:11434
OLLAMA_MODEL: gpt-oss:20b
PUBLIC_URL: https://perfectpostcodes.schmelczer.dev
R5_URL: http://r5:8003
GOOGLE_MAPS_API_KEY: "AIzaSyBgBn9LjrxHCjb9j1LZbLYpEdCJj-NkHPY"
depends_on:
pocketbase:
@ -141,27 +141,6 @@ services:
condition: service_healthy
restart: unless-stopped
r5:
init: true
build: ./r5-java
ports:
- "8004:8003"
networks:
- dev-network
volumes:
- r5-network:/data/network
- ./property-data/transit:/data/transit:ro
- ./property-data/transit/raw:/data/transit-raw:ro
environment:
DATA_DIR: /data/transit
OSM_DIR: /data/transit-raw
NETWORK_CACHE_DIR: /data/network
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8003/health"]
interval: 10s
timeout: 5s
retries: 5
start_period: 300s
volumes:
pb-data:
@ -169,7 +148,6 @@ volumes:
cargo-target:
frontend-node-modules:
screenshot-cache:
r5-network:
gluetun-cache-v2:
gluetun-auth:

View file

@ -6,6 +6,6 @@ WORKDIR /app
COPY pyproject.toml ./
RUN uv pip install --system -r pyproject.toml
COPY main.py ./
COPY *.py ./
CMD ["python3", "main.py"]

56
finder/constants.py Normal file
View file

@ -0,0 +1,56 @@
import os
from pathlib import Path
ARCGIS_PATH = os.environ.get("ARCGIS_PATH", "/data/arcgis_data.parquet")
DATA_DIR = Path("/app/data")
PAGE_SIZE = 24
DELAY_BETWEEN_PAGES = 1.0
DELAY_BETWEEN_OUTCODES = 2.0
MAX_RETRIES = 3
RETRY_BASE_DELAY = 2.0
GRID_CELL_SIZE = 0.01 # degrees for postcode spatial index
SEED = 42
TYPEAHEAD_URL = "https://los.rightmove.co.uk/typeahead"
SEARCH_URL = "https://www.rightmove.co.uk/api/property-search/listing/search"
RIGHTMOVE_BASE = "https://www.rightmove.co.uk"
PROPERTY_TYPE_MAP = {
"Detached": "Detached",
"Semi-Detached": "Semi-Detached",
"Terraced": "Terraced",
"End of Terrace": "Terraced",
"Mid Terrace": "Terraced",
"Flat": "Flat",
"Maisonette": "Flat",
"Studio": "Flat",
"Apartment": "Flat",
"Penthouse": "Flat",
"Ground Flat": "Flat",
"Detached Bungalow": "Detached",
"Semi-Detached Bungalow": "Semi-Detached",
"Town House": "Terraced",
"Link Detached": "Detached",
"Link Detached House": "Detached",
"Bungalow": "Other",
"Cottage": "Other",
"Park Home": "Other",
"Land": "Other",
"Farm / Barn": "Other",
"House": "Detached",
"Not Specified": "Other",
"Chalet": "Other",
"Barn Conversion": "Other",
"Coach House": "Other",
"Character Property": "Other",
"Cluster House": "Other",
"Retirement Property": "Flat",
"Plot": "Other",
"Garages": "Other",
"Mews": "Terraced",
}
CHANNELS = [
{"channel": "BUY", "transactionType": "BUY", "sortType": "2"},
{"channel": "RENT", "transactionType": "LETTING", "sortType": "6"},
]

126
finder/http_client.py Normal file
View file

@ -0,0 +1,126 @@
import logging
import random
import threading
import time
import httpx
from fake_useragent import UserAgent
from constants import MAX_RETRIES, RETRY_BASE_DELAY
from metrics import http_errors_total, http_requests_total, ip_rotations_total
log = logging.getLogger("rightmove")
_ua = UserAgent(browsers=["Chrome", "Edge"], os=["Windows", "Mac OS X"], min_version=120.0)
def _endpoint_label(url: str) -> str:
if "typeahead" in url:
return "typeahead"
if "search" in url:
return "search"
return "other"
def _status_label(code: int) -> str:
if code >= 500:
return "5xx"
return str(code)
# Gluetun control API — runs on port 8000 inside the gluetun container.
# Since finder uses network_mode: service:gluetun, localhost IS gluetun.
GLUETUN_API = "http://127.0.0.1:8000"
_ip_rotate_lock = threading.Lock()
def rotate_ip() -> bool:
"""Ask gluetun to reconnect to a different VPN server, getting a new IP.
Returns True if the IP changed successfully."""
with _ip_rotate_lock:
log.info("Rotating VPN IP via gluetun...")
try:
# Get current IP
with httpx.Client(timeout=10) as ctl:
old_ip_resp = ctl.get(f"{GLUETUN_API}/v1/publicip/ip")
old_ip = old_ip_resp.json().get("public_ip", "unknown") if old_ip_resp.status_code == 200 else "unknown"
log.info("Current IP: %s", old_ip)
# Trigger server change — PUT with empty JSON body picks a random server
resp = ctl.put(f"{GLUETUN_API}/v1/vpn/status", json={"status": "stopped"})
if resp.status_code != 200:
log.error("Failed to stop VPN: %d %s", resp.status_code, resp.text)
return False
time.sleep(2)
resp = ctl.put(f"{GLUETUN_API}/v1/vpn/status", json={"status": "running"})
if resp.status_code != 200:
log.error("Failed to start VPN: %d %s", resp.status_code, resp.text)
return False
# Wait for reconnection
for _ in range(30):
time.sleep(2)
try:
with httpx.Client(timeout=10) as ctl:
new_ip_resp = ctl.get(f"{GLUETUN_API}/v1/publicip/ip")
if new_ip_resp.status_code == 200:
new_ip = new_ip_resp.json().get("public_ip", "")
if new_ip and new_ip != old_ip:
log.info("IP rotated: %s%s", old_ip, new_ip)
ip_rotations_total.labels(result="success").inc()
return True
except Exception:
pass # VPN still reconnecting
log.warning("IP rotation timed out (may still be same IP)")
ip_rotations_total.labels(result="failure").inc()
return False
except Exception as e:
log.error("IP rotation failed: %s", e)
ip_rotations_total.labels(result="failure").inc()
return False
def make_client() -> httpx.Client:
return httpx.Client(
timeout=30,
headers={"User-Agent": _ua.random, "Accept": "application/json"},
follow_redirects=True,
)
def fetch_with_retry(
client: httpx.Client, url: str, params: dict | None = None, on_403: bool = True
) -> dict | None:
"""GET JSON with retries on 429/5xx/connection errors. Returns None on permanent failure.
On 403, triggers IP rotation and retries once."""
endpoint = _endpoint_label(url)
for attempt in range(MAX_RETRIES):
try:
resp = client.get(url, params=params)
http_requests_total.labels(status=_status_label(resp.status_code), endpoint=endpoint).inc()
if resp.status_code == 200:
return resp.json()
if resp.status_code == 403 and on_403:
log.warning("HTTP 403 — IP likely blocked, rotating...")
if rotate_ip():
# Retry once with new IP (but don't recurse on 403 again)
return fetch_with_retry(client, url, params, on_403=False)
log.error("IP rotation failed, giving up on %s", url)
return None
if resp.status_code in (429, 500, 502, 503, 504):
delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1)
log.warning("HTTP %d from %s, retry %d/%d in %.1fs", resp.status_code, url, attempt + 1, MAX_RETRIES, delay)
time.sleep(delay)
continue
log.error("HTTP %d from %s (non-retryable)", resp.status_code, url)
return None
except (httpx.ConnectError, httpx.ReadTimeout, httpx.WriteTimeout, httpx.PoolTimeout) as e:
http_errors_total.labels(type=type(e).__name__).inc()
delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1)
log.warning("%s from %s, retry %d/%d in %.1fs", type(e).__name__, url, attempt + 1, MAX_RETRIES, delay)
time.sleep(delay)
http_errors_total.labels(type="retry_exhausted").inc()
log.error("All %d retries exhausted for %s", MAX_RETRIES, url)
return None

View file

@ -1,17 +1,21 @@
import logging
import math
import os
import random
import re
import threading
import time
from collections import defaultdict
from dataclasses import dataclass, field
from pathlib import Path
import httpx
import polars as pl
from flask import Flask, jsonify, send_from_directory
from flask import Flask, Response, jsonify, send_from_directory
from prometheus_client import generate_latest, CONTENT_TYPE_LATEST
from constants import DATA_DIR
from rightmove import outcode_cache
from scraper import (
_sync_gauges,
build_postcode_index,
load_outcodes,
run_scrape,
status,
status_lock,
)
# ---------------------------------------------------------------------------
# Logging
@ -33,615 +37,6 @@ log.setLevel(logging.DEBUG)
logging.getLogger("httpx").setLevel(logging.WARNING)
logging.getLogger("httpcore").setLevel(logging.WARNING)
# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------
ARCGIS_PATH = os.environ.get("ARCGIS_PATH", "/data/arcgis_data.parquet")
DATA_DIR = Path("/app/data")
PAGE_SIZE = 24
MAX_PAGES_PER_OUTCODE = 42 # 24*42 = 1008, safety cap per outcode
DELAY_BETWEEN_PAGES = 1.0
DELAY_BETWEEN_OUTCODES = 2.0
MAX_RETRIES = 3
RETRY_BASE_DELAY = 2.0
GRID_CELL_SIZE = 0.01 # degrees for postcode spatial index
SEED = 42
TYPEAHEAD_URL = "https://los.rightmove.co.uk/typeahead"
SEARCH_URL = "https://www.rightmove.co.uk/api/property-search/listing/search"
RIGHTMOVE_BASE = "https://www.rightmove.co.uk"
PROPERTY_TYPE_MAP = {
"Detached": "Detached",
"Semi-Detached": "Semi-Detached",
"Terraced": "Terraced",
"End of Terrace": "Terraced",
"Mid Terrace": "Terraced",
"Flat": "Flat",
"Maisonette": "Flat",
"Studio": "Flat",
"Apartment": "Flat",
"Penthouse": "Flat",
"Ground Flat": "Flat",
"Detached Bungalow": "Detached",
"Semi-Detached Bungalow": "Semi-Detached",
"Town House": "Terraced",
"Link Detached": "Detached",
"Link Detached House": "Detached",
"Bungalow": "Other",
"Cottage": "Other",
"Park Home": "Other",
"Land": "Other",
"Farm / Barn": "Other",
"House": "Detached",
"Not Specified": "Other",
"Chalet": "Other",
"Barn Conversion": "Other",
"Coach House": "Other",
"Character Property": "Other",
"Cluster House": "Other",
"Retirement Property": "Flat",
"Plot": "Other",
"Garages": "Other",
"Mews": "Terraced",
}
CHANNELS = [
{"channel": "BUY", "transactionType": "BUY", "sortType": "2"},
{"channel": "RENT", "transactionType": "LETTING", "sortType": "6"},
]
# ---------------------------------------------------------------------------
# Postcode spatial index
# ---------------------------------------------------------------------------
class PostcodeSpatialIndex:
"""Grid-based spatial index over arcgis postcodes for nearest-lookup."""
def __init__(self, lats: list[float], lngs: list[float], postcodes: list[str]):
self.grid: dict[tuple[int, int], list[tuple[float, float, str]]] = defaultdict(list)
for lat, lng, pcd in zip(lats, lngs, postcodes):
gx = int(math.floor(lng / GRID_CELL_SIZE))
gy = int(math.floor(lat / GRID_CELL_SIZE))
self.grid[(gx, gy)].append((lat, lng, pcd))
log.info("Postcode spatial index: %d cells, %d postcodes", len(self.grid), len(lats))
def nearest(self, lat: float, lng: float) -> str | None:
gx = int(math.floor(lng / GRID_CELL_SIZE))
gy = int(math.floor(lat / GRID_CELL_SIZE))
best_dist = float("inf")
best_pcd = None
for dx in range(-1, 2):
for dy in range(-1, 2):
for plat, plng, pcd in self.grid.get((gx + dx, gy + dy), []):
d = (plat - lat) ** 2 + (plng - lng) ** 2
if d < best_dist:
best_dist = d
best_pcd = pcd
return best_pcd
# ---------------------------------------------------------------------------
# Scrape status
# ---------------------------------------------------------------------------
@dataclass
class ScrapeStatus:
state: str = "idle" # idle | running | done | error
channel: str = ""
outcode: str = ""
outcodes_done: int = 0
outcodes_total: int = 0
properties_buy: int = 0
properties_rent: int = 0
errors: list[str] = field(default_factory=list)
started_at: float = 0.0
finished_at: float = 0.0
status = ScrapeStatus()
status_lock = threading.Lock()
debug_data: dict = {"last_response": None, "outcode_cache": {}}
# ---------------------------------------------------------------------------
# HTTP helpers
# ---------------------------------------------------------------------------
USER_AGENT = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
)
# Gluetun control API — runs on port 8000 inside the gluetun container.
# Since finder uses network_mode: service:gluetun, localhost IS gluetun.
GLUETUN_API = "http://127.0.0.1:8000"
_ip_rotate_lock = threading.Lock()
def rotate_ip() -> bool:
"""Ask gluetun to reconnect to a different VPN server, getting a new IP.
Returns True if the IP changed successfully."""
with _ip_rotate_lock:
log.info("Rotating VPN IP via gluetun...")
try:
# Get current IP
with httpx.Client(timeout=10) as ctl:
old_ip_resp = ctl.get(f"{GLUETUN_API}/v1/publicip/ip")
old_ip = old_ip_resp.json().get("public_ip", "unknown") if old_ip_resp.status_code == 200 else "unknown"
log.info("Current IP: %s", old_ip)
# Trigger server change — PUT with empty JSON body picks a random server
resp = ctl.put(f"{GLUETUN_API}/v1/vpn/status", json={"status": "stopped"})
if resp.status_code != 200:
log.error("Failed to stop VPN: %d %s", resp.status_code, resp.text)
return False
time.sleep(2)
resp = ctl.put(f"{GLUETUN_API}/v1/vpn/status", json={"status": "running"})
if resp.status_code != 200:
log.error("Failed to start VPN: %d %s", resp.status_code, resp.text)
return False
# Wait for reconnection
for _ in range(30):
time.sleep(2)
try:
with httpx.Client(timeout=10) as ctl:
new_ip_resp = ctl.get(f"{GLUETUN_API}/v1/publicip/ip")
if new_ip_resp.status_code == 200:
new_ip = new_ip_resp.json().get("public_ip", "")
if new_ip and new_ip != old_ip:
log.info("IP rotated: %s%s", old_ip, new_ip)
return True
except Exception:
pass # VPN still reconnecting
log.warning("IP rotation timed out (may still be same IP)")
return False
except Exception as e:
log.error("IP rotation failed: %s", e)
return False
def make_client() -> httpx.Client:
return httpx.Client(
timeout=30,
headers={"User-Agent": USER_AGENT, "Accept": "application/json"},
follow_redirects=True,
)
def fetch_with_retry(
client: httpx.Client, url: str, params: dict | None = None, on_403: bool = True
) -> dict | None:
"""GET JSON with retries on 429/5xx/connection errors. Returns None on permanent failure.
On 403, triggers IP rotation and retries once."""
for attempt in range(MAX_RETRIES):
try:
resp = client.get(url, params=params)
if resp.status_code == 200:
return resp.json()
if resp.status_code == 403 and on_403:
log.warning("HTTP 403 — IP likely blocked, rotating...")
if rotate_ip():
# Retry once with new IP (but don't recurse on 403 again)
return fetch_with_retry(client, url, params, on_403=False)
log.error("IP rotation failed, giving up on %s", url)
return None
if resp.status_code in (429, 500, 502, 503, 504):
delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1)
log.warning("HTTP %d from %s, retry %d/%d in %.1fs", resp.status_code, url, attempt + 1, MAX_RETRIES, delay)
time.sleep(delay)
continue
log.error("HTTP %d from %s (non-retryable)", resp.status_code, url)
return None
except (httpx.ConnectError, httpx.ReadTimeout, httpx.WriteTimeout, httpx.PoolTimeout) as e:
delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1)
log.warning("%s from %s, retry %d/%d in %.1fs", type(e).__name__, url, attempt + 1, MAX_RETRIES, delay)
time.sleep(delay)
log.error("All %d retries exhausted for %s", MAX_RETRIES, url)
return None
# ---------------------------------------------------------------------------
# Rightmove API
# ---------------------------------------------------------------------------
def resolve_outcode_id(client: httpx.Client, outcode: str) -> str | None:
"""Look up Rightmove's internal ID for an outcode via typeahead API."""
if outcode in debug_data["outcode_cache"]:
return debug_data["outcode_cache"][outcode]
data = fetch_with_retry(client, TYPEAHEAD_URL, {"query": outcode, "limit": "10", "exclude": "STREET"})
if not data:
return None
for match in data.get("matches", []):
if match.get("type") == "OUTCODE" and match.get("displayName") == outcode:
rid = str(match["id"])
debug_data["outcode_cache"][outcode] = rid
return rid
log.debug("Outcode %s not found in typeahead results", outcode)
return None
def search_outcode(
client: httpx.Client,
outcode_id: str,
outcode: str,
channel_cfg: dict,
pc_index: PostcodeSpatialIndex,
) -> list[dict]:
"""Paginate through search results for one outcode+channel. Returns transformed properties."""
properties = []
index = 0
for page in range(MAX_PAGES_PER_OUTCODE):
params = {
"useLocationIdentifier": "true",
"locationIdentifier": f"OUTCODE^{outcode_id}",
"index": str(index),
"sortType": channel_cfg["sortType"],
"channel": channel_cfg["channel"],
"transactionType": channel_cfg["transactionType"],
}
data = fetch_with_retry(client, SEARCH_URL, params)
if not data:
log.warning("Failed to fetch page %d for %s/%s", page, outcode, channel_cfg["channel"])
break
debug_data["last_response"] = data
raw_props = data.get("properties", [])
if not raw_props:
break
for prop in raw_props:
transformed = transform_property(prop, outcode, pc_index)
if transformed:
properties.append(transformed)
# Check if there are more pages
result_count_str = data.get("resultCount", "0")
result_count = int(result_count_str.replace(",", ""))
index += PAGE_SIZE
if index >= result_count:
break
if page < MAX_PAGES_PER_OUTCODE - 1:
time.sleep(DELAY_BETWEEN_PAGES)
return properties
# ---------------------------------------------------------------------------
# Property transformation
# ---------------------------------------------------------------------------
def parse_display_size(display_size: str | None) -> float | None:
"""Parse displaySize like '499 sq. ft.' or '4,124 sq. ft.' to sqm."""
if not display_size:
return None
# Try sq. ft. first
m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*ft", display_size, re.IGNORECASE)
if m:
sqft = float(m.group(1).replace(",", ""))
return round(sqft * 0.092903, 1)
# Try sq. m.
m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*m", display_size, re.IGNORECASE)
if m:
return round(float(m.group(1).replace(",", "")), 1)
return None
def map_property_type(sub_type: str | None) -> str:
"""Map propertySubType to canonical type."""
if not sub_type:
return "Other"
canonical = PROPERTY_TYPE_MAP.get(sub_type)
if canonical:
return canonical
log.warning("Unknown propertySubType: %r — mapping to Other", sub_type)
return "Other"
def extract_tenure(tenure_obj: dict | None) -> str | None:
"""Extract tenure string from tenure object."""
if not tenure_obj:
return None
tt = tenure_obj.get("tenureType", "")
if tt == "FREEHOLD":
return "Freehold"
if tt == "LEASEHOLD":
return "Leasehold"
return None
def fix_coords(lat: float, lng: float) -> tuple[float, float]:
"""Swap lat/lng if they look reversed. England: lat ~4956, lng ~-72."""
if 49 <= lat <= 56 and -7 <= lng <= 2:
return lat, lng
if 49 <= lng <= 56 and -7 <= lat <= 2:
log.debug("Swapping reversed coords: lat=%.4f lng=%.4f → lat=%.4f lng=%.4f", lat, lng, lng, lat)
return lng, lat
log.warning("Coords outside England bounds even after swap attempt: lat=%.4f lng=%.4f", lat, lng)
return lat, lng
def normalize_price(amount: int, frequency: str) -> int:
"""Normalize price to monthly for rentals (weekly × 52/12, yearly ÷ 12)."""
if frequency == "weekly":
return round(amount * 52 / 12)
if frequency == "yearly":
return round(amount / 12)
return amount
def transform_property(prop: dict, outcode: str, pc_index: PostcodeSpatialIndex) -> dict | None:
"""Transform a raw Rightmove property dict into our output schema."""
loc = prop.get("location")
if not loc:
return None
raw_lat = loc.get("latitude")
raw_lng = loc.get("longitude")
if raw_lat is None or raw_lng is None:
return None
lat, lng = fix_coords(raw_lat, raw_lng)
price_obj = prop.get("price", {})
amount = price_obj.get("amount")
if amount is None:
return None
frequency = price_obj.get("frequency", "")
price = normalize_price(int(amount), frequency)
display_prices = price_obj.get("displayPrices", [])
price_qualifier = display_prices[0].get("displayPriceQualifier", "") if display_prices else ""
sub_type = prop.get("propertySubType", "")
bedrooms = prop.get("bedrooms", 0) or 0
bathrooms = prop.get("bathrooms", 0) or 0
key_features = [kf.get("description", "") for kf in prop.get("keyFeatures", []) if kf.get("description")]
listing_update = prop.get("listingUpdate", {})
update_date = listing_update.get("listingUpdateDate", "")
postcode = pc_index.nearest(lat, lng)
return {
"id": prop.get("id"),
"bedrooms": bedrooms,
"bathrooms": bathrooms,
"total_rooms": bedrooms + bathrooms,
"longitude": lng,
"latitude": lat,
"postcode": postcode,
"address": prop.get("displayAddress", ""),
"tenure": extract_tenure(prop.get("tenure")),
"property_type": map_property_type(sub_type),
"property_sub_type": sub_type or "Unknown",
"price": price,
"price_frequency": frequency,
"price_qualifier": price_qualifier,
"floorspace_sqm": parse_display_size(prop.get("displaySize")),
"url": RIGHTMOVE_BASE + prop.get("propertyUrl", ""),
"features": key_features,
"first_visible_date": prop.get("firstVisibleDate", ""),
"update_date": update_date,
"outcode": outcode,
"house_share": sub_type == "House Share",
}
# ---------------------------------------------------------------------------
# Parquet writing
# ---------------------------------------------------------------------------
def write_parquet(properties: list[dict], path: Path) -> None:
"""Write properties list to parquet using Polars."""
if not properties:
log.warning("No properties to write to %s", path)
return
df = pl.DataFrame(
{
"id": [p["id"] for p in properties],
"bedrooms": [p["bedrooms"] for p in properties],
"bathrooms": [p["bathrooms"] for p in properties],
"total_rooms": [p["total_rooms"] for p in properties],
"longitude": [p["longitude"] for p in properties],
"latitude": [p["latitude"] for p in properties],
"postcode": [p["postcode"] for p in properties],
"address": [p["address"] for p in properties],
"tenure": [p["tenure"] for p in properties],
"property_type": [p["property_type"] for p in properties],
"property_sub_type": [p["property_sub_type"] for p in properties],
"price": [p["price"] for p in properties],
"price_frequency": [p["price_frequency"] for p in properties],
"price_qualifier": [p["price_qualifier"] for p in properties],
"floorspace_sqm": [p["floorspace_sqm"] for p in properties],
"url": [p["url"] for p in properties],
"features": [p["features"] for p in properties],
"first_visible_date": [p["first_visible_date"] for p in properties],
"update_date": [p["update_date"] for p in properties],
"outcode": [p["outcode"] for p in properties],
"house_share": [p["house_share"] for p in properties],
},
schema={
"id": pl.Int64,
"bedrooms": pl.Int32,
"bathrooms": pl.Int32,
"total_rooms": pl.Int32,
"longitude": pl.Float64,
"latitude": pl.Float64,
"postcode": pl.Utf8,
"address": pl.Utf8,
"tenure": pl.Utf8,
"property_type": pl.Utf8,
"property_sub_type": pl.Utf8,
"price": pl.Int64,
"price_frequency": pl.Utf8,
"price_qualifier": pl.Utf8,
"floorspace_sqm": pl.Float64,
"url": pl.Utf8,
"features": pl.List(pl.Utf8),
"first_visible_date": pl.Utf8,
"update_date": pl.Utf8,
"outcode": pl.Utf8,
"house_share": pl.Boolean,
},
)
df.write_parquet(path)
log.info("Wrote %d properties to %s", len(df), path)
# ---------------------------------------------------------------------------
# Scrape orchestration
# ---------------------------------------------------------------------------
def load_outcodes() -> list[str]:
"""Load England-only outcodes from arcgis parquet."""
log.info("Loading outcodes from %s", ARCGIS_PATH)
df = pl.read_parquet(ARCGIS_PATH, columns=["pcd", "ctry", "lat", "long"])
england = df.filter(pl.col("ctry") == "E92000001")
log.info("England postcodes: %d", len(england))
outcodes = (
england.select(pl.col("pcd").str.extract(r"^([A-Z]{1,2}\d[A-Z0-9]?)", 1).alias("outcode"))
.drop_nulls()
.get_column("outcode")
.unique()
.sort()
.to_list()
)
log.info("Unique England outcodes: %d", len(outcodes))
return outcodes
def build_postcode_index() -> PostcodeSpatialIndex:
"""Build spatial index from arcgis England postcodes."""
log.info("Building postcode spatial index from %s", ARCGIS_PATH)
df = pl.read_parquet(ARCGIS_PATH, columns=["pcd", "ctry", "lat", "long"])
england = df.filter(pl.col("ctry") == "E92000001").drop_nulls(subset=["lat", "long"])
return PostcodeSpatialIndex(
england.get_column("lat").to_list(),
england.get_column("long").to_list(),
england.get_column("pcd").to_list(),
)
def run_scrape(outcodes: list[str], pc_index: PostcodeSpatialIndex) -> None:
"""Main scrape loop — runs in background thread."""
global status
with status_lock:
status.state = "running"
status.started_at = time.time()
status.errors = []
status.properties_buy = 0
status.properties_rent = 0
# Shuffle for geographic diversity
shuffled = list(outcodes)
random.seed(SEED)
random.shuffle(shuffled)
client = make_client()
try:
for channel_cfg in CHANNELS:
channel_name = channel_cfg["channel"]
file_suffix = "buy" if channel_name == "BUY" else "rent"
all_properties: dict[int, dict] = {} # dedup by id
with status_lock:
status.channel = channel_name
status.outcodes_done = 0
status.outcodes_total = len(shuffled)
log.info("=== Starting %s channel (%d outcodes) ===", channel_name, len(shuffled))
for i, outcode in enumerate(shuffled):
with status_lock:
status.outcode = outcode
status.outcodes_done = i
log.debug("Outcode %s (%d/%d) — %d properties so far",
outcode, i + 1, len(shuffled), len(all_properties))
try:
outcode_id = resolve_outcode_id(client, outcode)
if not outcode_id:
log.debug("No Rightmove ID for outcode %s, skipping", outcode)
continue
props = search_outcode(client, outcode_id, outcode, channel_cfg, pc_index)
for p in props:
pid = p["id"]
if pid not in all_properties:
all_properties[pid] = p
with status_lock:
if channel_name == "BUY":
status.properties_buy = len(all_properties)
else:
status.properties_rent = len(all_properties)
log.info("Outcode %s: got %d properties (total: %d)", outcode, len(props), len(all_properties))
except Exception as e:
msg = f"Error scraping {outcode}/{channel_name}: {e}"
log.error(msg)
with status_lock:
status.errors.append(msg)
if i < len(shuffled) - 1:
time.sleep(DELAY_BETWEEN_OUTCODES)
# Write parquet
deduped = list(all_properties.values())
output_path = DATA_DIR / f"rightmove_{file_suffix}.parquet"
write_parquet(deduped, output_path)
with status_lock:
if channel_name == "BUY":
status.properties_buy = len(deduped)
else:
status.properties_rent = len(deduped)
status.outcodes_done = len(shuffled)
log.info("=== %s channel complete: %d unique properties ===", channel_name, len(deduped))
with status_lock:
status.state = "done"
status.finished_at = time.time()
elapsed = status.finished_at - status.started_at
log.info("Scrape complete in %.0fs — buy: %d, rent: %d",
elapsed, status.properties_buy, status.properties_rent)
except Exception as e:
log.exception("Fatal scrape error")
with status_lock:
status.state = "error"
status.errors.append(f"Fatal: {e}")
status.finished_at = time.time()
finally:
client.close()
# ---------------------------------------------------------------------------
# Startup: load data
# ---------------------------------------------------------------------------
@ -693,12 +88,18 @@ def get_status():
@app.route("/debug")
def get_debug():
return jsonify({
"last_response": debug_data["last_response"],
"outcode_cache_size": len(debug_data["outcode_cache"]),
"outcode_cache_sample": dict(list(debug_data["outcode_cache"].items())[:20]),
"outcode_cache_size": len(outcode_cache),
"outcode_cache_sample": dict(list(outcode_cache.items())[:20]),
})
@app.route("/metrics")
def metrics():
with status_lock:
_sync_gauges()
return Response(generate_latest(), mimetype=CONTENT_TYPE_LATEST)
@app.route("/data/<filename>")
def serve_data(filename):
if not filename.endswith(".parquet"):

59
finder/metrics.py Normal file
View file

@ -0,0 +1,59 @@
from prometheus_client import Counter, Gauge
# ---------------------------------------------------------------------------
# Gauges — current scrape state, updated after each outcode
# ---------------------------------------------------------------------------
scrape_state = Gauge(
"scrape_state",
"Current scrape state as a labeled gauge (1 = active)",
["state"],
)
scrape_outcodes_done = Gauge(
"scrape_outcodes_done",
"Outcodes processed in current channel",
)
scrape_outcodes_total = Gauge(
"scrape_outcodes_total",
"Total outcodes in current channel",
)
scrape_properties_total = Gauge(
"scrape_properties_total",
"Properties found so far",
["channel"],
)
scrape_elapsed_seconds = Gauge(
"scrape_elapsed_seconds",
"Seconds since scrape started",
)
# ---------------------------------------------------------------------------
# Counters — monotonically increasing
# ---------------------------------------------------------------------------
http_requests_total = Counter(
"http_requests_total",
"HTTP requests made by the scraper",
["status", "endpoint"],
)
http_errors_total = Counter(
"http_errors_total",
"HTTP connection/timeout errors",
["type"],
)
ip_rotations_total = Counter(
"ip_rotations_total",
"VPN IP rotation attempts",
["result"],
)
scrape_errors_total = Counter(
"scrape_errors_total",
"Per-outcode scrape errors",
)

View file

@ -6,4 +6,6 @@ dependencies = [
"flask",
"httpx",
"polars",
"fake-useragent>=2.2.0",
"prometheus-client",
]

86
finder/rightmove.py Normal file
View file

@ -0,0 +1,86 @@
import logging
import time
import httpx
from constants import (
PAGE_SIZE,
DELAY_BETWEEN_PAGES,
SEARCH_URL,
TYPEAHEAD_URL,
)
from http_client import fetch_with_retry
from spatial import PostcodeSpatialIndex
from transform import transform_property
log = logging.getLogger("rightmove")
# Outcode ID cache (Rightmove typeahead → internal ID)
outcode_cache: dict[str, str] = {}
def resolve_outcode_id(client: httpx.Client, outcode: str) -> str | None:
"""Look up Rightmove's internal ID for an outcode via typeahead API."""
if outcode in outcode_cache:
return outcode_cache[outcode]
data = fetch_with_retry(client, TYPEAHEAD_URL, {"query": outcode, "limit": "10", "exclude": "STREET"})
if not data:
return None
for match in data.get("matches", []):
if match.get("type") == "OUTCODE" and match.get("displayName") == outcode:
rid = str(match["id"])
outcode_cache[outcode] = rid
return rid
log.debug("Outcode %s not found in typeahead results", outcode)
return None
def search_outcode(
client: httpx.Client,
outcode_id: str,
outcode: str,
channel_cfg: dict,
pc_index: PostcodeSpatialIndex,
) -> list[dict]:
"""Paginate through search results for one outcode+channel. Returns transformed properties."""
properties = []
index = 0
while True:
params = {
"useLocationIdentifier": "true",
"locationIdentifier": f"OUTCODE^{outcode_id}",
"index": str(index),
"sortType": channel_cfg["sortType"],
"channel": channel_cfg["channel"],
"transactionType": channel_cfg["transactionType"],
}
data = fetch_with_retry(client, SEARCH_URL, params)
if not data:
log.warning("Failed to fetch index %d for %s/%s", index, outcode, channel_cfg["channel"])
break
raw_props = data.get("properties", [])
if not raw_props:
break
for prop in raw_props:
transformed = transform_property(prop, outcode, pc_index)
if transformed:
properties.append(transformed)
# Check if there are more pages
result_count_str = data.get("resultCount", "0")
result_count = int(result_count_str.replace(",", ""))
index += PAGE_SIZE
if index >= result_count:
break
time.sleep(DELAY_BETWEEN_PAGES)
return properties

191
finder/scraper.py Normal file
View file

@ -0,0 +1,191 @@
import logging
import random
import threading
import time
from dataclasses import dataclass, field
import polars as pl
from constants import ARCGIS_PATH, CHANNELS, DATA_DIR, DELAY_BETWEEN_OUTCODES, SEED
from http_client import make_client
from metrics import (
scrape_elapsed_seconds,
scrape_errors_total,
scrape_outcodes_done,
scrape_outcodes_total,
scrape_properties_total,
scrape_state,
)
from rightmove import resolve_outcode_id, search_outcode
from spatial import PostcodeSpatialIndex
from storage import write_parquet
log = logging.getLogger("rightmove")
@dataclass
class ScrapeStatus:
state: str = "idle" # idle | running | done | error
channel: str = ""
outcode: str = ""
outcodes_done: int = 0
outcodes_total: int = 0
properties_buy: int = 0
properties_rent: int = 0
errors: list[str] = field(default_factory=list)
started_at: float = 0.0
finished_at: float = 0.0
status = ScrapeStatus()
status_lock = threading.Lock()
def _sync_gauges() -> None:
"""Push current ScrapeStatus values into Prometheus gauges. Must hold status_lock."""
for state in ("idle", "running", "done", "error"):
scrape_state.labels(state=state).set(1 if status.state == state else 0)
scrape_outcodes_done.set(status.outcodes_done)
scrape_outcodes_total.set(status.outcodes_total)
scrape_properties_total.labels(channel="buy").set(status.properties_buy)
scrape_properties_total.labels(channel="rent").set(status.properties_rent)
if status.started_at:
end = status.finished_at if status.finished_at else time.time()
scrape_elapsed_seconds.set(end - status.started_at)
else:
scrape_elapsed_seconds.set(0)
def load_outcodes() -> list[str]:
"""Load England-only outcodes from arcgis parquet."""
log.info("Loading outcodes from %s", ARCGIS_PATH)
df = pl.read_parquet(ARCGIS_PATH, columns=["pcd", "ctry", "lat", "long"])
england = df.filter(pl.col("ctry") == "E92000001")
log.info("England postcodes: %d", len(england))
outcodes = (
england.select(pl.col("pcd").str.extract(r"^([A-Z]{1,2}\d[A-Z0-9]?)", 1).alias("outcode"))
.drop_nulls()
.get_column("outcode")
.unique()
.sort()
.to_list()
)
log.info("Unique England outcodes: %d", len(outcodes))
return outcodes
def build_postcode_index() -> PostcodeSpatialIndex:
"""Build spatial index from arcgis England postcodes."""
log.info("Building postcode spatial index from %s", ARCGIS_PATH)
df = pl.read_parquet(ARCGIS_PATH, columns=["pcd", "ctry", "lat", "long"])
england = df.filter(pl.col("ctry") == "E92000001").drop_nulls(subset=["lat", "long"])
return PostcodeSpatialIndex(
england.get_column("lat").to_list(),
england.get_column("long").to_list(),
england.get_column("pcd").to_list(),
)
def run_scrape(outcodes: list[str], pc_index: PostcodeSpatialIndex) -> None:
"""Main scrape loop — runs in background thread."""
global status
with status_lock:
status.state = "running"
status.started_at = time.time()
status.errors = []
status.properties_buy = 0
status.properties_rent = 0
_sync_gauges()
# Shuffle for geographic diversity
shuffled = list(outcodes)
random.seed(SEED)
random.shuffle(shuffled)
client = make_client()
try:
for channel_cfg in CHANNELS:
channel_name = channel_cfg["channel"]
file_suffix = "buy" if channel_name == "BUY" else "rent"
all_properties: dict[int, dict] = {} # dedup by id
with status_lock:
status.channel = channel_name
status.outcodes_done = 0
status.outcodes_total = len(shuffled)
log.info("=== Starting %s channel (%d outcodes) ===", channel_name, len(shuffled))
for i, outcode in enumerate(shuffled):
with status_lock:
status.outcode = outcode
status.outcodes_done = i
log.debug("Outcode %s (%d/%d) — %d properties so far",
outcode, i + 1, len(shuffled), len(all_properties))
try:
outcode_id = resolve_outcode_id(client, outcode)
if not outcode_id:
log.debug("No Rightmove ID for outcode %s, skipping", outcode)
continue
props = search_outcode(client, outcode_id, outcode, channel_cfg, pc_index)
for p in props:
pid = p["id"]
if pid not in all_properties:
all_properties[pid] = p
with status_lock:
if channel_name == "BUY":
status.properties_buy = len(all_properties)
else:
status.properties_rent = len(all_properties)
_sync_gauges()
log.info("Outcode %s: got %d properties (total: %d)", outcode, len(props), len(all_properties))
except Exception as e:
msg = f"Error scraping {outcode}/{channel_name}: {e}"
log.error(msg)
scrape_errors_total.inc()
with status_lock:
status.errors.append(msg)
if i < len(shuffled) - 1:
time.sleep(DELAY_BETWEEN_OUTCODES)
# Write parquet
deduped = list(all_properties.values())
output_path = DATA_DIR / f"rightmove_{file_suffix}.parquet"
write_parquet(deduped, output_path)
with status_lock:
if channel_name == "BUY":
status.properties_buy = len(deduped)
else:
status.properties_rent = len(deduped)
status.outcodes_done = len(shuffled)
_sync_gauges()
log.info("=== %s channel complete: %d unique properties ===", channel_name, len(deduped))
with status_lock:
status.state = "done"
status.finished_at = time.time()
_sync_gauges()
elapsed = status.finished_at - status.started_at
log.info("Scrape complete in %.0fs — buy: %d, rent: %d",
elapsed, status.properties_buy, status.properties_rent)
except Exception as e:
log.exception("Fatal scrape error")
with status_lock:
status.state = "error"
status.errors.append(f"Fatal: {e}")
status.finished_at = time.time()
_sync_gauges()
finally:
client.close()

33
finder/spatial.py Normal file
View file

@ -0,0 +1,33 @@
import logging
import math
from collections import defaultdict
from constants import GRID_CELL_SIZE
log = logging.getLogger("rightmove")
class PostcodeSpatialIndex:
"""Grid-based spatial index over arcgis postcodes for nearest-lookup."""
def __init__(self, lats: list[float], lngs: list[float], postcodes: list[str]):
self.grid: dict[tuple[int, int], list[tuple[float, float, str]]] = defaultdict(list)
for lat, lng, pcd in zip(lats, lngs, postcodes):
gx = int(math.floor(lng / GRID_CELL_SIZE))
gy = int(math.floor(lat / GRID_CELL_SIZE))
self.grid[(gx, gy)].append((lat, lng, pcd))
log.info("Postcode spatial index: %d cells, %d postcodes", len(self.grid), len(lats))
def nearest(self, lat: float, lng: float) -> str | None:
gx = int(math.floor(lng / GRID_CELL_SIZE))
gy = int(math.floor(lat / GRID_CELL_SIZE))
best_dist = float("inf")
best_pcd = None
for dx in range(-1, 2):
for dy in range(-1, 2):
for plat, plng, pcd in self.grid.get((gx + dx, gy + dy), []):
d = (plat - lat) ** 2 + (plng - lng) ** 2
if d < best_dist:
best_dist = d
best_pcd = pcd
return best_pcd

65
finder/storage.py Normal file
View file

@ -0,0 +1,65 @@
import logging
from pathlib import Path
import polars as pl
log = logging.getLogger("rightmove")
def write_parquet(properties: list[dict], path: Path) -> None:
"""Write properties list to parquet using Polars."""
if not properties:
log.warning("No properties to write to %s", path)
return
df = pl.DataFrame(
{
"id": [p["id"] for p in properties],
"bedrooms": [p["bedrooms"] for p in properties],
"bathrooms": [p["bathrooms"] for p in properties],
"total_rooms": [p["total_rooms"] for p in properties],
"longitude": [p["longitude"] for p in properties],
"latitude": [p["latitude"] for p in properties],
"postcode": [p["postcode"] for p in properties],
"address": [p["address"] for p in properties],
"tenure": [p["tenure"] for p in properties],
"property_type": [p["property_type"] for p in properties],
"property_sub_type": [p["property_sub_type"] for p in properties],
"price": [p["price"] for p in properties],
"price_frequency": [p["price_frequency"] for p in properties],
"price_qualifier": [p["price_qualifier"] for p in properties],
"floorspace_sqm": [p["floorspace_sqm"] for p in properties],
"url": [p["url"] for p in properties],
"features": [p["features"] for p in properties],
"first_visible_date": [p["first_visible_date"] for p in properties],
"update_date": [p["update_date"] for p in properties],
"outcode": [p["outcode"] for p in properties],
"house_share": [p["house_share"] for p in properties],
},
schema={
"id": pl.Int64,
"bedrooms": pl.Int32,
"bathrooms": pl.Int32,
"total_rooms": pl.Int32,
"longitude": pl.Float64,
"latitude": pl.Float64,
"postcode": pl.Utf8,
"address": pl.Utf8,
"tenure": pl.Utf8,
"property_type": pl.Utf8,
"property_sub_type": pl.Utf8,
"price": pl.Int64,
"price_frequency": pl.Utf8,
"price_qualifier": pl.Utf8,
"floorspace_sqm": pl.Float64,
"url": pl.Utf8,
"features": pl.List(pl.Utf8),
"first_visible_date": pl.Utf8,
"update_date": pl.Utf8,
"outcode": pl.Utf8,
"house_share": pl.Boolean,
},
)
df.write_parquet(path)
log.info("Wrote %d properties to %s", len(df), path)

124
finder/transform.py Normal file
View file

@ -0,0 +1,124 @@
import logging
import re
from constants import PROPERTY_TYPE_MAP, RIGHTMOVE_BASE
from spatial import PostcodeSpatialIndex
log = logging.getLogger("rightmove")
def parse_display_size(display_size: str | None) -> float | None:
"""Parse displaySize like '499 sq. ft.' or '4,124 sq. ft.' to sqm."""
if not display_size:
return None
# Try sq. ft. first
m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*ft", display_size, re.IGNORECASE)
if m:
sqft = float(m.group(1).replace(",", ""))
return round(sqft * 0.092903, 1)
# Try sq. m.
m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*m", display_size, re.IGNORECASE)
if m:
return round(float(m.group(1).replace(",", "")), 1)
return None
def map_property_type(sub_type: str | None) -> str:
"""Map propertySubType to canonical type."""
if not sub_type:
return "Other"
canonical = PROPERTY_TYPE_MAP.get(sub_type)
if canonical:
return canonical
log.warning("Unknown propertySubType: %r — mapping to Other", sub_type)
return "Other"
def extract_tenure(tenure_obj: dict | None) -> str | None:
"""Extract tenure string from tenure object."""
if not tenure_obj:
return None
tt = tenure_obj.get("tenureType", "")
if tt == "FREEHOLD":
return "Freehold"
if tt == "LEASEHOLD":
return "Leasehold"
return None
def fix_coords(lat: float, lng: float) -> tuple[float, float]:
"""Swap lat/lng if they look reversed. England: lat ~4956, lng ~-72."""
if 49 <= lat <= 56 and -7 <= lng <= 2:
return lat, lng
if 49 <= lng <= 56 and -7 <= lat <= 2:
log.debug("Swapping reversed coords: lat=%.4f lng=%.4f → lat=%.4f lng=%.4f", lat, lng, lng, lat)
return lng, lat
log.warning("Coords outside England bounds even after swap attempt: lat=%.4f lng=%.4f", lat, lng)
return lat, lng
def normalize_price(amount: int, frequency: str) -> int:
"""Normalize price to monthly for rentals (weekly × 52/12, yearly ÷ 12)."""
if frequency == "weekly":
return round(amount * 52 / 12)
if frequency == "yearly":
return round(amount / 12)
return amount
def transform_property(prop: dict, outcode: str, pc_index: PostcodeSpatialIndex) -> dict | None:
"""Transform a raw Rightmove property dict into our output schema."""
loc = prop.get("location")
if not loc:
return None
raw_lat = loc.get("latitude")
raw_lng = loc.get("longitude")
if raw_lat is None or raw_lng is None:
return None
lat, lng = fix_coords(raw_lat, raw_lng)
price_obj = prop.get("price", {})
amount = price_obj.get("amount")
if amount is None:
return None
frequency = price_obj.get("frequency", "")
price = normalize_price(int(amount), frequency)
display_prices = price_obj.get("displayPrices", [])
price_qualifier = display_prices[0].get("displayPriceQualifier", "") if display_prices else ""
sub_type = prop.get("propertySubType", "")
bedrooms = prop.get("bedrooms", 0) or 0
bathrooms = prop.get("bathrooms", 0) or 0
key_features = [kf.get("description", "") for kf in prop.get("keyFeatures", []) if kf.get("description")]
listing_update = prop.get("listingUpdate", {})
update_date = listing_update.get("listingUpdateDate", "")
postcode = pc_index.nearest(lat, lng)
return {
"id": prop.get("id"),
"bedrooms": bedrooms,
"bathrooms": bathrooms,
"total_rooms": bedrooms + bathrooms,
"longitude": lng,
"latitude": lat,
"postcode": postcode,
"address": prop.get("displayAddress", ""),
"tenure": extract_tenure(prop.get("tenure")),
"property_type": map_property_type(sub_type),
"property_sub_type": sub_type or "Unknown",
"price": price,
"price_frequency": frequency,
"price_qualifier": price_qualifier,
"floorspace_sqm": parse_display_size(prop.get("displaySize")),
"url": RIGHTMOVE_BASE + prop.get("propertyUrl", ""),
"features": key_features,
"first_visible_date": prop.get("firstVisibleDate", ""),
"update_date": update_date,
"outcode": outcode,
"house_share": sub_type == "House Share",
}

View file

@ -21,6 +21,7 @@
"pocketbase": "^0.26.8",
"react": "^18.2.0",
"react-dom": "^18.2.0",
"react-joyride": "^2.9.3",
"react-map-gl": "^7.1.0"
},
"devDependencies": {
@ -2033,6 +2034,11 @@
"integrity": "sha512-aGTxbpbg8/b5JfU1HXSrbH3wXZuLPJcNEcZQFMxLs3oSzgtVu6nFPkbbGGUvBcUjKV2YyB9Wxxabo+HEH9tcRQ==",
"license": "MIT"
},
"node_modules/@gilbarbara/deep-equal": {
"version": "0.3.1",
"resolved": "https://registry.npmjs.org/@gilbarbara/deep-equal/-/deep-equal-0.3.1.tgz",
"integrity": "sha512-I7xWjLs2YSVMc5gGx1Z3ZG1lgFpITPndpi8Ku55GeEIKpACCPQNS/OTqQbxgTCfq0Ncvcc+CrFov96itVh6Qvw=="
},
"node_modules/@humanwhocodes/config-array": {
"version": "0.13.0",
"resolved": "https://registry.npmjs.org/@humanwhocodes/config-array/-/config-array-0.13.0.tgz",
@ -4623,7 +4629,6 @@
"version": "15.7.15",
"resolved": "https://registry.npmjs.org/@types/prop-types/-/prop-types-15.7.15.tgz",
"integrity": "sha512-F6bEyamV9jKGAFBEmlQnesRPGOQqS2+Uwi0Em15xenOxHaf2hv6L8YCVn3rPdPJOiJfPiCnLIRyvwVaqMY3MIw==",
"devOptional": true,
"license": "MIT"
},
"node_modules/@types/qs": {
@ -4644,7 +4649,6 @@
"version": "18.3.27",
"resolved": "https://registry.npmjs.org/@types/react/-/react-18.3.27.tgz",
"integrity": "sha512-cisd7gxkzjBKU2GgdYrTdtQx1SORymWyaAFhaxQPK9bYO9ot3Y5OikQRvY0VYQtvwjeQnizCINJAenh/V7MK2w==",
"devOptional": true,
"license": "MIT",
"dependencies": {
"@types/prop-types": "*",
@ -6772,7 +6776,6 @@
"version": "3.2.3",
"resolved": "https://registry.npmjs.org/csstype/-/csstype-3.2.3.tgz",
"integrity": "sha512-z1HGKcYy2xA8AGQfwrn0PAy+PB7X/GSj3UVJW9qKyn43xWa+gl5nXmU4qqLMRzWVLFC8KusUX8T/0kCiOYpAIQ==",
"devOptional": true,
"license": "MIT"
},
"node_modules/data-uri-to-buffer": {
@ -6856,6 +6859,12 @@
}
}
},
"node_modules/deep-diff": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/deep-diff/-/deep-diff-1.0.2.tgz",
"integrity": "sha512-aWS3UIVH+NPGCD1kki+DCU9Dua032iSsO43LqQpcs4R3+dVv7tX0qBGjiVHJHjplsoUM2XRO/KB92glqc68awg==",
"deprecated": "Package no longer supported. Contact Support at https://www.npmjs.com/support for more info."
},
"node_modules/deep-is": {
"version": "0.1.4",
"resolved": "https://registry.npmjs.org/deep-is/-/deep-is-0.1.4.tgz",
@ -6875,6 +6884,14 @@
"node": ">=0.10.0"
}
},
"node_modules/deepmerge": {
"version": "4.3.1",
"resolved": "https://registry.npmjs.org/deepmerge/-/deepmerge-4.3.1.tgz",
"integrity": "sha512-3sUqbMEc77XqpdNO7FRyRog+eW3ph+GYCbj+rK+uYyRMuwsVy0rMiVtPn+QJlKFvWP/1PYpapqYn0Me2knFn+A==",
"engines": {
"node": ">=0.10.0"
}
},
"node_modules/default-browser": {
"version": "5.4.0",
"resolved": "https://registry.npmjs.org/default-browser/-/default-browser-5.4.0.tgz",
@ -9693,6 +9710,11 @@
"url": "https://github.com/sponsors/sindresorhus"
}
},
"node_modules/is-lite": {
"version": "1.2.1",
"resolved": "https://registry.npmjs.org/is-lite/-/is-lite-1.2.1.tgz",
"integrity": "sha512-pgF+L5bxC+10hLBgf6R2P4ZZUBOQIIacbdo8YvuCP8/JvsWxG7aZ9p10DYuLtifFci4l3VITphhMlMV4Y+urPw=="
},
"node_modules/is-map": {
"version": "2.0.3",
"resolved": "https://registry.npmjs.org/is-map/-/is-map-2.0.3.tgz",
@ -10710,7 +10732,6 @@
"version": "4.1.1",
"resolved": "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz",
"integrity": "sha512-rJgTQnkUnH1sFw8yT6VSU3zD3sWmu6sZhIseY8VX+GRu3P6F7Fu+JNDoXfklElbLJSnc3FUQHVe4cU5hj+BcUg==",
"dev": true,
"license": "MIT",
"engines": {
"node": ">=0.10.0"
@ -11253,6 +11274,16 @@
"resolved": "https://registry.npmjs.org/pocketbase/-/pocketbase-0.26.8.tgz",
"integrity": "sha512-aQ/ewvS7ncvAE8wxoW10iAZu6ElgbeFpBhKPnCfvRovNzm2gW8u/sQNPGN6vNgVEagz44kK//C61oKjfa+7Low=="
},
"node_modules/popper.js": {
"version": "1.16.1",
"resolved": "https://registry.npmjs.org/popper.js/-/popper.js-1.16.1.tgz",
"integrity": "sha512-Wb4p1J4zyFTbM+u6WuO4XstYx4Ky9Cewe4DWrel7B0w6VVICvPwdOpotjzcf6eD8TsckVnIMNONQyPIUFOUbCQ==",
"deprecated": "You can find the new Popper v2 at @popperjs/core, this package is dedicated to the legacy v1",
"funding": {
"type": "opencollective",
"url": "https://opencollective.com/popperjs"
}
},
"node_modules/possible-typed-array-names": {
"version": "1.1.0",
"resolved": "https://registry.npmjs.org/possible-typed-array-names/-/possible-typed-array-names-1.1.0.tgz",
@ -11608,7 +11639,6 @@
"version": "15.8.1",
"resolved": "https://registry.npmjs.org/prop-types/-/prop-types-15.8.1.tgz",
"integrity": "sha512-oj87CgZICdulUohogVAR7AjlC0327U4el4L6eAvOqCeudMDVU0NThNaV+b9Df4dXgSP1gXMTnPdhfe/2qDH5cg==",
"dev": true,
"license": "MIT",
"dependencies": {
"loose-envify": "^1.4.0",
@ -11854,13 +11884,89 @@
"react": "^18.3.1"
}
},
"node_modules/react-floater": {
"version": "0.7.9",
"resolved": "https://registry.npmjs.org/react-floater/-/react-floater-0.7.9.tgz",
"integrity": "sha512-NXqyp9o8FAXOATOEo0ZpyaQ2KPb4cmPMXGWkx377QtJkIXHlHRAGer7ai0r0C1kG5gf+KJ6Gy+gdNIiosvSicg==",
"dependencies": {
"deepmerge": "^4.3.1",
"is-lite": "^0.8.2",
"popper.js": "^1.16.0",
"prop-types": "^15.8.1",
"tree-changes": "^0.9.1"
},
"peerDependencies": {
"react": "15 - 18",
"react-dom": "15 - 18"
}
},
"node_modules/react-floater/node_modules/@gilbarbara/deep-equal": {
"version": "0.1.2",
"resolved": "https://registry.npmjs.org/@gilbarbara/deep-equal/-/deep-equal-0.1.2.tgz",
"integrity": "sha512-jk+qzItoEb0D0xSSmrKDDzf9sheQj/BAPxlgNxgmOaA3mxpUa6ndJLYGZKsJnIVEQSD8zcTbyILz7I0HcnBCRA=="
},
"node_modules/react-floater/node_modules/is-lite": {
"version": "0.8.2",
"resolved": "https://registry.npmjs.org/is-lite/-/is-lite-0.8.2.tgz",
"integrity": "sha512-JZfH47qTsslwaAsqbMI3Q6HNNjUuq6Cmzzww50TdP5Esb6e1y2sK2UAaZZuzfAzpoI2AkxoPQapZdlDuP6Vlsw=="
},
"node_modules/react-floater/node_modules/tree-changes": {
"version": "0.9.3",
"resolved": "https://registry.npmjs.org/tree-changes/-/tree-changes-0.9.3.tgz",
"integrity": "sha512-vvvS+O6kEeGRzMglTKbc19ltLWNtmNt1cpBoSYLj/iEcPVvpJasemKOlxBrmZaCtDJoF+4bwv3m01UKYi8mukQ==",
"dependencies": {
"@gilbarbara/deep-equal": "^0.1.1",
"is-lite": "^0.8.2"
}
},
"node_modules/react-innertext": {
"version": "1.1.5",
"resolved": "https://registry.npmjs.org/react-innertext/-/react-innertext-1.1.5.tgz",
"integrity": "sha512-PWAqdqhxhHIv80dT9znP2KvS+hfkbRovFp4zFYHFFlOoQLRiawIic81gKb3U1wEyJZgMwgs3JoLtwryASRWP3Q==",
"peerDependencies": {
"@types/react": ">=0.0.0 <=99",
"react": ">=0.0.0 <=99"
}
},
"node_modules/react-is": {
"version": "16.13.1",
"resolved": "https://registry.npmjs.org/react-is/-/react-is-16.13.1.tgz",
"integrity": "sha512-24e6ynE2H+OKt4kqsOvNd8kBpV65zoxbA4BVsEOB3ARVWQki/DHzaUoC5KuON/BiccDaCCTZBuOcfZs70kR8bQ==",
"dev": true,
"license": "MIT"
},
"node_modules/react-joyride": {
"version": "2.9.3",
"resolved": "https://registry.npmjs.org/react-joyride/-/react-joyride-2.9.3.tgz",
"integrity": "sha512-1+Mg34XK5zaqJ63eeBhqdbk7dlGCFp36FXwsEvgpjqrtyywX2C6h9vr3jgxP0bGHCw8Ilsp/nRDzNVq6HJ3rNw==",
"dependencies": {
"@gilbarbara/deep-equal": "^0.3.1",
"deep-diff": "^1.0.2",
"deepmerge": "^4.3.1",
"is-lite": "^1.2.1",
"react-floater": "^0.7.9",
"react-innertext": "^1.1.5",
"react-is": "^16.13.1",
"scroll": "^3.0.1",
"scrollparent": "^2.1.0",
"tree-changes": "^0.11.2",
"type-fest": "^4.27.0"
},
"peerDependencies": {
"react": "15 - 18",
"react-dom": "15 - 18"
}
},
"node_modules/react-joyride/node_modules/type-fest": {
"version": "4.41.0",
"resolved": "https://registry.npmjs.org/type-fest/-/type-fest-4.41.0.tgz",
"integrity": "sha512-TeTSQ6H5YHvpqVwBRcnLDCBnDOHWYu7IvGbHT6N8AOymcr9PJGjc1GTtiWZTYg0NCgYwvnYWEkVChQAr9bjfwA==",
"engines": {
"node": ">=16"
},
"funding": {
"url": "https://github.com/sponsors/sindresorhus"
}
},
"node_modules/react-map-gl": {
"version": "7.1.9",
"resolved": "https://registry.npmjs.org/react-map-gl/-/react-map-gl-7.1.9.tgz",
@ -12457,6 +12563,16 @@
"url": "https://opencollective.com/webpack"
}
},
"node_modules/scroll": {
"version": "3.0.1",
"resolved": "https://registry.npmjs.org/scroll/-/scroll-3.0.1.tgz",
"integrity": "sha512-pz7y517OVls1maEzlirKO5nPYle9AXsFzTMNJrRGmT951mzpIBy7sNHOg5o/0MQd/NqliCiWnAi0kZneMPFLcg=="
},
"node_modules/scrollparent": {
"version": "2.1.0",
"resolved": "https://registry.npmjs.org/scrollparent/-/scrollparent-2.1.0.tgz",
"integrity": "sha512-bnnvJL28/Rtz/kz2+4wpBjHzWoEzXhVg/TE8BeVGJHUqE8THNIRnDxDWMktwM+qahvlRdvlLdsQfYe+cuqfZeA=="
},
"node_modules/select-hose": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/select-hose/-/select-hose-2.0.0.tgz",
@ -13754,6 +13870,15 @@
"node": ">=0.6"
}
},
"node_modules/tree-changes": {
"version": "0.11.3",
"resolved": "https://registry.npmjs.org/tree-changes/-/tree-changes-0.11.3.tgz",
"integrity": "sha512-r14mvDZ6tqz8PRQmlFKjhUVngu4VZ9d92ON3tp0EGpFBE6PAHOq8Bx8m8ahbNoGE3uI/npjYcJiqVydyOiYXag==",
"dependencies": {
"@gilbarbara/deep-equal": "^0.3.1",
"is-lite": "^1.2.1"
}
},
"node_modules/tree-dump": {
"version": "1.1.0",
"resolved": "https://registry.npmjs.org/tree-dump/-/tree-dump-1.1.0.tgz",

View file

@ -26,6 +26,7 @@
"pocketbase": "^0.26.8",
"react": "^18.2.0",
"react-dom": "^18.2.0",
"react-joyride": "^2.9.3",
"react-map-gl": "^7.1.0"
},
"devDependencies": {

View file

@ -4,6 +4,7 @@ import PricingPage from './components/pricing/PricingPage';
import HomePage from './components/home/HomePage';
import SavedSearchesPage from './components/saved-searches/SavedSearchesPage';
import LearnPage from './components/learn/LearnPage';
import AccountPage from './components/account/AccountPage';
import Header, { type Page } from './components/ui/Header';
import AuthModal from './components/ui/AuthModal';
import SaveSearchModal from './components/ui/SaveSearchModal';
@ -32,6 +33,8 @@ case 'saved-searches':
return '/learn';
case 'pricing':
return '/pricing';
case 'account':
return '/account';
default:
return '/';
}
@ -42,6 +45,7 @@ function pathToPage(pathname: string): Page | null {
if (pathname === '/saved') return 'saved-searches';
if (pathname === '/learn') return 'learn';
if (pathname === '/pricing') return 'pricing';
if (pathname === '/account') return 'account';
if (pathname === '/') return 'home';
return null;
}
@ -92,6 +96,7 @@ export default function App() {
register,
logout,
requestPasswordReset,
refreshAuth,
clearError,
} = useAuth();
const [showAuthModal, setShowAuthModal] = useState(false);
@ -233,6 +238,8 @@ export default function App() {
<PricingPage onOpenDashboard={() => navigateTo('dashboard')} />
) : activePage === 'learn' ? (
<LearnPage />
) : activePage === 'account' && user ? (
<AccountPage user={user} onRefreshAuth={refreshAuth} />
) : activePage === 'saved-searches' ? (
<SavedSearchesPage
searches={savedSearches.searches}

View file

@ -0,0 +1,131 @@
import { useState } from 'react';
import type { AuthUser } from '../../hooks/useAuth';
import { apiUrl, authHeaders, assertOk } from '../../lib/api';
import { SpinnerIcon } from '../ui/icons/SpinnerIcon';
import { CheckIcon } from '../ui/icons/CheckIcon';
const SUBSCRIPTION_OPTIONS = ['free', 'rental', 'buyer'] as const;
const SUBSCRIPTION_LABELS: Record<string, string> = {
free: 'Free',
rental: 'Rental',
buyer: 'Buyer',
};
export default function AccountPage({
user,
onRefreshAuth,
}: {
user: AuthUser;
onRefreshAuth: () => Promise<void>;
}) {
const [selectedSubscription, setSelectedSubscription] = useState(user.subscription || 'free');
const [saving, setSaving] = useState(false);
const [saved, setSaved] = useState(false);
const [error, setError] = useState<string | null>(null);
const handleSave = async () => {
setSaving(true);
setError(null);
setSaved(false);
try {
const res = await fetch(apiUrl('subscription'), {
method: 'PATCH',
...authHeaders({
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ subscription: selectedSubscription }),
}),
});
assertOk(res, 'Update subscription');
await onRefreshAuth();
setSaved(true);
setTimeout(() => setSaved(false), 2000);
} catch (err) {
const msg = err instanceof Error ? err.message : 'Failed to update subscription';
setError(msg);
} finally {
setSaving(false);
}
};
const badgeColor =
user.subscription === 'buyer'
? 'bg-teal-100 text-teal-700 dark:bg-teal-900/30 dark:text-teal-400'
: user.subscription === 'rental'
? 'bg-blue-100 text-blue-700 dark:bg-blue-900/30 dark:text-blue-400'
: 'bg-warm-100 text-warm-600 dark:bg-warm-700 dark:text-warm-300';
return (
<div className="flex-1 overflow-y-auto bg-warm-50 dark:bg-navy-950">
<div className="max-w-lg mx-auto px-6 py-16">
<h1 className="text-2xl font-bold text-navy-950 dark:text-warm-100 mb-8">Account</h1>
<div className="bg-white dark:bg-warm-800 rounded-xl border border-warm-200 dark:border-warm-700 divide-y divide-warm-200 dark:divide-warm-700">
{/* Email */}
<div className="px-5 py-4 flex items-center justify-between">
<div>
<p className="text-sm text-warm-500 dark:text-warm-400">Email</p>
<p className="text-navy-950 dark:text-warm-100 font-medium">{user.email}</p>
</div>
<span
className={`text-xs font-medium px-2 py-0.5 rounded-full ${
user.verified
? 'bg-green-100 text-green-700 dark:bg-green-900/30 dark:text-green-400'
: 'bg-amber-100 text-amber-700 dark:bg-amber-900/30 dark:text-amber-400'
}`}
>
{user.verified ? 'Verified' : 'Unverified'}
</span>
</div>
{/* Subscription */}
<div className="px-5 py-4 flex items-center justify-between">
<div>
<p className="text-sm text-warm-500 dark:text-warm-400">Subscription</p>
<span className={`inline-block text-sm font-medium px-2.5 py-0.5 rounded-full mt-1 ${badgeColor}`}>
{SUBSCRIPTION_LABELS[user.subscription] || user.subscription || 'Free'}
</span>
</div>
</div>
{/* Admin section */}
{user.isAdmin && (
<div className="px-5 py-4">
<p className="text-sm text-warm-500 dark:text-warm-400 mb-3">
Admin: Change subscription
</p>
<div className="flex items-center gap-3">
<select
value={selectedSubscription}
onChange={(e) => setSelectedSubscription(e.target.value)}
className="flex-1 px-3 py-2 rounded-lg border border-warm-200 dark:border-warm-700 bg-white dark:bg-warm-900 text-navy-950 dark:text-warm-200 text-sm"
>
{SUBSCRIPTION_OPTIONS.map((opt) => (
<option key={opt} value={opt}>
{SUBSCRIPTION_LABELS[opt]}
</option>
))}
</select>
<button
onClick={handleSave}
disabled={saving || selectedSubscription === user.subscription}
className="px-4 py-2 rounded-lg bg-teal-600 hover:bg-teal-700 text-white text-sm font-medium disabled:opacity-50 disabled:cursor-not-allowed flex items-center gap-2"
>
{saving ? (
<SpinnerIcon className="w-4 h-4 animate-spin" />
) : saved ? (
<CheckIcon className="w-4 h-4" />
) : null}
{saved ? 'Saved' : 'Save'}
</button>
</div>
{error && (
<p className="mt-2 text-sm text-red-600 dark:text-red-400">{error}</p>
)}
</div>
)}
</div>
</div>
</div>
);
}

View file

@ -1,131 +1,63 @@
import { useRef, useEffect } from 'react';
import { useMemo } from 'react';
const HEX_COUNT = 70;
const TAU = Math.PI * 2;
const HEX_COUNT = 50;
interface Hex {
x: number;
y: number;
baseY: number;
interface HexConfig {
size: number;
opacity: number;
speed: number;
phase: number;
top: number;
driftDuration: number;
bobDuration: number;
bobAmount: number;
delay: number;
reverse: boolean;
}
function initHexes(w: number, h: number): Hex[] {
const hexes: Hex[] = [];
function generateHexes(): HexConfig[] {
const hexes: HexConfig[] = [];
for (let i = 0; i < HEX_COUNT; i++) {
const y = Math.random() * h;
const side = Math.random() < 0.5 ? 'left' : 'right';
const x = side === 'left' ? Math.random() * w * 0.3 : w * 0.7 + Math.random() * w * 0.3;
const driftDuration = 18 + Math.random() * 35;
hexes.push({
x,
y,
baseY: y,
size: 8 + Math.random() * 20,
opacity: 0.08 + Math.random() * 0.15,
speed: 6 + Math.random() * 14,
phase: Math.random() * TAU,
size: 10 + Math.random() * 32,
opacity: 0.06 + Math.random() * 0.18,
top: Math.random() * 100,
driftDuration,
bobDuration: 3 + Math.random() * 5,
bobAmount: 8 + Math.random() * 30,
delay: -Math.random() * driftDuration,
reverse: Math.random() < 0.3,
});
}
return hexes;
}
function drawHex(ctx: CanvasRenderingContext2D, cx: number, cy: number, r: number) {
ctx.beginPath();
for (let i = 0; i < 6; i++) {
const angle = (TAU / 6) * i - Math.PI / 6;
const px = cx + r * Math.cos(angle);
const py = cy + r * Math.sin(angle);
if (i === 0) ctx.moveTo(px, py);
else ctx.lineTo(px, py);
}
ctx.closePath();
}
export default function HexCanvas({ isDark = false }: { isDark?: boolean }) {
const canvasRef = useRef<HTMLCanvasElement>(null);
const hexesRef = useRef<Hex[]>([]);
const animRef = useRef(0);
const isDarkRef = useRef(isDark);
isDarkRef.current = isDark;
useEffect(() => {
const canvas = canvasRef.current;
if (!canvas) return;
const ctx = canvas.getContext('2d');
if (!ctx) return;
let w = 0;
let h = 0;
function resize() {
const dpr = window.devicePixelRatio || 1;
const rect = canvas!.parentElement!.getBoundingClientRect();
w = rect.width;
h = rect.height;
canvas!.width = w * dpr;
canvas!.height = h * dpr;
canvas!.style.width = `${w}px`;
canvas!.style.height = `${h}px`;
ctx!.setTransform(dpr, 0, 0, dpr, 0, 0);
hexesRef.current = initHexes(w, h);
}
resize();
const ro = new ResizeObserver(resize);
ro.observe(canvas.parentElement!);
let prev = performance.now();
function frame(now: number) {
const dt = (now - prev) / 1000;
prev = now;
ctx!.clearRect(0, 0, w, h);
for (const hex of hexesRef.current) {
hex.x += hex.speed * dt * 0.3;
if (hex.x > w * 0.3 + hex.size && hex.x < w * 0.7 - hex.size) {
hex.x = w * 0.7 + hex.size;
}
if (hex.x > w + hex.size * 2) {
hex.x = -hex.size * 2;
hex.y = Math.random() * h;
hex.baseY = hex.y;
}
const bob = Math.sin(now / 1000 + hex.phase) * 8;
hex.y = hex.baseY + bob;
const dark = isDarkRef.current;
ctx!.globalAlpha = hex.opacity * (dark ? 0.6 : 1);
ctx!.fillStyle = dark ? '#058172' : '#00a28c';
drawHex(ctx!, hex.x, hex.y, hex.size);
ctx!.fill();
ctx!.globalAlpha = hex.opacity * 0.5 * (dark ? 0.6 : 1);
ctx!.strokeStyle = dark ? '#0a665b' : '#05c9aa';
ctx!.lineWidth = 1;
drawHex(ctx!, hex.x, hex.y, hex.size);
ctx!.stroke();
}
animRef.current = requestAnimationFrame(frame);
}
animRef.current = requestAnimationFrame(frame);
return () => {
cancelAnimationFrame(animRef.current);
ro.disconnect();
};
}, []);
const hexes = useMemo(generateHexes, []);
return (
<canvas
ref={canvasRef}
className="absolute inset-0 pointer-events-none"
style={{ zIndex: 0 }}
/>
<div className="absolute inset-0 overflow-hidden pointer-events-none" style={{ zIndex: 0 }}>
{hexes.map((hex, i) => (
<div
key={i}
className="absolute"
style={{
top: `${hex.top}%`,
animation: `hex-drift ${hex.driftDuration}s linear ${hex.delay}s infinite${hex.reverse ? ' reverse' : ''}`,
}}
>
<div
className="bg-teal-500"
style={{
width: hex.size,
height: hex.size,
opacity: hex.opacity * (isDark ? 0.6 : 1),
clipPath: 'polygon(50% 0%, 100% 25%, 100% 75%, 50% 100%, 0% 75%, 0% 25%)',
animation: `hex-bob ${hex.bobDuration}s ease-in-out infinite`,
'--bob': `${hex.bobAmount}px`,
} as React.CSSProperties}
/>
</div>
))}
</div>
);
}

View file

@ -43,7 +43,7 @@ export default function HomePage({
<HexCanvas isDark={theme === 'dark'} />
{/* Radial teal glow */}
<div className="absolute top-1/2 left-1/2 -translate-x-1/2 -translate-y-1/2 w-[600px] h-[400px] bg-teal-500/[0.07] rounded-full blur-3xl pointer-events-none" />
<div className="relative z-10 max-w-4xl mx-auto px-6">
<div className="relative z-10 max-w-4xl mx-auto px-6 md:px-10 py-6 backdrop-blur-sm bg-navy-950/30 rounded-2xl">
<p className="text-teal-400 font-semibold tracking-wide uppercase text-sm mb-4">
Browsing listings is not a strategy. Knowing what you want is.
</p>

View file

@ -119,7 +119,37 @@ export default memo(function Filters({
onAiFilterSubmit,
}: FiltersProps) {
const availableFeatures = features.filter((f) => !enabledFeatures.has(f.name));
const enabledFeatureList = features.filter((f) => enabledFeatures.has(f.name));
const enabledFeatureList = features.filter(
(f) => enabledFeatures.has(f.name) && f.name !== 'Listing status'
);
const listingToggles = useMemo(() => {
const val = filters['Listing status'] as string[] | undefined;
if (!val) return { historical: true, buy: true, rent: true };
return {
historical: val.includes('Historical sale'),
buy: val.includes('For sale'),
rent: val.includes('For rent'),
};
}, [filters]);
const handleListingToggle = useCallback(
(key: 'historical' | 'buy' | 'rent') => {
const next = { ...listingToggles, [key]: !listingToggles[key] };
const allOn = next.historical && next.buy && next.rent;
const allOff = !next.historical && !next.buy && !next.rent;
if (allOn || allOff) {
onRemoveFilter('Listing status');
return;
}
const values: string[] = [];
if (next.historical) values.push('Historical sale');
if (next.buy) values.push('For sale');
if (next.rent) values.push('For rent');
onFilterChange('Listing status', values);
},
[listingToggles, onFilterChange, onRemoveFilter]
);
const containerRef = useRef<HTMLDivElement>(null);
const [showPhilosophy, setShowPhilosophy] = useState(false);
@ -155,7 +185,8 @@ export default memo(function Filters({
return scales;
}, [features]);
const badgeCount = enabledFeatureList.length + activeModes.length;
const hasListingFilter = !listingToggles.historical || !listingToggles.buy || !listingToggles.rent;
const badgeCount = enabledFeatureList.length + activeModes.length + (hasListingFilter ? 1 : 0);
return (
<div ref={containerRef} className="flex flex-col bg-white dark:bg-navy-950 overflow-y-auto md:overflow-hidden h-full">
@ -171,6 +202,17 @@ export default memo(function Filters({
</button>
</div>
</div>
<div className="shrink-0 flex items-center gap-2 px-3 py-2 border-b border-warm-200 dark:border-navy-700">
<span className="text-xs font-medium text-warm-500 dark:text-warm-400">Show</span>
<PillGroup>
<PillToggle label="Historical" active={listingToggles.historical}
onClick={() => handleListingToggle('historical')} size="xs" />
<PillToggle label="Buy" active={listingToggles.buy}
onClick={() => handleListingToggle('buy')} size="xs" />
<PillToggle label="Rent" active={listingToggles.rent}
onClick={() => handleListingToggle('rent')} size="xs" />
</PillGroup>
</div>
<div className="shrink-0 md:shrink md:min-h-0 flex flex-col md:basis-[40%]">
<div className="shrink-0 flex items-center justify-between px-3 py-2 border-b border-warm-200 dark:border-navy-700">
<div className="flex items-center gap-2">

View file

@ -118,7 +118,7 @@ export default function LocationSearch({
}
return (
<div ref={containerRef} className="absolute top-3 left-3 z-10 flex flex-col">
<div ref={containerRef} data-tutorial="search" className="absolute top-3 left-3 z-10 flex flex-col">
<div className="flex items-center shadow-lg rounded overflow-hidden bg-white dark:bg-warm-800">
<SearchIcon className="w-4 h-4 text-warm-400 dark:text-warm-500 ml-3 shrink-0" />
<PlaceSearchInput

View file

@ -37,7 +37,7 @@ interface MapProps {
features: FeatureMeta[];
selectedHexagonId: string | null;
hoveredHexagonId: string | null;
onHexagonClick: (id: string, isPostcode?: boolean) => void;
onHexagonClick: (id: string, isPostcode?: boolean, geometry?: PostcodeGeometry) => void;
onHexagonHover: (h3: string | null, x?: number, y?: number) => void;
initialViewState?: ViewState;
theme?: 'light' | 'dark';

View file

@ -1,5 +1,5 @@
import { useState, useEffect, useMemo, useCallback } from 'react';
import type { FeatureMeta, FeatureFilters, POICategoryGroup, ViewState } from '../../types';
import type { FeatureMeta, FeatureFilters, POICategoryGroup, ViewState, PostcodeGeometry } from '../../types';
import type { SearchedLocation } from './LocationSearch';
import type { Page } from '../ui/Header';
import Map from './Map';
@ -18,6 +18,9 @@ import { usePaneResize } from '../../hooks/usePaneResize';
import { useAiFilters } from '../../hooks/useAiFilters';
import { useAreaSummary } from '../../hooks/useAreaSummary';
import { useUrlSync } from '../../hooks/useUrlSync';
import { useTutorial } from '../../hooks/useTutorial';
import { getTutorialStyles } from '../../lib/tutorial-styles';
import Joyride from 'react-joyride';
import {
useTravelTime,
TRANSPORT_MODES,
@ -191,8 +194,8 @@ export default function MapPage({
// On mobile, open drawer and switch tab when hexagon is clicked
const { handleHexagonClick } = selection;
const handleMobileHexagonClick = useCallback(
(id: string, isPostcode?: boolean) => {
handleHexagonClick(id, isPostcode);
(id: string, isPostcode?: boolean, geometry?: PostcodeGeometry) => {
handleHexagonClick(id, isPostcode, geometry);
if (id) {
setMobileDrawerOpen(true);
}
@ -225,6 +228,9 @@ export default function MapPage({
mapData.resolution,
]);
// Tutorial
const tutorial = useTutorial(initialLoading, isMobile);
// AI area summary
const aiSummary = useAreaSummary({
stats: selection.areaStats,
@ -551,8 +557,20 @@ export default function MapPage({
</div>
)}
<Joyride
steps={tutorial.steps}
run={tutorial.run}
continuous
showProgress
showSkipButton
callback={tutorial.handleCallback}
styles={getTutorialStyles(theme)}
disableScrolling
/>
{/* Left Pane */}
<div
data-tutorial="filters"
className="flex bg-white dark:bg-navy-950 shadow-lg overflow-hidden"
style={{ width: leftPaneWidth }}
>
@ -566,7 +584,7 @@ export default function MapPage({
</div>
{/* Map */}
<div className="flex-1 relative">
<div data-tutorial="map" className="flex-1 relative">
<Map
data={mapData.data}
postcodeData={mapData.postcodeData}
@ -599,6 +617,7 @@ export default function MapPage({
)}
{/* Floating POI button */}
<button
data-tutorial="poi-button"
onClick={() => setPoiPaneOpen((p) => !p)}
className={`absolute bottom-4 right-4 z-10 p-2 rounded-lg shadow-lg bg-white dark:bg-warm-800 ${poiPaneOpen ? 'text-teal-600 dark:text-teal-400' : 'text-warm-500 dark:text-warm-400 hover:text-teal-600 dark:hover:text-teal-400'}`}
>
@ -614,6 +633,7 @@ export default function MapPage({
{/* Right Pane */}
<div
data-tutorial="right-pane"
className="flex bg-white dark:bg-navy-950 shadow-lg z-10"
style={{ width: rightPaneWidth }}
>

View file

@ -13,7 +13,7 @@ import { SpinnerIcon } from './icons/SpinnerIcon';
import UserMenu from './UserMenu';
import MobileMenu from './MobileMenu';
export type Page = 'home' | 'dashboard' | 'saved-searches' | 'learn' | 'pricing';
export type Page = 'home' | 'dashboard' | 'saved-searches' | 'learn' | 'pricing' | 'account';
export default function Header({
activePage,
@ -200,7 +200,7 @@ export default function Header({
{!isMobile && (
<>
{user ? (
<UserMenu user={user} onLogout={onLogout} />
<UserMenu user={user} onLogout={onLogout} onPageChange={onPageChange} />
) : (
<>
<button

View file

@ -83,6 +83,7 @@ export default function MobileMenu({
{user && mobileNavItem('saved-searches', 'Saved')}
{mobileNavItem('learn', 'Learn')}
{mobileNavItem('pricing', 'Pricing')}
{user && mobileNavItem('account', 'Account')}
{/* Dashboard actions */}
{activePage === 'dashboard' && (

View file

@ -1,7 +1,16 @@
import { useState, useRef, useEffect } from 'react';
import type { AuthUser } from '../../hooks/useAuth';
import type { Page } from './Header';
export default function UserMenu({ user, onLogout }: { user: AuthUser; onLogout: () => void }) {
export default function UserMenu({
user,
onLogout,
onPageChange,
}: {
user: AuthUser;
onLogout: () => void;
onPageChange: (page: Page) => void;
}) {
const [open, setOpen] = useState(false);
const menuRef = useRef<HTMLDivElement>(null);
@ -37,6 +46,15 @@ export default function UserMenu({ user, onLogout }: { user: AuthUser; onLogout:
</p>
</div>
<div className="p-1">
<button
onClick={() => {
setOpen(false);
onPageChange('account');
}}
className="w-full text-left px-3 py-2 text-sm text-warm-700 dark:text-warm-300 hover:bg-warm-50 dark:hover:bg-warm-700 rounded"
>
Account
</button>
<button
onClick={() => {
setOpen(false);

View file

@ -5,6 +5,8 @@ export interface AuthUser {
id: string;
email: string;
verified: boolean;
isAdmin: boolean;
subscription: string;
}
function recordToUser(record: { id: string; [key: string]: unknown }): AuthUser {
@ -15,6 +17,8 @@ function recordToUser(record: { id: string; [key: string]: unknown }): AuthUser
id: record.id,
email: record.email,
verified: typeof record.verified === 'boolean' ? record.verified : false,
isAdmin: typeof record.is_admin === 'boolean' ? record.is_admin : false,
subscription: typeof record.subscription === 'string' ? record.subscription : 'free',
};
}
@ -110,6 +114,11 @@ export function useAuth() {
}
}, []);
const refreshAuth = useCallback(async () => {
const result = await pb.collection('users').authRefresh();
setUser(recordToUser(result.record));
}, []);
const clearError = useCallback(() => {
setError(null);
}, []);
@ -123,6 +132,7 @@ export function useAuth() {
loginWithOAuth,
logout,
requestPasswordReset,
refreshAuth,
clearError,
};
}

View file

@ -1,4 +1,4 @@
import { useCallback, useRef, useState, useMemo } from 'react';
import { useCallback, useRef, useState, useMemo, useEffect } from 'react';
import { H3HexagonLayer } from '@deck.gl/geo-layers';
import { GeoJsonLayer, IconLayer, TextLayer, ScatterplotLayer } from '@deck.gl/layers';
import type { PickingInfo } from '@deck.gl/core';
@ -18,6 +18,7 @@ import {
type TransportMode,
type TravelTimeEntries,
} from './useTravelTime';
import { MarchingAntsExtension } from '../lib/MarchingAntsExtension';
/** Convert POI id (e.g. "n12345") to OpenStreetMap URL */
function osmIdToUrl(id: string): string | null {
@ -40,7 +41,7 @@ interface UseDeckLayersProps {
features: FeatureMeta[];
selectedHexagonId: string | null;
hoveredHexagonId: string | null;
onHexagonClick: (id: string, isPostcode?: boolean) => void;
onHexagonClick: (id: string, isPostcode?: boolean, geometry?: PostcodeGeometry) => void;
onHexagonHover: (h3: string | null, x?: number, y?: number) => void;
theme: 'light' | 'dark';
selectedPostcodeGeometry?: PostcodeGeometry | null;
@ -89,9 +90,18 @@ export function useDeckLayers({
}: UseDeckLayersProps) {
const [popupInfo, setPopupInfo] = useState<PopupInfo | null>(null);
const [hoverPosition, setHoverPosition] = useState<{ x: number; y: number } | null>(null);
const [selectedPostcode, setSelectedPostcode] = useState<string | null>(null);
const [hoveredPostcode, setHoveredPostcode] = useState<string | null>(null);
// Marching ants animation
const [marchTime, setMarchTime] = useState(0);
const hasPostcodeGeometry = selectedPostcodeGeometry != null;
useEffect(() => {
if (!hasPostcodeGeometry) return;
setMarchTime(0);
const id = setInterval(() => setMarchTime((t) => t + 0.3), 50);
return () => clearInterval(id);
}, [hasPostcodeGeometry]);
const isDark = theme === 'dark';
const densityGradient = isDark ? DENSITY_GRADIENT_DARK : DENSITY_GRADIENT;
@ -110,8 +120,6 @@ export function useDeckLayers({
selectedHexagonIdRef.current = selectedHexagonId;
const hoveredHexagonIdRef = useRef(hoveredHexagonId);
hoveredHexagonIdRef.current = hoveredHexagonId;
const selectedPostcodeRef = useRef(selectedPostcode);
selectedPostcodeRef.current = selectedPostcode;
const hoveredPostcodeRef = useRef(hoveredPostcode);
hoveredPostcodeRef.current = hoveredPostcode;
@ -233,8 +241,7 @@ export function useDeckLayers({
const handlePostcodeClick = useCallback((info: PickingInfo<any>) => {
const pc = info.object?.properties?.postcode;
if (pc) {
setSelectedPostcode((prev) => (prev === pc ? null : pc));
onHexagonClickRef.current(pc, true);
onHexagonClickRef.current(pc, true, info.object?.geometry);
}
}, []);
@ -265,7 +272,7 @@ export function useDeckLayers({
}, [travelTimeEntries, travelTimeColorRanges]);
const colorTrigger = `${viewFeature}|${colorRange?.[0]}|${colorRange?.[1]}|${filterRange?.[0]}|${filterRange?.[1]}|${countRange.min}|${countRange.max}|${selectedHexagonId}|${hoveredHexagonId}|${theme}|${ttTrigger}`;
const postcodeColorTrigger = `${viewFeature}|${colorRange?.[0]}|${colorRange?.[1]}|${filterRange?.[0]}|${filterRange?.[1]}|${postcodeCountRange.min}|${postcodeCountRange.max}|${selectedPostcode}|${hoveredPostcode}|${theme}|${ttTrigger}`;
const postcodeColorTrigger = `${viewFeature}|${colorRange?.[0]}|${colorRange?.[1]}|${filterRange?.[0]}|${filterRange?.[1]}|${postcodeCountRange.min}|${postcodeCountRange.max}|${hoveredPostcode}|${theme}|${ttTrigger}`;
// --- Layers ---
const hexLayer = useMemo(
@ -423,8 +430,6 @@ export function useDeckLayers({
getLineColor: (f) => {
const pc = f.properties.postcode;
const dark = isDarkRef.current;
if (pc === selectedPostcodeRef.current)
return [255, 255, 255, 255] as [number, number, number, number];
if (pc === hoveredPostcodeRef.current)
return [29, 228, 195, 200] as [number, number, number, number];
return (dark ? [180, 170, 160, 100] : [100, 100, 100, 150]) as [
@ -436,7 +441,6 @@ export function useDeckLayers({
},
getLineWidth: (f) => {
const pc = f.properties.postcode;
if (pc === selectedPostcodeRef.current) return 3;
if (pc === hoveredPostcodeRef.current) return 2;
return 1;
},
@ -500,37 +504,28 @@ export function useDeckLayers({
[pois, stablePoiHover]
);
// Check if the selected postcode has data (passes current filters)
const selectedPostcodeHasData = useMemo(() => {
if (!selectedPostcodeGeometry || !selectedHexagonId) return false;
return postcodeData.some((f) => f.properties.postcode === selectedHexagonId);
}, [selectedPostcodeGeometry, selectedHexagonId, postcodeData]);
// Highlight layer for selected postcode (from search)
const selectedPostcodeHighlightLayer = useMemo(() => {
// Marching ants highlight layer for selected postcode (click or search)
const marchingAntsLayer = useMemo(() => {
if (!selectedPostcodeGeometry) return null;
const hasData = selectedPostcodeHasData;
const feature = {
type: 'Feature' as const,
geometry: selectedPostcodeGeometry,
properties: {},
};
return new GeoJsonLayer({
id: 'searched-postcode-highlight',
data: [feature],
getFillColor: hasData
? [29, 228, 195, 40] // teal tint when has data
: [255, 180, 0, 30], // orange tint when filtered out
getLineColor: hasData
? [29, 228, 195, 255] // solid teal when has data
: [255, 180, 0, 200], // orange when filtered out (no matching properties)
getLineWidth: hasData ? 4 : 3,
lineWidthUnits: 'pixels',
id: 'marching-ants',
data: [
{
type: 'Feature' as const,
geometry: selectedPostcodeGeometry,
properties: {},
},
],
filled: false,
stroked: true,
filled: true,
getLineColor: [29, 228, 195, 255],
getLineWidth: 3,
lineWidthUnits: 'pixels' as const,
pickable: false,
marchTime,
extensions: [new MarchingAntsExtension()],
});
}, [selectedPostcodeGeometry, selectedPostcodeHasData]);
}, [selectedPostcodeGeometry, marchTime]);
// Destination markers: one red dot per mode with a destination
const destinationMarkerData = useMemo(() => {
@ -566,7 +561,7 @@ export function useDeckLayers({
const baseLayers: any[] = usePostcodeView
? [postcodeLayer, postcodeLabelsLayer, poiLayer]
: [hexLayer, poiLayer];
if (selectedPostcodeHighlightLayer) baseLayers.push(selectedPostcodeHighlightLayer);
if (marchingAntsLayer) baseLayers.push(marchingAntsLayer);
if (destinationMarkerLayer) baseLayers.push(destinationMarkerLayer);
return baseLayers;
}, [
@ -575,7 +570,7 @@ export function useDeckLayers({
postcodeLayer,
postcodeLabelsLayer,
poiLayer,
selectedPostcodeHighlightLayer,
marchingAntsLayer,
destinationMarkerLayer,
]);
@ -594,7 +589,6 @@ export function useDeckLayers({
postcodeCountRange,
colorFeatureMeta,
handleMouseLeave,
selectedPostcode,
hoveredPostcode,
primaryTravelMode,
};

View file

@ -99,15 +99,16 @@ export function useHexagonSelection({ filters, features, resolution }: UseHexago
);
const handleHexagonClick = useCallback(
(id: string, isPostcode = false) => {
setSelectedPostcodeGeometry(null);
(id: string, isPostcode = false, geometry?: PostcodeGeometry) => {
if (selectedHexagon?.id === id) {
setSelectedHexagon(null);
setProperties([]);
setAreaStats(null);
setSelectedPostcodeGeometry(null);
} else {
const type = isPostcode ? 'postcode' : 'hexagon';
setSelectedHexagon({ id, type, resolution });
setSelectedPostcodeGeometry(isPostcode && geometry ? geometry : null);
setProperties([]);
setPropertiesTotal(0);
setPropertiesOffset(0);

View file

@ -0,0 +1,86 @@
import { useState, useCallback, useMemo } from 'react';
import type { Step, CallBackProps } from 'react-joyride';
import { ACTIONS, EVENTS, STATUS } from 'react-joyride';
const STORAGE_KEY = 'tutorial_completed';
const STEPS: Step[] = [
{
target: '[data-tutorial="filters"]',
title: 'Filter Properties',
content:
'Use filters to narrow down properties by price, energy rating, floor area, and more. Pin a filter to colour the map by that feature.',
placement: 'right',
disableBeacon: true,
},
{
target: '[data-tutorial="map"]',
title: 'Explore the Map',
content:
'Pan and zoom to explore property data across the UK. Click any hexagon to see detailed stats and individual properties.',
placement: 'bottom',
disableBeacon: true,
},
{
target: '[data-tutorial="search"]',
title: 'Search Locations',
content:
'Search for a place name or postcode to jump directly to that area on the map.',
placement: 'bottom',
disableBeacon: true,
},
{
target: '[data-tutorial="right-pane"]',
title: 'Area Stats & Properties',
content:
'After clicking a hexagon, view aggregated area statistics or browse individual properties in this pane.',
placement: 'left',
disableBeacon: true,
},
{
target: '[data-tutorial="poi-button"]',
title: 'Points of Interest',
content:
'Toggle points of interest like schools, shops, and transport stops to see what amenities are nearby.',
placement: 'left',
disableBeacon: true,
},
];
export function useTutorial(initialLoading: boolean, isMobile: boolean) {
const [run, setRun] = useState(() => {
if (isMobile) return false;
return !localStorage.getItem(STORAGE_KEY);
});
const shouldRun = run && !initialLoading && !isMobile;
const handleCallback = useCallback((data: CallBackProps) => {
const { status, action, type } = data;
if (status === STATUS.FINISHED || status === STATUS.SKIPPED) {
localStorage.setItem(STORAGE_KEY, '1');
setRun(false);
}
// Also stop if user closes a tooltip via the X button
if (action === ACTIONS.CLOSE && type === EVENTS.STEP_AFTER) {
localStorage.setItem(STORAGE_KEY, '1');
setRun(false);
}
}, []);
const resetTutorial = useCallback(() => {
localStorage.removeItem(STORAGE_KEY);
setRun(true);
}, []);
return useMemo(
() => ({
steps: STEPS,
run: shouldRun,
handleCallback,
resetTutorial,
}),
[shouldRun, handleCallback, resetTutorial]
);
}

View file

@ -40,6 +40,17 @@ h3 {
color 0.2s ease;
}
/* Hexagon background animations */
@keyframes hex-drift {
from { transform: translateX(-5vw); }
to { transform: translateX(105vw); }
}
@keyframes hex-bob {
0%, 100% { transform: translateY(var(--bob)); }
50% { transform: translateY(calc(var(--bob) * -1)); }
}
/* Fade-in animation for homepage sections */
.fade-in-section {
opacity: 0;

View file

@ -0,0 +1,53 @@
import { LayerExtension } from '@deck.gl/core';
/** Animates a marching-ants border on PathLayer sublayers (alternating white/green dashes). */
export class MarchingAntsExtension extends LayerExtension {
static extensionName = 'MarchingAntsExtension';
static defaultProps = {
marchTime: { type: 'number', value: 0 },
};
// eslint-disable-next-line @typescript-eslint/no-explicit-any
isEnabled(layer: any): boolean {
return 'pathTesselator' in layer.state;
}
// eslint-disable-next-line @typescript-eslint/no-explicit-any
getShaders(extension: any): any {
if (!extension.isEnabled(this)) return null;
return {
modules: [
{
name: 'marchingAnts',
inject: {
'fs:#decl': `\
uniform marchingAntsUniforms {
float marchTime;
} marchingAnts;`,
'fs:DECKGL_FILTER_COLOR': `\
float marchSegLen = 4.0;
float marchPos = mod(vPathPosition.y - marchingAnts.marchTime, marchSegLen * 2.0);
if (marchPos < marchSegLen) {
color = vec4(1.0, 1.0, 1.0, color.a);
} else {
color = vec4(0.114, 0.894, 0.765, color.a);
}`,
},
uniformTypes: {
marchTime: 'f32',
},
},
],
};
}
// eslint-disable-next-line @typescript-eslint/no-explicit-any
updateState(_params: any, extension: any): void {
if (!extension.isEnabled(this)) return;
// @ts-expect-error setShaderModuleProps exists on Layer
this.setShaderModuleProps({
// @ts-expect-error marchTime is a custom prop from this extension
marchingAnts: { marchTime: this.props.marchTime || 0 },
});
}
}

View file

@ -0,0 +1,52 @@
import type { Styles } from 'react-joyride';
export function getTutorialStyles(theme: 'light' | 'dark'): Partial<Styles> {
const isDark = theme === 'dark';
return {
options: {
arrowColor: isDark ? '#292524' : '#ffffff',
backgroundColor: isDark ? '#292524' : '#ffffff',
overlayColor: isDark ? 'rgba(10,14,26,0.75)' : 'rgba(0,0,0,0.5)',
primaryColor: '#00a28c',
textColor: isDark ? '#d6d3d1' : '#44403c',
zIndex: 1000,
},
tooltip: {
borderRadius: 8,
padding: 16,
},
tooltipTitle: {
color: isDark ? '#f5f5f4' : '#0a0e1a',
fontSize: 15,
fontWeight: 600,
},
tooltipContent: {
fontSize: 13,
lineHeight: 1.5,
padding: '8px 0 0',
},
buttonNext: {
borderRadius: 6,
fontSize: 13,
fontWeight: 500,
padding: '6px 14px',
},
buttonBack: {
color: isDark ? '#a8a29e' : '#78716c',
fontSize: 13,
fontWeight: 500,
marginRight: 8,
},
buttonSkip: {
color: isDark ? '#78716c' : '#a8a29e',
fontSize: 12,
},
buttonClose: {
color: isDark ? '#a8a29e' : '#78716c',
},
spotlight: {
borderRadius: 8,
},
};
}

View file

@ -16,7 +16,6 @@ from shapely import wkb
from shapely.geometry import MultiPolygon, Polygon
from tqdm import tqdm
from .pois import download_pbf
MIN_AREA_SQM = 5_000 # ~70m x 70m — skip pocket parks and small ponds
@ -103,12 +102,7 @@ def main():
)
args = parser.parse_args()
if args.pbf.exists():
pbf_file = args.pbf
print(f"Using existing PBF: {pbf_file}")
else:
download_pbf(args.pbf)
pbf_file = args.pbf
print("Extracting greenspace/water areas from PBF (two-pass area assembly)...")
with tqdm(
unit=" areas", unit_scale=True, desc="Processing", smoothing=0.05

View file

@ -1,121 +0,0 @@
"""Shared utilities for price index, price estimate, and renovation premium scripts."""
import numpy as np
import polars as pl
CURRENT_YEAR = 2025
TERRACE_TYPES = [
"Mid-Terrace",
"End-Terrace",
"Enclosed Mid-Terrace",
"Enclosed End-Terrace",
"Terraced",
]
FLAT_TYPES = ["Flats/Maisonettes", "Flat", "Maisonette"]
TYPE_GROUPS = ["Detached", "Semi-Detached", "Terraced", "Flats", "Bungalow"]
SHRINKAGE_K = 50
def type_group_expr():
"""Polars expression: Property type -> type_group."""
return (
pl.when(pl.col("Property type").is_in(TERRACE_TYPES))
.then(pl.lit("Terraced"))
.when(pl.col("Property type").is_in(FLAT_TYPES))
.then(pl.lit("Flats"))
.when(pl.col("Property type") == "Bungalow")
.then(pl.lit("Bungalow"))
.when(pl.col("Property type").is_in(["Detached", "Semi-Detached"]))
.then(pl.col("Property type"))
.otherwise(pl.lit(None))
.alias("type_group")
)
def sector_expr():
"""Polars expression: Postcode -> sector (drop last 2 chars, strip)."""
return (
pl.col("Postcode")
.str.slice(0, pl.col("Postcode").str.len_chars() - 2)
.str.strip_chars()
.alias("sector")
)
def hierarchy_keys(sector: str) -> tuple[str, str]:
"""Return (district, area) for a sector string."""
district = sector.rsplit(" ", 1)[0] if " " in sector else sector
area = ""
for ch in district:
if ch.isalpha():
area += ch
else:
break
return district, area
AGE_BREAKS = [1900, 1930, 1950, 1967, 1983, 2000, 2010]
AGE_LABELS = [
"pre-1900",
"1900-1929",
"1930-1949",
"1950-1966",
"1967-1982",
"1983-1999",
"2000-2009",
"2010+",
]
HEDONIC_COLUMNS = [
"Last known price",
"Date of last transaction",
"Property type",
"Total floor area (sqm)",
"Postcode",
]
def age_band_expr():
"""Polars expression: Construction age (UInt16 year) → age band string."""
expr = pl.when(pl.col("Construction age").is_null()).then(pl.lit(None))
for i, brk in enumerate(AGE_BREAKS):
expr = expr.when(pl.col("Construction age") < brk).then(pl.lit(AGE_LABELS[i]))
return expr.otherwise(pl.lit(AGE_LABELS[-1])).alias("age_band")
NON_REF_TYPES = ["Terraced", "Semi-Detached", "Flats", "Bungalow"]
def build_hedonic_features(df: pl.DataFrame) -> np.ndarray:
"""Build hedonic feature matrix from a DataFrame with type_group column.
Columns (5 total): log(floor_area), 4 type dummies (ref: Detached).
Sector fixed effects do the heavy lifting additional property features
(EPC, rooms, age) add no predictive value after sector demeaning.
"""
fa = df["Total floor area (sqm)"].to_numpy().astype(np.float32)
log_fa = np.log(np.maximum(fa, 1.0)).reshape(-1, 1)
tg = df["type_group"].to_numpy()
parts = [log_fa]
for t in NON_REF_TYPES:
parts.append((tg == t).astype(np.float32).reshape(-1, 1))
return np.hstack(parts)
def extract_centroids(input_path) -> dict[str, tuple[float, float]]:
"""Compute mean lat/lon per postcode sector."""
print("Computing sector centroids...")
df = (
pl.scan_parquet(input_path)
.select("Postcode", "lat", "lon")
.filter(pl.col("Postcode").is_not_null(), pl.col("lat").is_not_null())
.with_columns(sector_expr())
.group_by("sector")
.agg(pl.col("lat").mean(), pl.col("lon").mean())
.collect()
)
centroids = {}
for row in df.iter_rows(named=True):
centroids[row["sector"]] = (row["lat"], row["lon"])
print(f" {len(centroids):,} sector centroids")
return centroids

View file

@ -1,300 +0,0 @@
"""Cross-Sectional Hedonic Model (Per-Type)
Trains separate OLS models per property type on recent sales (last 5 years)
with sector fixed effects via Frisch-Waugh-Lovell demeaning:
log(price) = beta_type * log(floor_area) + alpha_sector_type + epsilon
Each type gets its own floor area elasticity and sector intercepts, capturing
that detached houses (beta=0.74) have higher price sensitivity to size than
terraced houses (beta=0.60), and a sector's value differs by property type.
Sector intercepts are hierarchically shrunk (sector district area national)
and spatially smoothed via KD-tree nearest neighbors.
Output: hedonic_model.json with per-type betas and sector intercepts.
"""
import argparse
import json
from pathlib import Path
import numpy as np
import polars as pl
from scipy.spatial import KDTree
from pipeline.transform._price_utils import (
CURRENT_YEAR,
HEDONIC_COLUMNS,
SHRINKAGE_K,
TYPE_GROUPS,
extract_centroids,
hierarchy_keys,
sector_expr,
type_group_expr,
)
TRAINING_YEARS = 5
SPATIAL_NEIGHBORS = 5
SPATIAL_BLEND_K = 30
def load_training_data(input_path: Path) -> pl.DataFrame:
"""Load recent sales with complete hedonic features."""
min_year = CURRENT_YEAR - TRAINING_YEARS
print(f"Loading training data (sales {min_year}-{CURRENT_YEAR})...")
df = (
pl.scan_parquet(input_path)
.select(*HEDONIC_COLUMNS)
.filter(
pl.col("Last known price").is_not_null(),
pl.col("Total floor area (sqm)").is_not_null(),
pl.col("Total floor area (sqm)") > 0,
pl.col("Postcode").is_not_null(),
)
.with_columns(
pl.col("Date of last transaction").dt.year().alias("sale_year"),
type_group_expr(),
sector_expr(),
)
.filter(
pl.col("type_group").is_not_null(),
pl.col("sale_year").is_not_null(),
pl.col("sale_year") >= min_year,
pl.col("sale_year") <= CURRENT_YEAR,
)
.collect()
)
print(f" {len(df):,} complete cases")
return df
def train_type_model(
df: pl.DataFrame, type_group: str
) -> tuple[float, dict[str, float], dict[str, int], float]:
"""Train hedonic model for a single property type.
Returns (beta_fa, sector_intercepts, sector_counts, national_intercept).
"""
t_df = df.filter(pl.col("type_group") == type_group)
y = np.log(t_df["Last known price"].to_numpy().astype(np.float64))
log_fa = np.log(
np.maximum(t_df["Total floor area (sqm)"].to_numpy().astype(np.float64), 1.0)
)
X = log_fa.reshape(-1, 1)
sectors = t_df["sector"].to_list()
# Group by sector for demeaning
sector_indices: dict[str, list[int]] = {}
for i, s in enumerate(sectors):
sector_indices.setdefault(s, []).append(i)
# Compute sector means and demean
X_demeaned = np.empty_like(X)
y_demeaned = np.empty_like(y)
sector_X_means: dict[str, np.ndarray] = {}
sector_y_means: dict[str, float] = {}
sector_counts: dict[str, int] = {}
for s, idxs in sector_indices.items():
idx = np.array(idxs)
X_mean = X[idx].mean(axis=0)
y_mean = y[idx].mean()
sector_X_means[s] = X_mean
sector_y_means[s] = y_mean
X_demeaned[idx] = X[idx] - X_mean
y_demeaned[idx] = y[idx] - y_mean
sector_counts[s] = len(idxs)
# OLS on demeaned data
beta = np.linalg.lstsq(X_demeaned, y_demeaned, rcond=None)[0]
beta_fa = float(beta[0])
# Recover sector intercepts
sector_intercepts = {}
for s in sector_indices:
sector_intercepts[s] = float(sector_y_means[s] - beta_fa * sector_X_means[s][0])
national_intercept = float(np.mean(list(sector_intercepts.values())))
# R-squared
y_pred = X[:, 0] * beta_fa
for i, s in enumerate(sectors):
y_pred[i] += sector_intercepts[s]
ss_res = np.sum((y - y_pred) ** 2)
ss_tot = np.sum((y - y.mean()) ** 2)
r2 = 1 - ss_res / ss_tot
print(
f" {type_group:<15s}: n={len(t_df):>9,} β_fa={beta_fa:.4f} "
f"R²={r2:.4f} sectors={len(sector_intercepts):,}"
)
return beta_fa, sector_intercepts, sector_counts, national_intercept
def shrink_intercepts(
sector_intercepts: dict[str, float],
sector_counts: dict[str, int],
) -> dict[str, float]:
"""Hierarchical shrinkage: sector -> district -> area -> national."""
national = float(np.mean(list(sector_intercepts.values())))
sector_to_dist: dict[str, str] = {}
dist_to_area: dict[str, str] = {}
for s in sector_intercepts:
d, a = hierarchy_keys(s)
sector_to_dist[s] = d
dist_to_area[d] = a
# Area-level intercepts (weighted mean of sectors in area)
area_vals: dict[str, list[tuple[float, int]]] = {}
for s, val in sector_intercepts.items():
d = sector_to_dist[s]
a = dist_to_area[d]
area_vals.setdefault(a, []).append((val, sector_counts.get(s, 0)))
area_intercepts: dict[str, float] = {}
area_counts: dict[str, int] = {}
for a, entries in area_vals.items():
total_n = sum(n for _, n in entries)
if total_n > 0:
area_intercepts[a] = sum(v * n for v, n in entries) / total_n
else:
area_intercepts[a] = sum(v for v, _ in entries) / len(entries)
area_counts[a] = total_n
# District-level intercepts
dist_vals: dict[str, list[tuple[float, int]]] = {}
for s, val in sector_intercepts.items():
d = sector_to_dist[s]
dist_vals.setdefault(d, []).append((val, sector_counts.get(s, 0)))
dist_intercepts: dict[str, float] = {}
dist_counts: dict[str, int] = {}
for d, entries in dist_vals.items():
total_n = sum(n for _, n in entries)
if total_n > 0:
dist_intercepts[d] = sum(v * n for v, n in entries) / total_n
else:
dist_intercepts[d] = sum(v for v, _ in entries) / len(entries)
dist_counts[d] = total_n
# Shrink: area -> national
area_shrunk: dict[str, float] = {}
for a, val in area_intercepts.items():
n = area_counts[a]
w = n / (n + SHRINKAGE_K)
area_shrunk[a] = w * val + (1 - w) * national
# Shrink: district -> area
dist_shrunk: dict[str, float] = {}
for d, val in dist_intercepts.items():
a = dist_to_area[d]
parent = area_shrunk.get(a, national)
n = dist_counts[d]
w = n / (n + SHRINKAGE_K)
dist_shrunk[d] = w * val + (1 - w) * parent
# Shrink: sector -> district
result: dict[str, float] = {}
for s, val in sector_intercepts.items():
d = sector_to_dist[s]
parent = dist_shrunk.get(d, national)
n = sector_counts.get(s, 0)
w = n / (n + SHRINKAGE_K)
result[s] = w * val + (1 - w) * parent
return result
def spatial_smooth_intercepts(
sector_intercepts: dict[str, float],
centroids: dict[str, tuple[float, float]],
sector_counts: dict[str, int],
) -> dict[str, float]:
"""Blend sparse sector intercepts with K nearest neighbors."""
sectors_with_coords = [s for s in sector_intercepts if s in centroids]
if len(sectors_with_coords) < SPATIAL_NEIGHBORS + 1:
return sector_intercepts
coords = np.array([centroids[s] for s in sectors_with_coords])
mean_lat = np.mean(coords[:, 0])
scale = np.cos(np.radians(mean_lat))
scaled_coords = np.column_stack([coords[:, 0], coords[:, 1] * scale])
tree = KDTree(scaled_coords)
result = dict(sector_intercepts)
for i, sec in enumerate(sectors_with_coords):
n = sector_counts.get(sec, 0)
self_w = n / (n + SPATIAL_BLEND_K)
if self_w > 0.95:
continue
dists, idxs = tree.query(scaled_coords[i], k=SPATIAL_NEIGHBORS + 1)
neighbor_dists = dists[1:]
neighbor_idxs = idxs[1:]
inv_dists = []
neighbor_vals = []
for d, j in zip(neighbor_dists, neighbor_idxs):
ns = sectors_with_coords[j]
if d > 0 and ns in sector_intercepts:
inv_dists.append(1.0 / d)
neighbor_vals.append(sector_intercepts[ns])
if not neighbor_vals:
continue
total_inv = sum(inv_dists)
nbr_w = 1.0 - self_w
blended = self_w * sector_intercepts[sec]
for val, iw in zip(neighbor_vals, inv_dists):
blended += nbr_w * (iw / total_inv) * val
result[sec] = blended
return result
def main():
parser = argparse.ArgumentParser(description="Train cross-sectional hedonic model")
parser.add_argument(
"--input", type=Path, required=True, help="Path to wide.parquet"
)
parser.add_argument(
"--output", type=Path, required=True, help="Output hedonic_model.json"
)
args = parser.parse_args()
df = load_training_data(args.input)
centroids = extract_centroids(args.input)
print("\nTraining per-type models...")
type_models = {}
total_sectors = 0
for tg in TYPE_GROUPS:
beta_fa, raw_intercepts, sector_counts, national = train_type_model(df, tg)
shrunk = shrink_intercepts(raw_intercepts, sector_counts)
smoothed = spatial_smooth_intercepts(shrunk, centroids, sector_counts)
total_sectors += len(smoothed)
type_models[tg] = {
"beta_fa": beta_fa,
"sector_intercepts": smoothed,
"national_intercept": national,
}
# Output
args.output.parent.mkdir(parents=True, exist_ok=True)
with open(args.output, "w") as f:
json.dump({"type_models": type_models}, f, indent=2)
size_kb = args.output.stat().st_size / 1024
print(f"\nWrote {args.output} ({size_kb:.0f} KB)")
print(f" {len(TYPE_GROUPS)} type models, {total_sectors:,} total sector intercepts")
if __name__ == "__main__":
main()

View file

@ -1,385 +0,0 @@
"""Backtesting: Evaluate price index model on held-out recent sales.
Test set: properties with 2+ sales where the last sale is 2022-2025.
Uses the second-to-last sale as input, predicts the last sale price.
Compares index-based prediction against a naive baseline (raw input price).
Uses type-stratified index when available, falling back to "All" type.
Output: backtest_results.parquet with predictions vs actuals.
"""
import argparse
import json
from pathlib import Path
import numpy as np
import polars as pl
from pipeline.transform._price_utils import (
CURRENT_YEAR,
HEDONIC_COLUMNS,
sector_expr,
type_group_expr,
)
TEST_YEAR_MIN = 2022
def extract_test_set(
input_path: Path, include_hedonic_cols: bool = False
) -> pl.DataFrame:
"""Extract test pairs: second-to-last sale as input, last sale as ground truth."""
print("Loading test set...")
cols = ["Postcode", "historical_prices", "Property type"]
if include_hedonic_cols:
for c in HEDONIC_COLUMNS:
if c not in cols:
cols.append(c)
df = (
pl.scan_parquet(input_path)
.select(cols)
.filter(
pl.col("Postcode").is_not_null(),
pl.col("historical_prices").list.len() >= 2,
)
.with_columns(
sector_expr(),
type_group_expr(),
# Last sale (ground truth)
pl.col("historical_prices")
.list.last()
.struct.field("year")
.alias("actual_year"),
pl.col("historical_prices")
.list.last()
.struct.field("price")
.alias("actual_price"),
# Second-to-last sale (input)
pl.col("historical_prices")
.list.get(-2)
.struct.field("year")
.alias("input_year"),
pl.col("historical_prices")
.list.get(-2)
.struct.field("price")
.alias("input_price"),
)
.filter(
pl.col("actual_year") >= TEST_YEAR_MIN,
pl.col("input_price") > 0,
pl.col("actual_price") > 0,
pl.col("actual_year") > pl.col("input_year"),
)
.collect()
)
print(f" {len(df):,} test pairs (last sale {TEST_YEAR_MIN}-{CURRENT_YEAR})")
return df
def predict(test: pl.DataFrame, index: pl.DataFrame) -> pl.DataFrame:
"""Index-based prediction with type-stratified fallback."""
has_type_group = "type_group" in index.columns
if has_type_group:
idx_typed = index.filter(pl.col("type_group") != "All")
idx_all = index.filter(pl.col("type_group") == "All")
# Join type-specific index at input year
test = test.join(
idx_typed.select(
"sector", "type_group", "year", pl.col("log_index").alias("li_in_typed")
),
left_on=["sector", "type_group", "input_year"],
right_on=["sector", "type_group", "year"],
how="left",
)
# Join "All" index at input year
test = test.join(
idx_all.select("sector", "year", pl.col("log_index").alias("li_in_all")),
left_on=["sector", "input_year"],
right_on=["sector", "year"],
how="left",
)
# Join type-specific index at actual year
test = test.join(
idx_typed.select(
"sector",
"type_group",
"year",
pl.col("log_index").alias("li_act_typed"),
),
left_on=["sector", "type_group", "actual_year"],
right_on=["sector", "type_group", "year"],
how="left",
)
# Join "All" index at actual year
test = test.join(
idx_all.select("sector", "year", pl.col("log_index").alias("li_act_all")),
left_on=["sector", "actual_year"],
right_on=["sector", "year"],
how="left",
)
test = test.with_columns(
pl.col("li_in_typed")
.fill_null(pl.col("li_in_all"))
.alias("log_index_input"),
pl.col("li_act_typed")
.fill_null(pl.col("li_act_all"))
.alias("log_index_actual"),
)
else:
# Unstratified index
test = test.join(
index.select(
"sector", "year", pl.col("log_index").alias("log_index_input")
),
left_on=["sector", "input_year"],
right_on=["sector", "year"],
how="left",
)
test = test.join(
index.select(
"sector", "year", pl.col("log_index").alias("log_index_actual")
),
left_on=["sector", "actual_year"],
right_on=["sector", "year"],
how="left",
)
test = test.with_columns(
(
pl.col("input_price").cast(pl.Float64)
* (pl.col("log_index_actual") - pl.col("log_index_input")).exp()
)
.fill_null(pl.col("input_price").cast(pl.Float64))
.alias("predicted"),
)
return test
def compute_metrics(actual: np.ndarray, predicted: np.ndarray) -> dict:
valid = np.isfinite(predicted) & np.isfinite(actual) & (actual > 0)
actual = actual[valid]
predicted = predicted[valid]
ape = np.abs(predicted - actual) / actual
signed_err = predicted - actual
return {
"MdAPE (%)": float(np.median(ape) * 100),
"% within 10%": float(np.mean(ape <= 0.10) * 100),
"% within 20%": float(np.mean(ape <= 0.20) * 100),
"% within 30%": float(np.mean(ape <= 0.30) * 100),
"MAE (£)": float(np.mean(np.abs(signed_err))),
"Mean signed error (£)": float(np.mean(signed_err)),
"n": int(len(actual)),
}
def print_metrics_table(metrics_by_stage: dict):
print("\n" + "=" * 55)
print("BACKTEST RESULTS")
print("=" * 55)
metric_names = [
"MdAPE (%)",
"% within 10%",
"% within 20%",
"% within 30%",
"MAE (£)",
"Mean signed error (£)",
"n",
]
stages = list(metrics_by_stage.keys())
header = f"{'Metric':<25s}"
for stage in stages:
header += f" {stage:>14s}"
print(header)
print("-" * 55)
for metric in metric_names:
row = f"{metric:<25s}"
for stage in stages:
val = metrics_by_stage[stage][metric]
if metric == "n":
row += f" {val:>14,d}"
elif "£" in metric:
row += f" {val:>13,.0f}"
else:
row += f" {val:>13.1f}%"
print(row)
print("=" * 55)
def main():
parser = argparse.ArgumentParser(description="Backtest price estimation model")
parser.add_argument(
"--input", type=Path, required=True, help="Path to wide.parquet"
)
parser.add_argument(
"--index", type=Path, required=True, help="Path to price_index.parquet"
)
parser.add_argument(
"--output", type=Path, required=True, help="Output backtest_results.parquet"
)
parser.add_argument(
"--hedonic-model",
type=Path,
default=None,
help="Path to hedonic_model.json (optional)",
)
args = parser.parse_args()
index = pl.read_parquet(args.index)
has_type_group = "type_group" in index.columns
if has_type_group:
print(
f"Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors, "
f"{index['type_group'].n_unique()} type groups"
)
else:
print(
f"Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors"
)
has_hedonic = args.hedonic_model is not None
test = extract_test_set(args.input, include_hedonic_cols=has_hedonic)
print("\nPredicting with price index...")
test = predict(test, index)
# Compute and print metrics
actual = test["actual_price"].to_numpy().astype(np.float64)
metrics = {
"Naive": compute_metrics(
actual, test["input_price"].to_numpy().astype(np.float64)
),
"Index": compute_metrics(
actual, test["predicted"].to_numpy().astype(np.float64)
),
}
# Hedonic blending
if has_hedonic:
print("\nApplying hedonic blending...")
with open(args.hedonic_model) as f:
model = json.load(f)
type_models = model["type_models"]
# Identify eligible rows for hedonic estimate
hedonic_mask = (
pl.col("Total floor area (sqm)").is_not_null()
& (pl.col("Total floor area (sqm)") > 0)
& pl.col("type_group").is_not_null()
)
eligible_mask = test.select(hedonic_mask).to_series()
eligible = test.filter(eligible_mask)
if len(eligible) > 0:
log_fa = np.log(
np.maximum(
eligible["Total floor area (sqm)"].to_numpy().astype(np.float64),
1.0,
)
)
sectors = eligible["sector"].to_list()
types = eligible["type_group"].to_list()
# Per-type hedonic prediction
log_hedonic = np.empty(len(eligible))
for i in range(len(eligible)):
tm = type_models.get(types[i])
if tm is None:
log_hedonic[i] = np.nan
continue
alpha = tm["sector_intercepts"].get(
sectors[i], tm["national_intercept"]
)
log_hedonic[i] = tm["beta_fa"] * log_fa[i] + alpha
valid = np.isfinite(log_hedonic)
# Hold years: input_year to actual_year (simulating real prediction)
input_years = eligible["input_year"].to_numpy().astype(np.float64)
actual_years = eligible["actual_year"].to_numpy().astype(np.float64)
hold_years = np.maximum(actual_years - input_years, 0.0)
log_index_pred = np.log(
np.maximum(eligible["predicted"].to_numpy().astype(np.float64), 1.0)
)
# Sweep tau values (only on valid hedonic rows)
tau_values = [5.0, 10.0, 15.0, 20.0, 30.0]
actual_eligible = eligible["actual_price"].to_numpy().astype(np.float64)
best_tau = 15.0
best_mdape = float("inf")
print(f"\n tau sweep ({valid.sum():,} eligible properties):")
for tau in tau_values:
blend_w = hold_years / (hold_years + tau)
log_blended = np.where(
valid,
(1 - blend_w) * log_index_pred + blend_w * log_hedonic,
log_index_pred,
)
blended = np.exp(log_blended)
m = compute_metrics(actual_eligible, blended)
marker = ""
if m["MdAPE (%)"] < best_mdape:
best_mdape = m["MdAPE (%)"]
best_tau = tau
marker = " <-- best"
print(
f" tau={tau:>4.0f}: MdAPE={m['MdAPE (%)']:>5.1f}%, "
f"within 10%={m['% within 10%']:>5.1f}%{marker}"
)
print(f"\n Best tau = {best_tau}")
# Compute blended predictions with best tau for full test set
blend_w = hold_years / (hold_years + best_tau)
log_blended = np.where(
valid,
(1 - blend_w) * log_index_pred + blend_w * log_hedonic,
log_index_pred,
)
blended_eligible = np.exp(log_blended)
# Merge back: for non-eligible rows, use index prediction
blended_all = test["predicted"].to_numpy().astype(np.float64).copy()
eligible_indices = eligible_mask.arg_true()
for i, idx in enumerate(eligible_indices):
blended_all[idx] = blended_eligible[i]
test = test.with_columns(
pl.Series("blended", blended_all, dtype=pl.Float64),
)
metrics["Blended"] = compute_metrics(actual, blended_all)
print_metrics_table(metrics)
# Save results
result_cols = [
"Postcode",
"sector",
"input_year",
"input_price",
"actual_year",
"actual_price",
"predicted",
]
if "blended" in test.columns:
result_cols.append("blended")
result = test.select(result_cols)
result.write_parquet(args.output)
size_mb = args.output.stat().st_size / (1024 * 1024)
print(f"\nWrote {args.output} ({size_mb:.1f} MB)")
print(f" {len(result):,} rows")
if __name__ == "__main__":
main()

View file

@ -1,414 +0,0 @@
"""Augment wide.parquet with an estimated current price column.
Joins the precomputed repeat-sales price index (from price_index.py) with each
property's last known sale to produce an inflation-adjusted current price estimate.
Uses type-stratified index when available, falling back to "All" type.
Optionally applies renovation premiums from renovation_premium.py: for properties
with post-sale renovation events, the estimated price is adjusted upward based on
data-driven per-area premiums with time decay.
Modifies wide.parquet in-place, adding the "Estimated current price" column.
"""
import argparse
import json
import math
from pathlib import Path
import numpy as np
import polars as pl
from pipeline.transform._price_utils import (
CURRENT_YEAR,
sector_expr,
type_group_expr,
)
HALF_LIFE = 10.0
DECAY_RATE = math.log(2) / HALF_LIFE
def main():
parser = argparse.ArgumentParser(
description="Augment wide.parquet with estimated current prices"
)
parser.add_argument(
"--input",
type=Path,
required=True,
help="Path to wide.parquet (modified in-place)",
)
parser.add_argument(
"--index", type=Path, required=True, help="Path to price_index.parquet"
)
parser.add_argument(
"--renovation-premium",
type=Path,
default=None,
help="Path to renovation_premium.parquet (optional)",
)
parser.add_argument(
"--hedonic-model",
type=Path,
default=None,
help="Path to hedonic_model.json (optional)",
)
args = parser.parse_args()
print("Loading wide.parquet...")
df = pl.read_parquet(args.input)
print(f" {len(df):,} rows, {len(df.columns)} columns")
# Drop existing estimated columns if re-running
for col in ["Estimated current price", "Est. price per sqm"]:
if col in df.columns:
df = df.drop(col)
# Derive helper columns for the join
has_price = (
pl.col("Last known price").is_not_null()
& pl.col("Postcode").is_not_null()
& pl.col("Date of last transaction").is_not_null()
)
df = df.with_columns(
sector_expr().alias("_sector"),
pl.col("Date of last transaction").dt.year().alias("_sale_year"),
type_group_expr().alias("_type_group"),
)
index = pl.read_parquet(args.index)
has_type_group = "type_group" in index.columns
if has_type_group:
print(
f" Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors, "
f"{index['type_group'].n_unique()} type groups"
)
else:
print(
f" Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors (unstratified)"
)
print("\nApplying repeat-sales index...")
if has_type_group:
idx_typed = index.filter(pl.col("type_group") != "All")
idx_all = index.filter(pl.col("type_group") == "All")
# Join type-specific index at sale year
df = df.join(
idx_typed.select(
"sector",
"type_group",
"year",
pl.col("log_index").alias("log_idx_sale_typed"),
),
left_on=["_sector", "_type_group", "_sale_year"],
right_on=["sector", "type_group", "year"],
how="left",
)
# Join "All" index at sale year
df = df.join(
idx_all.select(
"sector", "year", pl.col("log_index").alias("log_idx_sale_all")
),
left_on=["_sector", "_sale_year"],
right_on=["sector", "year"],
how="left",
)
# Join type-specific index at current year
df = df.join(
idx_typed.filter(pl.col("year") == CURRENT_YEAR).select(
"sector", "type_group", pl.col("log_index").alias("log_idx_cur_typed")
),
left_on=["_sector", "_type_group"],
right_on=["sector", "type_group"],
how="left",
)
# Join "All" index at current year
df = df.join(
idx_all.filter(pl.col("year") == CURRENT_YEAR).select(
"sector", pl.col("log_index").alias("log_idx_cur_all")
),
left_on="_sector",
right_on="sector",
how="left",
)
df = df.with_columns(
pl.col("log_idx_sale_typed")
.fill_null(pl.col("log_idx_sale_all"))
.alias("_log_index_sale"),
pl.col("log_idx_cur_typed")
.fill_null(pl.col("log_idx_cur_all"))
.alias("_log_index_current"),
)
else:
df = df.join(
index.select(
"sector", "year", pl.col("log_index").alias("_log_index_sale")
),
left_on=["_sector", "_sale_year"],
right_on=["sector", "year"],
how="left",
)
index_current = index.filter(pl.col("year") == CURRENT_YEAR).select(
"sector", pl.col("log_index").alias("_log_index_current")
)
df = df.join(index_current, left_on="_sector", right_on="sector", how="left")
# Compute estimate — only for rows with a known price
df = df.with_columns(
pl.when(has_price)
.then(
pl.col("Last known price").cast(pl.Float64)
* (pl.col("_log_index_current") - pl.col("_log_index_sale")).exp()
)
.otherwise(pl.lit(None))
.alias("Estimated current price"),
)
n_adjusted = df.filter(has_price & pl.col("_log_index_sale").is_not_null()).height
n_with_price = df.filter(has_price).height
print(
f" {n_adjusted:,} of {n_with_price:,} properties adjusted by index ({n_adjusted / max(n_with_price, 1) * 100:.1f}%)"
)
# Apply hedonic blending if model provided
if args.hedonic_model is not None:
print("\nApplying hedonic blending...")
with open(args.hedonic_model) as f:
model = json.load(f)
type_models = model["type_models"]
tau = model.get("tau", 15.0)
print(f" tau = {tau}, {len(type_models)} type models")
# Add type_group for per-type lookup
df = df.with_columns(type_group_expr())
hedonic_mask = (
has_price
& pl.col("Estimated current price").is_not_null()
& pl.col("Total floor area (sqm)").is_not_null()
& (pl.col("Total floor area (sqm)") > 0)
& pl.col("type_group").is_not_null()
)
eligible = df.filter(hedonic_mask)
if len(eligible) > 0:
log_fa = np.log(
np.maximum(
eligible["Total floor area (sqm)"].to_numpy().astype(np.float64),
1.0,
)
)
sectors = eligible["_sector"].to_list()
types = eligible["type_group"].to_list()
# Per-type hedonic prediction
log_hedonic = np.empty(len(eligible))
for i in range(len(eligible)):
tm = type_models.get(types[i])
if tm is None:
log_hedonic[i] = np.nan
continue
alpha = tm["sector_intercepts"].get(
sectors[i], tm["national_intercept"]
)
log_hedonic[i] = tm["beta_fa"] * log_fa[i] + alpha
valid = np.isfinite(log_hedonic)
# Hold years and blend weight
sale_years = eligible["_sale_year"].to_numpy().astype(np.float64)
hold_years = np.maximum(CURRENT_YEAR - sale_years, 0.0)
blend_w = hold_years / (hold_years + tau)
# Blend in log space
log_index_est = np.log(
eligible["Estimated current price"].to_numpy().astype(np.float64)
)
log_blended = np.where(
valid,
(1 - blend_w) * log_index_est + blend_w * log_hedonic,
log_index_est,
)
blended_prices = np.exp(log_blended)
# Write back into df
eligible_indices = df.select(hedonic_mask).to_series().arg_true()
price_arr = df["Estimated current price"].to_numpy().astype(np.float64)
for i, idx in enumerate(eligible_indices):
price_arr[idx] = blended_prices[i]
df = df.with_columns(
pl.Series("Estimated current price", price_arr, dtype=pl.Float64),
)
n_blended = int(valid.sum())
avg_w = float(np.mean(blend_w[valid]))
print(
f" {n_blended:,} properties with hedonic blending (avg blend weight: {avg_w:.3f})"
)
else:
print(" No eligible properties for hedonic blending")
# Apply renovation premiums if provided
if args.renovation_premium is not None:
print("\nApplying renovation premiums...")
reno_prem = pl.read_parquet(args.renovation_premium)
print(f" Loaded {len(reno_prem):,} premium rows")
# Find properties with post-sale renovation events
has_reno = (
pl.col("renovation_history").is_not_null()
& (pl.col("renovation_history").list.len() > 0)
& pl.col("Estimated current price").is_not_null()
)
# Explode renovation events, filter to post-sale only
reno_rows = (
df.lazy()
.filter(has_reno)
.select("_sector", "_type_group", "_sale_year", "renovation_history")
.with_row_index("_row_idx")
.explode("renovation_history")
.with_columns(
pl.col("renovation_history").struct.field("year").alias("_event_year"),
pl.col("renovation_history").struct.field("event").alias("_event_type"),
)
.filter(pl.col("_event_year") > pl.col("_sale_year"))
.collect()
)
if len(reno_rows) > 0:
# Take most recent event per (row, event_type)
latest = (
reno_rows.lazy()
.group_by("_row_idx", "_event_type", "_sector", "_type_group")
.agg(pl.col("_event_year").max().alias("_event_year"))
.collect()
)
# Compute time-decayed premium
latest = latest.with_columns(
(-DECAY_RATE * (CURRENT_YEAR - pl.col("_event_year")).cast(pl.Float64))
.exp()
.alias("_decay"),
)
# Join with renovation_premium.parquet — try typed first, fall back to "All"
rp_typed = reno_prem.filter(pl.col("type_group") != "All")
rp_all = reno_prem.filter(pl.col("type_group") == "All")
latest = (
latest.join(
rp_typed.select(
"sector",
"type_group",
"event_type",
pl.col("log_premium").alias("_lp_typed"),
),
left_on=["_sector", "_type_group", "_event_type"],
right_on=["sector", "type_group", "event_type"],
how="left",
)
.join(
rp_all.select(
"sector", "event_type", pl.col("log_premium").alias("_lp_all")
),
left_on=["_sector", "_event_type"],
right_on=["sector", "event_type"],
how="left",
)
.with_columns(
pl.col("_lp_typed")
.fill_null(pl.col("_lp_all"))
.fill_null(0.0)
.alias("_log_premium"),
)
)
# Compute total decayed log premium per property
per_property = (
latest.lazy()
.with_columns(
(pl.col("_log_premium") * pl.col("_decay")).alias("_decayed_lp"),
)
.group_by("_row_idx")
.agg(pl.col("_decayed_lp").sum().alias("_reno_log_premium"))
.collect()
)
# We need to map _row_idx back to the main df. Re-derive the row indices.
# _row_idx was generated from filtered rows — we need the actual df row indices.
reno_mask = df.select(has_reno).to_series()
actual_indices = reno_mask.arg_true()
# Build a mapping: _row_idx -> actual df row
idx_map = per_property.with_columns(
pl.col("_row_idx")
.map_elements(
lambda i: int(actual_indices[i]),
return_dtype=pl.UInt32,
)
.alias("_df_row"),
)
# Create a full-length column of zeros, then fill in premium values
reno_log_prem = [0.0] * len(df)
for row in idx_map.iter_rows(named=True):
reno_log_prem[row["_df_row"]] = row["_reno_log_premium"]
df = df.with_columns(
pl.Series("_reno_log_premium", reno_log_prem, dtype=pl.Float64),
)
# Apply: multiply estimated price by exp(reno_log_premium) where premium > 0
df = df.with_columns(
pl.when(pl.col("_reno_log_premium") != 0.0)
.then(
pl.col("Estimated current price")
* pl.col("_reno_log_premium").exp()
)
.otherwise(pl.col("Estimated current price"))
.alias("Estimated current price"),
)
n_with_premium = idx_map.height
avg_multiplier = math.exp(
per_property["_reno_log_premium"]
.filter(per_property["_reno_log_premium"] != 0.0)
.mean()
)
print(f" {n_with_premium:,} properties with renovation premium applied")
print(
f" Average premium multiplier: {avg_multiplier:.3f} ({avg_multiplier - 1:.1%} uplift)"
)
else:
print(" No properties with post-sale renovation events")
# Derive estimated price per sqm where both estimated price and floor area exist
df = df.with_columns(
(pl.col("Estimated current price") / pl.col("Total floor area (sqm)"))
.round(0)
.cast(pl.Int32)
.alias("Est. price per sqm"),
)
# Drop all temporary columns
temp_cols = [c for c in df.columns if c.startswith("_") or c.startswith("log_idx_")]
# Also drop hedonic-derived column if it was added
if "type_group" in df.columns:
temp_cols.append("type_group")
df = df.drop(temp_cols)
df.write_parquet(args.input)
size_mb = args.input.stat().st_size / (1024 * 1024)
print(f"\nWrote {args.input} ({size_mb:.1f} MB)")
print(
f" {len(df):,} rows, {len(df.columns)} columns (including 'Estimated current price')"
)
if __name__ == "__main__":
main()

View file

@ -1,523 +0,0 @@
"""Repeat-Sales Price Index (improved)
Builds a hierarchical repeat-sales price index with:
1. Stratification by property type (Detached/Semi-Detached/Terraced/Flats)
2. Robust regression (IRLS with Huber weights) instead of hard outlier cutoff
3. National hedonic time-dummy model as ultimate shrinkage fallback
4. Spatial smoothing for sparse sectors via KD-tree nearest neighbors
Output: price_index.parquet sector × type_group × year log_index
"""
import argparse
from pathlib import Path
import numpy as np
import polars as pl
from scipy.sparse import csc_matrix
from scipy.sparse.linalg import lsqr
from scipy.spatial import KDTree
from tqdm import tqdm
from pipeline.transform._price_utils import (
CURRENT_YEAR,
SHRINKAGE_K,
TYPE_GROUPS,
build_hedonic_features,
extract_centroids,
hierarchy_keys,
sector_expr,
type_group_expr,
)
# --- Constants ---
MIN_PAIRS = 5
OUTLIER_THRESHOLD = 3.0 # hard pre-filter; Huber handles the rest
HUBER_K = 1.345
IRLS_ITERATIONS = 5
SPATIAL_NEIGHBORS = 5
SPATIAL_BLEND_K = 30
# --- Pair extraction ---
def extract_pairs(input_path: Path) -> pl.DataFrame:
print("Extracting repeat-sale pairs...")
df = (
pl.scan_parquet(input_path)
.select("Postcode", "historical_prices", "Property type")
.filter(
pl.col("Postcode").is_not_null(),
pl.col("historical_prices").list.len() >= 2,
)
.with_columns(sector_expr(), type_group_expr())
.collect()
)
print(f" {len(df):,} properties with 2+ transactions")
pairs = (
df.lazy()
.with_columns(
pl.col("historical_prices")
.list.slice(0, pl.col("historical_prices").list.len() - 1)
.alias("from_txn"),
pl.col("historical_prices").list.slice(1).alias("to_txn"),
)
.explode("from_txn", "to_txn")
.with_columns(
pl.col("from_txn").struct.field("year").alias("year1"),
pl.col("from_txn").struct.field("price").alias("price1"),
pl.col("to_txn").struct.field("year").alias("year2"),
pl.col("to_txn").struct.field("price").alias("price2"),
)
.select("sector", "type_group", "year1", "price1", "year2", "price2")
.filter(
pl.col("price1") > 0,
pl.col("price2") > 0,
pl.col("year2") > pl.col("year1"),
)
.with_columns(
(pl.col("price2").cast(pl.Float64) / pl.col("price1").cast(pl.Float64))
.log()
.alias("log_ratio"),
(1.0 / (pl.col("year2") - pl.col("year1")).cast(pl.Float64).sqrt()).alias(
"weight"
),
)
.filter(pl.col("log_ratio").abs() <= OUTLIER_THRESHOLD)
.collect()
)
# Add hierarchy columns
pairs = pairs.with_columns(
pl.col("sector").str.replace(r"\s+\d+$", "").alias("district"),
).with_columns(
pl.col("district").str.replace(r"\d.*$", "").alias("area"),
)
print(f" {len(pairs):,} pairs extracted")
return pairs
# --- Robust IRLS solver ---
def solve_robust_index(
years1: np.ndarray,
years2: np.ndarray,
log_ratios: np.ndarray,
base_weights: np.ndarray,
) -> dict[int, float]:
"""IRLS Huber M-estimation for the Case-Shiller repeat-sales model."""
n = len(years1)
if n < MIN_PAIRS:
return {}
all_years = np.union1d(years1, years2)
min_year = int(all_years.min())
col = 0
year_to_col = {}
for y in all_years:
iy = int(y)
if iy != min_year:
year_to_col[iy] = col
col += 1
n_cols = len(year_to_col)
if n_cols == 0:
return {}
# Vectorized column index mapping
col2 = np.full(n, -1, dtype=np.int32)
col1 = np.full(n, -1, dtype=np.int32)
for year, c in year_to_col.items():
col2[years2 == year] = c
col1[years1 == year] = c
# Sparse matrix structure (fixed across iterations)
mask2 = col2 >= 0
mask1 = col1 >= 0
rows_arr = np.concatenate([np.where(mask2)[0], np.where(mask1)[0]])
cols_arr = np.concatenate([col2[mask2], col1[mask1]])
signs_arr = np.concatenate([np.ones(mask2.sum()), -np.ones(mask1.sum())])
weights = base_weights.copy()
for _ in range(IRLS_ITERATIONS):
data = signs_arr * weights[rows_arr]
A = csc_matrix((data, (rows_arr, cols_arr)), shape=(n, n_cols))
b = log_ratios * weights
betas = lsqr(A, b, atol=1e-10, btol=1e-10)[0]
# Residuals
predicted = np.zeros(n)
predicted[mask2] += betas[col2[mask2]]
predicted[mask1] -= betas[col1[mask1]]
residuals = log_ratios - predicted
# Huber reweighting
abs_r = np.abs(residuals)
huber_w = np.where(abs_r <= HUBER_K, 1.0, HUBER_K / np.maximum(abs_r, 1e-10))
weights = base_weights * huber_w
index = {min_year: 0.0}
for year, c in year_to_col.items():
index[year] = float(betas[c])
return index
def compute_indices_for_level(pairs: pl.DataFrame, group_col: str):
"""Solve robust indices for each group. Returns (indices, n_pairs) dicts."""
groups = pairs.group_by(group_col).agg(
pl.col("year1"),
pl.col("year2"),
pl.col("log_ratio"),
pl.col("weight"),
)
indices = {}
n_pairs = {}
for row in tqdm(
groups.iter_rows(named=True), total=len(groups), desc=f" {group_col}"
):
key = row[group_col]
y1 = np.array(row["year1"], dtype=np.int32)
y2 = np.array(row["year2"], dtype=np.int32)
lr = np.array(row["log_ratio"], dtype=np.float64)
w = np.array(row["weight"], dtype=np.float64)
idx = solve_robust_index(y1, y2, lr, w)
if idx:
indices[key] = idx
n_pairs[key] = len(y1)
return indices, n_pairs
# --- Hedonic model ---
def compute_hedonic_index(
input_path: Path, min_year: int, max_year: int
) -> dict[int, float]:
"""Two-step hedonic index: regress log(price) on features, average residual by year."""
print("Computing hedonic index...")
df = (
pl.scan_parquet(input_path)
.select(
"Last known price",
"Date of last transaction",
"Property type",
"Total floor area (sqm)",
)
.filter(
pl.col("Last known price").is_not_null(),
pl.col("Total floor area (sqm)").is_not_null(),
pl.col("Total floor area (sqm)") > 0,
)
.with_columns(
pl.col("Date of last transaction").dt.year().alias("sale_year"),
type_group_expr(),
)
.filter(
pl.col("type_group").is_not_null(),
pl.col("sale_year").is_not_null(),
pl.col("sale_year") >= min_year,
pl.col("sale_year") <= max_year,
)
.collect()
)
print(f" {len(df):,} complete cases for hedonic model")
# Target
log_price = np.log(df["Last known price"].to_numpy().astype(np.float64))
sale_years = df["sale_year"].to_numpy()
# Build feature matrix (18 hedonic features + intercept)
X = build_hedonic_features(df)
F = np.hstack([X, np.ones((len(df), 1), dtype=np.float32)])
print(f" Feature matrix: {F.shape[0]:,} × {F.shape[1]}")
# Step 1: regress log(price) on features → quality score
betas = np.linalg.lstsq(F.astype(np.float64), log_price, rcond=None)[0]
quality_score = F.astype(np.float64) @ betas
residuals = log_price - quality_score
# Step 2: average residual by year = hedonic index
hedonic = {}
for y in range(min_year, max_year + 1):
mask = sale_years == y
if mask.sum() > 0:
hedonic[y] = float(np.mean(residuals[mask]))
# Normalize: min_year = 0
base = hedonic.get(min_year, 0.0)
for y in hedonic:
hedonic[y] -= base
print(
f" Hedonic index: {len(hedonic)} years, range {min(hedonic.values()):.3f} to {max(hedonic.values()):.3f}"
)
return hedonic
# --- Shrinkage ---
def shrink_index(raw: dict, parent: dict, n_pairs: int, k: int = SHRINKAGE_K) -> dict:
w = n_pairs / (n_pairs + k)
result = {}
for y in set(raw) | set(parent):
r = raw.get(y, parent.get(y, 0.0))
p = parent.get(y, raw.get(y, 0.0))
result[y] = w * r + (1 - w) * p
return result
def apply_shrinkage(
sector_idx,
sector_n,
district_idx,
district_n,
area_idx,
area_n,
national_idx,
national_n,
hedonic_idx,
all_sectors,
sector_to_dist,
dist_to_area,
):
"""Top-down hierarchical shrinkage: national→hedonic, area→national, etc."""
# National → hedonic
national_shrunk = shrink_index(national_idx, hedonic_idx, national_n)
# Area → national
area_shrunk = {}
for area, idx in area_idx.items():
area_shrunk[area] = shrink_index(idx, national_shrunk, area_n[area])
# District → area
district_shrunk = {}
for dist, idx in district_idx.items():
a = dist_to_area.get(dist, "")
parent = area_shrunk.get(a, national_shrunk)
district_shrunk[dist] = shrink_index(idx, parent, district_n[dist])
# Sector → district
sector_shrunk = {}
for sec, idx in sector_idx.items():
d = sector_to_dist.get(sec, "")
parent = district_shrunk.get(d, national_shrunk)
sector_shrunk[sec] = shrink_index(idx, parent, sector_n[sec])
# Fill sectors without their own index
for sec in all_sectors:
if sec not in sector_shrunk:
d = sector_to_dist.get(sec, "")
a = dist_to_area.get(d, "")
sector_shrunk[sec] = district_shrunk.get(
d, area_shrunk.get(a, national_shrunk)
)
return sector_shrunk
# --- Spatial smoothing ---
def spatial_smooth(
sector_indices: dict,
centroids: dict,
n_pairs_map: dict,
) -> dict:
"""Blend sparse sector indices with K nearest neighbors."""
# Build coordinate arrays for sectors with centroids
sectors_with_coords = [s for s in sector_indices if s in centroids]
if len(sectors_with_coords) < SPATIAL_NEIGHBORS + 1:
return sector_indices
coords = np.array([centroids[s] for s in sectors_with_coords])
# Scale longitude by cos(mean_lat) for approximate Euclidean distance
mean_lat = np.mean(coords[:, 0])
scale = np.cos(np.radians(mean_lat))
scaled_coords = np.column_stack([coords[:, 0], coords[:, 1] * scale])
tree = KDTree(scaled_coords)
result = dict(sector_indices)
for i, sec in enumerate(sectors_with_coords):
n = n_pairs_map.get(sec, 0)
self_w = n / (n + SPATIAL_BLEND_K)
if self_w > 0.95:
continue # enough data, skip smoothing
dists, idxs = tree.query(scaled_coords[i], k=SPATIAL_NEIGHBORS + 1)
# Skip self (index 0, distance ~0)
neighbor_dists = dists[1:]
neighbor_idxs = idxs[1:]
inv_dists = []
neighbor_indices = []
for d, j in zip(neighbor_dists, neighbor_idxs):
ns = sectors_with_coords[j]
if d > 0 and ns in sector_indices:
inv_dists.append(1.0 / d)
neighbor_indices.append(sector_indices[ns])
if not neighbor_indices:
continue
total_inv = sum(inv_dists)
nbr_w = 1.0 - self_w
ws = [iw / total_inv * nbr_w for iw in inv_dists]
blended = {}
all_years = set(sector_indices[sec])
for ni in neighbor_indices:
all_years |= set(ni)
for y in all_years:
val = self_w * sector_indices[sec].get(y, 0.0)
for ni, w in zip(neighbor_indices, ws):
val += w * ni.get(y, 0.0)
blended[y] = val
result[sec] = blended
return result
# --- Forward fill ---
def forward_fill(index: dict, min_year: int, max_year: int) -> dict:
filled = {}
last = 0.0
for y in range(min_year, max_year + 1):
if y in index:
last = index[y]
filled[y] = last
return filled
# --- Main ---
def main():
parser = argparse.ArgumentParser(
description="Build improved repeat-sales price index"
)
parser.add_argument("--input", type=Path, required=True)
parser.add_argument("--output", type=Path, required=True)
args = parser.parse_args()
pairs = extract_pairs(args.input)
centroids = extract_centroids(args.input)
min_year = int(pairs["year1"].min())
max_year = max(int(pairs["year2"].max()), CURRENT_YEAR)
hedonic_idx = compute_hedonic_index(args.input, min_year, max_year)
# Precompute hierarchy
all_sectors = pairs["sector"].unique().to_list()
sector_to_dist = {}
dist_to_area = {}
for s in all_sectors:
d, a = hierarchy_keys(s)
sector_to_dist[s] = d
dist_to_area[d] = a
# Process each type group + "All"
all_type_groups = ["All"] + TYPE_GROUPS
final = {} # {type_group: {sector: {year: log_index}}}
final_n = {} # {type_group: {sector: n_pairs}}
for tg in all_type_groups:
print(f"\n--- {tg} ---")
typed = pairs if tg == "All" else pairs.filter(pl.col("type_group") == tg)
if len(typed) < MIN_PAIRS:
print(f" Skipping (only {len(typed)} pairs)")
final[tg] = {s: dict(hedonic_idx) for s in all_sectors}
final_n[tg] = {s: 0 for s in all_sectors}
continue
print(f" {len(typed):,} pairs")
# National
np_arrs = typed.select("year1", "year2", "log_ratio", "weight")
national_idx = solve_robust_index(
np_arrs["year1"].to_numpy(),
np_arrs["year2"].to_numpy(),
np_arrs["log_ratio"].to_numpy(),
np_arrs["weight"].to_numpy(),
)
national_n = len(typed)
print(f" National: {len(national_idx)} years")
# Area, district, sector
print(" Computing per-level indices:")
area_idx, area_n = compute_indices_for_level(typed, "area")
district_idx, district_n = compute_indices_for_level(typed, "district")
sector_idx, sector_n = compute_indices_for_level(typed, "sector")
print(
f" {len(area_idx)} areas, {len(district_idx)} districts, {len(sector_idx)} sectors"
)
# Shrinkage
print(" Applying shrinkage...")
sector_shrunk = apply_shrinkage(
sector_idx,
sector_n,
district_idx,
district_n,
area_idx,
area_n,
national_idx,
national_n,
hedonic_idx,
all_sectors,
sector_to_dist,
dist_to_area,
)
# Spatial smoothing
print(" Spatial smoothing...")
sector_smoothed = spatial_smooth(sector_shrunk, centroids, sector_n)
# Forward fill
for sec in all_sectors:
sector_smoothed[sec] = forward_fill(
sector_smoothed.get(sec, hedonic_idx), min_year, max_year
)
final[tg] = sector_smoothed
final_n[tg] = sector_n
# Assemble output
print("\nAssembling output...")
rows = []
for tg in all_type_groups:
for sec in all_sectors:
n = final_n[tg].get(sec, 0)
for year, log_idx in final[tg][sec].items():
rows.append((sec, tg, year, log_idx, n))
result = pl.DataFrame(
rows,
schema={
"sector": pl.String,
"type_group": pl.String,
"year": pl.Int32,
"log_index": pl.Float64,
"n_pairs": pl.Int64,
},
orient="row",
).sort("type_group", "sector", "year")
result.write_parquet(args.output)
size_mb = args.output.stat().st_size / (1024 * 1024)
print(f"\nWrote {args.output} ({size_mb:.1f} MB)")
print(
f" {result['sector'].n_unique():,} sectors × {len(all_type_groups)} types × {max_year - min_year + 1} years = {len(result):,} rows"
)
if __name__ == "__main__":
main()

View file

@ -1,572 +0,0 @@
"""Estimate per-area renovation premiums from repeat-sale residuals.
For each repeat-sale pair, computes the residual after removing the price-index
predicted return. Pairs where renovation events occurred between sales should have
systematically higher residuals. A WLS regression estimates the log-premium per
event type, with hierarchical shrinkage and spatial smoothing.
Output: renovation_premium.parquet sector × type_group × event_type log_premium
"""
import argparse
import math
from pathlib import Path
import numpy as np
import polars as pl
from scipy.spatial import KDTree
from pipeline.transform._price_utils import (
SHRINKAGE_K,
TYPE_GROUPS,
extract_centroids,
hierarchy_keys,
sector_expr,
type_group_expr,
)
HALF_LIFE = 10.0
DECAY_RATE = math.log(2) / HALF_LIFE
OUTLIER_THRESHOLD = 3.0
MIN_PAIRS = 10
SPATIAL_NEIGHBORS = 5
SPATIAL_BLEND_K = 30
EVENT_TYPES = ["Extension", "Renovation", "Remodeling"]
def extract_pairs_with_events(input_path: Path, index_path: Path) -> pl.DataFrame:
"""Extract repeat-sale pairs with renovation events and index residuals."""
print("Extracting repeat-sale pairs with renovation events...")
df = (
pl.scan_parquet(input_path)
.select("Postcode", "historical_prices", "Property type", "renovation_history")
.filter(
pl.col("Postcode").is_not_null(),
pl.col("historical_prices").list.len() >= 2,
)
.with_columns(sector_expr(), type_group_expr())
.collect()
)
print(f" {len(df):,} properties with 2+ transactions")
# Build consecutive pairs
pairs = (
df.lazy()
.with_columns(
pl.col("historical_prices")
.list.slice(0, pl.col("historical_prices").list.len() - 1)
.alias("from_txn"),
pl.col("historical_prices").list.slice(1).alias("to_txn"),
)
.explode("from_txn", "to_txn")
.with_columns(
pl.col("from_txn").struct.field("year").alias("year1"),
pl.col("from_txn").struct.field("price").alias("price1"),
pl.col("to_txn").struct.field("year").alias("year2"),
pl.col("to_txn").struct.field("price").alias("price2"),
)
.select(
"sector",
"type_group",
"year1",
"price1",
"year2",
"price2",
"renovation_history",
)
.filter(
pl.col("price1") > 0,
pl.col("price2") > 0,
pl.col("year2") > pl.col("year1"),
)
.with_columns(
(pl.col("price2").cast(pl.Float64) / pl.col("price1").cast(pl.Float64))
.log()
.alias("log_ratio"),
)
.filter(pl.col("log_ratio").abs() <= OUTLIER_THRESHOLD)
.collect()
)
print(f" {len(pairs):,} repeat-sale pairs")
# Join price index to compute residuals
index = pl.read_parquet(index_path)
has_type_group = "type_group" in index.columns
if has_type_group:
idx_typed = index.filter(pl.col("type_group") != "All")
idx_all = index.filter(pl.col("type_group") == "All")
# Join at year1
pairs = pairs.join(
idx_typed.select(
"sector", "type_group", "year", pl.col("log_index").alias("li1_typed")
),
left_on=["sector", "type_group", "year1"],
right_on=["sector", "type_group", "year"],
how="left",
).join(
idx_all.select("sector", "year", pl.col("log_index").alias("li1_all")),
left_on=["sector", "year1"],
right_on=["sector", "year"],
how="left",
)
# Join at year2
pairs = pairs.join(
idx_typed.select(
"sector", "type_group", "year", pl.col("log_index").alias("li2_typed")
),
left_on=["sector", "type_group", "year2"],
right_on=["sector", "type_group", "year"],
how="left",
).join(
idx_all.select("sector", "year", pl.col("log_index").alias("li2_all")),
left_on=["sector", "year2"],
right_on=["sector", "year"],
how="left",
)
pairs = pairs.with_columns(
(pl.col("li1_typed").fill_null(pl.col("li1_all"))).alias("_li1"),
(pl.col("li2_typed").fill_null(pl.col("li2_all"))).alias("_li2"),
)
else:
pairs = pairs.join(
index.select("sector", "year", pl.col("log_index").alias("_li1")),
left_on=["sector", "year1"],
right_on=["sector", "year"],
how="left",
).join(
index.select("sector", "year", pl.col("log_index").alias("_li2")),
left_on=["sector", "year2"],
right_on=["sector", "year"],
how="left",
)
# Compute residual = log_ratio - (index2 - index1)
pairs = pairs.with_columns(
(
pl.col("log_ratio")
- (pl.col("_li2").fill_null(0.0) - pl.col("_li1").fill_null(0.0))
).alias("residual"),
(1.0 / (pl.col("year2") - pl.col("year1")).cast(pl.Float64).sqrt()).alias(
"weight"
),
)
# For each pair, compute time-decayed renovation indicators
# Use row index for unique identification (composite keys aren't unique per pair)
pairs = pairs.with_row_index("_pair_idx")
for et in EVENT_TYPES:
col_name = f"has_{et.lower()}"
pairs = pairs.with_columns(pl.lit(0.0).alias(col_name))
# Process properties that have renovation history
has_reno = pairs.filter(
pl.col("renovation_history").is_not_null()
& (pl.col("renovation_history").list.len() > 0)
)
if len(has_reno) > 0:
reno_exploded = (
has_reno.select("_pair_idx", "year1", "year2", "renovation_history")
.explode("renovation_history")
.with_columns(
pl.col("renovation_history").struct.field("year").alias("event_year"),
pl.col("renovation_history").struct.field("event").alias("event_type"),
)
# Only events between the two sales
.filter(
(pl.col("event_year") > pl.col("year1"))
& (pl.col("event_year") <= pl.col("year2"))
)
)
if len(reno_exploded) > 0:
# For each pair + event type, take the most recent event
latest_events = reno_exploded.group_by(
"_pair_idx", "event_type", "year2"
).agg(pl.col("event_year").max().alias("latest_event_year"))
# Compute time-decayed indicator: exp(-decay_rate * (year2 - event_year))
latest_events = latest_events.with_columns(
(
-DECAY_RATE
* (pl.col("year2") - pl.col("latest_event_year")).cast(pl.Float64)
)
.exp()
.alias("decayed_indicator"),
)
# Pivot to wide format using _pair_idx for unique join
for et in EVENT_TYPES:
et_data = latest_events.filter(pl.col("event_type") == et)
if len(et_data) > 0:
col_name = f"has_{et.lower()}"
pairs = (
pairs.join(
et_data.select(
"_pair_idx",
pl.col("decayed_indicator").alias(f"_{col_name}"),
),
on="_pair_idx",
how="left",
)
.with_columns(
pl.col(f"_{col_name}").fill_null(0.0).alias(col_name),
)
.drop(f"_{col_name}")
)
pairs = pairs.drop("_pair_idx")
# Add hierarchy columns
pairs = pairs.with_columns(
pl.col("sector").str.replace(r"\s+\d+$", "").alias("district"),
).with_columns(
pl.col("district").str.replace(r"\d.*$", "").alias("area"),
)
# Count reno pairs
reno_mask = (
(pl.col("has_extension") > 0)
| (pl.col("has_renovation") > 0)
| (pl.col("has_remodeling") > 0)
)
n_reno = pairs.filter(reno_mask).height
print(
f" {n_reno:,} pairs with renovation events ({n_reno / len(pairs) * 100:.1f}%)"
)
# Drop temporary columns from index join + renovation_history (no longer needed)
temp_cols = [
c
for c in pairs.columns
if c.startswith("_li") or c.startswith("li1_") or c.startswith("li2_")
]
pairs = pairs.drop(temp_cols + ["renovation_history"])
return pairs
def wls_regression(
residuals: np.ndarray,
weights: np.ndarray,
X: np.ndarray,
) -> np.ndarray:
"""Weighted least squares: residual ~ X (with intercept column in X).
Uses sqrt(weights) scaling to avoid building a full N×N diagonal matrix.
"""
sqrt_w = np.sqrt(weights)[:, np.newaxis]
Xw = X * sqrt_w
yw = residuals * sqrt_w.ravel()
try:
betas = np.linalg.lstsq(Xw, yw, rcond=None)[0]
except np.linalg.LinAlgError:
betas = np.zeros(X.shape[1])
return betas
def compute_premiums_for_group(df: pl.DataFrame) -> dict[str, float]:
"""Run WLS regression for a group, return {event_type: log_premium}."""
n = len(df)
if n < MIN_PAIRS:
return {}
residuals = df["residual"].to_numpy().astype(np.float64)
weights = df["weight"].to_numpy().astype(np.float64)
# Build design matrix: intercept + 3 event indicators
X = np.column_stack(
[
np.ones(n),
df["has_extension"].to_numpy().astype(np.float64),
df["has_renovation"].to_numpy().astype(np.float64),
df["has_remodeling"].to_numpy().astype(np.float64),
]
)
# Check if we have any renovation pairs in this group
reno_sum = X[:, 1:].sum()
if reno_sum < 1.0:
return {}
betas = wls_regression(residuals, weights, X)
# betas[0] is intercept, betas[1:4] are the premiums
return {
"Extension": float(betas[1]),
"Renovation": float(betas[2]),
"Remodeling": float(betas[3]),
}
def compute_premiums_for_level(
pairs: pl.DataFrame, group_col: str
) -> tuple[dict, dict]:
"""Compute premiums per group at a given hierarchy level.
Returns (premiums, n_reno_pairs) dicts keyed by group value.
premiums[key] = {event_type: log_premium}
"""
groups = pairs.group_by(group_col)
premiums = {}
n_reno_pairs = {}
for key, group_df in groups:
key_val = key[0]
result = compute_premiums_for_group(group_df)
if result:
premiums[key_val] = result
# Count pairs with any reno indicator
reno_mask = (
(group_df["has_extension"].to_numpy() > 0)
| (group_df["has_renovation"].to_numpy() > 0)
| (group_df["has_remodeling"].to_numpy() > 0)
)
n_reno_pairs[key_val] = int(reno_mask.sum())
return premiums, n_reno_pairs
def shrink_premium(
raw: dict[str, float], parent: dict[str, float], n: int
) -> dict[str, float]:
"""Shrink raw premiums toward parent level."""
w = n / (n + SHRINKAGE_K)
result = {}
for et in EVENT_TYPES:
r = raw.get(et, parent.get(et, 0.0))
p = parent.get(et, raw.get(et, 0.0))
result[et] = w * r + (1 - w) * p
return result
def apply_shrinkage(
sector_prem,
sector_n,
district_prem,
district_n,
area_prem,
area_n,
national_prem,
national_n,
all_sectors,
sector_to_dist,
dist_to_area,
):
"""Top-down hierarchical shrinkage for premiums."""
# Area -> national
area_shrunk = {}
for area, prem in area_prem.items():
area_shrunk[area] = shrink_premium(prem, national_prem, area_n.get(area, 0))
# District -> area
district_shrunk = {}
for dist, prem in district_prem.items():
a = dist_to_area.get(dist, "")
parent = area_shrunk.get(a, national_prem)
district_shrunk[dist] = shrink_premium(prem, parent, district_n.get(dist, 0))
# Sector -> district
sector_shrunk = {}
for sec, prem in sector_prem.items():
d = sector_to_dist.get(sec, "")
parent = district_shrunk.get(d, national_prem)
sector_shrunk[sec] = shrink_premium(prem, parent, sector_n.get(sec, 0))
# Fill missing sectors
for sec in all_sectors:
if sec not in sector_shrunk:
d = sector_to_dist.get(sec, "")
a = dist_to_area.get(d, "")
sector_shrunk[sec] = district_shrunk.get(
d, area_shrunk.get(a, national_prem)
)
return sector_shrunk
def spatial_smooth(
sector_premiums: dict[str, dict[str, float]],
centroids: dict[str, tuple[float, float]],
n_reno_map: dict[str, int],
) -> dict[str, dict[str, float]]:
"""Blend sparse sector premiums with K nearest neighbors."""
sectors_with_coords = [s for s in sector_premiums if s in centroids]
if len(sectors_with_coords) < SPATIAL_NEIGHBORS + 1:
return sector_premiums
coords = np.array([centroids[s] for s in sectors_with_coords])
mean_lat = np.mean(coords[:, 0])
scale = np.cos(np.radians(mean_lat))
scaled_coords = np.column_stack([coords[:, 0], coords[:, 1] * scale])
tree = KDTree(scaled_coords)
result = dict(sector_premiums)
for i, sec in enumerate(sectors_with_coords):
n = n_reno_map.get(sec, 0)
self_w = n / (n + SPATIAL_BLEND_K)
if self_w > 0.95:
continue
dists, idxs = tree.query(scaled_coords[i], k=SPATIAL_NEIGHBORS + 1)
neighbor_dists = dists[1:]
neighbor_idxs = idxs[1:]
inv_dists = []
neighbor_prems = []
for d, j in zip(neighbor_dists, neighbor_idxs):
ns = sectors_with_coords[j]
if d > 0 and ns in sector_premiums:
inv_dists.append(1.0 / d)
neighbor_prems.append(sector_premiums[ns])
if not neighbor_prems:
continue
total_inv = sum(inv_dists)
nbr_w = 1.0 - self_w
ws = [iw / total_inv * nbr_w for iw in inv_dists]
blended = {}
for et in EVENT_TYPES:
val = self_w * sector_premiums[sec].get(et, 0.0)
for np_dict, w in zip(neighbor_prems, ws):
val += w * np_dict.get(et, 0.0)
blended[et] = val
result[sec] = blended
return result
def main():
parser = argparse.ArgumentParser(
description="Estimate renovation premiums from repeat-sale residuals"
)
parser.add_argument(
"--input", type=Path, required=True, help="Path to wide.parquet"
)
parser.add_argument(
"--index", type=Path, required=True, help="Path to price_index.parquet"
)
parser.add_argument(
"--output", type=Path, required=True, help="Output renovation_premium.parquet"
)
args = parser.parse_args()
pairs = extract_pairs_with_events(args.input, args.index)
centroids = extract_centroids(args.input)
# Precompute hierarchy
all_sectors = pairs["sector"].unique().to_list()
sector_to_dist = {}
dist_to_area = {}
for s in all_sectors:
d, a = hierarchy_keys(s)
sector_to_dist[s] = d
dist_to_area[d] = a
all_type_groups = ["All"] + TYPE_GROUPS
rows = []
for tg in all_type_groups:
print(f"\n--- {tg} ---")
typed = pairs if tg == "All" else pairs.filter(pl.col("type_group") == tg)
if len(typed) < MIN_PAIRS:
print(f" Skipping (only {len(typed)} pairs)")
continue
print(f" {len(typed):,} pairs")
# National
national_prem = compute_premiums_for_group(typed)
national_reno = typed.filter(
(pl.col("has_extension") > 0)
| (pl.col("has_renovation") > 0)
| (pl.col("has_remodeling") > 0)
).height
if not national_prem:
print(" No renovation pairs at national level, skipping")
continue
print(
" National premiums: "
+ ", ".join(
f"{et}: {v:.4f} ({math.exp(v) - 1:.1%})"
for et, v in national_prem.items()
)
)
# Per-level
print(" Computing per-level premiums:")
area_prem, area_n = compute_premiums_for_level(typed, "area")
district_prem, district_n = compute_premiums_for_level(typed, "district")
sector_prem, sector_n = compute_premiums_for_level(typed, "sector")
print(
f" {len(area_prem)} areas, {len(district_prem)} districts, {len(sector_prem)} sectors with data"
)
# Shrinkage
print(" Applying shrinkage...")
sector_shrunk = apply_shrinkage(
sector_prem,
sector_n,
district_prem,
district_n,
area_prem,
area_n,
national_prem,
national_reno,
all_sectors,
sector_to_dist,
dist_to_area,
)
# Spatial smoothing
print(" Spatial smoothing...")
sector_smoothed = spatial_smooth(sector_shrunk, centroids, sector_n)
# Collect rows
for sec in all_sectors:
prem = sector_smoothed.get(sec, national_prem)
n = sector_n.get(sec, 0)
for et in EVENT_TYPES:
rows.append((sec, tg, et, prem.get(et, 0.0), n))
result = pl.DataFrame(
rows,
schema={
"sector": pl.String,
"type_group": pl.String,
"event_type": pl.String,
"log_premium": pl.Float64,
"n_reno_pairs": pl.Int64,
},
orient="row",
).sort("type_group", "sector", "event_type")
result.write_parquet(args.output)
size_mb = args.output.stat().st_size / (1024 * 1024)
print(f"\nWrote {args.output} ({size_mb:.1f} MB)")
print(
f" {result['sector'].n_unique():,} sectors x {len(all_type_groups)} types x {len(EVENT_TYPES)} events = {len(result):,} rows"
)
# Print summary statistics
print("\nNational premium summary:")
national = (
result.filter(pl.col("type_group") == "All")
.group_by("event_type")
.agg(
pl.col("log_premium").mean().alias("mean_log_premium"),
)
)
for row in national.iter_rows(named=True):
et = row["event_type"]
lp = row["mean_log_premium"]
print(f" {et}: log_premium={lp:.4f} ({math.exp(lp) - 1:.1%} price uplift)")
if __name__ == "__main__":
main()

View file

@ -26,6 +26,10 @@ dependencies = [
"pyproj>=3.7.2",
"pyshp>=2.3.0",
"folium>=0.20.0",
"flask",
"httpx",
"polars",
"fake-useragent>=2.2.0",
]
[tool.uv]

3
r5-java/.gitignore vendored Normal file
View file

@ -0,0 +1,3 @@
jdk/
lib/
out/

View file

@ -1,20 +0,0 @@
FROM eclipse-temurin:21-jdk AS build
WORKDIR /app
# Download pre-built R5 fat JAR from GitHub Releases (includes all R5 deps)
ADD https://github.com/conveyal/r5/releases/download/v7.5/r5-v7.5-all.jar /app/lib/r5.jar
# Gson for JSON (HTTP server is built into JDK)
ADD https://repo1.maven.org/maven2/com/google/code/gson/gson/2.11.0/gson-2.11.0.jar /app/lib/gson.jar
COPY src/ src/
RUN javac -cp "lib/*" -d out src/main/java/propertymap/App.java
FROM eclipse-temurin:21-jre
WORKDIR /app
RUN apt-get update && apt-get install -y curl && rm -rf /var/lib/apt/lists/*
COPY --from=build /app/lib/ /app/lib/
COPY --from=build /app/out/ /app/out/
COPY entrypoint.sh /app/entrypoint.sh
RUN chmod +x /app/entrypoint.sh
ENTRYPOINT ["/app/entrypoint.sh"]

View file

@ -1,18 +0,0 @@
#!/bin/bash
set -e
TRANSIT_DIR=$DATA_DIR
NETWORK_DIR=$NETWORK_CACHE_DIR
BUILD_DIR="$NETWORK_DIR/build"
# If no cached network yet, copy transit data to a writable location for the build.
# R5 writes temp files (.mapdb) next to the OSM/GTFS files during network construction.
if [ ! -f "$NETWORK_DIR/network.dat" ]; then
echo "No cached network — copying transit data to writable build dir..."
mkdir -p "$BUILD_DIR"
cp "$OSM_DIR"/*.osm.pbf "$BUILD_DIR/" 2>/dev/null || true
cp "$TRANSIT_DIR"/*.zip "$BUILD_DIR/" 2>/dev/null || true
export DATA_DIR="$BUILD_DIR"
fi
exec java -Xmx16g -cp "out:lib/*" propertymap.App

129
r5-java/run.sh Executable file
View file

@ -0,0 +1,129 @@
#!/bin/bash
set -euo pipefail
# Batch-compute travel times from all places to all England postcodes
# for all transport modes (car, bicycle, walking, transit).
#
# Uses each place as origin with all postcodes as destinations — R5 does one
# routing computation per place, then reads off travel times to all postcodes.
# For car/bicycle/walking this is symmetric (place->postcode = postcode->place).
#
# Output: property-data/travel-times/{mode}/
# - {index}.parquet files: (pcds VARCHAR, travel_minutes SMALLINT), one per place
# - postcodes_ref.parquet: postcode order reference
# - places_ref.parquet: place order reference
#
# Usage:
# ./r5-java/run.sh # 4 threads, 16g heap
# ./r5-java/run.sh --threads 8
# ./r5-java/run.sh --heap 24g
# --- Defaults ---
THREADS=28
HEAP=40g
NETWORK_DIR=property-data/r5-network
OUTPUT_BASE=property-data/travel-times
R5_DIR=r5-java
# --- Parse args ---
while [[ $# -gt 0 ]]; do
case $1 in
--threads) THREADS="$2"; shift 2 ;;
--heap) HEAP="$2"; shift 2 ;;
--network-dir) NETWORK_DIR="$2"; shift 2 ;;
*) echo "Unknown: $1"; exit 1 ;;
esac
done
# --- Verify we're in project root ---
if [ ! -f property-data/places.parquet ] || [ ! -f property-data/arcgis_data.parquet ]; then
echo "Error: run from the property-map project root"
exit 1
fi
echo "=== R5 Batch Travel Times ==="
echo "Threads: $THREADS | Heap: $HEAP"
echo ""
# --- Step 1: Download JDK if needed ---
JDK_DIR="$R5_DIR/jdk"
if [ ! -d "$JDK_DIR" ]; then
echo "--- Downloading JDK 21 ---"
ARCH=$(uname -m)
case "$ARCH" in
x86_64|amd64) JDK_ARCH="x64" ;;
aarch64|arm64) JDK_ARCH="aarch64" ;;
*) echo "Unsupported architecture: $ARCH"; exit 1 ;;
esac
JDK_URL="https://api.adoptium.net/v3/binary/latest/21/ga/linux/${JDK_ARCH}/jdk/hotspot/normal/eclipse"
mkdir -p "$JDK_DIR"
curl -fL "$JDK_URL" | tar xz --strip-components=1 -C "$JDK_DIR"
fi
export JAVA_HOME="$JDK_DIR"
export PATH="$JAVA_HOME/bin:$PATH"
# --- Step 2: Download library JARs ---
LIB_DIR="$R5_DIR/lib"
mkdir -p "$LIB_DIR"
R5_JAR="$LIB_DIR/r5.jar"
DUCKDB_JAR="$LIB_DIR/duckdb.jar"
if [ ! -f "$R5_JAR" ]; then
echo "--- Downloading R5 v7.5 fat JAR ---"
curl -fL -o "$R5_JAR" https://github.com/conveyal/r5/releases/download/v7.5/r5-v7.5-all.jar
fi
if [ ! -f "$DUCKDB_JAR" ]; then
echo "--- Downloading DuckDB JDBC ---"
curl -fL -o "$DUCKDB_JAR" https://repo1.maven.org/maven2/org/duckdb/duckdb_jdbc/1.0.0/duckdb_jdbc-1.0.0.jar
fi
# --- Step 3: Compile Java source ---
OUT_DIR="$R5_DIR/out"
SRC_DIR="$R5_DIR/src/main/java/propertymap"
NEEDS_COMPILE=false
for src in "$SRC_DIR"/*.java; do
class="$OUT_DIR/propertymap/$(basename "${src%.java}").class"
if [ ! -f "$class" ] || [ "$src" -nt "$class" ]; then
NEEDS_COMPILE=true
break
fi
done
if $NEEDS_COMPILE; then
echo "--- Compiling Java source ---"
mkdir -p "$OUT_DIR"
javac -cp "$LIB_DIR/*" -d "$OUT_DIR" "$SRC_DIR"/*.java
fi
# --- Step 4: Prepare network build directory ---
# R5 writes .mapdb temp files next to OSM/GTFS files during network construction.
# Copy source data to a writable build dir to avoid polluting the originals.
mkdir -p "$NETWORK_DIR"
DATA_DIR="property-data/transit"
if [ ! -f "$NETWORK_DIR/network.dat" ]; then
BUILD_DIR="$NETWORK_DIR/build"
echo "--- No cached network — copying transit data to build dir ---"
mkdir -p "$BUILD_DIR"
cp property-data/transit/raw/*.osm.pbf "$BUILD_DIR/" 2>/dev/null || true
cp property-data/transit/*.zip "$BUILD_DIR/" 2>/dev/null || true
DATA_DIR="$BUILD_DIR"
fi
# --- Step 5: Run batch ---
echo ""
echo "--- Starting batch computation ---"
DATA_DIR="$DATA_DIR" NETWORK_CACHE_DIR="$NETWORK_DIR" \
java -Xmx"$HEAP" -cp "$OUT_DIR:$LIB_DIR/*" propertymap.App \
--postcodes property-data/arcgis_data.parquet \
--places property-data/places.parquet \
--output-dir "$OUTPUT_BASE" \
--threads "$THREADS"
echo ""
echo "=== Complete ==="
echo "Output: $OUTPUT_BASE/{car,bicycle,walking,transit}/"
echo "Reference: $OUTPUT_BASE/postcodes_ref.parquet, $OUTPUT_BASE/places_ref.parquet"

View file

@ -1,223 +1,208 @@
package propertymap;
import com.conveyal.r5.OneOriginResult;
import com.conveyal.r5.analyst.FreeFormPointSet;
import com.conveyal.r5.analyst.PointSet;
import com.conveyal.r5.analyst.TravelTimeComputer;
import com.conveyal.r5.analyst.WebMercatorExtents;
import com.conveyal.r5.analyst.cluster.RegionalTask;
import com.conveyal.r5.analyst.cluster.TravelTimeResult;
import com.conveyal.r5.api.util.LegMode;
import com.conveyal.r5.api.util.TransitModes;
import com.conveyal.r5.kryo.KryoNetworkSerializer;
import com.conveyal.r5.transit.TransportNetwork;
import com.google.gson.Gson;
import com.sun.net.httpserver.HttpExchange;
import com.sun.net.httpserver.HttpServer;
import org.locationtech.jts.geom.Coordinate;
import org.duckdb.DuckDBConnection;
import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.net.InetSocketAddress;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.time.LocalDate;
import java.util.EnumSet;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
/**
* Batch-compute travel times from each origin (place) to all destinations (postcodes)
* for all transport modes (car, bicycle, walking, transit).
*
* Output per mode: one parquet file per origin in {output-dir}/{mode}/{index}.parquet
* with columns (pcds VARCHAR, travel_minutes SMALLINT). -1 = unreachable within 120 min.
*/
public class App {
private static TransportNetwork network;
private static final Gson gson = new Gson();
static class TravelTimeRequest {
double[] origin; // [lat, lon]
double[][] destinations; // [[lat, lon], ...]
String mode; // "transit", "car", "bicycle", "walking"
}
static class TravelTimeResponse {
double[] travel_times; // minutes, -1 = unreachable
}
private static final String[] MODES = {"car", "bicycle", "walking", "transit"};
private static final int MAX_RETRIES = 2;
public static void main(String[] args) throws Exception {
String dataDir = System.getenv("DATA_DIR");
String postcodesPath = requiredArg(args, "--postcodes");
String placesPath = requiredArg(args, "--places");
String outputDirStr = requiredArg(args, "--output-dir");
int threads = Integer.parseInt(optionalArg(args, "--threads", "4"));
if (dataDir == null) {
System.err.println("Error: DATA_DIR environment variable not set");
System.exit(1);
}
Path outDir = Paths.get(outputDirStr);
Files.createDirectories(outDir);
String networkCacheDir = System.getenv("NETWORK_CACHE_DIR");
if (networkCacheDir == null) {
System.err.println("Error: NETWORK_CACHE_DIR environment variable not set");
System.exit(1);
}
LocalDate today = LocalDate.now();
TransportNetwork network = Router.loadNetwork(requiredEnv("DATA_DIR"), requiredEnv("NETWORK_CACHE_DIR"));
System.out.println("Loading transport network from " + dataDir);
System.out.println("Network cache dir: " + networkCacheDir);
System.err.println("Loading postcodes (England only)...");
Parquet.Postcodes postcodes = Parquet.loadEnglandPostcodes(
postcodesPath, outDir.resolve("postcodes_ref.parquet"));
int nDest = postcodes.lats().length;
System.err.printf(" %,d postcodes%n", nDest);
File cacheFile = new File(networkCacheDir, "network.dat");
if (cacheFile.exists()) {
System.out.println("Loading cached network from " + cacheFile);
network = KryoNetworkSerializer.read(cacheFile);
} else {
System.out.println("Building network (first run, this takes a few minutes)...");
network = TransportNetwork.fromDirectory(new File(dataDir));
new File(networkCacheDir).mkdirs();
KryoNetworkSerializer.write(network, cacheFile);
System.out.println("Network cached to " + cacheFile);
}
List<Router.DestinationChunk> chunks = Router.buildDestinationChunks(postcodes.lats(), postcodes.lons());
// Build stop-to-vertex distance tables (needed for egress routing in transit mode).
// Not built by fromDirectory() and too large to fit in the Kryo cache with 4GB heap.
System.out.println("Building stop-to-vertex distance tables...");
network.transitLayer.buildDistanceTables(null);
System.out.println("Distance tables built");
System.err.println("Loading places (deduplicated)...");
double[][] placesLatLon = Parquet.loadPlaces(placesPath, outDir.resolve("places_ref.parquet"));
double[] originLats = placesLatLon[0], originLons = placesLatLon[1];
int nOrigins = originLats.length;
System.err.printf(" %,d places%n", nOrigins);
System.err.printf(" Estimated output: %.1f GB (%,d x %,d x 2B)%n",
(double) nOrigins * nDest * 2 / 1e9, nOrigins, nDest);
System.out.println("Transport network loaded successfully");
HttpServer server = HttpServer.create(new InetSocketAddress(8003), 0);
server.createContext("/health", exchange -> {
sendResponse(exchange, 200, "ok");
// One thread pool shared across all modes
ExecutorService pool = Executors.newFixedThreadPool(threads);
// One DuckDB connection per thread, reused across all writes
ThreadLocal<DuckDBConnection> threadConn = ThreadLocal.withInitial(() -> {
try { return Parquet.connect(); }
catch (Exception e) { throw new RuntimeException(e); }
});
server.createContext("/travel-times", exchange -> {
if (!"POST".equals(exchange.getRequestMethod())) {
sendResponse(exchange, 405, "Method not allowed");
return;
try {
for (String mode : MODES) {
processMode(network, chunks, postcodes.codes(), originLats, originLons,
nDest, outDir, mode, today, pool, threadConn);
}
} finally {
pool.shutdown();
pool.awaitTermination(Long.MAX_VALUE, TimeUnit.MILLISECONDS);
}
}
private static void processMode(
TransportNetwork network, List<Router.DestinationChunk> chunks,
String[] postcodes, double[] originLats, double[] originLons, int nDest,
Path outDir, String mode, LocalDate date,
ExecutorService pool, ThreadLocal<DuckDBConnection> threadConn) throws Exception {
int nOrigins = originLats.length;
System.err.printf("%n=== %s ===%n", mode.toUpperCase());
Path modeDir = outDir.resolve(mode);
Files.createDirectories(modeDir);
List<Integer> remaining = findRemaining(modeDir, nOrigins);
int alreadyDone = nOrigins - remaining.size();
System.err.printf(" %,d done, %,d remaining%n", alreadyDone, remaining.size());
if (remaining.isEmpty()) {
System.err.println(" All origins completed for this mode!");
return;
}
long startMs = System.currentTimeMillis();
int total = remaining.size();
AtomicInteger completed = new AtomicInteger(0);
AtomicInteger failed = new AtomicInteger(0);
// Progress reporter on a timer instead of per-task stderr writes
ScheduledExecutorService reporter = Executors.newSingleThreadScheduledExecutor(r -> {
Thread t = new Thread(r, "progress");
t.setDaemon(true);
return t;
});
reporter.scheduleAtFixedRate(() -> {
int c = completed.get();
if (c == 0) return;
double secs = (System.currentTimeMillis() - startMs) / 1000.0;
double rate = c / secs;
double etaH = (total - c) / rate / 3600;
System.err.printf("\r [%,d/%,d] %.1f/s | ETA %.1fh | fail %d",
c, total, rate, etaH, failed.get());
}, 2, 2, TimeUnit.SECONDS);
// Submit all work, wait for completion via CountDownLatch-like pattern
java.util.concurrent.CountDownLatch latch = new java.util.concurrent.CountDownLatch(remaining.size());
for (int idx : remaining) {
pool.submit(() -> {
try {
processOrigin(network, chunks, postcodes, originLats[idx], originLons[idx],
nDest, modeDir, mode, date, idx, threadConn.get());
completed.incrementAndGet();
} catch (Exception e) {
failed.incrementAndGet();
System.err.printf("%n [FAIL] origin %d: %s%n", idx, e.getMessage());
} finally {
latch.countDown();
}
});
}
latch.await();
reporter.shutdown();
double elapsedH = (System.currentTimeMillis() - startMs) / 3_600_000.0;
int n = completed.get();
System.err.printf("\r [%,d/%,d] %.1f/s | %.1fh | fail %d%n",
n, total, n / Math.max(elapsedH * 3600, 1), elapsedH, failed.get());
}
/** Compute and write travel times for a single origin, with retry on failure. */
private static void processOrigin(
TransportNetwork network, List<Router.DestinationChunk> chunks,
String[] postcodes, double lat, double lon, int nDest,
Path modeDir, String mode, LocalDate date, int idx,
DuckDBConnection conn) throws Exception {
Path outPath = modeDir.resolve(String.format("%06d.parquet", idx));
Exception lastError = null;
for (int attempt = 0; attempt <= MAX_RETRIES; attempt++) {
try {
handleTravelTimes(exchange);
short[] times = Router.computeTravelTimes(network, chunks, lat, lon, mode, nDest, date);
Parquet.writeTravelTimes(conn, outPath, postcodes, times);
return;
} catch (Exception e) {
System.err.println("Error handling travel-times: " + e.getMessage());
e.printStackTrace();
sendResponse(exchange, 500, "Internal server error: " + e.getMessage());
}
});
server.setExecutor(java.util.concurrent.Executors.newFixedThreadPool(4));
server.start();
System.out.println("R5 service listening on port 8003");
}
private static void sendResponse(HttpExchange exchange, int status, String body) throws IOException {
byte[] bytes = body.getBytes(StandardCharsets.UTF_8);
exchange.getResponseHeaders().set("Content-Type", "application/json");
exchange.sendResponseHeaders(status, bytes.length);
try (OutputStream os = exchange.getResponseBody()) {
os.write(bytes);
}
}
private static void handleTravelTimes(HttpExchange exchange) throws IOException {
long t0 = System.currentTimeMillis();
String body = new String(exchange.getRequestBody().readAllBytes(), StandardCharsets.UTF_8);
TravelTimeRequest req = gson.fromJson(body, TravelTimeRequest.class);
if (req.origin == null || req.origin.length != 2) {
sendResponse(exchange, 400, "{\"error\":\"origin must be [lat, lon]\"}");
return;
}
if (req.destinations == null || req.destinations.length == 0) {
sendResponse(exchange, 400, "{\"error\":\"destinations must be non-empty\"}");
return;
}
String mode = req.mode != null ? req.mode : "transit";
// Build destination point set (Coordinate takes x=lon, y=lat)
Coordinate[] coords = new Coordinate[req.destinations.length];
for (int i = 0; i < req.destinations.length; i++) {
coords[i] = new Coordinate(req.destinations[i][1], req.destinations[i][0]); // lon, lat
}
FreeFormPointSet destinations = new FreeFormPointSet(coords);
// Build the regional task
RegionalTask task = new RegionalTask();
task.fromLat = req.origin[0];
task.fromLon = req.origin[1];
task.date = LocalDate.now();
task.percentiles = new int[]{50};
task.recordTimes = true;
task.destinationPointSets = new PointSet[]{ destinations };
// Set grid extents from destination point set (required by TravelTimeComputer)
WebMercatorExtents extents = destinations.getWebMercatorExtents();
task.zoom = extents.zoom;
task.west = extents.west;
task.north = extents.north;
task.width = extents.width;
task.height = extents.height;
switch (mode) {
case "car":
task.fromTime = 8 * 3600;
task.toTime = 8 * 3600 + 60;
task.maxTripDurationMinutes = 120;
task.accessModes = EnumSet.of(LegMode.CAR);
task.egressModes = EnumSet.of(LegMode.CAR);
task.directModes = EnumSet.of(LegMode.CAR);
task.transitModes = EnumSet.noneOf(TransitModes.class);
break;
case "bicycle":
task.fromTime = 8 * 3600;
task.toTime = 8 * 3600 + 60;
task.maxTripDurationMinutes = 120;
task.accessModes = EnumSet.of(LegMode.BICYCLE);
task.egressModes = EnumSet.of(LegMode.BICYCLE);
task.directModes = EnumSet.of(LegMode.BICYCLE);
task.transitModes = EnumSet.noneOf(TransitModes.class);
break;
case "walking":
task.fromTime = 8 * 3600;
task.toTime = 8 * 3600 + 60;
task.maxTripDurationMinutes = 120;
task.accessModes = EnumSet.of(LegMode.WALK);
task.egressModes = EnumSet.of(LegMode.WALK);
task.directModes = EnumSet.of(LegMode.WALK);
task.transitModes = EnumSet.noneOf(TransitModes.class);
break;
default: // transit
task.fromTime = 8 * 3600;
task.toTime = 8 * 3600 + 60; // single RAPTOR sweep
task.maxTripDurationMinutes = 120;
task.maxRides = 4;
task.accessModes = EnumSet.of(LegMode.WALK);
task.egressModes = EnumSet.of(LegMode.WALK);
task.directModes = EnumSet.of(LegMode.WALK);
task.transitModes = EnumSet.of(TransitModes.TRANSIT);
break;
}
// Compute travel times
TravelTimeComputer computer = new TravelTimeComputer(task, network);
OneOriginResult result = computer.computeTravelTimes();
TravelTimeResponse response = new TravelTimeResponse();
response.travel_times = new double[req.destinations.length];
TravelTimeResult tt = result.travelTimes;
if (tt != null) {
int[][] values = tt.getValues();
// values[percentileIndex][destinationIndex]
for (int i = 0; i < req.destinations.length; i++) {
if (i < values[0].length && values[0][i] != Integer.MAX_VALUE) {
response.travel_times[i] = values[0][i]; // already in minutes
} else {
response.travel_times[i] = -1; // unreachable
lastError = e;
if (attempt < MAX_RETRIES) {
System.err.printf("%n [RETRY %d/%d] origin %d: %s%n",
attempt + 1, MAX_RETRIES, idx, e.getMessage());
}
}
} else {
for (int i = 0; i < req.destinations.length; i++) {
response.travel_times[i] = -1;
}
throw lastError;
}
/** Find origin indices that don't yet have output parquet files. */
private static List<Integer> findRemaining(Path modeDir, int nOrigins) throws Exception {
List<Integer> remaining = new ArrayList<>();
for (int i = 0; i < nOrigins; i++) {
Path f = modeDir.resolve(String.format("%06d.parquet", i));
if (!Files.exists(f) || Files.size(f) == 0) {
remaining.add(i);
}
}
return remaining;
}
long elapsed = System.currentTimeMillis() - t0;
System.out.println("Travel times (" + mode + ") computed for " + req.destinations.length +
" destinations in " + elapsed + "ms");
private static String requiredArg(String[] args, String name) {
for (int i = 0; i < args.length - 1; i++) {
if (args[i].equals(name)) return args[i + 1];
}
System.err.println("Missing required argument: " + name);
System.err.println("Usage: App --postcodes FILE --places FILE --output-dir DIR [--threads N]");
System.exit(1);
return null; // unreachable
}
sendResponse(exchange, 200, gson.toJson(response));
private static String optionalArg(String[] args, String name, String defaultValue) {
for (int i = 0; i < args.length - 1; i++) {
if (args[i].equals(name)) return args[i + 1];
}
return defaultValue;
}
private static String requiredEnv(String name) {
String val = System.getenv(name);
if (val == null) {
System.err.println("Missing required environment variable: " + name);
System.exit(1);
}
return val;
}
}

View file

@ -0,0 +1,112 @@
package propertymap;
import org.duckdb.DuckDBAppender;
import org.duckdb.DuckDBConnection;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardCopyOption;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.List;
/** DuckDB-based parquet I/O. */
public class Parquet {
record Postcodes(String[] codes, double[] lats, double[] lons) {}
static {
try { Class.forName("org.duckdb.DuckDBDriver"); }
catch (ClassNotFoundException e) { throw new RuntimeException(e); }
}
/** Load England postcodes, write reference parquet, return codes + flat lat/lon arrays. */
static Postcodes loadEnglandPostcodes(String parquetPath, Path refOut) throws Exception {
try (DuckDBConnection conn = connect(); Statement stmt = conn.createStatement()) {
stmt.execute("CREATE TABLE postcodes AS SELECT pcds, lat, \"long\" FROM read_parquet('"
+ parquetPath + "') WHERE ctry = 'E92000001'");
copyToParquet(stmt, "SELECT * FROM postcodes", refOut);
try (ResultSet rs = stmt.executeQuery("SELECT COUNT(*) FROM postcodes")) {
rs.next();
int n = rs.getInt(1);
String[] codes = new String[n];
double[] lats = new double[n];
double[] lons = new double[n];
try (ResultSet data = stmt.executeQuery("SELECT pcds, lat, \"long\" FROM postcodes")) {
int i = 0;
while (data.next()) {
codes[i] = data.getString(1);
lats[i] = data.getDouble(2);
lons[i] = data.getDouble(3);
i++;
}
}
return new Postcodes(codes, lats, lons);
}
}
}
/** Load places deduplicated by lat/lon, write reference parquet, return flat lat/lon arrays. */
static double[][] loadPlaces(String parquetPath, Path refOut) throws Exception {
try (DuckDBConnection conn = connect(); Statement stmt = conn.createStatement()) {
stmt.execute("CREATE TABLE places AS SELECT * EXCLUDE (rn) FROM ("
+ "SELECT *, ROW_NUMBER() OVER (PARTITION BY lat, lon) AS rn "
+ "FROM read_parquet('" + parquetPath + "')) WHERE rn = 1");
copyToParquet(stmt, "SELECT * FROM places", refOut);
try (ResultSet rs = stmt.executeQuery("SELECT COUNT(*) FROM places")) {
rs.next();
int n = rs.getInt(1);
// Return as [lats, lons] flat arrays
double[] lats = new double[n];
double[] lons = new double[n];
try (ResultSet data = stmt.executeQuery("SELECT lat, lon FROM places")) {
int i = 0;
while (data.next()) {
lats[i] = data.getDouble(1);
lons[i] = data.getDouble(2);
i++;
}
}
return new double[][]{lats, lons};
}
}
}
/** Write postcode travel times as a ZSTD-compressed parquet (atomic via tmp + rename). */
static void writeTravelTimes(DuckDBConnection conn, Path outPath, String[] postcodes, short[] times)
throws Exception {
Path tmp = outPath.resolveSibling(outPath.getFileName() + ".tmp");
try (Statement stmt = conn.createStatement()) {
stmt.execute("DROP TABLE IF EXISTS t");
stmt.execute("CREATE TABLE t (pcds VARCHAR, travel_minutes SMALLINT)");
}
try (DuckDBAppender appender = conn.createAppender("main", "t")) {
for (int i = 0; i < postcodes.length; i++) {
appender.beginRow();
appender.append(postcodes[i]);
appender.append(times[i]);
appender.endRow();
}
}
try (Statement stmt = conn.createStatement()) {
stmt.execute("COPY t TO '" + tmp.toAbsolutePath() + "' (FORMAT PARQUET, COMPRESSION ZSTD)");
}
Files.move(tmp, outPath, StandardCopyOption.REPLACE_EXISTING, StandardCopyOption.ATOMIC_MOVE);
}
/** Create a new in-memory DuckDB connection (for use as a per-thread reusable connection). */
static DuckDBConnection connect() throws Exception {
return (DuckDBConnection) DriverManager.getConnection("jdbc:duckdb:");
}
private static void copyToParquet(Statement stmt, String query, Path outPath) throws Exception {
stmt.execute("COPY (" + query + ") TO '" + outPath.toAbsolutePath()
+ "' (FORMAT PARQUET, COMPRESSION ZSTD)");
}
}

View file

@ -0,0 +1,211 @@
package propertymap;
import com.conveyal.r5.OneOriginResult;
import com.conveyal.r5.analyst.FreeFormPointSet;
import com.conveyal.r5.analyst.PointSet;
import com.conveyal.r5.analyst.TravelTimeComputer;
import com.conveyal.r5.analyst.WebMercatorExtents;
import com.conveyal.r5.analyst.cluster.RegionalTask;
import com.conveyal.r5.analyst.cluster.TravelTimeResult;
import com.conveyal.r5.api.util.LegMode;
import com.conveyal.r5.api.util.TransitModes;
import com.conveyal.r5.kryo.KryoNetworkSerializer;
import com.conveyal.r5.transit.TransportNetwork;
import org.locationtech.jts.geom.Coordinate;
import java.io.File;
import java.time.LocalDate;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.EnumSet;
import java.util.List;
/** R5 routing: network loading, point set construction, travel time computation. */
public class Router {
private static final int ZOOM = 9;
private static final int MAX_GRID_CELLS = 4_900_000; // under R5's 5M limit
/**
* A chunk of destinations that fits within R5's grid cell limit at zoom 9.
* originalIndices maps each position in this chunk back to the full destinations array.
*/
record DestinationChunk(FreeFormPointSet pointSet, WebMercatorExtents extents, int[] originalIndices) {}
/** Load or build the transport network with Kryo caching. */
static TransportNetwork loadNetwork(String dataDir, String cacheDir) throws Exception {
System.err.println("Loading transport network...");
File cacheFile = new File(cacheDir, "network.dat");
TransportNetwork network;
if (cacheFile.exists()) {
System.err.println(" Loading cached network from " + cacheFile);
network = KryoNetworkSerializer.read(cacheFile);
} else {
System.err.println(" Building network (first run, takes a few minutes)...");
network = TransportNetwork.fromDirectory(new File(dataDir));
new File(cacheDir).mkdirs();
KryoNetworkSerializer.write(network, cacheFile);
System.err.println(" Cached to " + cacheFile);
}
System.err.println(" Building distance tables...");
network.transitLayer.buildDistanceTables(null);
System.err.println(" Network ready");
return network;
}
/**
* Split destinations into geographic chunks that each fit within R5's grid cell limit.
* Sorts by latitude and splits into bands so each band's bounding box at zoom 9 is under 5M cells.
*/
static List<DestinationChunk> buildDestinationChunks(double[] lats, double[] lons) {
int n = lats.length;
// Sort indices by latitude for geographic chunking
Integer[] sorted = new Integer[n];
for (int i = 0; i < n; i++) sorted[i] = i;
Arrays.sort(sorted, (a, b) -> Double.compare(lats[a], lats[b]));
// Determine grid width (longitude span is the same for all chunks)
double minLon = Double.MAX_VALUE, maxLon = -Double.MAX_VALUE;
for (double lon : lons) {
minLon = Math.min(minLon, lon);
maxLon = Math.max(maxLon, lon);
}
int totalPixels = 256 << ZOOM;
int gridWidth = lonToPixel(maxLon, totalPixels) - lonToPixel(minLon, totalPixels) + 1;
int maxHeight = MAX_GRID_CELLS / gridWidth;
// Greedily build chunks: extend each band until it would exceed maxHeight
List<DestinationChunk> chunks = new ArrayList<>();
int start = 0;
while (start < n) {
int end = start + 1;
int topPixel = latToPixel(lats[sorted[start]], totalPixels);
while (end < n) {
int bottomPixel = latToPixel(lats[sorted[end]], totalPixels);
if (Math.abs(bottomPixel - topPixel) + 1 > maxHeight) break;
end++;
}
chunks.add(buildChunk(lats, lons, sorted, start, end));
start = end;
}
System.err.printf(" Split into %d chunks at zoom %d (grid width %d, max height %d)%n",
chunks.size(), ZOOM, gridWidth, maxHeight);
return chunks;
}
/** Compute travel times from one origin to all destinations across all chunks. */
static short[] computeTravelTimes(
TransportNetwork network, List<DestinationChunk> chunks,
double originLat, double originLon, String mode, int nDest, LocalDate date) {
short[] times = new short[nDest];
Arrays.fill(times, (short) -1);
for (DestinationChunk chunk : chunks) {
RegionalTask task = buildTask(chunk, originLat, originLon, mode, date);
TravelTimeComputer computer = new TravelTimeComputer(task, network);
OneOriginResult result = computer.computeTravelTimes();
TravelTimeResult tt = result.travelTimes;
if (tt != null) {
int[][] values = tt.getValues();
for (int i = 0; i < chunk.originalIndices.length && i < values[0].length; i++) {
if (values[0][i] != Integer.MAX_VALUE) {
times[chunk.originalIndices[i]] = (short) values[0][i];
}
}
}
}
return times;
}
private static DestinationChunk buildChunk(
double[] lats, double[] lons, Integer[] sorted, int start, int end) {
int size = end - start;
int[] originalIndices = new int[size];
Coordinate[] coords = new Coordinate[size];
double minLat = Double.MAX_VALUE, maxLat = -Double.MAX_VALUE;
double minLon = Double.MAX_VALUE, maxLon = -Double.MAX_VALUE;
for (int i = 0; i < size; i++) {
int idx = sorted[start + i];
originalIndices[i] = idx;
double lat = lats[idx], lon = lons[idx];
coords[i] = new Coordinate(lon, lat); // x=lon, y=lat
minLat = Math.min(minLat, lat);
maxLat = Math.max(maxLat, lat);
minLon = Math.min(minLon, lon);
maxLon = Math.max(maxLon, lon);
}
FreeFormPointSet pointSet = new FreeFormPointSet(coords);
int totalPixels = 256 << ZOOM;
int west = lonToPixel(minLon, totalPixels);
int north = latToPixel(maxLat, totalPixels);
int width = lonToPixel(maxLon, totalPixels) - west + 1;
int height = latToPixel(minLat, totalPixels) - north + 1;
WebMercatorExtents extents = new WebMercatorExtents(west, north, width, height, ZOOM);
return new DestinationChunk(pointSet, extents, originalIndices);
}
private static RegionalTask buildTask(
DestinationChunk chunk, double originLat, double originLon, String mode, LocalDate date) {
RegionalTask task = new RegionalTask();
task.fromLat = originLat;
task.fromLon = originLon;
task.date = date;
task.percentiles = new int[]{50};
task.recordTimes = true;
task.destinationPointSets = new PointSet[]{chunk.pointSet};
task.zoom = chunk.extents.zoom;
task.west = chunk.extents.west;
task.north = chunk.extents.north;
task.width = chunk.extents.width;
task.height = chunk.extents.height;
task.fromTime = 8 * 3600;
task.toTime = 8 * 3600 + 60;
task.maxTripDurationMinutes = 120;
configureMode(task, mode);
return task;
}
private static void configureMode(RegionalTask task, String mode) {
switch (mode) {
case "car" -> setDirectMode(task, LegMode.CAR);
case "bicycle" -> setDirectMode(task, LegMode.BICYCLE);
case "walking" -> setDirectMode(task, LegMode.WALK);
case "transit" -> {
task.maxRides = 4;
task.accessModes = EnumSet.of(LegMode.WALK);
task.egressModes = EnumSet.of(LegMode.WALK);
task.directModes = EnumSet.of(LegMode.WALK);
task.transitModes = EnumSet.of(TransitModes.TRANSIT);
}
default -> throw new IllegalArgumentException("Unknown mode: " + mode);
}
}
private static void setDirectMode(RegionalTask task, LegMode legMode) {
task.accessModes = EnumSet.of(legMode);
task.egressModes = EnumSet.of(legMode);
task.directModes = EnumSet.of(legMode);
task.transitModes = EnumSet.noneOf(TransitModes.class);
}
private static int lonToPixel(double lon, int totalPixels) {
return (int) Math.floor(totalPixels * (lon + 180.0) / 360.0);
}
private static int latToPixel(double lat, int totalPixels) {
double latRad = Math.toRadians(lat);
return (int) Math.floor(totalPixels * (1.0 - Math.log(Math.tan(latRad) + 1.0 / Math.cos(latRad)) / Math.PI) / 2.0);
}
}

62
uv.lock generated
View file

@ -140,6 +140,15 @@ css = [
{ name = "tinycss2", marker = "python_full_version < '3.14' and sys_platform == 'linux'" },
]
[[package]]
name = "blinker"
version = "1.9.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/21/28/9b3f50ce0e048515135495f198351908d99540d69bfdc8c1d15b73dc55ce/blinker-1.9.0.tar.gz", hash = "sha256:b4ce2265a7abece45e7cc896e98dbebe6cead56bcf805a3d23136d145f5445bf", size = 22460, upload-time = "2024-11-08T17:25:47.436Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/10/cb/f2ad4230dc2eb1a74edf38f1a38b9b52277f75bef262d8908e60d957e13c/blinker-1.9.0-py3-none-any.whl", hash = "sha256:ba0efaa9080b619ff2f3459d1d500c57bddea4a6b424b60a91141db6fd2f08bc", size = 8458, upload-time = "2024-11-08T17:25:46.184Z" },
]
[[package]]
name = "branca"
version = "0.8.2"
@ -379,6 +388,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/c1/ea/53f2148663b321f21b5a606bd5f191517cf40b7072c0497d3c92c4a13b1e/executing-2.2.1-py2.py3-none-any.whl", hash = "sha256:760643d3452b4d777d295bb167ccc74c64a81df23fb5e08eff250c425a4b2017", size = 28317, upload-time = "2025-09-01T09:48:08.5Z" },
]
[[package]]
name = "fake-useragent"
version = "2.2.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/41/43/948d10bf42735709edb5ae51e23297d034086f17fc7279fef385a7acb473/fake_useragent-2.2.0.tar.gz", hash = "sha256:4e6ab6571e40cc086d788523cf9e018f618d07f9050f822ff409a4dfe17c16b2", size = 158898, upload-time = "2025-04-14T15:32:19.238Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/51/37/b3ea9cd5558ff4cb51957caca2193981c6b0ff30bd0d2630ac62505d99d0/fake_useragent-2.2.0-py3-none-any.whl", hash = "sha256:67f35ca4d847b0d298187443aaf020413746e56acd985a611908c73dba2daa24", size = 161695, upload-time = "2025-04-14T15:32:17.732Z" },
]
[[package]]
name = "fastexcel"
version = "0.19.0"
@ -400,6 +418,23 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/cb/a8/20d0723294217e47de6d9e2e40fd4a9d2f7c4b6ef974babd482a59743694/fastjsonschema-2.21.2-py3-none-any.whl", hash = "sha256:1c797122d0a86c5cace2e54bf4e819c36223b552017172f32c5c024a6b77e463", size = 24024, upload-time = "2025-08-14T18:49:34.776Z" },
]
[[package]]
name = "flask"
version = "3.1.2"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "blinker", marker = "python_full_version < '3.14' and sys_platform == 'linux'" },
{ name = "click", marker = "python_full_version < '3.14' and sys_platform == 'linux'" },
{ name = "itsdangerous", marker = "python_full_version < '3.14' and sys_platform == 'linux'" },
{ name = "jinja2", marker = "python_full_version < '3.14' and sys_platform == 'linux'" },
{ name = "markupsafe", marker = "python_full_version < '3.14' and sys_platform == 'linux'" },
{ name = "werkzeug", marker = "python_full_version < '3.14' and sys_platform == 'linux'" },
]
sdist = { url = "https://files.pythonhosted.org/packages/dc/6d/cfe3c0fcc5e477df242b98bfe186a4c34357b4847e87ecaef04507332dab/flask-3.1.2.tar.gz", hash = "sha256:bf656c15c80190ed628ad08cdfd3aaa35beb087855e2f494910aa3774cc4fd87", size = 720160, upload-time = "2025-08-19T21:03:21.205Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/ec/f9/7f9263c5695f4bd0023734af91bedb2ff8209e8de6ead162f35d8dc762fd/flask-3.1.2-py3-none-any.whl", hash = "sha256:ca1d8112ec8a6158cc29ea4858963350011b5c846a414cdb7a954aa9e967d03c", size = 103308, upload-time = "2025-08-19T21:03:19.499Z" },
]
[[package]]
name = "folium"
version = "0.20.0"
@ -593,6 +628,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/7b/55/e5326141505c5d5e34c5e0935d2908a74e4561eca44108fbfb9c13d2911a/isoduration-20.11.0-py3-none-any.whl", hash = "sha256:b2904c2a4228c3d44f409c8ae8e2370eb21a26f7ac2ec5446df141dde3452042", size = 11321, upload-time = "2020-11-01T10:59:58.02Z" },
]
[[package]]
name = "itsdangerous"
version = "2.2.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/9c/cb/8ac0172223afbccb63986cc25049b154ecfb5e85932587206f42317be31d/itsdangerous-2.2.0.tar.gz", hash = "sha256:e0050c0b7da1eea53ffaf149c0cfbb5c6e2e2b69c4bef22c81fa6eb73e5f6173", size = 54410, upload-time = "2024-04-16T21:28:15.614Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/04/96/92447566d16df59b2a776c0fb82dbc4d9e07cd95062562af01e408583fc4/itsdangerous-2.2.0-py3-none-any.whl", hash = "sha256:c6242fc49e35958c8b15141343aa660db5fc54d4f13a1db01a3f5891b98700ef", size = 16234, upload-time = "2024-04-16T21:28:14.499Z" },
]
[[package]]
name = "jedi"
version = "0.19.2"
@ -1367,7 +1411,9 @@ name = "property-map"
version = "0.1.0"
source = { virtual = "." }
dependencies = [
{ name = "fake-useragent", marker = "python_full_version < '3.14' and sys_platform == 'linux'" },
{ name = "fastexcel", marker = "python_full_version < '3.14' and sys_platform == 'linux'" },
{ name = "flask", marker = "python_full_version < '3.14' and sys_platform == 'linux'" },
{ name = "folium", marker = "python_full_version < '3.14' and sys_platform == 'linux'" },
{ name = "httpx", extra = ["socks"], marker = "python_full_version < '3.14' and sys_platform == 'linux'" },
{ name = "ipywidgets", marker = "python_full_version < '3.14' and sys_platform == 'linux'" },
@ -1397,8 +1443,11 @@ dev = [
[package.metadata]
requires-dist = [
{ name = "fake-useragent", specifier = ">=2.2.0" },
{ name = "fastexcel", specifier = ">=0.19.0" },
{ name = "flask" },
{ name = "folium", specifier = ">=0.20.0" },
{ name = "httpx" },
{ name = "httpx", extras = ["socks"], specifier = ">=0.28.1" },
{ name = "ipywidgets", specifier = ">=8.0.0" },
{ name = "jupyter", specifier = ">=1.0.0" },
@ -1407,6 +1456,7 @@ requires-dist = [
{ name = "osmium", specifier = ">=4.0.0" },
{ name = "pandas", specifier = ">=2.0.0" },
{ name = "plotly", specifier = ">=6.5.2" },
{ name = "polars" },
{ name = "polars", specifier = ">=1.37.1" },
{ name = "pyarrow", specifier = ">=15.0.0" },
{ name = "pyproj", specifier = ">=3.7.2" },
@ -2127,6 +2177,18 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/34/db/b10e48aa8fff7407e67470363eac595018441cf32d5e1001567a7aeba5d2/websocket_client-1.9.0-py3-none-any.whl", hash = "sha256:af248a825037ef591efbf6ed20cc5faa03d3b47b9e5a2230a529eeee1c1fc3ef", size = 82616, upload-time = "2025-10-07T21:16:34.951Z" },
]
[[package]]
name = "werkzeug"
version = "3.1.5"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "markupsafe", marker = "python_full_version < '3.14' and sys_platform == 'linux'" },
]
sdist = { url = "https://files.pythonhosted.org/packages/5a/70/1469ef1d3542ae7c2c7b72bd5e3a4e6ee69d7978fa8a3af05a38eca5becf/werkzeug-3.1.5.tar.gz", hash = "sha256:6a548b0e88955dd07ccb25539d7d0cc97417ee9e179677d22c7041c8f078ce67", size = 864754, upload-time = "2026-01-08T17:49:23.247Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/ad/e4/8d97cca767bcc1be76d16fb76951608305561c6e056811587f36cb1316a8/werkzeug-3.1.5-py3-none-any.whl", hash = "sha256:5111e36e91086ece91f93268bb39b4a35c1e6f1feac762c9c822ded0a4e322dc", size = 225025, upload-time = "2026-01-08T17:49:21.859Z" },
]
[[package]]
name = "widgetsnbextension"
version = "4.0.15"