This commit is contained in:
Ruby 2026-02-01 20:04:27 +00:00
commit 897dae77ac
104 changed files with 16454 additions and 4622 deletions

13
.dockerignore Normal file
View file

@ -0,0 +1,13 @@
data/
data_sources/
.venv
**/node_modules
**/dist
server-rs/target
.git
.task
.claude
__pycache__
*.parquet
analyses/
*.log

49
.github/workflows/docker.yml vendored Normal file
View file

@ -0,0 +1,49 @@
name: Docker
on:
push:
branches: [main]
env:
REGISTRY: ghcr.io
IMAGE_NAME: ${{ github.repository }}
jobs:
build-and-push:
runs-on: ubuntu-latest
permissions:
contents: read
packages: write
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Log in to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Extract metadata
id: meta
uses: docker/metadata-action@v5
with:
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
tags: |
type=raw,value=latest
type=sha,prefix=sha-,format=short
- name: Build and push
uses: docker/build-push-action@v6
with:
context: .
push: true
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
cache-from: type=gha
cache-to: type=gha,mode=max

3
.gitignore vendored
View file

@ -5,3 +5,6 @@ tfl_journey_client
**/node_modules
**/__pycache__
**/dist
server-rs/target
.task
data

31
.vscode/extensions.json vendored Normal file
View file

@ -0,0 +1,31 @@
{
"recommendations": [
"esbenp.prettier-vscode",
"dbaeumer.vscode-eslint",
"ms-toolsai.jupyter",
"ms-python.python",
"GitHub.copilot",
"ms-azuretools.vscode-docker",
"redhat.vscode-yaml",
"1yib.rust-bundle",
"alexcvzz.vscode-sqlite",
"esbenp.prettier-vscode",
"dbaeumer.vscode-eslint",
"ms-python.python",
"ms-toolsai.jupyter",
"ms-azuretools.vscode-docker",
"redhat.vscode-yaml",
"tomoki1207.pdf",
"qwtel.sqlite-viewer",
"alexcvzz.vscode-sqlite",
"rust-lang.rust-analyzer",
"pkief.material-icon-theme",
"detachhead.basedpyright",
"editorconfig.editorconfig",
"davidanson.vscode-markdownlint",
"charliermarsh.ruff",
"timonwong.shellcheck",
"tonybaloney.vscode-pets",
"vadimcn.vscode-lldb"
]
}

View file

@ -2,6 +2,9 @@
"files.exclude": {
"*.venv": true,
"**/__pycache__": true,
"**/node_modules": true
"**/node_modules": true,
"**/.ruff_cache":true,
"**/.pytest_cache":true,
"**/target":true
}
}

244
CLAUDE.md
View file

@ -2,68 +2,228 @@
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
NEVER EVER RUN GIT COMMANDS!!
## Project Overview
Property Map is a full-stack geospatial web application that visualizes UK property price data aggregated by H3 hexagonal spatial indices. It combines Land Registry price data with postcode geolocation to create an interactive map for exploring property markets.
Property Map is a full-stack geospatial application for visualizing UK property data on an interactive map. It combines Land Registry price-paid data, EPC energy certificates, postcode geolocation, TFL journey times, Index of Deprivation scores, crime statistics, ethnicity data, broadband speeds, school ratings, road noise, and OpenStreetMap POIs into a single wide parquet file, then serves aggregated H3 hexagon statistics and POI data via a Rust backend.
## Commands
All commands use [Task](https://taskfile.dev) runner. Install with: `curl -1sLf 'https://dl.cloudsmith.io/public/task/task/setup.deb.sh' | sudo -E bash`
All commands use [Task](https://taskfile.dev) runner. Python uses `uv run`. Frontend uses `npm run` from `frontend/`.
```bash
# Initial setup (downloads ~GB of data, runs pipeline)
task prepare
# Development servers
task dev:server # Rust backend on :8001 (cargo run --release)
task dev:frontend # Webpack dev server on :3030 (proxies /api to :8001)
# Development (run in separate terminals)
task server # FastAPI backend on :8001
task frontend # Webpack dev server on :3030 (proxies /api to :8001)
# Data pipeline
task prepare # Build wide.parquet from all pre-downloaded sources
# Code quality
task lint # Lint Python (ruff) + TypeScript (ESLint + Prettier)
task format # Auto-fix formatting
task typecheck # TypeScript type checking
task check # All checks (lint + typecheck + build)
# Quality
task lint # Lint all: Python (ruff) + TypeScript (ESLint+Prettier) + Rust (clippy+fmt)
task format # Auto-fix formatting for all languages
task test # Python tests (fuzzy join, haversine, POI counts)
task check # Full validation: lint + build + test
# Production
task build # Build frontend
task prod # Serve built frontend via FastAPI
# Building
task build:frontend # TypeScript typecheck + webpack production build
task build:server # cargo build --release (NOTE: dir is wrong in Taskfile, run from server-rs/)
# Granular lint/format
task lint:python # uv run ruff check .
task lint:frontend # eslint + prettier --check
task lint:rust # cargo clippy -- -D warnings && cargo fmt --check
task format:python # ruff check --fix && ruff format
task format:frontend # eslint --fix + prettier --write
task format:rust # cargo fmt --all
```
Running individual tests:
```bash
uv run pytest pipeline/utils/test_haversine.py # Single test file
uv run pytest pipeline/utils/test_haversine.py -k "test_name" # Single test
```
## Architecture
### Data Flow
```
frontend/ React + TypeScript SPA (deck.gl/MapLibre for visualization)
src/App.tsx Main component with filters and map state
src/components/ Map.tsx (deck.gl H3HexagonLayer), Filters UI
server/ FastAPI backend
main.py App setup, CORS, static file mounting
routes/hexagons.py GET /api/hexagons - returns aggregated price data
pipeline/ Data processing (Polars + H3)
config.py Central config (H3 resolutions 6-11, year/price ranges)
sources/ Postcode loading, property price joins
processors/ H3 aggregation (count, avg/median/min/max by cell+year)
tfl_journey_client/ Generated TFL API client (local package)
Raw sources → [Download scripts] → data/*.parquet
→ [Fuzzy join EPC ↔ Price-Paid] → epc_pp.parquet
→ [Merge all datasets] → wide.parquet
→ [Rust server loads into memory + precomputes H3 + spatial grid]
→ [Frontend renders deck.gl H3HexagonLayer over MapLibre GL]
```
## Data Flow
### Data Pipeline (`pipeline/`)
1. **Download**: Land Registry prices + ArcGIS postcode→lat/lon mappings → `data_sources/`
2. **Pipeline**: Join data, compute H3 indices, aggregate stats → `data_sources/processed/aggregates/*.parquet`
3. **Serve**: Load parquet files into memory, filter by bounds/year/price, return as GeoJSON-like response
4. **Visualize**: Frontend fetches on viewport change, renders hexagons colored by average price
Python + Polars. Two phases:
## Tech Stack
1. **Download** (`pipeline/download/`) — Each script fetches one raw dataset into `data/`
2. **Transform** (`pipeline/transform/`) — Joins and derives features:
- `join_epc_pp.py` — Fuzzy-joins EPC ↔ price-paid by address within postcode buckets
- `merge.py`**Main pipeline**: joins all datasets → `wide.parquet` with human-readable column names
- `transform_poi.py` — Filters POIs, maps to friendly names + emoji (exhaustive category validation)
- `poi_proximity.py` — Counts POIs within 2km per postcode using 0.05° spatial grid
- `crime.py` — Aggregates crime CSVs into yearly averages by LSOA
- **Frontend**: React 18, TypeScript, Webpack, TailwindCSS, deck.gl, MapLibre GL
- **Backend**: Python 3.12, FastAPI, Polars, H3
- **Package managers**: `uv` (Python), `npm` (frontend)
**Critical: column renaming in `merge.py`** — The pipeline renames columns from snake_case to human-readable names before writing `wide.parquet`. The Rust server auto-discovers features from whatever column names exist in the parquet. Key renames:
- `pp_address``Address per Property Register`
- `postcode``Postcode`
- `latest_price``Last known price`
- `duration``Leashold/Freehold`
- `total_floor_area``Total floor area (sqm)`
- `current_energy_rating``Current energy rating`
The server and frontend must handle these human-readable names. See the full rename map in `merge.py`.
### Backend (`server-rs/`)
Rust + Axum. Loads parquet into memory at startup.
**Structure:**
- `data/property.rs` — Loads `wide.parquet`, auto-discovers numeric + enum features, computes histograms, sorts rows by spatial locality, precomputes H3 cells (resolutions 412)
- `data/poi.rs` — Loads `filtered_uk_pois.parquet`
- `index.rs``GridIndex`: 0.01° spatial grid for O(1) cell lookup
- `filter.rs` — Parses filter strings and checks rows. Format: `name:min:max` (numeric), `name:val1|val2` (enum)
- `routes/` — One file per endpoint
- `consts.rs` — Key constants (histogram bins, H3 range, max enum cardinality, excluded columns)
**API endpoints:**
- `GET /api/features` — Feature metadata with histograms and 2nd/98th percentiles
- `GET /api/hexagons?resolution=&bounds=&filters=` — H3 aggregates (min/max per feature per hex)
- `GET /api/hexagon-properties?h3=&resolution=&filters=&limit=&offset=` — Paginated properties within a hexagon
- `GET /api/pois?bounds=&categories=` — POIs by bounds (max 5000)
- `GET /api/poi-categories` — Available POI category names
Serves `frontend/dist/` as static fallback in production.
**Data representation:**
- Numeric features: row-major flat `Vec<f64>`, NaN = null
- Enum features: `Vec<u8>` indices into value list, 255 = null
- String fields (address, postcode): `Vec<String>`, empty = null
- The server accepts the parquet path as a CLI argument (defaults to `data_sources/processed/wide.parquet`)
### Frontend (`frontend/`)
React 18 + TypeScript. deck.gl `H3HexagonLayer` over MapLibre GL. TailwindCSS. No state management library — pure React hooks.
**Key patterns:**
- `App.tsx` manages all state, API fetching (150ms debounce), and URL state sync (300ms debounce)
- URL encodes view/filters/POI categories/active tab as query params for shareable links
- AbortControllers cancel in-flight requests on new queries
- Zoom → H3 resolution: `<7→7, <9.5→8, <11→9, <13→10, ≥13→11`
- Bounds quantized to 0.01° to match backend caching
- Properties pane uses feature names from API response (human-readable), not hardcoded field names
- Proxy: dev server on :3030 proxies `/api` to :8001; also handles VS Code `/proxy/PORT` patterns
## Frontend Design Guide (STRICT — must be followed for all UI changes)
The frontend uses Tailwind's `darkMode: 'class'` strategy. The `dark` class is toggled on `<html>`. Every visible element must have both light and dark styles. **Never add a light-only color class without its `dark:` counterpart.** Run `task build:frontend` after any UI change to verify.
### Theme System
- **State**: `App.tsx` owns a `theme` state (`'light' | 'dark' | 'system'`), persisted in `localStorage` under the key `theme`, default `'system'`.
- **Effective theme**: When `'system'`, resolved via `window.matchMedia('(prefers-color-scheme: dark)')`. A `change` listener re-renders on OS preference flip.
- **Toggle cycle**: light → dark → system → light. Three-way, not binary.
- **Flash prevention**: `index.html` contains an inline `<script>` that applies the `dark` class before first paint. If the localStorage/matchMedia logic in that script changes, update it to match `App.tsx`.
- **Prop plumbing**: `effectiveTheme` (`'light' | 'dark'`) is passed as a prop to `<Map>` and `<HomePage>`. Components that need the resolved theme must receive it as a prop — do not read localStorage or matchMedia inside child components.
### Color Token Reference
Every UI element must use the correct token from this table. Do not invent new pairings.
| Role | Light class | Dark class | Hex (dark) |
|------|------------|------------|------------|
| **Page / pane background** | `bg-warm-50` or `bg-white` | `dark:bg-warm-900` | #1c1917 |
| **Card / elevated surface** | `bg-white` | `dark:bg-warm-800` | #292524 |
| **Inset / recessed surface** | `bg-warm-100` or `bg-warm-50` | `dark:bg-warm-800` | #292524 |
| **Input / select background** | `bg-white` | `dark:bg-warm-800` or `dark:bg-warm-900` | |
| **Primary border** | `border-warm-200` | `dark:border-warm-700` | #44403c |
| **Subtle border (dividers)** | `border-warm-100` | `dark:border-warm-800` | #292524 |
| **Primary text (headings)** | `text-navy-950` or implicit dark | `dark:text-warm-100` | #f5f5f4 |
| **Body text** | `text-warm-700` | `dark:text-warm-300` | #d6d3d1 |
| **Secondary text (labels, hints)** | `text-warm-500` or `text-warm-600` | `dark:text-warm-400` | #a8a29e |
| **Disabled / placeholder text** | `text-warm-400` / `placeholder-warm-400` | `dark:text-warm-500` / `dark:placeholder-warm-500` | #78716c |
| **Accent text (links, actions)** | `text-teal-600` | `dark:text-teal-400` | #1de4c3 |
| **Accent hover text** | `hover:text-teal-800` | `dark:hover:text-teal-300` | #51f7d9 |
| **Accent background (highlights)** | `bg-teal-50` | `dark:bg-teal-900/30` | |
| **Active ring / focus ring** | `ring-teal-400` | same — works in both | |
| **Price / key metric text** | `text-teal-700` | `dark:text-teal-400` | |
| **Remove / close button** | `text-warm-400 hover:text-warm-700` | `dark:hover:text-warm-300` | |
| **Checkbox accent** | `accent-teal-600` | same — works in both | |
| **Header (unchanged both modes)** | `bg-navy-900 text-white` | same | |
### Mapping Rules for Specific Contexts
**Sidebars (Filters, POIPane, PropertiesPane, right-pane tabs):**
- Container: `bg-white dark:bg-warm-900`
- Inner cards / dropdown menus: `bg-white dark:bg-warm-800`
- Borders: `border-warm-200 dark:border-warm-700`
- Tab text (active): add `dark:text-warm-100`
- Tab text (inactive): `text-warm-600 dark:text-warm-400`
**Map overlays (PostcodeSearch, MapLegend, POI popup, loading indicator):**
- Background: `bg-white dark:bg-warm-800`
- Text: `dark:text-warm-200`
- Semi-transparent variants: use `/90` opacity suffix (e.g. `dark:bg-warm-800/90`)
- Deck.gl tooltip (inline styles, not Tailwind): use `#292524` bg / `#e7e5e4` text / `rgba(0,0,0,0.5)` shadow in dark.
- Deck.gl postcode labels (RGB arrays): `[220,220,220,220]` text / `[30,30,30,200]` outline in dark; inverse in light.
**Map basemaps:**
- Light: `https://basemaps.cartocdn.com/gl/voyager-gl-style/style.json`
- Dark: `https://basemaps.cartocdn.com/gl/dark-matter-gl-style/style.json`
- `handleMapLoad` must only apply label/water tweaks in light mode. Dark Matter has good defaults.
**HomePage (landing page):**
- Page bg: `bg-warm-50 dark:bg-warm-900`
- Cards: `bg-white dark:bg-warm-800` with `border-warm-200 dark:border-warm-700`
- Backdrop-blur panels: use `/60` or `/40` opacity on both `bg-warm-50` and `dark:bg-warm-900`
- HexCanvas: reads `isDark` ref; uses dimmer fill (`#058172`) and stroke (`#0a665b`) at 60% opacity multiplier.
- All headings: `dark:text-warm-100`. All body: `dark:text-warm-300` or `dark:text-warm-400`.
**DataSourcesPage:**
- Same card pattern as above. Footer is already dark (`bg-navy-900`) — no changes needed.
- License badges: `bg-warm-100 dark:bg-warm-700 text-warm-600 dark:text-warm-300`
- Links: `text-teal-600 dark:text-teal-400`
**DataSources floating button (on map):**
- `bg-white/90 dark:bg-warm-800/90` with `text-teal-600 dark:text-teal-400`
### Rules for New Components
1. **Every `bg-white` needs `dark:bg-warm-800` or `dark:bg-warm-900`.** Pane-level = warm-900, card-level = warm-800.
2. **Every `border-warm-200` needs `dark:border-warm-700`.**
3. **Every `text-warm-*` needs a `dark:text-warm-*` counterpart.** Follow the token table — don't guess.
4. **Every `text-teal-600` needs `dark:text-teal-400`.** Every `hover:text-teal-800` needs `dark:hover:text-teal-300`.
5. **Every `bg-teal-50` needs `dark:bg-teal-900/30`.**
6. **Every `hover:bg-warm-50` needs `dark:hover:bg-warm-700` or `dark:hover:bg-warm-800`.**
7. **Inputs and selects**: always add `dark:bg-warm-800 dark:text-warm-200 dark:border-warm-700`. Placeholders get `dark:placeholder-warm-500`.
8. **Checkboxes**: always include `accent-teal-600 rounded`.
9. **Do not use Tailwind `dark:` classes inside deck.gl layers or canvas code.** Use the `theme` prop / ref and conditional JS values.
10. **Do not add `transition-*` classes for theme switching.** The global CSS rule in `index.css` handles transitions for `background-color`, `border-color`, and `color` on all standard HTML elements. Adding per-element transition classes will conflict.
11. **Never hardcode hex colors in JSX `style=` props for themed elements** (except deck.gl tooltip and canvas, which can't use Tailwind). Use the Tailwind classes from the token table instead.
12. **The header (`bg-navy-900`) is identical in both themes.** Do not add dark variants to it.
### Verification Checklist (for any UI PR)
- [ ] `task build:frontend` passes with no errors
- [ ] Every new `bg-*`, `text-*`, `border-*` class has a `dark:` counterpart (search your diff)
- [ ] Toggle through all three modes (light → dark → system) with no flash
- [ ] Map basemap switches when theme changes
- [ ] Sidebars, dropdowns, and popups are readable in both modes
- [ ] HomePage and DataSourcesPage adapt correctly
## Key Implementation Details
- Backend caches dataframes in memory and uses LRU cache on queries
- Bounds rounded to 0.01° precision to improve cache hits
- Results capped at 50,000 hexagons per request (truncated flag in response)
- Frontend debounces API calls on map movement
- **Spatial sort**: Rows sorted by 0.01° grid cell at load time for cache-friendly sequential access
- **Row-major layout**: `feature_data[row * num_features + feat_idx]` — all features for one property are contiguous
- **H3 precomputation**: Resolutions 412 computed in parallel (rayon) at startup
- **Histogram percentiles without sorting**: O(n) two-pass algorithm — build histogram, interpolate percentiles
- **Direct JSON writing**: Hexagon endpoint writes JSON via string buffer, avoids serde_json::Value allocations
- **POI transform validation**: Fails if any OSM category is unmapped — guarantees exhaustive coverage
- **Fuzzy join**: Groups by postcode, uses `thefuzz.token_sort_ratio` with numeric token compatibility, greedy assignment from highest score
- **Filter bounds format**: `south,west,north,east` (not standard bbox order)
- **POI proximity**: Uses 0.05° grid (~5km cells) to reduce candidates before haversine distance check

25
Dockerfile Normal file
View file

@ -0,0 +1,25 @@
# Stage 1: Build frontend
FROM node:20-slim AS frontend
WORKDIR /app/frontend
COPY frontend/package.json frontend/package-lock.json ./
RUN npm ci
COPY frontend/ ./
RUN npm run build
# Stage 2: Build Rust server
FROM rust:1.83-bookworm AS server
WORKDIR /app
COPY server-rs/ server-rs/
WORKDIR /app/server-rs
RUN cargo build --release
# Stage 3: Runtime
FROM debian:bookworm-slim
RUN apt-get update && apt-get install -y --no-install-recommends ca-certificates && rm -rf /var/lib/apt/lists/*
WORKDIR /app
COPY --from=server /app/server-rs/target/release/property-map-server ./
COPY --from=frontend /app/frontend/dist ./dist/
EXPOSE 8001
ENTRYPOINT ["./property-map-server"]
CMD ["--data", "/data/wide.parquet", "--pois", "/data/filtered_uk_pois.parquet"]

File diff suppressed because it is too large Load diff

View file

@ -69,3 +69,11 @@ Nice to haves?
- [Local Autheority (Upper Tier)](https://communitiesopendata-communities.hub.arcgis.com/datasets/6e8edb2974da4834bbafa09644a5b02d_0/explore?location=52.684195%2C-2.489482%2C7.17)
- [Open Geography](https://geoportal.statistics.gov.uk/)
- [CommunitiesOpenData](https://communitiesopendata-communities.hub.arcgis.com/)
- [PlanetOSM](https://planet.openstreetmap.org/) for open street map POI
- [TFL api](https://api-portal.tfl.gov.uk/signin)
- [EPC](https://epc.opendatacommunities.org/login) - <https://epc.opendatacommunities.org/downloads/domestic>
rightmove:
curl '<https://www.rightmove.co.uk/api/property-search/listing/search?searchLocation=E14&useLocationIdentifier=true&locationIdentifier=OUTCODE%5E749&buy=For+sale&radius=20.0&_includeSSTC=on&index=0&sortType=2&channel=BUY&transactionType=BUY>'
curl '<https://www.onthemarket.com/async/search/properties-v2/?search-type=for-sale&location-id=e13&view=map-list>'

191
Taskfile.data.yml Normal file
View file

@ -0,0 +1,191 @@
version: '3'
vars:
DATA_DIR: /bulk/property-data
ARCGIS_OUTPUT: "{{.DATA_DIR}}/arcgis_data.parquet"
PRICE_PAID_OUTPUT: "{{.DATA_DIR}}/price-paid-complete.parquet"
IOD_OUTPUT: "{{.DATA_DIR}}/IoD2025_Scores.parquet"
POIS_RAW_OUTPUT: "{{.DATA_DIR}}/uk_pois.parquet"
POIS_FILTERED_OUTPUT: "{{.DATA_DIR}}/filtered_uk_pois.parquet"
POI_PROXIMITY_OUTPUT: "{{.DATA_DIR}}/poi_proximity.parquet"
EPC_PP_OUTPUT: "{{.DATA_DIR}}/epc_pp.parquet"
WIDE_OUTPUT: "{{.DATA_DIR}}/wide.parquet"
EPC: "{{.DATA_DIR}}/certificates.csv"
JOURNEY_TIMES: "./data_sources/processed/journey_times_bank_checkpoint.parquet"
ETHNICITY_OUTPUT: "{{.DATA_DIR}}/ethnicity_by_la.parquet"
CRIME_DIR: "{{.DATA_DIR}}/crime"
CRIME_OUTPUT: "{{.DATA_DIR}}/crime_by_lsoa.parquet"
NOISE_OUTPUT: "{{.DATA_DIR}}/road_noise.parquet"
OFSTED_OUTPUT: "{{.DATA_DIR}}/ofsted.parquet"
NAPTAN_OUTPUT: "{{.DATA_DIR}}/naptan.parquet"
BROADBAND_OUTPUT: "{{.DATA_DIR}}/broadband.parquet"
SCHOOL_PROXIMITY_OUTPUT: "{{.DATA_DIR}}/school_proximity.parquet"
tasks:
prompt:epc:
desc: Prompt user to download EPC dataset (requires registration)
status:
- test -f {{.EPC}}
cmds:
- |
echo ""
echo "=== EPC dataset not found ==="
echo "The EPC certificates file is required: {{.EPC}}"
echo ""
echo "To obtain it, register at https://epc.opendatacommunities.org/login"
echo ""
exit 1
prompt:journey-times:
desc: Download TFL journey times if missing (requires API key registration)
status:
- test -f {{.JOURNEY_TIMES}}
deps:
- download:arcgis
cmds:
- |
echo ""
echo "=== TFL journey times not found ==="
echo "Register for a TFL API key at https://api-portal.tfl.gov.uk/signin"
echo "Then set the TFL_API_KEY environment variable and re-run this task."
echo ""
exit 1
download:arcgis:
desc: Download and convert ArcGIS postcode data
status:
- test -f {{.ARCGIS_OUTPUT}}
cmds:
- uv run python -m pipeline.download.arcgis --output {{.ARCGIS_OUTPUT}}
download:price-paid:
desc: Download and convert Land Registry price-paid data
status:
- test -f {{.PRICE_PAID_OUTPUT}}
cmds:
- uv run python -m pipeline.download.price_paid --output {{.PRICE_PAID_OUTPUT}}
download:deprivation:
desc: Download and convert Index of Deprivation data
status:
- test -f {{.IOD_OUTPUT}}
cmds:
- uv run python -m pipeline.download.deprivation_data --output {{.IOD_OUTPUT}}
download:ethnicity:
desc: Download ethnicity by local authority data
status:
- test -f {{.ETHNICITY_OUTPUT}}
cmds:
- uv run python -m pipeline.download.ethnicity --output {{.ETHNICITY_OUTPUT}}
download:naptan:
desc: Download NaPTAN station data
status:
- test -f {{.NAPTAN_OUTPUT}}
cmds:
- uv run python -m pipeline.download.naptan --output {{.NAPTAN_OUTPUT}}
download:pois:
desc: Download and extract POIs from OpenStreetMap
status:
- test -f {{.POIS_RAW_OUTPUT}}
cmds:
- uv run python -m pipeline.download.pois --output {{.POIS_RAW_OUTPUT}}
download:ofsted:
desc: Download Ofsted school inspection outcomes
status:
- test -f {{.OFSTED_OUTPUT}}
cmds:
- uv run python -m pipeline.download.ofsted --output {{.OFSTED_OUTPUT}}
download:broadband:
desc: Download Ofcom broadband performance data
status:
- test -f {{.BROADBAND_OUTPUT}}
cmds:
- uv run python -m pipeline.download.broadband --output {{.BROADBAND_OUTPUT}}
download:noise:
desc: Download Defra noise data (road, rail, airport) sampled at postcode centroids
deps:
- download:arcgis
status:
- test -f {{.NOISE_OUTPUT}}
cmds:
- uv run python -m pipeline.download.noise --arcgis {{.ARCGIS_OUTPUT}} --output {{.NOISE_OUTPUT}}
transform:pois:
desc: Transform raw POIs to filtered version with friendly names
deps:
- download:pois
- download:naptan
status:
- test -f {{.POIS_FILTERED_OUTPUT}}
cmds:
- uv run python -m pipeline.transform.transform_poi --input {{.POIS_RAW_OUTPUT}} --naptan {{.NAPTAN_OUTPUT}} --output {{.POIS_FILTERED_OUTPUT}}
transform:epc-pp:
desc: Fuzzy join EPC and Price Paid data
deps:
- download:price-paid
- prompt:epc
status:
- test -f {{.EPC_PP_OUTPUT}}
cmds:
- uv run python -m pipeline.transform.join_epc_pp --epc {{.EPC}} --price-paid {{.PRICE_PAID_OUTPUT}} --output {{.EPC_PP_OUTPUT}}
transform:crime:
desc: Transform crime CSVs into yearly averages by LSOA
status:
- test -f {{.CRIME_OUTPUT}}
cmds:
- uv run python -m pipeline.transform.crime --input {{.CRIME_DIR}} --output {{.CRIME_OUTPUT}}
transform:poi-proximity:
desc: Compute POI proximity counts per postcode
deps:
- download:arcgis
- transform:pois
status:
- test -f {{.POI_PROXIMITY_OUTPUT}}
cmds:
- uv run python -m pipeline.transform.poi_proximity --arcgis {{.ARCGIS_OUTPUT}} --pois {{.POIS_FILTERED_OUTPUT}} --output {{.POI_PROXIMITY_OUTPUT}}
transform:school-proximity:
desc: Compute good+ school proximity counts per postcode
deps:
- download:ofsted
- download:arcgis
status:
- test -f {{.SCHOOL_PROXIMITY_OUTPUT}}
cmds:
- uv run python -m pipeline.transform.school_proximity --ofsted {{.OFSTED_OUTPUT}} --arcgis {{.ARCGIS_OUTPUT}} --output {{.SCHOOL_PROXIMITY_OUTPUT}}
download:journey-times:
desc: Fetch TfL journey times for all postcodes
deps:
- download:arcgis
status:
- test -f {{.JOURNEY_TIMES}}
cmds:
- uv run python -m pipeline.journey_times
prepare:
desc: Build wide property dataframe with all joins
deps:
- transform:epc-pp
- download:arcgis
- download:deprivation
- download:ethnicity
- download:broadband
- download:noise
- transform:crime
- transform:poi-proximity
- transform:school-proximity
- prompt:journey-times
status:
- test -f {{.WIDE_OUTPUT}}
cmds:
- uv run python -m pipeline.transform.merge --epc-pp {{.EPC_PP_OUTPUT}} --arcgis {{.ARCGIS_OUTPUT}} --iod {{.IOD_OUTPUT}} --poi-proximity {{.POI_PROXIMITY_OUTPUT}} --journey-times {{.JOURNEY_TIMES}} --ethnicity {{.ETHNICITY_OUTPUT}} --crime {{.CRIME_OUTPUT}} --noise {{.NOISE_OUTPUT}} --school-proximity {{.SCHOOL_PROXIMITY_OUTPUT}} --broadband {{.BROADBAND_OUTPUT}} --output {{.WIDE_OUTPUT}}

View file

@ -1,66 +1,79 @@
version: '3'
includes:
data:
taskfile: ./Taskfile.data.yml
flatten: true
vars:
DATA_DIR: /bulk/property-data
WIDE_OUTPUT: "{{.DATA_DIR}}/wide.parquet"
POIS_FILTERED_OUTPUT: "{{.DATA_DIR}}/filtered_uk_pois.parquet"
tasks:
install:
desc: Install dependencies, generate client, and download data
desc: Install dependencies
cmds:
- uv run generate_tfl_client.py
- uv sync
- cd frontend && npm install
download:
desc: Download data
test:
cmds:
- uv run -m pipeline.utils.test_fuzzy_join
- uv run pytest pipeline/utils/test_haversine.py
- uv run pytest pipeline/utils/test_poi_counts.py
test:server:
desc: Run Rust backend tests
dir: server-rs
cmds:
- cargo test
dev:server:
desc: Run Rust backend on port 8001 (debug build, fast compile)
dir: server-rs
cmds:
- cargo run -- --data {{.WIDE_OUTPUT}} --pois {{.POIS_FILTERED_OUTPUT}}
dev:server:release:
desc: Run Rust backend on port 8001 (release build)
dir: server-rs
cmds:
- cargo run --release -- --data {{.WIDE_OUTPUT}} --pois {{.POIS_FILTERED_OUTPUT}}
dev:frontend:
desc: Run frontend dev server on port 3030 (proxies /api to :8001)
dir: frontend
deps:
- install
cmds:
- uv run python download_land_registry.py
- uv run python download_arcgis_data.py
- uv run python download_pois.py
pipeline:
desc: Run data processing pipeline
deps:
- download
cmds:
- uv run python -m pipeline.run
prepare:
desc: Prepare the application (install, download data, run pipeline)
deps:
- pipeline
server:
desc: Run FastAPI backend on port 8001
cmds:
- uv run fastapi dev server/main.py --port 8001
frontend:
desc: Run frontend dev server on port 3030 (proxies /api to :8001)
dir: frontend
cmds:
- npm run dev
build:
build:server:
desc: Build server for production
dir: server-rs
cmds:
- cargo build --release
build:frontend:
desc: Build frontend for production
dir: frontend
cmds:
- npm run typecheck
- npm run build
prod:
desc: Run production server (serves built frontend)
cmds:
- uv run fastapi run server/main.py --port 8001
lint:
desc: Lint all code (Python and TypeScript)
desc: Lint all code (Python, TypeScript, and Rust)
cmds:
- task: lint:python
- task: lint:frontend
- task: lint:rust
lint:python:
desc: Lint Python code with ruff
desc: Lint Python code with ruff and check for unused dependencies
cmds:
- uv run ruff check .
- uv run deptry .
lint:frontend:
desc: Lint frontend TypeScript code
@ -69,11 +82,20 @@ tasks:
- npm run lint
- npm run format:check
lint:rust:
desc: Lint Rust code with clippy, check formatting, and detect unused dependencies
dir: server-rs
cmds:
- cargo clippy -- -D warnings
- cargo fmt --check
- cargo machete
format:
desc: Format all code (Python and TypeScript)
desc: Format all code (Python, TypeScript, and Rust)
cmds:
- task: format:python
- task: format:frontend
- task: format:rust
format:python:
desc: Format Python code with ruff
@ -88,15 +110,17 @@ tasks:
- npm run lint:fix
- npm run format
format:rust:
desc: Format Rust code with cargo fmt
dir: server-rs
cmds:
- cargo fmt --all
check:
desc: Run all checks (lint, typecheck, build)
cmds:
- task: lint
- task: typecheck
- task: build
typecheck:
desc: Type check frontend TypeScript code
dir: frontend
cmds:
- npm run typecheck
- task: build:server
- task: build:frontend
- task: test
- task: test:server

723
analyses/epc_analysis.ipynb Normal file

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load diff

113
analyses/wide.ipynb Normal file

File diff suppressed because one or more lines are too long

View file

@ -1,129 +0,0 @@
#!/usr/bin/env python3
"""Download ArcGIS data and convert to Parquet."""
# Run it with:
# uv run download_arcgis_data.py
import time
import zipfile
import httpx
import polars as pl
from pathlib import Path
from tqdm import tqdm
URL = "https://www.arcgis.com/sharing/rest/content/items/077631e063eb4e1ab43575d01381ec33/data"
BASE_DATA_PATH = Path("./data_sources")
BASE_DATA_PATH.mkdir(exist_ok=True)
DOWNLOAD_PATH = BASE_DATA_PATH / "arcgis_data.zip"
EXTRACT_PATH = BASE_DATA_PATH / "arcgis_extracted"
PARQUET_PATH = BASE_DATA_PATH / "arcgis_data.parquet"
MAX_RETRIES = 3
def download_with_progress(url: str, output_path: Path) -> None:
"""Download a file with progress bar and retry logic."""
for attempt in range(1, MAX_RETRIES + 1):
try:
with httpx.stream(
"GET",
url,
follow_redirects=True,
timeout=httpx.Timeout(30.0, read=None),
) as response:
response.raise_for_status() # pyright: ignore[reportUnusedCallResult]
total = int(response.headers.get("content-length", 0))
with (
open(output_path, "wb") as f,
tqdm(
total=total,
unit="B",
unit_scale=True,
unit_divisor=1024,
desc="Downloading",
) as pbar,
):
for chunk in response.iter_bytes(chunk_size=8192):
f.write(chunk)
pbar.update(len(chunk))
return # Success
except (httpx.ConnectError, httpx.ReadTimeout) as e:
if attempt < MAX_RETRIES:
wait = 2**attempt
print(f"Attempt {attempt} failed: {e}. Retrying in {wait}s...")
time.sleep(wait)
else:
raise
def extract_zip(zip_path: Path, extract_path: Path) -> list[Path]:
"""Extract ZIP file and return list of extracted files."""
print("Extracting ZIP file...")
extract_path.mkdir(exist_ok=True)
with zipfile.ZipFile(zip_path, "r") as zf:
zf.extractall(extract_path)
return [extract_path / name for name in zf.namelist()]
def find_data_file(extract_path: Path) -> Path:
"""Find the main data file (CSV, XLSX, or similar) in extracted files."""
# Look for common data file extensions
for ext in ["*.csv", "*.xlsx", "*.xls", "*.json", "*.geojson"]:
files = list(extract_path.rglob(ext))
if files:
# Return the largest file if multiple found
return max(files, key=lambda f: f.stat().st_size)
raise FileNotFoundError(f"No data file found in {extract_path}")
def convert_to_parquet(data_path: Path, parquet_path: Path) -> None:
"""Convert data file to Parquet using Polars."""
print(f"Converting {data_path.name} to Parquet...")
suffix = data_path.suffix.lower()
if suffix == ".csv":
df = pl.read_csv(data_path, try_parse_dates=True)
elif suffix in [".xlsx", ".xls"]:
df = pl.read_excel(data_path)
elif suffix in [".json", ".geojson"]:
df = pl.read_json(data_path)
else:
raise ValueError(f"Unsupported file format: {suffix}")
df.write_parquet(parquet_path, compression="zstd")
print(f"Saved to {parquet_path}")
print(f"Rows: {df.height:,}")
print(f"Columns: {df.columns}")
print(f"Original size: {data_path.stat().st_size / 1024**2:.1f} MB")
print(f"Parquet size: {parquet_path.stat().st_size / 1024**2:.1f} MB")
def main() -> None:
if PARQUET_PATH.exists():
print(f"Parquet already exists at {PARQUET_PATH}, skipping")
return
if not DOWNLOAD_PATH.exists():
download_with_progress(URL, DOWNLOAD_PATH)
else:
print(f"File already exists at {DOWNLOAD_PATH}, skipping download")
# Check if it's a ZIP file
if zipfile.is_zipfile(DOWNLOAD_PATH):
extracted_files = extract_zip(DOWNLOAD_PATH, EXTRACT_PATH)
print(f"Extracted {len(extracted_files)} files")
data_file = find_data_file(EXTRACT_PATH)
else:
# Not a ZIP, treat as direct data file
data_file = DOWNLOAD_PATH
convert_to_parquet(data_file, PARQUET_PATH)
if __name__ == "__main__":
main()

View file

@ -1,61 +0,0 @@
#!/usr/bin/env python3
"""Download IoD2025 Deprivation Scores and convert to Parquet."""
import httpx
import polars as pl
from pathlib import Path
URL = "https://assets.publishing.service.gov.uk/media/691ded34513046b952c500bd/File_5_IoD2025_Scores_for_the_Indices_of_Deprivation.xlsx"
BASE_DATA_PATH = Path("./data_sources")
BASE_DATA_PATH.mkdir(exist_ok=True)
XLSX_PATH = BASE_DATA_PATH / "IoD2025_Scores.xlsx"
PARQUET_PATH = BASE_DATA_PATH / "IoD2025_Scores.parquet"
def download_file(url: str, output_path: Path) -> None:
"""Download file from URL."""
print(f"Downloading from {url}...")
with httpx.stream("GET", url, follow_redirects=True, timeout=60) as response:
response.raise_for_status()
total = int(response.headers.get("content-length", 0))
downloaded = 0
with open(output_path, "wb") as f:
for chunk in response.iter_bytes(chunk_size=8192):
f.write(chunk)
downloaded += len(chunk)
if total:
print(f"\rDownloaded {downloaded / 1024 / 1024:.1f} MB / {total / 1024 / 1024:.1f} MB", end="")
print(f"\nSaved to {output_path}")
def convert_to_parquet(xlsx_path: Path, parquet_path: Path) -> None:
"""Convert Excel sheet 2 to Parquet."""
print("Reading Excel file (sheet 2)...")
# Read the 2nd sheet (index 1) - IoD2025 Scores
df = pl.read_excel(
xlsx_path,
sheet_id=2, # 1-indexed, so 2 = second sheet
)
print(f"Shape: {df.shape}")
print(f"Columns: {df.columns}")
df.write_parquet(parquet_path, compression="zstd")
print(f"Saved to {parquet_path}")
print(f"Excel size: {xlsx_path.stat().st_size / 1024 / 1024:.1f} MB")
print(f"Parquet size: {parquet_path.stat().st_size / 1024 / 1024:.1f} MB")
def main() -> None:
if not XLSX_PATH.exists():
download_file(URL, XLSX_PATH)
else:
print(f"Excel file already exists at {XLSX_PATH}, skipping download")
convert_to_parquet(XLSX_PATH, PARQUET_PATH)
if __name__ == "__main__":
main()

View file

@ -1,114 +0,0 @@
#!/usr/bin/env python3
"""Download Land Registry price paid data and convert to Parquet."""
# Run it with:
# uv run download_land_registry.py
# The download failed in this environment due to network restrictions, but the script will work on your local machine. The ~5GB CSV should compress to roughly ~1GB in Parquet format with ZSTD compression.
import time
import httpx
import polars as pl
from pathlib import Path
from tqdm import tqdm
URL = "http://prod.publicdata.landregistry.gov.uk.s3-website-eu-west-1.amazonaws.com/pp-complete.csv"
BASE_DATA_PATH = Path("./data_sources")
BASE_DATA_PATH.mkdir(exist_ok=True)
CSV_PATH = BASE_DATA_PATH / "pp-complete.csv"
PARQUET_PATH = BASE_DATA_PATH / "pp-complete.parquet"
MAX_RETRIES = 3
def download_with_progress(url: str, output_path: Path) -> None:
"""Download a file with progress bar and retry logic."""
for attempt in range(1, MAX_RETRIES + 1):
try:
with httpx.stream(
"GET",
url,
follow_redirects=True,
timeout=httpx.Timeout(30.0, read=None),
) as response:
response.raise_for_status() # pyright: ignore[reportUnusedCallResult]
total = int(response.headers.get("content-length", 0))
with (
open(output_path, "wb") as f,
tqdm(
total=total,
unit="B",
unit_scale=True,
unit_divisor=1024,
desc="Downloading",
) as pbar,
):
for chunk in response.iter_bytes(chunk_size=8192):
f.write(chunk)
pbar.update(len(chunk))
return # Success
except (httpx.ConnectError, httpx.ReadTimeout) as e:
if attempt < MAX_RETRIES:
wait = 2**attempt
print(f"Attempt {attempt} failed: {e}. Retrying in {wait}s...")
time.sleep(wait)
else:
raise
def convert_to_parquet(csv_path: Path, parquet_path: Path) -> None:
"""Convert CSV to Parquet using Polars."""
print("Converting to Parquet...")
# https://www.gov.uk/guidance/about-the-price-paid-data
# Land Registry CSV columns
columns = [
"transaction_id",
"price",
"date_of_transfer",
"postcode",
"property_type",
"old_new",
"duration",
"paon",
"saon",
"street",
"locality",
"town_city",
"district",
"county",
"ppd_category",
"record_status",
]
df = pl.read_csv(
csv_path,
has_header=False,
new_columns=columns,
try_parse_dates=True,
)
df.write_parquet(parquet_path, compression="zstd")
print(f"Saved to {parquet_path}")
print(f"Rows: {df.height:,}")
print(f"CSV size: {csv_path.stat().st_size / 1024**2:.1f} MB")
print(f"Parquet size: {parquet_path.stat().st_size / 1024**2:.1f} MB")
def main() -> None:
if PARQUET_PATH.exists():
print(f"Parquet already exists at {PARQUET_PATH}, skipping")
return
if not CSV_PATH.exists():
download_with_progress(URL, CSV_PATH)
else:
print(f"CSV already exists at {CSV_PATH}, skipping download")
convert_to_parquet(CSV_PATH, PARQUET_PATH)
if __name__ == "__main__":
main()

View file

@ -1,54 +0,0 @@
"""Download POI data for the UK from Overture Maps."""
from pathlib import Path
import overturemaps
import pyarrow as pa
import pyarrow.parquet as pq
from tqdm import tqdm
# UK bounding box (west, south, east, north)
UK_BBOX = (-8.65, 49.86, 1.77, 60.86)
OUTPUT_DIR = Path("data_sources")
OUTPUT_FILE = OUTPUT_DIR / "uk_pois.parquet"
def main():
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
if OUTPUT_FILE.exists():
print(f"POI file already exists: {OUTPUT_FILE}")
print("Delete it manually to re-download.")
return
print("Downloading UK POI data from Overture Maps...")
print(f"Bounding box: {UK_BBOX}")
print("This may take several minutes...")
reader = overturemaps.record_batch_reader("place", bbox=UK_BBOX)
# Read all batches
batches = []
with tqdm(desc="Downloading batches", unit=" batches") as pbar:
for batch in reader:
batches.append(batch)
pbar.update(1)
pbar.set_postfix(rows=sum(b.num_rows for b in batches))
if not batches:
print("No data found in bounding box!")
return
# Combine batches into a table and write
table = pa.Table.from_batches(batches, schema=reader.schema)
print(f"\nWriting {table.num_rows:,} POIs to {OUTPUT_FILE}...")
pq.write_table(table, OUTPUT_FILE)
print(f"Download complete: {OUTPUT_FILE}")
print(f"File size: {OUTPUT_FILE.stat().st_size / 1024 / 1024:.1f} MB")
if __name__ == "__main__":
main()

View file

@ -11,6 +11,7 @@
"@deck.gl/core": "^9.0.0",
"@deck.gl/geo-layers": "^9.0.0",
"@deck.gl/layers": "^9.0.0",
"@deck.gl/mapbox": "^9.2.6",
"@deck.gl/react": "^9.0.0",
"@radix-ui/react-select": "^2.0.0",
"@radix-ui/react-slider": "^1.1.0",
@ -181,6 +182,22 @@
"@luma.gl/engine": "~9.2.6"
}
},
"node_modules/@deck.gl/mapbox": {
"version": "9.2.6",
"resolved": "https://registry.npmjs.org/@deck.gl/mapbox/-/mapbox-9.2.6.tgz",
"integrity": "sha512-gyqCHZwiZS8LOYY6LILQQp5YCCf++VFk/wRoGskZvhb/kdEPX2Onv8iV8pXe0h9UyMLO6Mj0wl3HlJWg2ILkrg==",
"license": "MIT",
"dependencies": {
"@luma.gl/constants": "^9.2.6",
"@math.gl/web-mercator": "^4.1.0"
},
"peerDependencies": {
"@deck.gl/core": "~9.2.0",
"@luma.gl/constants": "~9.2.6",
"@luma.gl/core": "~9.2.6",
"@math.gl/web-mercator": "^4.1.0"
}
},
"node_modules/@deck.gl/mesh-layers": {
"version": "9.2.6",
"resolved": "https://registry.npmjs.org/@deck.gl/mesh-layers/-/mesh-layers-9.2.6.tgz",

View file

@ -11,41 +11,42 @@
"format:check": "prettier --check \"src/**/*.{ts,tsx,css}\""
},
"dependencies": {
"react": "^18.2.0",
"react-dom": "^18.2.0",
"@deck.gl/core": "^9.0.0",
"@deck.gl/layers": "^9.0.0",
"@deck.gl/geo-layers": "^9.0.0",
"@deck.gl/layers": "^9.0.0",
"@deck.gl/mapbox": "^9.2.6",
"@deck.gl/react": "^9.0.0",
"maplibre-gl": "^4.0.0",
"react-map-gl": "^7.1.0",
"@radix-ui/react-slider": "^1.1.0",
"@radix-ui/react-select": "^2.0.0",
"@radix-ui/react-slider": "^1.1.0",
"class-variance-authority": "^0.7.0",
"clsx": "^2.1.0",
"maplibre-gl": "^4.0.0",
"react": "^18.2.0",
"react-dom": "^18.2.0",
"react-map-gl": "^7.1.0",
"tailwind-merge": "^2.2.0",
"tailwindcss-animate": "^1.0.7"
},
"devDependencies": {
"webpack": "^5.90.0",
"webpack-cli": "^5.1.0",
"webpack-dev-server": "^5.0.0",
"html-webpack-plugin": "^5.6.0",
"css-loader": "^7.0.0",
"style-loader": "^4.0.0",
"postcss-loader": "^8.0.0",
"ts-loader": "^9.5.0",
"typescript": "^5.4.0",
"@types/react": "^18.2.0",
"@types/react-dom": "^18.2.0",
"tailwindcss": "^3.4.0",
"autoprefixer": "^10.4.0",
"postcss": "^8.4.0",
"eslint": "^8.57.0",
"@typescript-eslint/eslint-plugin": "^7.0.0",
"@typescript-eslint/parser": "^7.0.0",
"autoprefixer": "^10.4.0",
"css-loader": "^7.0.0",
"eslint": "^8.57.0",
"eslint-plugin-react": "^7.34.0",
"eslint-plugin-react-hooks": "^4.6.0",
"prettier": "^3.2.0"
"html-webpack-plugin": "^5.6.0",
"postcss": "^8.4.0",
"postcss-loader": "^8.0.0",
"prettier": "^3.2.0",
"style-loader": "^4.0.0",
"tailwindcss": "^3.4.0",
"ts-loader": "^9.5.0",
"typescript": "^5.4.0",
"webpack": "^5.90.0",
"webpack-cli": "^5.1.0",
"webpack-dev-server": "^5.0.0"
}
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,243 @@
import { useMemo } from 'react';
import type { FeatureMeta, HexagonStatsResponse } from '../types';
interface AreaPaneProps {
stats: HexagonStatsResponse | null;
globalFeatures: FeatureMeta[];
loading: boolean;
hexagonId: string | null;
isHoveredPreview: boolean;
hoverMode: boolean;
onHoverModeChange: (enabled: boolean) => void;
onViewProperties: () => void;
onClose: () => void;
}
function formatValue(value: number): string {
if (Math.abs(value) >= 1_000_000) return `${(value / 1_000_000).toFixed(1)}M`;
if (Math.abs(value) >= 1_000) return `${(value / 1_000).toFixed(0)}k`;
if (Number.isInteger(value)) return value.toLocaleString();
return value.toFixed(1);
}
// Group features by their group field from globalFeatures
function groupFeatures(
globalFeatures: FeatureMeta[]
): { name: string; features: FeatureMeta[] }[] {
const groups: { name: string; features: FeatureMeta[] }[] = [];
const seen = new Set<string>();
for (const feature of globalFeatures) {
const groupName = feature.group || 'Other';
if (!seen.has(groupName)) {
seen.add(groupName);
groups.push({ name: groupName, features: [] });
}
groups.find((group) => group.name === groupName)!.features.push(feature);
}
return groups;
}
function MiniHistogram({ counts, maxCount }: { counts: number[]; maxCount: number }) {
if (maxCount === 0) return null;
// Downsample to ~20 bars for display
const targetBars = 20;
const step = Math.max(1, Math.floor(counts.length / targetBars));
const bars: number[] = [];
for (let index = 0; index < counts.length; index += step) {
let sum = 0;
for (let offset = 0; offset < step && index + offset < counts.length; offset++) {
sum += counts[index + offset];
}
bars.push(sum);
}
const barMax = Math.max(...bars, 1);
return (
<div className="flex items-end gap-px h-8 mt-1">
{bars.map((count, index) => (
<div
key={index}
className="flex-1 bg-teal-500 dark:bg-teal-400 rounded-t-sm min-w-[2px]"
style={{ height: `${(count / barMax) * 100}%`, opacity: count > 0 ? 1 : 0.1 }}
/>
))}
</div>
);
}
function EnumBarChart({ counts }: { counts: Record<string, number> }) {
const entries = Object.entries(counts).sort(([, countA], [, countB]) => countB - countA);
const maxCount = Math.max(...entries.map(([, count]) => count), 1);
return (
<div className="space-y-1 mt-1">
{entries.map(([label, count]) => (
<div key={label} className="flex items-center gap-2 text-xs">
<span className="w-16 truncate text-warm-500 dark:text-warm-400 text-right shrink-0">
{label}
</span>
<div className="flex-1 h-3 bg-warm-100 dark:bg-navy-700 rounded overflow-hidden">
<div
className="h-full bg-teal-500 dark:bg-teal-400 rounded"
style={{ width: `${(count / maxCount) * 100}%` }}
/>
</div>
<span className="w-8 text-warm-500 dark:text-warm-400 text-right shrink-0">{count}</span>
</div>
))}
</div>
);
}
export default function AreaPane({
stats,
globalFeatures,
loading,
hexagonId,
isHoveredPreview,
hoverMode,
onHoverModeChange,
onViewProperties,
onClose,
}: AreaPaneProps) {
const featureGroups = useMemo(() => groupFeatures(globalFeatures), [globalFeatures]);
// Build lookup maps from stats
const numericByName = useMemo(() => {
if (!stats) return new Map();
return new Map(stats.numeric_features.map((feature) => [feature.name, feature]));
}, [stats]);
const enumByName = useMemo(() => {
if (!stats) return new Map();
return new Map(stats.enum_features.map((feature) => [feature.name, feature]));
}, [stats]);
if (!hexagonId) {
return (
<div className="flex items-center justify-center h-full text-warm-500 dark:text-warm-400 px-4 text-center text-sm">
Click a hexagon to view area statistics
</div>
);
}
return (
<div className="flex flex-col h-full">
{/* Header */}
<div className="p-3 border-b border-warm-200 dark:border-navy-700">
<div className="flex justify-between items-center">
<div className="flex items-center gap-2">
<h2 className="text-sm font-semibold dark:text-warm-100">Area Statistics</h2>
{isHoveredPreview && (
<span className="text-xs px-1.5 py-0.5 rounded bg-teal-50 dark:bg-teal-900/30 text-teal-600 dark:text-teal-400">
Preview
</span>
)}
</div>
<div className="flex items-center gap-1">
<button
onClick={() => onHoverModeChange(!hoverMode)}
className={`p-1 rounded ${
hoverMode
? 'text-teal-600 dark:text-teal-400 bg-teal-50 dark:bg-teal-900/30'
: 'text-warm-400 hover:text-warm-700 dark:hover:text-warm-300'
}`}
title={hoverMode ? 'Live preview on (click to lock)' : 'Live preview off (click to enable)'}
>
<svg className="w-4 h-4" viewBox="0 0 24 24" fill="none" stroke="currentColor" strokeWidth={2}>
<path strokeLinecap="round" strokeLinejoin="round" d="M15 12a3 3 0 11-6 0 3 3 0 016 0z" />
<path strokeLinecap="round" strokeLinejoin="round" d="M2.458 12C3.732 7.943 7.523 5 12 5c4.478 0 8.268 2.943 9.542 7-1.274 4.057-5.064 7-9.542 7-4.477 0-8.268-2.943-9.542-7z" />
</svg>
</button>
<button
onClick={onClose}
className="text-warm-400 hover:text-warm-700 dark:hover:text-warm-300 p-1"
>
<svg className="w-4 h-4" viewBox="0 0 24 24" fill="none" stroke="currentColor" strokeWidth={2}>
<path strokeLinecap="round" strokeLinejoin="round" d="M6 18L18 6M6 6l12 12" />
</svg>
</button>
</div>
</div>
{stats && (
<p className="text-sm text-warm-600 dark:text-warm-400 mt-1">
{stats.count.toLocaleString()} properties
</p>
)}
{stats && (
<button
onClick={onViewProperties}
className="mt-2 w-full text-sm py-1.5 rounded bg-teal-600 hover:bg-teal-700 text-white font-medium"
>
View {stats.count.toLocaleString()} Properties
</button>
)}
</div>
{/* Stats content */}
<div className="flex-1 overflow-y-auto">
{loading && !stats ? (
<div className="p-4 text-warm-500 dark:text-warm-400 text-sm">Loading...</div>
) : stats ? (
<div className="p-3 space-y-4">
{featureGroups.map((group) => {
// Check if any feature in this group has data
const hasData = group.features.some(
(feature) => numericByName.has(feature.name) || enumByName.has(feature.name)
);
if (!hasData) return null;
return (
<div key={group.name}>
<h3 className="text-xs font-semibold text-warm-500 dark:text-warm-400 uppercase tracking-wider mb-2">
{group.name}
</h3>
<div className="space-y-3">
{group.features.map((feature) => {
const numericStats = numericByName.get(feature.name);
const enumStats = enumByName.get(feature.name);
if (numericStats) {
const maxCount = Math.max(...numericStats.histogram.counts);
return (
<div key={feature.name} className="bg-warm-50 dark:bg-navy-800 rounded p-2">
<div className="flex justify-between items-baseline">
<span className="text-xs text-warm-700 dark:text-warm-300 truncate mr-2">
{feature.name}
</span>
<span className="text-xs font-semibold text-teal-700 dark:text-teal-400 whitespace-nowrap">
{formatValue(numericStats.mean)}
</span>
</div>
<div className="flex justify-between text-[10px] text-warm-400 dark:text-warm-500 mt-0.5">
<span>{formatValue(numericStats.min)}</span>
<span>{formatValue(numericStats.max)}</span>
</div>
<MiniHistogram counts={numericStats.histogram.counts} maxCount={maxCount} />
</div>
);
}
if (enumStats) {
return (
<div key={feature.name} className="bg-warm-50 dark:bg-navy-800 rounded p-2">
<span className="text-xs text-warm-700 dark:text-warm-300">
{feature.name}
</span>
<EnumBarChart counts={enumStats.counts} />
</div>
);
}
return null;
})}
</div>
</div>
);
})}
</div>
) : null}
</div>
</div>
);
}

View file

@ -0,0 +1,10 @@
export default function DataSources({ onNavigate }: { onNavigate: () => void }) {
return (
<button
onClick={onNavigate}
className="absolute bottom-2 right-2 bg-white/90 dark:bg-navy-800/90 backdrop-blur-sm px-3 py-2 rounded shadow-lg text-xs text-teal-600 dark:text-teal-400 hover:text-teal-800 dark:hover:text-teal-300 hover:underline font-semibold transition-colors"
>
Data Sources
</button>
);
}

View file

@ -0,0 +1,214 @@
import { useEffect, useState, useRef } from 'react';
const DATA_SOURCES = [
{
id: 'price-paid',
name: 'Price Paid Data',
origin: 'HM Land Registry',
use: 'Complete historical property sale prices for England and Wales. Used for the last known sale price of each property.',
url: 'https://www.gov.uk/government/statistical-data-sets/price-paid-data-downloads',
license: 'Open Government Licence v3.0',
},
{
id: 'epc',
name: 'Energy Performance Certificates (EPC)',
origin: 'Ministry of Housing, Communities & Local Government',
use: 'Domestic Energy Performance Certificates providing floor area, number of rooms, construction age, energy ratings, property type, and built form. Fuzzy-joined with Price Paid records by address within postcode buckets.',
url: 'https://epc.opendatacommunities.org/downloads/domestic',
license: 'Open Government Licence v3.0',
},
{
id: 'nspl',
name: 'National Statistics Postcode Lookup (NSPL)',
origin: 'ONS / ArcGIS',
use: 'Maps postcodes to latitude/longitude, LSOA, and Output Area codes for geolocation and joining area-level datasets.',
url: 'https://www.arcgis.com/sharing/rest/content/items/077631e063eb4e1ab43575d01381ec33/data',
license: 'Open Government Licence v3.0',
},
{
id: 'iod',
name: 'English Indices of Deprivation 2025',
origin: 'Ministry of Housing, Communities & Local Government',
use: 'Relative deprivation scores for 33,755 LSOAs across domains: Income, Employment, Education, Health, Crime, Living Environment, and sub-domains. Joined to properties via LSOA code.',
url: 'https://www.gov.uk/government/statistics/english-indices-of-deprivation-2025',
license: 'Open Government Licence v3.0',
},
{
id: 'ethnicity',
name: 'Population by Ethnicity (2021 Census)',
origin: 'ONS',
use: 'Population percentages by ethnic group (Asian, Black, Mixed, White, Other) per Local Authority. Joined via Local Authority District code.',
url: 'https://www.ethnicity-facts-figures.service.gov.uk/uk-population-by-ethnicity/national-and-regional-populations/regional-ethnic-diversity/latest/#download-the-data',
license: 'Open Government Licence v3.0',
},
{
id: 'crime',
name: 'Street-level Crime Data',
origin: 'data.police.uk',
use: 'Street-level crime data from 2023 to 2025, aggregated into yearly averages by LSOA and crime type (violence, burglary, anti-social behaviour, drugs, vehicle crime, etc.).',
url: 'https://data.police.uk/data/',
license: 'Open Government Licence v3.0',
},
{
id: 'tfl-journey-times',
name: 'TfL Journey Times',
origin: 'Transport for London',
use: "Journey time calculations from postcodes to central London destinations (Bank, Waterloo, King's Cross, etc.) via public transport and cycling.",
url: 'https://api-portal.tfl.gov.uk/',
license: 'Powered by TfL Open Data',
},
{
id: 'osm-pois',
name: 'OpenStreetMap POIs',
origin: 'OpenStreetMap contributors / Geofabrik',
use: 'Points of interest extracted from the Great Britain PBF extract. Covers amenities, shops, healthcare, leisure, tourism, and more. Filtered and remapped to friendly category names.',
url: 'https://download.geofabrik.de/europe/great-britain-latest.osm.pbf',
license: 'Open Data Commons Open Database License (ODbL)',
},
{
id: 'naptan',
name: 'NaPTAN (Public Transport Stops)',
origin: 'Department for Transport',
use: 'National Public Transport Access Nodes providing station and stop locations (rail, bus, metro/tram, ferry, airport), merged into the POI dataset.',
url: 'https://naptan.dft.gov.uk/naptan/schema/2.4/doc/NaPTANSchemaGuide-2.4-v0.57.pdf',
license: 'Open Government Licence v3.0',
},
{
id: 'noise',
name: 'Defra Noise Mapping',
origin: 'Defra / Environment Agency',
use: 'Strategic noise mapping Round 4 (2022) for road, rail, and airport sources. Lden (day-evening-night 24h weighted average) at 10m grid resolution, modelled at 4m above ground. Sampled at postcode centroids via WCS GeoTIFF tiles.',
url: 'https://environment.data.gov.uk/spatialdata/road-noise-all-metrics-england-round-4/wcs',
license: 'Open Government Licence v3.0',
},
{
id: 'ofsted',
name: 'Ofsted School Inspections',
origin: 'Ofsted',
use: 'Latest inspection outcomes for state-funded schools (as at April 2025). Averaged per postcode to give a local school quality score (1=Outstanding to 4=Inadequate).',
url: 'https://www.gov.uk/government/statistical-data-sets/monthly-management-information-ofsteds-school-inspections-outcomes',
license: 'Open Government Licence v3.0',
},
{
id: 'broadband',
name: 'Ofcom Broadband Performance',
origin: 'Ofcom',
use: 'Fixed broadband coverage and speeds by Output Area from Connected Nations 2025. Includes max download/upload speeds across different speed tiers.',
url: 'https://www.ofcom.org.uk/phones-and-broadband/coverage-and-speeds/connected-nations-20252/data-downloads-2025',
license: 'Open Government Licence v3.0',
},
];
export default function DataSourcesPage() {
const [highlightedId, setHighlightedId] = useState<string | null>(null);
const cardRefs = useRef<Record<string, HTMLDivElement | null>>({});
useEffect(() => {
function handleHash() {
const hash = window.location.hash.replace('#', '');
if (hash && DATA_SOURCES.some((s) => s.id === hash)) {
setHighlightedId(hash);
// Scroll after a brief delay to allow render
setTimeout(() => {
cardRefs.current[hash]?.scrollIntoView({ behavior: 'smooth', block: 'center' });
}, 100);
} else {
setHighlightedId(null);
}
}
handleHash();
window.addEventListener('hashchange', handleHash);
return () => window.removeEventListener('hashchange', handleHash);
}, []);
return (
<div className="flex-1 overflow-y-auto bg-warm-50 dark:bg-navy-950 flex flex-col">
<div className="flex-1">
<div className="max-w-5xl mx-auto px-6 py-8">
<h1 className="text-2xl font-bold text-warm-900 dark:text-warm-100 mb-2">Data Sources</h1>
<p className="text-warm-600 dark:text-warm-400 mb-6">
This application combines {DATA_SOURCES.length} open datasets covering property prices,
energy performance, transport, demographics, crime, environment, and more.
</p>
<div className="grid grid-cols-1 lg:grid-cols-2 gap-4">
{DATA_SOURCES.map((source) => (
<div
key={source.id}
id={source.id}
ref={(el) => { cardRefs.current[source.id] = el; }}
className={`bg-white dark:bg-navy-800 rounded-lg border p-5 ${
highlightedId === source.id
? 'border-teal-400 ring-2 ring-teal-400'
: 'border-warm-200 dark:border-navy-700'
}`}
>
<div className="flex items-start justify-between gap-4 mb-2">
<h2 className="text-lg font-semibold text-warm-900 dark:text-warm-100">{source.name}</h2>
<span className="shrink-0 text-xs bg-warm-100 dark:bg-navy-700 text-warm-600 dark:text-warm-300 px-2 py-1 rounded">
{source.license}
</span>
</div>
<p className="text-sm text-warm-500 dark:text-warm-400 mb-2">Source: {source.origin}</p>
<p className="text-sm text-warm-700 dark:text-warm-300 mb-3">{source.use}</p>
<a
href={source.url}
target="_blank"
rel="noopener noreferrer"
className="text-sm text-teal-600 dark:text-teal-400 hover:text-teal-800 dark:hover:text-teal-300 hover:underline break-all"
>
{source.url}
</a>
</div>
))}
</div>
</div>
</div>
<footer className="bg-navy-900 text-warm-400 px-6 py-6">
<div className="max-w-5xl mx-auto">
<h2 className="text-sm font-semibold text-warm-300 uppercase tracking-wide mb-3">
Attribution
</h2>
<ul className="space-y-1.5 text-sm">
<li>Contains HM Land Registry data &copy; Crown copyright and database right 2025.</li>
<li>
Contains public sector information licensed under the{' '}
<a
href="https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/"
target="_blank"
rel="noopener noreferrer"
className="text-teal-400 hover:text-teal-300 hover:underline"
>
Open Government Licence v3.0
</a>
.
</li>
<li>Contains OS data &copy; Crown copyright and database rights 2025.</li>
<li>Powered by TfL Open Data.</li>
<li>
Contains data from{' '}
<a
href="https://www.openstreetmap.org/copyright"
target="_blank"
rel="noopener noreferrer"
className="text-teal-400 hover:text-teal-300 hover:underline"
>
&copy; OpenStreetMap contributors
</a>
, available under the{' '}
<a
href="https://opendatacommons.org/licenses/odbl/"
target="_blank"
rel="noopener noreferrer"
className="text-teal-400 hover:text-teal-300 hover:underline"
>
Open Data Commons Open Database License (ODbL)
</a>
.
</li>
</ul>
</div>
</footer>
</div>
);
}

View file

@ -0,0 +1,119 @@
import { useState } from 'react';
interface FAQItem {
question: string;
answer: string;
}
const FAQ_ITEMS: FAQItem[] = [
{
question: 'What is this application?',
answer:
'Narrowit is an interactive map that visualises property-level data across England and Wales. It combines Land Registry sale prices, EPC energy certificates, TfL journey times, deprivation indices, crime statistics, broadband speeds, school ratings, road noise levels, ethnicity demographics, and OpenStreetMap points of interest into a single explorable view.',
},
{
question: 'Where does the data come from?',
answer:
'All data comes from open government and community sources. Property prices are from HM Land Registry, energy certificates from MHCLG, transport times from TfL, deprivation scores from the English Indices of Deprivation 2025, crime data from data.police.uk, school ratings from Ofsted, broadband from Ofcom, noise from Defra, ethnicity from the 2021 Census, and points of interest from OpenStreetMap. See the Data Sources page for full details and links.',
},
{
question: 'What are the coloured hexagons on the map?',
answer:
'The map uses H3 hexagons to aggregate property data at different zoom levels. Each hexagon summarises the properties within it. The colour represents the value of whichever feature you have pinned or are actively filtering — for example, average price or energy rating. Zoom in to see smaller, more detailed hexagons; zoom out for a broader overview.',
},
{
question: 'How do filters work?',
answer:
'Use the Filters panel on the left to narrow down properties. Add a filter by clicking a feature name, then drag the range slider to set minimum and maximum values. For categorical features like property type, select or deselect individual values. Only hexagons containing properties that match all active filters are shown. Filters are combined with AND logic — every property must satisfy every filter.',
},
{
question: 'What does the eye icon do on a filter?',
answer:
'The eye icon pins a feature as the colour source for the hexagon layer. When pinned, hexagons are coloured by that feature\'s value range even when you are not actively dragging its slider. This lets you visualise one feature while filtering on others. Click the eye icon again to unpin.',
},
{
question: 'How fresh is the data?',
answer:
'Property prices cover all Land Registry transactions up to the most recent quarterly release. EPC data includes certificates issued up to the latest available download. Crime data spans 20232025 as yearly averages. TfL journey times are computed from current timetables. Deprivation indices are from the 2025 release. School ratings reflect the latest Ofsted inspections as at April 2025. Broadband data is from Ofcom Connected Nations 2025.',
},
{
question: 'How are EPC records matched to Land Registry sales?',
answer:
'EPC and Land Registry records don\'t share a common identifier, so they are fuzzy-joined by address within each postcode bucket. The pipeline uses token-sorted string similarity with special handling for numeric tokens (house numbers, flat numbers). Matches are assigned greedily from highest similarity score downward so each record is used at most once.',
},
{
question: 'What are Points of Interest (POIs)?',
answer:
'POIs are places like cafes, schools, supermarkets, GP surgeries, parks, and train stations extracted from OpenStreetMap and the NaPTAN public transport dataset. Use the POI panel on the right to toggle categories on and off. POIs appear as markers on the map when you are zoomed in far enough.',
},
{
question: 'Can I share a specific view with someone?',
answer:
'Yes. The URL updates automatically as you pan, zoom, and change filters. Click the Share button in the header to copy the current URL to your clipboard. Anyone who opens that link will see the same view, filters, and active POI categories.',
},
{
question: 'How do I see individual properties?',
answer:
'Click on a hexagon to open the Properties panel on the right. It lists all matching properties within that hexagon, showing address, price, and key features. Use "Load more" at the bottom to paginate through large hexagons.',
},
{
question: 'Why are some hexagons grey?',
answer:
'Grey hexagons contain properties that have data but fall outside the range of your currently pinned or active feature. This gives you a sense of where properties exist even when their values are outside your selected range.',
},
{
question: 'Does this work on mobile?',
answer:
'The app is designed for desktop browsers where you have enough screen space for the map, filter panel, and POI/properties panel side by side. It will load on mobile but the experience is best on a larger screen.',
},
];
function FAQItemCard({ item }: { item: FAQItem }) {
const [open, setOpen] = useState(false);
return (
<div className="bg-white dark:bg-navy-800 rounded-lg border border-warm-200 dark:border-navy-700">
<button
className="w-full text-left px-5 py-4 flex items-center justify-between gap-4"
onClick={() => setOpen(!open)}
>
<span className="font-medium text-warm-900 dark:text-warm-100">{item.question}</span>
<svg
className={`w-5 h-5 shrink-0 text-warm-400 dark:text-warm-500 transform ${open ? 'rotate-180' : ''}`}
fill="none"
stroke="currentColor"
viewBox="0 0 24 24"
strokeWidth={2}
>
<path strokeLinecap="round" strokeLinejoin="round" d="M19 9l-7 7-7-7" />
</svg>
</button>
{open && (
<div className="px-5 pb-4">
<p className="text-sm text-warm-700 dark:text-warm-300 leading-relaxed">{item.answer}</p>
</div>
)}
</div>
);
}
export default function FAQPage() {
return (
<div className="flex-1 overflow-y-auto bg-warm-50 dark:bg-navy-950">
<div className="max-w-3xl mx-auto px-6 py-8">
<h1 className="text-2xl font-bold text-warm-900 dark:text-warm-100 mb-2">
Frequently Asked Questions
</h1>
<p className="text-warm-600 dark:text-warm-400 mb-6">
Common questions about how Narrowit works, where the data comes from, and how to use the
map.
</p>
<div className="space-y-3">
{FAQ_ITEMS.map((item, index) => (
<FAQItemCard key={index} item={item} />
))}
</div>
</div>
</div>
);
}

View file

@ -1,119 +1,466 @@
import { memo, useState, useRef, useCallback, useMemo, useEffect } from 'react';
import { Slider } from './ui/slider';
import { Label } from './ui/label';
import { YEAR_MIN, YEAR_MAX, YEAR_STEP, PRICE_MIN, PRICE_MAX, PRICE_STEP } from '../lib/constants';
import type { Filters as FiltersType, POICategoryGroup } from '../types';
import { POI_CATEGORY_GROUPS } from '../types';
import type { FeatureMeta, FeatureFilters } from '../types';
interface FiltersProps {
filters: FiltersType;
onChange: (filters: FiltersType) => void;
features: FeatureMeta[];
filters: FeatureFilters;
activeFeature: string | null;
dragValue: [number, number] | null;
enabledFeatures: Set<string>;
onAddFilter: (name: string) => void;
onRemoveFilter: (name: string) => void;
onFilterChange: (name: string, value: [number, number] | string[]) => void;
onDragStart: (name: string) => void;
onDragChange: (value: [number, number]) => void;
onDragEnd: () => void;
zoom: number;
selectedPOICategories: Set<POICategoryGroup>;
onPOICategoriesChange: (categories: Set<POICategoryGroup>) => void;
pinnedFeature: string | null;
onTogglePin: (name: string) => void;
onCancelPin: () => void;
onNavigateToSource?: (slug: string, featureName: string) => void;
openInfoFeature?: string | null;
onClearOpenInfoFeature?: () => void;
}
const POI_LABELS: Record<POICategoryGroup, string> = {
schools: '🏫 Schools',
healthcare: '🏥 Healthcare',
transport: '🚉 Transport',
parks: '🌳 Parks',
emergency: '🚨 Emergency',
supermarkets: '🛒 Supermarkets',
};
function EyeIcon({ filled, className }: { filled: boolean; className?: string }) {
return (
<svg
className={className || 'w-3.5 h-3.5'}
viewBox="0 0 24 24"
fill={filled ? 'currentColor' : 'none'}
stroke="currentColor"
strokeWidth={2}
>
<path d="M1 12s4-8 11-8 11 8 11 8-4 8-11 8-11-8-11-8z" />
<circle cx="12" cy="12" r="3" />
</svg>
);
}
export default function Filters({
filters,
onChange,
zoom,
selectedPOICategories,
onPOICategoriesChange,
}: FiltersProps) {
const update = (key: keyof FiltersType, value: number) => onChange({ ...filters, [key]: value });
function InfoPopup({
feature,
onClose,
onNavigateToSource,
}: {
feature: FeatureMeta;
onClose: () => void;
onNavigateToSource?: (slug: string, featureName: string) => void;
}) {
const popupRef = useRef<HTMLDivElement>(null);
const togglePOICategory = (category: POICategoryGroup) => {
const newSet = new Set(selectedPOICategories);
if (newSet.has(category)) {
newSet.delete(category);
} else {
newSet.add(category);
useEffect(() => {
function handleClickOutside(e: MouseEvent) {
if (popupRef.current && !popupRef.current.contains(e.target as Node)) {
onClose();
}
}
onPOICategoriesChange(newSet);
};
document.addEventListener('mousedown', handleClickOutside);
return () => document.removeEventListener('mousedown', handleClickOutside);
}, [onClose]);
return (
<div className="w-72 p-4 bg-white shadow-lg space-y-6 overflow-y-auto max-h-screen">
<h1 className="text-xl font-bold">UK Property Prices</h1>
<div className="text-sm text-slate-500">Zoom: {zoom.toFixed(1)}</div>
<div className="space-y-2">
<Label>
Year Range: {filters.minYear} - {filters.maxYear}
</Label>
<Slider
min={YEAR_MIN}
max={YEAR_MAX}
step={YEAR_STEP}
value={[filters.minYear, filters.maxYear]}
onValueChange={([min, max]) => onChange({ ...filters, minYear: min, maxYear: max })}
/>
</div>
<div className="space-y-2">
<Label>Min Price: £{filters.minPrice.toLocaleString()}</Label>
<Slider
min={PRICE_MIN}
max={PRICE_MAX}
step={PRICE_STEP}
value={[filters.minPrice]}
onValueChange={([v]) => update('minPrice', v)}
/>
</div>
<div className="space-y-2">
<Label>Max Price: £{filters.maxPrice.toLocaleString()}</Label>
<Slider
min={PRICE_MIN}
max={PRICE_MAX}
step={PRICE_STEP}
value={[filters.maxPrice]}
onValueChange={([v]) => update('maxPrice', v)}
/>
</div>
<div className="mt-6 p-3 bg-slate-100 rounded text-xs">
<div className="mb-2 font-medium">Average Price</div>
<div
className="h-4 rounded"
style={{
background:
'linear-gradient(to right, rgb(46, 204, 113), rgb(241, 196, 15), rgb(231, 76, 60), rgb(142, 68, 173))',
}}
></div>
<div className="flex justify-between mt-1">
<span>£0</span>
<span>£200k</span>
<span>£400k</span>
<span>£800k+</span>
</div>
</div>
<div className="space-y-2">
<Label>Points of Interest</Label>
<div className="space-y-1">
{POI_CATEGORY_GROUPS.map((category) => (
<label key={category} className="flex items-center gap-2 cursor-pointer">
<input
type="checkbox"
checked={selectedPOICategories.has(category)}
onChange={() => togglePOICategory(category)}
className="rounded"
/>
<span className="text-sm">{POI_LABELS[category]}</span>
</label>
))}
<div className="fixed inset-0 z-50 flex items-center justify-center bg-black/30">
<div
ref={popupRef}
className="bg-white dark:bg-navy-800 border border-warm-200 dark:border-navy-700 rounded-lg shadow-xl max-w-md w-full mx-4 p-5"
>
<div className="flex items-start justify-between mb-3">
<h3 className="text-sm font-semibold text-warm-900 dark:text-warm-100 pr-4">
{feature.name}
</h3>
<button
onClick={onClose}
className="text-warm-400 hover:text-warm-700 dark:hover:text-warm-300 shrink-0"
>
<svg className="w-4 h-4" viewBox="0 0 24 24" fill="none" stroke="currentColor" strokeWidth={2}>
<path strokeLinecap="round" strokeLinejoin="round" d="M6 18L18 6M6 6l12 12" />
</svg>
</button>
</div>
{feature.description && (
<p className="text-xs text-warm-500 dark:text-warm-400 mb-2">{feature.description}</p>
)}
{feature.detail && (
<p className="text-sm text-warm-700 dark:text-warm-300 mb-4 leading-relaxed">{feature.detail}</p>
)}
{feature.source && onNavigateToSource && (
<button
onClick={() => {
onNavigateToSource(feature.source!, feature.name);
onClose();
}}
className="text-sm text-teal-600 dark:text-teal-400 hover:text-teal-800 dark:hover:text-teal-300 hover:underline"
>
View data source
</button>
)}
</div>
</div>
);
}
function FeatureBrowser({
availableFeatures,
allFeatures,
pinnedFeature,
onAddFilter,
onTogglePin,
onNavigateToSource,
openInfoFeature,
onClearOpenInfoFeature,
}: {
availableFeatures: FeatureMeta[];
allFeatures: FeatureMeta[];
pinnedFeature: string | null;
onAddFilter: (name: string) => void;
onTogglePin: (name: string) => void;
onNavigateToSource?: (slug: string, featureName: string) => void;
openInfoFeature?: string | null;
onClearOpenInfoFeature?: () => void;
}) {
const [search, setSearch] = useState('');
const [infoFeature, setInfoFeature] = useState<FeatureMeta | null>(null);
// Auto-open info popup when navigating back
useEffect(() => {
if (openInfoFeature) {
const feat = allFeatures.find((f) => f.name === openInfoFeature);
if (feat) setInfoFeature(feat);
onClearOpenInfoFeature?.();
}
}, [openInfoFeature, allFeatures, onClearOpenInfoFeature]);
const filtered = useMemo(() => {
if (!search) return availableFeatures;
const lower = search.toLowerCase();
return availableFeatures.filter((f) => f.name.toLowerCase().includes(lower));
}, [availableFeatures, search]);
const grouped = useMemo(() => {
const groups: { name: string; features: FeatureMeta[] }[] = [];
const seen = new Map<string, FeatureMeta[]>();
for (const f of filtered) {
const g = f.group || 'Other';
let arr = seen.get(g);
if (!arr) {
arr = [];
seen.set(g, arr);
groups.push({ name: g, features: arr });
}
arr.push(f);
}
return groups;
}, [filtered]);
return (
<>
<div className="p-2 border-b border-warm-200 dark:border-navy-700">
<input
type="text"
placeholder="Search features..."
value={search}
onChange={(e) => setSearch(e.target.value)}
className="w-full px-2 py-1 text-sm border rounded bg-white dark:bg-navy-800 dark:text-warm-200 border-warm-200 dark:border-navy-700 placeholder-warm-400 dark:placeholder-warm-500 focus:outline-none focus:ring-1 focus:ring-teal-400"
/>
</div>
<div className="flex-1 overflow-y-auto">
{grouped.map((group) => (
<div key={group.name}>
<div className="px-3 py-1.5 text-xs font-bold text-warm-500 bg-warm-50 dark:bg-navy-950 dark:text-warm-400 sticky top-0">
{group.name}
</div>
{group.features.map((f) => {
const isPinned = pinnedFeature === f.name;
return (
<div
key={f.name}
className="flex items-start justify-between px-3 py-1.5 hover:bg-teal-50 dark:hover:bg-teal-900/30 dark:text-warm-300"
>
<div className="min-w-0 mr-2">
<span className="text-sm truncate block">{f.name}</span>
{f.description && (
<span className="text-xs text-warm-400 dark:text-warm-500 truncate block">{f.description}</span>
)}
</div>
<div className="flex items-center gap-1 shrink-0 mt-0.5">
{f.detail && (
<button
onClick={() => setInfoFeature(f)}
className="text-warm-400 hover:text-warm-700 dark:hover:text-warm-300 p-0.5 rounded"
title="Feature info"
>
<svg className="w-3.5 h-3.5" viewBox="0 0 24 24" fill="none" stroke="currentColor" strokeWidth={2}>
<circle cx="12" cy="12" r="10" />
<path strokeLinecap="round" d="M12 16v-4m0-4h.01" />
</svg>
</button>
)}
<button
onClick={() => onTogglePin(f.name)}
className={`p-0.5 rounded ${isPinned ? 'text-teal-600 dark:text-teal-400' : 'text-warm-400 hover:text-warm-700 dark:hover:text-warm-300'}`}
title={isPinned ? 'Unpin color view' : 'Color map by this feature'}
>
<EyeIcon filled={isPinned} />
</button>
<button
onClick={() => onAddFilter(f.name)}
className="text-warm-400 hover:text-warm-700 dark:hover:text-warm-300 p-0.5 rounded"
title="Add filter"
>
<svg className="w-3.5 h-3.5" viewBox="0 0 24 24" fill="none" stroke="currentColor" strokeWidth={2}>
<path strokeLinecap="round" strokeLinejoin="round" d="M12 5v14m-7-7h14" />
</svg>
</button>
</div>
</div>
);
})}
</div>
))}
{grouped.length === 0 && (
<div className="px-3 py-4 text-sm text-warm-400 dark:text-warm-500 text-center">
{search ? 'No matching features' : 'All features are active'}
</div>
)}
</div>
{infoFeature && (
<InfoPopup
feature={infoFeature}
onClose={() => setInfoFeature(null)}
onNavigateToSource={onNavigateToSource}
/>
)}
</>
);
}
function formatValue(value: number): string {
if (Math.abs(value) >= 1_000_000) return `${(value / 1_000_000).toFixed(1)}M`;
if (Math.abs(value) >= 1_000) return `${(value / 1_000).toFixed(1)}k`;
if (Number.isInteger(value)) return value.toString();
return value.toFixed(2);
}
export default memo(function Filters({
features,
filters,
activeFeature,
dragValue,
enabledFeatures,
onAddFilter,
onRemoveFilter,
onFilterChange,
onDragStart,
onDragChange,
onDragEnd,
zoom,
pinnedFeature,
onTogglePin,
onCancelPin,
onNavigateToSource,
openInfoFeature,
onClearOpenInfoFeature,
}: FiltersProps) {
const availableFeatures = features.filter((f) => !enabledFeatures.has(f.name));
const enabledFeatureList = features.filter((f) => enabledFeatures.has(f.name));
const containerRef = useRef<HTMLDivElement>(null);
const [splitFraction, setSplitFraction] = useState(0.65);
const draggingRef = useRef(false);
const handleSeparatorPointerDown = useCallback(
(e: React.PointerEvent) => {
e.preventDefault();
(e.target as HTMLElement).setPointerCapture(e.pointerId);
draggingRef.current = true;
},
[]
);
const handleSeparatorPointerMove = useCallback(
(e: React.PointerEvent) => {
if (!draggingRef.current || !containerRef.current) return;
const rect = containerRef.current.getBoundingClientRect();
const y = e.clientY - rect.top;
const fraction = Math.min(0.8, Math.max(0.15, y / rect.height));
setSplitFraction(fraction);
},
[]
);
const handleSeparatorPointerUp = useCallback(() => {
draggingRef.current = false;
}, []);
return (
<div ref={containerRef} className="w-80 flex flex-col bg-white dark:bg-navy-950 shadow-lg overflow-hidden">
{/* Top: Active filters — user-resizable, scrollable */}
<div className="min-h-0 flex flex-col" style={{ height: `${splitFraction * 100}%` }}>
{/* Active Filters header */}
<div className="shrink-0 flex items-center justify-between px-3 py-2 border-b border-warm-200 dark:border-navy-700">
<div className="flex items-center gap-2">
<span className="text-sm font-semibold text-navy-950 dark:text-warm-100">Active Filters</span>
{enabledFeatureList.length > 0 && (
<span className="text-xs font-medium px-1.5 py-0.5 rounded-full bg-teal-50 dark:bg-teal-900/30 text-teal-600 dark:text-teal-400">
{enabledFeatureList.length}
</span>
)}
</div>
<span className="text-xs text-warm-500 dark:text-warm-400">Zoom {zoom.toFixed(1)}</span>
</div>
<div className="flex-1 overflow-y-auto p-3 space-y-3">
{enabledFeatureList.length === 0 && (
<div className="flex flex-col items-center justify-center py-8 text-center">
<svg className="w-8 h-8 text-warm-300 dark:text-warm-600 mb-2" viewBox="0 0 24 24" fill="none" stroke="currentColor" strokeWidth={1.5}>
<path strokeLinecap="round" strokeLinejoin="round" d="M12 3c2.755 0 5.455.232 8.083.678.533.09.917.556.917 1.096v1.044a2.25 2.25 0 01-.659 1.591l-5.432 5.432a2.25 2.25 0 00-.659 1.591v2.927a2.25 2.25 0 01-1.244 2.013L9.75 21v-6.568a2.25 2.25 0 00-.659-1.591L3.659 7.409A2.25 2.25 0 013 5.818V4.774c0-.54.384-1.006.917-1.096A48.32 48.32 0 0112 3z" />
</svg>
<span className="text-sm font-medium text-warm-400 dark:text-warm-500">No active filters</span>
<span className="text-xs text-warm-400 dark:text-warm-500 mt-1">Browse features below and click + to add a filter</span>
</div>
)}
{enabledFeatureList.map((feature) => {
if (feature.type === 'enum') {
const selectedValues = (filters[feature.name] as string[]) || [];
const allValues = feature.values || [];
return (
<div key={feature.name} className={`space-y-1 p-3 rounded ${pinnedFeature === feature.name ? 'ring-2 ring-teal-400 bg-teal-50/50 dark:bg-teal-900/20' : ''}`}>
<div className="flex items-center justify-between">
<Label>{feature.name}</Label>
<div className="flex items-center gap-0.5">
<button
onClick={() => onTogglePin(feature.name)}
className={`p-0.5 rounded ${pinnedFeature === feature.name ? 'text-teal-600 dark:text-teal-400' : 'text-warm-400 hover:text-warm-700 dark:hover:text-warm-300'}`}
title={pinnedFeature === feature.name ? 'Unpin color view' : 'Color map by this feature'}
>
<EyeIcon filled={pinnedFeature === feature.name} />
</button>
<button
onClick={() => onRemoveFilter(feature.name)}
className="text-warm-400 hover:text-warm-700 dark:hover:text-warm-300 text-sm px-1"
title="Remove filter"
>
x
</button>
</div>
</div>
<div className="flex gap-2 text-sm mb-1">
<button
className="text-teal-600 dark:text-teal-400 hover:underline"
onClick={() => onFilterChange(feature.name, [...allValues])}
>
All
</button>
<button
className="text-teal-600 dark:text-teal-400 hover:underline"
onClick={() => onFilterChange(feature.name, [])}
>
None
</button>
</div>
<div className="space-y-0.5 max-h-40 overflow-y-auto">
{allValues.map((val) => (
<label key={val} className="flex items-center gap-1.5 text-sm cursor-pointer dark:text-warm-300">
<input
type="checkbox"
checked={selectedValues.includes(val)}
onChange={() => {
const next = selectedValues.includes(val)
? selectedValues.filter((v) => v !== val)
: [...selectedValues, val];
onFilterChange(feature.name, next);
}}
className="rounded accent-teal-600"
/>
{val}
</label>
))}
</div>
</div>
);
}
// Numeric feature
const isActive = activeFeature === feature.name;
const isPinned = pinnedFeature === feature.name;
const displayValue =
isActive && dragValue
? dragValue
: (filters[feature.name] as [number, number]) || [feature.min!, feature.max!];
const step = feature.step ?? (feature.max! - feature.min!) / 100;
return (
<div
key={feature.name}
className={`space-y-1 p-3 rounded ${isActive ? 'ring-2 ring-teal-400 bg-teal-50 dark:bg-teal-900/30' : isPinned ? 'ring-2 ring-teal-400 bg-teal-50/50 dark:bg-teal-900/20' : ''}`}
>
<div className="flex items-center justify-between">
<Label>
{feature.name}: {formatValue(displayValue[0])} - {formatValue(displayValue[1])}
</Label>
<div className="flex items-center gap-0.5">
<button
onClick={() => onTogglePin(feature.name)}
className={`p-0.5 rounded ${isPinned ? 'text-teal-600 dark:text-teal-400' : 'text-warm-400 hover:text-warm-700 dark:hover:text-warm-300'}`}
title={isPinned ? 'Unpin color view' : 'Color map by this feature'}
>
<EyeIcon filled={isPinned} />
</button>
<button
onClick={() => onRemoveFilter(feature.name)}
className="text-warm-400 hover:text-warm-700 dark:hover:text-warm-300 text-sm px-1"
title="Remove filter"
>
x
</button>
</div>
</div>
<Slider
min={feature.min!}
max={feature.max!}
step={step}
value={[displayValue[0], displayValue[1]]}
onValueChange={([min, max]) => onDragChange([min, max])}
onPointerDown={() => onDragStart(feature.name)}
onPointerUp={() => onDragEnd()}
/>
</div>
);
})}
</div>
</div>
{/* Draggable separator */}
<div
className="shrink-0 h-1.5 cursor-row-resize flex items-center justify-center bg-warm-100 dark:bg-navy-800 hover:bg-warm-200 dark:hover:bg-navy-700 border-y border-warm-200 dark:border-navy-700"
onPointerDown={handleSeparatorPointerDown}
onPointerMove={handleSeparatorPointerMove}
onPointerUp={handleSeparatorPointerUp}
>
<div className="w-8 h-0.5 rounded bg-warm-300 dark:bg-navy-600" />
</div>
{/* Bottom: Feature browser — fills remaining space */}
<div className="min-h-0 flex-1 flex flex-col">
<div className="shrink-0 px-3 py-2 border-b border-warm-200 dark:border-navy-700">
<span className="text-sm font-semibold text-navy-950 dark:text-warm-100">Add Filter</span>
</div>
<div className="min-h-0 flex-1 flex flex-col">
<FeatureBrowser
availableFeatures={availableFeatures}
allFeatures={features}
pinnedFeature={pinnedFeature}
onAddFilter={onAddFilter}
onTogglePin={onTogglePin}
onNavigateToSource={onNavigateToSource}
openInfoFeature={openInfoFeature}
onClearOpenInfoFeature={onClearOpenInfoFeature}
/>
</div>
</div>
</div>
);
});

View file

@ -0,0 +1,367 @@
import { useRef, useState, useEffect, useCallback } from 'react';
// --- Floating hex particle canvas that reacts to scroll ---
const HEX_COUNT = 60;
const TAU = Math.PI * 2;
interface Hex {
x: number;
y: number;
baseY: number;
size: number;
opacity: number;
speed: number; // horizontal drift px/s
phase: number; // for gentle bob
}
function initHexes(w: number, h: number): Hex[] {
const hexes: Hex[] = [];
for (let i = 0; i < HEX_COUNT; i++) {
const y = Math.random() * h;
hexes.push({
x: Math.random() * w,
y,
baseY: y,
size: 8 + Math.random() * 20,
opacity: 0.06 + Math.random() * 0.12,
speed: 6 + Math.random() * 14,
phase: Math.random() * TAU,
});
}
return hexes;
}
function drawHex(ctx: CanvasRenderingContext2D, cx: number, cy: number, r: number) {
ctx.beginPath();
for (let i = 0; i < 6; i++) {
const angle = (TAU / 6) * i - Math.PI / 6;
const px = cx + r * Math.cos(angle);
const py = cy + r * Math.sin(angle);
if (i === 0) ctx.moveTo(px, py);
else ctx.lineTo(px, py);
}
ctx.closePath();
}
function HexCanvas({ scrollProgress, isDark = false }: { scrollProgress: number; isDark?: boolean }) {
const canvasRef = useRef<HTMLCanvasElement>(null);
const hexesRef = useRef<Hex[]>([]);
const animRef = useRef(0);
const scrollRef = useRef(scrollProgress);
scrollRef.current = scrollProgress;
const isDarkRef = useRef(isDark);
isDarkRef.current = isDark;
useEffect(() => {
const canvas = canvasRef.current;
if (!canvas) return;
const ctx = canvas.getContext('2d');
if (!ctx) return;
let w = 0;
let h = 0;
function resize() {
const dpr = window.devicePixelRatio || 1;
const rect = canvas!.parentElement!.getBoundingClientRect();
w = rect.width;
h = rect.height;
canvas!.width = w * dpr;
canvas!.height = h * dpr;
canvas!.style.width = `${w}px`;
canvas!.style.height = `${h}px`;
ctx!.setTransform(dpr, 0, 0, dpr, 0, 0);
hexesRef.current = initHexes(w, h);
}
resize();
const ro = new ResizeObserver(resize);
ro.observe(canvas.parentElement!);
let prev = performance.now();
function frame(now: number) {
const dt = (now - prev) / 1000;
prev = now;
const scroll = scrollRef.current;
ctx!.clearRect(0, 0, w, h);
// Teal accent color, fade to 0 as user scrolls down
const globalAlpha = Math.max(0, 1 - scroll * 2);
for (const hex of hexesRef.current) {
// drift right, wrap
hex.x = (hex.x + hex.speed * dt) % (w + hex.size * 2);
// gentle vertical bob + parallax push from scroll
const bob = Math.sin(now / 1000 + hex.phase) * 8;
const parallax = scroll * h * 0.3 * (hex.speed / 20);
hex.y = hex.baseY + bob - parallax;
// wrap vertically
if (hex.y < -hex.size * 2) hex.y += h + hex.size * 4;
if (hex.y > h + hex.size * 2) hex.y -= h + hex.size * 4;
const dark = isDarkRef.current;
ctx!.globalAlpha = hex.opacity * globalAlpha * (dark ? 0.6 : 1);
ctx!.fillStyle = dark ? '#058172' : '#00a28c';
drawHex(ctx!, hex.x, hex.y, hex.size);
ctx!.fill();
ctx!.globalAlpha = hex.opacity * 0.5 * globalAlpha * (dark ? 0.6 : 1);
ctx!.strokeStyle = dark ? '#0a665b' : '#05c9aa';
ctx!.lineWidth = 1;
drawHex(ctx!, hex.x, hex.y, hex.size);
ctx!.stroke();
}
animRef.current = requestAnimationFrame(frame);
}
animRef.current = requestAnimationFrame(frame);
return () => {
cancelAnimationFrame(animRef.current);
ro.disconnect();
};
}, []);
return (
<canvas
ref={canvasRef}
className="absolute inset-0 pointer-events-none"
style={{ zIndex: 0 }}
/>
);
}
// --- Fade-in hook ---
function useFadeInRef() {
const ref = useRef<HTMLDivElement>(null);
useEffect(() => {
const el = ref.current;
if (!el) return;
const observer = new IntersectionObserver(
([entry]) => {
if (entry.isIntersecting) {
el.classList.add('fade-in-visible');
observer.unobserve(el);
}
},
{ threshold: 0.15 }
);
observer.observe(el);
return () => observer.disconnect();
}, []);
return ref;
}
// --- Page ---
export default function HomePage({ onOpenDashboard, theme = 'light' }: { onOpenDashboard: () => void; theme?: 'light' | 'dark' }) {
const scrollRef = useRef<HTMLDivElement>(null);
const [scrollProgress, setScrollProgress] = useState(0);
const handleScroll = useCallback(() => {
const el = scrollRef.current;
if (!el) return;
const max = el.scrollHeight - el.clientHeight;
if (max <= 0) return;
setScrollProgress(el.scrollTop / max);
}, []);
useEffect(() => {
const el = scrollRef.current;
if (!el) return;
el.addEventListener('scroll', handleScroll, { passive: true });
return () => el.removeEventListener('scroll', handleScroll);
}, [handleScroll]);
const heroRef = useFadeInRef();
const problemRef = useFadeInRef();
const filtersRef = useFadeInRef();
const howRef = useFadeInRef();
const numbersRef = useFadeInRef();
const ctaRef = useFadeInRef();
return (
<div ref={scrollRef} className="flex-1 overflow-y-auto bg-warm-50 dark:bg-navy-950 relative">
<HexCanvas scrollProgress={scrollProgress} isDark={theme === 'dark'} />
<div className="relative" style={{ zIndex: 1 }}>
{/* Hero */}
<div className="max-w-3xl mx-auto px-6 pt-20 pb-24">
<div
ref={heroRef}
className="fade-in-section backdrop-blur-sm bg-warm-50/60 dark:bg-navy-950/60 rounded-2xl p-8 -mx-2"
>
<p className="text-teal-600 font-semibold tracking-wide uppercase text-sm mb-4">
Find where to live, not just what&apos;s for sale
</p>
<h1 className="text-5xl font-extrabold text-navy-950 dark:text-warm-100 mb-6 leading-[1.1] tracking-tight">
Every neighbourhood
<br />
in England &amp; Wales.
<br />
<span className="text-teal-600">One map. Your&nbsp;rules.</span>
</h1>
<p className="text-xl text-warm-600 dark:text-warm-400 mb-8 leading-relaxed max-w-xl">
Set the commute, budget, school rating, noise level, and crime threshold you&apos;ll
accept. Narrowit shows you every area that qualifies &mdash; instantly.
</p>
<div className="flex items-center gap-4">
<button
onClick={onOpenDashboard}
className="px-7 py-3.5 bg-coral-500 text-white rounded-lg font-semibold hover:bg-coral-600 transition-colors text-base shadow-lg shadow-coral-500/25"
>
Explore the map
</button>
<span className="text-warm-400 text-sm">
No signup &middot; Free &middot; Open data
</span>
</div>
</div>
</div>
{/* The flip */}
<div className="max-w-3xl mx-auto px-6 pb-20">
<div ref={problemRef} className="fade-in-section">
<div className="rounded-2xl backdrop-blur-sm bg-warm-50/40 dark:bg-navy-800/40 border border-warm-200/50 dark:border-navy-700/50 p-8">
<div className="grid md:grid-cols-2 gap-8">
<div>
<h3 className="text-sm font-semibold text-warm-400 uppercase tracking-wide mb-2">
The old way
</h3>
<p className="text-warm-700 dark:text-warm-300 leading-relaxed">
Pick a postcode. Google the schools. Check crime stats on another site. Look up
commute times. Realise it&apos;s too expensive. Start over. Repeat 40 times.
</p>
</div>
<div>
<h3 className="text-sm font-semibold text-teal-600 uppercase tracking-wide mb-2">
With Narrowit
</h3>
<p className="text-warm-700 dark:text-warm-300 leading-relaxed">
Tell the map what you need. Every hexagon that lights up is a place worth
looking at. Drill into any one to see individual properties, prices, and energy
ratings.
</p>
</div>
</div>
</div>
</div>
</div>
{/* Filter showcase */}
<div className="max-w-4xl mx-auto px-6 pb-20">
<div ref={filtersRef} className="fade-in-section">
<h2 className="text-3xl font-bold text-navy-950 dark:text-warm-100 mb-2 text-center">
12 datasets. One slider&nbsp;each.
</h2>
<p className="text-warm-500 dark:text-warm-400 text-center mb-10 max-w-lg mx-auto">
Every filter narrows the map in real time. Combine as many as you like.
</p>
<div className="grid grid-cols-2 md:grid-cols-4 gap-3">
{FILTERS.map((f) => (
<div
key={f.label}
className="rounded-xl bg-white dark:bg-navy-800 border border-warm-200 dark:border-navy-700 p-4 shadow-sm hover:shadow-md hover:border-teal-300 dark:hover:border-teal-600 transition-all"
>
<div className="text-2xl mb-2">{f.icon}</div>
<div className="font-semibold text-navy-950 dark:text-warm-100 text-sm">{f.label}</div>
<div className="text-xs text-warm-500 dark:text-warm-400 mt-0.5">{f.example}</div>
</div>
))}
</div>
</div>
</div>
{/* How it works */}
<div className="max-w-3xl mx-auto px-6 pb-20">
<div ref={howRef} className="fade-in-section">
<h2 className="text-3xl font-bold text-navy-950 dark:text-warm-100 mb-10 text-center">
Three clicks to clarity
</h2>
<div className="space-y-6">
{STEPS.map((step, i) => (
<div key={i} className="flex gap-5 items-start">
<span className="shrink-0 w-10 h-10 rounded-full bg-teal-600 text-white flex items-center justify-center text-lg font-bold">
{i + 1}
</span>
<div>
<h3 className="font-semibold text-navy-950 dark:text-warm-100 text-lg">{step.title}</h3>
<p className="text-warm-600 dark:text-warm-400 mt-0.5">{step.body}</p>
</div>
</div>
))}
</div>
</div>
</div>
{/* Numbers */}
<div className="max-w-3xl mx-auto px-6 pb-20">
<div ref={numbersRef} className="fade-in-section">
<div className="grid grid-cols-3 gap-6 text-center">
{STATS.map((s) => (
<div key={s.label}>
<div className="text-3xl font-extrabold text-teal-600">{s.value}</div>
<div className="text-sm text-warm-500 dark:text-warm-400 mt-1">{s.label}</div>
</div>
))}
</div>
</div>
</div>
{/* Final CTA */}
<div className="max-w-3xl mx-auto px-6 pb-24">
<div ref={ctaRef} className="fade-in-section text-center">
<h2 className="text-3xl font-bold text-navy-950 dark:text-warm-100 mb-3">Ready to narrow it down?</h2>
<p className="text-warm-500 dark:text-warm-400 mb-8 max-w-md mx-auto">
100% open data. No account required. Just set your filters and go.
</p>
<button
onClick={onOpenDashboard}
className="px-8 py-4 bg-coral-500 text-white rounded-lg font-semibold hover:bg-coral-600 transition-colors text-lg shadow-lg shadow-coral-500/25"
>
Open the map
</button>
</div>
</div>
</div>
</div>
);
}
// --- Data ---
const FILTERS = [
{ icon: '\u00A3', label: 'Sale price', example: 'e.g. under \u00A3400k' },
{ icon: '\uD83D\uDE86', label: 'Commute time', example: 'e.g. < 45 min to Bank' },
{ icon: '\uD83C\uDFEB', label: 'School quality', example: 'Ofsted Outstanding' },
{ icon: '\uD83D\uDEA8', label: 'Crime rate', example: 'Low burglary areas' },
{ icon: '\u26A1', label: 'Energy rating', example: 'EPC band A\u2013C' },
{ icon: '\uD83D\uDCCF', label: 'Floor area', example: 'e.g. 80+ sqm' },
{ icon: '\uD83D\uDD07', label: 'Road noise', example: 'Below 55 dB Lden' },
{ icon: '\uD83C\uDF10', label: 'Broadband speed', example: '100+ Mbps available' },
];
const STEPS = [
{
title: 'Add your deal-breakers',
body: 'Slide the filters for everything you care about \u2014 price cap, max commute, school quality, noise. The map updates as you drag.',
},
{
title: 'Spot the clusters',
body: 'Hexagons light up where properties match. Zoom in and they split into finer cells. At street level you see individual postcode boundaries.',
},
{
title: 'Dive into a neighbourhood',
body: 'Click any hexagon to see every property inside it \u2014 sale prices, floor plans, energy ratings, tenure. Layer on cafes, GP surgeries, and parks from OpenStreetMap.',
},
];
const STATS = [
{ value: '26M+', label: 'property records' },
{ value: '12', label: 'open datasets' },
{ value: '1.7M', label: 'postcodes mapped' },
];

View file

@ -1,88 +1,41 @@
import { useCallback, useRef, useEffect, useState, useMemo } from 'react';
import { Map as MapGL } from 'react-map-gl/maplibre';
import DeckGL from '@deck.gl/react';
import { useCallback, useRef, useEffect, useState, useMemo, memo } from 'react';
import { Map as MapGL, useControl } from 'react-map-gl/maplibre';
import type { MapRef } from 'react-map-gl/maplibre';
import { MapboxOverlay } from '@deck.gl/mapbox';
import { H3HexagonLayer } from '@deck.gl/geo-layers';
import { IconLayer } from '@deck.gl/layers';
import { IconLayer, TextLayer } from '@deck.gl/layers';
import type { PickingInfo } from '@deck.gl/core';
import 'maplibre-gl/dist/maplibre-gl.css';
import type { HexagonData, ViewState, ViewChangeParams, Bounds, POI } from '../types';
import type { HexagonData, ViewState, ViewChangeParams, Bounds, POI, FeatureMeta } from '../types';
interface MapProps {
data: HexagonData[];
pois: POI[];
onViewChange: (params: ViewChangeParams) => void;
viewFeature: string | null;
colorRange: [number, number] | null;
filterRange: [number, number] | null;
viewSource: 'drag' | 'eye' | null;
onCancelPin: () => void;
features: FeatureMeta[];
selectedHexagonId: string | null;
hoveredHexagonId: string | null;
onHexagonClick: (h3: string) => void;
onHexagonHover: (h3: string | null) => void;
initialViewState?: ViewState;
theme?: 'light' | 'dark';
}
// Twemoji CDN base URL
const TWEMOJI_BASE = 'https://cdn.jsdelivr.net/gh/twitter/twemoji@14.0.2/assets/72x72/';
// Map category to Twemoji codepoint (emoji unicode -> hex)
const POI_EMOJI_CODES: Record<string, string> = {
// Schools
elementary_school: '1f3eb', // 🏫
school: '1f3eb',
high_school: '1f393', // 🎓
preschool: '1f476', // 👶
college_university: '1f393',
private_school: '1f3eb',
// Healthcare
doctor: '1f3e5', // 🏥
dentist: '1f9b7', // 🦷
pharmacy: '1f48a', // 💊
hospital: '1f3e5',
public_health_clinic: '1f3e5',
// Transport
train_station: '1f689', // 🚉
bus_station: '1f68c', // 🚌
metro_station: '1f687', // 🚇
light_rail_and_subway_stations: '1f687',
// Parks
park: '1f333', // 🌳
national_park: '1f3de', // 🏞
dog_park: '1f415', // 🐕
// Emergency
police_department: '1f694', // 🚔
fire_department: '1f692', // 🚒
// Supermarkets
supermarket: '1f6d2', // 🛒
grocery_store: '1f6d2',
convenience_store: '1f3ea', // 🏪
};
function getPOIIconUrl(category: string): string {
const code = POI_EMOJI_CODES[category] || '1f4cd'; // 📍 default
return `${TWEMOJI_BASE}${code}.png`;
}
// Tooltip emojis (these render fine in HTML)
const TOOLTIP_EMOJIS: Record<string, string> = {
elementary_school: '🏫',
school: '🏫',
high_school: '🎓',
preschool: '👶',
college_university: '🎓',
private_school: '🏫',
doctor: '👨‍⚕️',
dentist: '🦷',
pharmacy: '💊',
hospital: '🏥',
public_health_clinic: '🏥',
train_station: '🚉',
bus_station: '🚌',
metro_station: '🚇',
light_rail_and_subway_stations: '🚇',
park: '🌳',
national_park: '🏞️',
dog_park: '🐕',
police_department: '🚔',
fire_department: '🚒',
supermarket: '🛒',
grocery_store: '🛒',
convenience_store: '🏪',
};
function getTooltipEmoji(category: string): string {
return TOOLTIP_EMOJIS[category] || '📍';
// Convert emoji to Twemoji URL
function emojiToTwemojiUrl(emoji: string): string {
// Convert emoji to Unicode codepoint hex
const codePoint = emoji.codePointAt(0);
if (!codePoint) return `${TWEMOJI_BASE}1f4cd.png`; // Default pin
const hex = codePoint.toString(16);
return `${TWEMOJI_BASE}${hex}.png`;
}
const INITIAL_VIEW: ViewState = {
@ -92,61 +45,44 @@ const INITIAL_VIEW: ViewState = {
pitch: 0,
};
const MAP_STYLE = 'https://basemaps.cartocdn.com/gl/positron-gl-style/style.json';
const MAP_STYLE_LIGHT = 'https://basemaps.cartocdn.com/gl/voyager-gl-style/style.json';
const MAP_STYLE_DARK = 'https://basemaps.cartocdn.com/gl/dark-matter-gl-style/style.json';
interface ColorStop {
price: number;
color: [number, number, number];
}
// Continuous color scale from green (low) -> yellow -> red -> purple (high)
const COLOR_SCALE: ColorStop[] = [
{ price: 0, color: [46, 204, 113] }, // Green
{ price: 200000, color: [241, 196, 15] }, // Yellow
{ price: 400000, color: [231, 76, 60] }, // Red
{ price: 800000, color: [142, 68, 173] }, // Purple
// Gradient stops for normalized [0,1] values
const GRADIENT: { t: number; color: [number, number, number] }[] = [
{ t: 0, color: [46, 204, 113] }, // Green
{ t: 0.33, color: [241, 196, 15] }, // Yellow
{ t: 0.66, color: [231, 76, 60] }, // Red
{ t: 1, color: [142, 68, 173] }, // Purple
];
function interpolateColor(
c1: [number, number, number],
c2: [number, number, number],
t: number
): [number, number, number] {
return [
Math.round(c1[0] + (c2[0] - c1[0]) * t),
Math.round(c1[1] + (c2[1] - c1[1]) * t),
Math.round(c1[2] + (c2[2] - c1[2]) * t),
];
}
function normalizedToColor(t: number): [number, number, number] {
if (t <= 0) return GRADIENT[0].color;
if (t >= 1) return GRADIENT[GRADIENT.length - 1].color;
function priceToColor(price: number | null | undefined): [number, number, number] {
if (price == null || isNaN(price)) return [128, 128, 128]; // Gray for missing data
// Clamp to scale range
if (price <= COLOR_SCALE[0].price) return COLOR_SCALE[0].color;
if (price >= COLOR_SCALE[COLOR_SCALE.length - 1].price) {
return COLOR_SCALE[COLOR_SCALE.length - 1].color;
}
// Find the two colors to interpolate between
for (let i = 0; i < COLOR_SCALE.length - 1; i++) {
const lower = COLOR_SCALE[i];
const upper = COLOR_SCALE[i + 1];
if (price >= lower.price && price <= upper.price) {
const t = (price - lower.price) / (upper.price - lower.price);
return interpolateColor(lower.color, upper.color, t);
for (let i = 0; i < GRADIENT.length - 1; i++) {
const lo = GRADIENT[i];
const hi = GRADIENT[i + 1];
if (t >= lo.t && t <= hi.t) {
const frac = (t - lo.t) / (hi.t - lo.t);
return [
Math.round(lo.color[0] + (hi.color[0] - lo.color[0]) * frac),
Math.round(lo.color[1] + (hi.color[1] - lo.color[1]) * frac),
Math.round(lo.color[2] + (hi.color[2] - lo.color[2]) * frac),
];
}
}
return COLOR_SCALE[COLOR_SCALE.length - 1].color;
return GRADIENT[GRADIENT.length - 1].color;
}
function zoomToResolution(zoom: number): number {
if (zoom < 8.5) return 7;
if (zoom < 6) return 5;
if (zoom < 7) return 6;
if (zoom < 9.5) return 8;
if (zoom < 11) return 9;
if (zoom < 13) return 10;
return 11;
if (zoom < 15) return 11;
return 12;
}
function getBoundsFromViewState(viewState: ViewState, width: number, height: number): Bounds {
@ -165,7 +101,6 @@ function getBoundsFromViewState(viewState: ViewState, width: number, height: num
const halfWidthDeg = (width / 2) * degreesPerPixelLng;
// Latitude uses Mercator projection (non-linear)
// Convert center lat to pixel y, offset by half height, convert back to lat
const latRad = (clampedLat * Math.PI) / 180;
const mercatorY = (1 - Math.log(Math.tan(latRad) + 1 / Math.cos(latRad)) / Math.PI) / 2;
const centerPixelY = mercatorY * worldSize;
@ -175,7 +110,7 @@ function getBoundsFromViewState(viewState: ViewState, width: number, height: num
// Convert pixel Y back to latitude
const pixelYToLat = (pixelY: number): number => {
const mercY = Math.max(0.001, Math.min(0.999, pixelY / worldSize)); // Clamp to avoid edge cases
const mercY = Math.max(0.001, Math.min(0.999, pixelY / worldSize));
const latRadians = Math.atan(Math.sinh(Math.PI * (1 - 2 * mercY)));
return (latRadians * 180) / Math.PI;
};
@ -193,9 +128,215 @@ interface Dimensions {
height: number;
}
export default function Map({ data, pois, onViewChange }: MapProps) {
function DeckOverlay({
layers,
getTooltip,
}: {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
layers: any[];
// eslint-disable-next-line @typescript-eslint/no-explicit-any
getTooltip: any;
}) {
const overlay = useControl(() => new MapboxOverlay({ interleaved: true }));
const prevLayersRef = useRef(layers);
const prevTooltipRef = useRef(getTooltip);
if (layers !== prevLayersRef.current || getTooltip !== prevTooltipRef.current) {
prevLayersRef.current = layers;
prevTooltipRef.current = getTooltip;
overlay.setProps({ layers, getTooltip });
}
return null;
}
// Vibrant density scale: light cyan → teal → deep indigo
const DENSITY_GRADIENT: { t: number; color: [number, number, number] }[] = [
{ t: 0, color: [130, 234, 220] }, // Light cyan (few)
{ t: 0.5, color: [20, 140, 180] }, // Ocean blue (moderate)
{ t: 1, color: [88, 28, 140] }, // Deep indigo (many)
];
function countToColor(t: number): [number, number, number] {
if (t <= 0) return DENSITY_GRADIENT[0].color;
if (t >= 1) return DENSITY_GRADIENT[DENSITY_GRADIENT.length - 1].color;
for (let i = 0; i < DENSITY_GRADIENT.length - 1; i++) {
const lo = DENSITY_GRADIENT[i];
const hi = DENSITY_GRADIENT[i + 1];
if (t >= lo.t && t <= hi.t) {
const frac = (t - lo.t) / (hi.t - lo.t);
return [
Math.round(lo.color[0] + (hi.color[0] - lo.color[0]) * frac),
Math.round(lo.color[1] + (hi.color[1] - lo.color[1]) * frac),
Math.round(lo.color[2] + (hi.color[2] - lo.color[2]) * frac),
];
}
}
return DENSITY_GRADIENT[DENSITY_GRADIENT.length - 1].color;
}
function PostcodeSearch({
onFlyTo,
}: {
onFlyTo: (lat: number, lng: number, zoom: number) => void;
}) {
const [query, setQuery] = useState('');
const [error, setError] = useState<string | null>(null);
const [loading, setLoading] = useState(false);
const handleSubmit = useCallback(
async (e: React.FormEvent) => {
e.preventDefault();
const trimmed = query.trim();
if (!trimmed) return;
setError(null);
setLoading(true);
try {
const res = await fetch(
`https://api.postcodes.io/postcodes/${encodeURIComponent(trimmed)}`
);
if (!res.ok) {
setError('Postcode not found');
return;
}
const json = await res.json();
if (json.status === 200 && json.result) {
onFlyTo(json.result.latitude, json.result.longitude, 14);
setQuery('');
} else {
setError('Postcode not found');
}
} catch {
setError('Lookup failed');
} finally {
setLoading(false);
}
},
[query, onFlyTo]
);
return (
<form onSubmit={handleSubmit} className="absolute top-3 left-3 z-10 flex flex-col gap-1">
<div className="flex shadow-lg rounded overflow-hidden">
<input
type="text"
value={query}
onChange={(e) => {
setQuery(e.target.value);
setError(null);
}}
placeholder="Search postcode..."
className="px-3 py-2 text-sm w-40 border-none outline-none bg-white dark:bg-navy-800 dark:text-warm-100 dark:placeholder-warm-500"
/>
<button
type="submit"
disabled={loading}
className="px-3 py-2 bg-teal-600 text-white text-sm hover:bg-teal-700 disabled:opacity-50"
>
{loading ? '...' : 'Go'}
</button>
</div>
{error && (
<span className="text-xs text-red-600 dark:text-red-400 bg-white/90 dark:bg-navy-800/90 rounded px-2 py-0.5 shadow">{error}</span>
)}
</form>
);
}
function MapLegend({
featureLabel,
range,
showCancel,
onCancel,
mode,
enumValues,
}: {
featureLabel: string;
range: [number, number];
showCancel: boolean;
onCancel: () => void;
mode: 'feature' | 'density';
enumValues?: string[];
}) {
const formatVal = (v: number) => {
if (Math.abs(v) >= 1_000_000) return `${(v / 1_000_000).toFixed(1)}M`;
if (Math.abs(v) >= 1_000) return `${(v / 1_000).toFixed(1)}k`;
if (Number.isInteger(v)) return v.toString();
return v.toFixed(1);
};
const gradientStyle =
mode === 'density'
? 'linear-gradient(to right, rgb(130, 234, 220), rgb(20, 140, 180), rgb(88, 28, 140))'
: 'linear-gradient(to right, rgb(46, 204, 113), rgb(241, 196, 15), rgb(231, 76, 60), rgb(142, 68, 173))';
return (
<div className="absolute top-3 right-3 z-10 bg-white dark:bg-navy-800 dark:text-warm-200 rounded shadow-lg p-3 text-xs min-w-[160px]">
<div className="flex items-center justify-between mb-2">
<span className="font-semibold text-sm">{featureLabel}</span>
{showCancel && (
<button
onClick={onCancel}
className="text-warm-400 hover:text-warm-700 dark:hover:text-warm-300 ml-2"
title="Clear color view"
>
<svg
className="w-4 h-4"
fill="none"
stroke="currentColor"
viewBox="0 0 24 24"
strokeWidth={2}
>
<path strokeLinecap="round" strokeLinejoin="round" d="M6 18L18 6M6 6l12 12" />
</svg>
</button>
)}
</div>
<div
className="h-3 rounded"
style={{ background: gradientStyle }}
/>
<div className="flex justify-between mt-1 text-warm-600 dark:text-warm-400">
{mode === 'density' ? (
<>
<span>Few</span>
<span>Many</span>
</>
) : enumValues && enumValues.length > 0 ? (
<>
<span>{enumValues[0]}</span>
<span>{enumValues[enumValues.length - 1]}</span>
</>
) : (
<>
<span>{formatVal(range[0])}</span>
<span>{formatVal(range[1])}</span>
</>
)}
</div>
</div>
);
}
export default memo(function Map({
data,
pois,
onViewChange,
viewFeature,
colorRange,
filterRange,
viewSource,
onCancelPin,
features,
selectedHexagonId,
hoveredHexagonId,
onHexagonClick,
onHexagonHover,
initialViewState,
theme = 'light',
}: MapProps) {
const containerRef = useRef<HTMLDivElement>(null);
const [viewState, setViewState] = useState<ViewState>(INITIAL_VIEW);
const [viewState, setViewState] = useState<ViewState>(initialViewState || INITIAL_VIEW);
const [dimensions, setDimensions] = useState<Dimensions>({ width: 0, height: 0 });
// Track container dimensions with ResizeObserver
@ -218,18 +359,69 @@ export default function Map({ data, pois, onViewChange }: MapProps) {
useEffect(() => {
if (dimensions.width === 0 || dimensions.height === 0) return;
const bounds = getBoundsFromViewState(viewState, dimensions.width, dimensions.height);
const raw = getBoundsFromViewState(viewState, dimensions.width, dimensions.height);
const resolution = zoomToResolution(viewState.zoom);
onViewChange({ resolution, bounds, zoom: viewState.zoom });
// Quantize bounds to 0.01° to reduce state churn and improve backend cache hits
const QUANT = 0.01;
const bounds: Bounds = {
south: Math.floor(raw.south / QUANT) * QUANT,
west: Math.floor(raw.west / QUANT) * QUANT,
north: Math.ceil(raw.north / QUANT) * QUANT,
east: Math.ceil(raw.east / QUANT) * QUANT,
};
onViewChange({
resolution,
bounds,
zoom: viewState.zoom,
latitude: viewState.latitude,
longitude: viewState.longitude,
});
}, [viewState, dimensions, onViewChange]);
const handleViewStateChange = useCallback((params: { viewState: unknown }) => {
const newViewState = params.viewState as ViewState;
setViewState(newViewState);
const handleMove = useCallback((evt: { viewState: ViewState }) => {
setViewState(evt.viewState);
}, []);
// Popup state for POI hover (using screen coordinates)
const handleFlyTo = useCallback((lat: number, lng: number, zoom: number) => {
setViewState((prev) => ({ ...prev, latitude: lat, longitude: lng, zoom }));
}, []);
const themeRef = useRef(theme);
themeRef.current = theme;
// Make place labels more legible over the colored hexagons
const handleMapLoad = useCallback(
(evt: { target: MapRef['getMap'] extends () => infer M ? M : never }) => {
const map = evt.target;
if (themeRef.current === 'light') {
for (const layer of map.getStyle().layers || []) {
if (layer.type !== 'symbol') continue;
map.setPaintProperty(layer.id, 'text-halo-color', 'rgba(255,255,255,1)');
map.setPaintProperty(layer.id, 'text-halo-width', 2);
map.setPaintProperty(layer.id, 'text-color', '#222');
}
// Make water more prominent
for (const layer of map.getStyle().layers || []) {
if (layer.id === 'water' || layer.id.startsWith('water')) {
map.setPaintProperty(layer.id, 'fill-color', '#6baed6');
}
}
}
try {
map.setLayoutProperty('building', 'visibility', 'none');
map.setLayoutProperty('building-top', 'visibility', 'none');
} catch {
// layers may not exist in dark style
}
},
[]
);
const mapStyle = theme === 'dark' ? MAP_STYLE_DARK : MAP_STYLE_LIGHT;
// Popup state for POI hover
const [popupInfo, setPopupInfo] = useState<{
x: number;
y: number;
@ -250,24 +442,149 @@ export default function Map({ data, pois, onViewChange }: MapProps) {
}
}, []);
const layers = useMemo(
() => [
// Compute count range for count-based coloring
const countRange = useMemo(() => {
if (data.length === 0) return { min: 0, max: 1 };
let min = Infinity;
let max = -Infinity;
for (const d of data) {
const c = d.count as number;
if (c < min) min = c;
if (c > max) max = c;
}
if (min === max) return { min, max: min + 1 };
return { min, max };
}, [data]);
// Memoize feature lookup to avoid new reference each render
const colorFeatureMeta = useMemo(
() => (viewFeature ? features.find((f) => f.name === viewFeature) || null : null),
[viewFeature, features]
);
// Use refs for values that change during drag so layers aren't recreated
const viewFeatureRef = useRef(viewFeature);
viewFeatureRef.current = viewFeature;
const colorRangeRef = useRef(colorRange);
colorRangeRef.current = colorRange;
const filterRangeRef = useRef(filterRange);
filterRangeRef.current = filterRange;
const colorFeatureMetaRef = useRef(colorFeatureMeta);
colorFeatureMetaRef.current = colorFeatureMeta;
const countRangeRef = useRef(countRange);
countRangeRef.current = countRange;
const selectedHexagonIdRef = useRef(selectedHexagonId);
selectedHexagonIdRef.current = selectedHexagonId;
const hoveredHexagonIdRef = useRef(hoveredHexagonId);
hoveredHexagonIdRef.current = hoveredHexagonId;
// Stable click handler using ref
const onHexagonClickRef = useRef(onHexagonClick);
onHexagonClickRef.current = onHexagonClick;
const handleHexagonClick = useCallback((info: PickingInfo<HexagonData>) => {
if (info.object && 'h3' in info.object) {
onHexagonClickRef.current(info.object.h3);
}
}, []);
// Stable hover handler using ref
const onHexagonHoverRef = useRef(onHexagonHover);
onHexagonHoverRef.current = onHexagonHover;
const handleHexagonHover = useCallback((info: PickingInfo<HexagonData>) => {
if (info.object && 'h3' in info.object) {
onHexagonHoverRef.current(info.object.h3);
} else {
onHexagonHoverRef.current(null);
}
}, []);
// Stable hover handler using ref
const handlePoiHoverRef = useRef(handlePoiHover);
handlePoiHoverRef.current = handlePoiHover;
const stablePoiHover = useCallback((info: PickingInfo<POI>) => {
handlePoiHoverRef.current(info);
}, []);
// Derive a trigger value from color-affecting state — avoids useEffect+setState double-render
const colorTrigger = `${viewFeature}|${colorRange?.[0]}|${colorRange?.[1]}|${filterRange?.[0]}|${filterRange?.[1]}|${countRange.min}|${countRange.max}|${selectedHexagonId}|${hoveredHexagonId}`;
// Hexagon layer — only recreated when data or color trigger changes
const hexLayer = useMemo(
() =>
new H3HexagonLayer<HexagonData>({
id: 'h3-hexagons',
data,
getHexagon: (d) => d.h3,
getFillColor: (d) => priceToColor(d.avg_price),
getFillColor: (d) => {
const vf = viewFeatureRef.current;
const clr = colorRangeRef.current;
const fr = filterRangeRef.current;
const cfm = colorFeatureMetaRef.current;
if (vf && clr && cfm) {
const val = d[`min_${vf}`];
if (val == null) return [128, 128, 128, 80] as [number, number, number, number];
// Gray out hexagons outside filter range
if (fr) {
const minVal = d[`min_${vf}`] as number;
const maxVal = d[`max_${vf}`] as number;
if (maxVal < fr[0] || minVal > fr[1]) {
return [180, 180, 180, 60] as [number, number, number, number];
}
}
// Color using full slider range
const range = clr[1] - clr[0];
if (range === 0) return [...GRADIENT[0].color, 200] as [number, number, number, number];
const t = ((val as number) - clr[0]) / range;
const rgb = normalizedToColor(Math.max(0, Math.min(1, t)));
return [...rgb, 200] as [number, number, number, number];
}
const cr = countRangeRef.current;
const c = d.count as number;
const t = (c - cr.min) / (cr.max - cr.min);
return [...countToColor(Math.max(0, Math.min(1, t))), 200] as [
number,
number,
number,
number,
];
},
getLineColor: (d) => {
if (d.h3 === selectedHexagonIdRef.current) return [255, 255, 255, 255] as [number, number, number, number];
if (d.h3 === hoveredHexagonIdRef.current) return [29, 228, 195, 200] as [number, number, number, number];
return [0, 0, 0, 0] as [number, number, number, number];
},
getLineWidth: (d) => {
if (d.h3 === selectedHexagonIdRef.current) return 3;
if (d.h3 === hoveredHexagonIdRef.current) return 2;
return 0;
},
lineWidthUnits: 'pixels',
updateTriggers: {
getFillColor: [colorTrigger],
getLineColor: [colorTrigger],
getLineWidth: [colorTrigger],
},
extruded: false,
pickable: true,
opacity: 0.5,
opacity: 1,
highPrecision: true,
onClick: handleHexagonClick,
onHover: handleHexagonHover,
// @ts-expect-error beforeId is a MapboxOverlay interleave prop, not typed in LayerProps
beforeId: 'waterway_label',
}),
[data, colorTrigger, handleHexagonClick, handleHexagonHover]
);
// POI layer — independent, only recreated when POI data changes
const poiLayer = useMemo(
() =>
new IconLayer<POI>({
id: 'poi-icons',
data: pois,
getPosition: (d) => [d.lng, d.lat],
getIcon: (d) => ({
url: getPOIIconUrl(d.category),
url: emojiToTwemojiUrl(d.emoji),
width: 72,
height: 72,
}),
@ -275,48 +592,89 @@ export default function Map({ data, pois, onViewChange }: MapProps) {
sizeMinPixels: 20,
sizeMaxPixels: 40,
pickable: true,
onHover: handlePoiHover,
onHover: stablePoiHover,
}),
],
[data, pois, handlePoiHover]
[pois, stablePoiHover]
);
// Postcode labels on high-res hexagons (resolution 11+, zoom >= 13)
const postcodeData = useMemo(
() => data.filter((d) => d.postcode && d.lat != null && d.lon != null),
[data]
);
// Tooltip for hexagons only (POIs use MapLibre popup)
const getTooltip = useCallback(({ object }: { object?: HexagonData }) => {
if (!object || !('h3' in object)) return null;
const showPostcodes = viewState.zoom >= 13;
const postcodeLayer = useMemo(
() =>
showPostcodes
? new TextLayer<HexagonData>({
id: 'postcode-labels',
data: postcodeData,
getPosition: (d) => [d.lon as number, d.lat as number],
getText: (d) => d.postcode as string,
getSize: 11,
getColor: theme === 'dark' ? [220, 220, 220, 220] : [30, 30, 30, 220],
getTextAnchor: 'middle',
getAlignmentBaseline: 'center',
fontFamily: 'Inter, system-ui, sans-serif',
fontWeight: 600,
outlineWidth: 2,
outlineColor: theme === 'dark' ? [30, 30, 30, 200] : [255, 255, 255, 200],
billboard: false,
sizeUnits: 'pixels',
sizeMinPixels: 10,
sizeMaxPixels: 14,
})
: null,
[postcodeData, showPostcodes, theme]
);
const hex = object as HexagonData;
return {
html: `<div style="padding: 8px; font-size: 14px;">
<strong>Avg: £${hex.avg_price?.toLocaleString() || 'N/A'}</strong>
<div style="color: #666; font-size: 12px;">
${hex.count} sales<br/>
Range: £${hex.min_price?.toLocaleString()} - £${hex.max_price?.toLocaleString()}
</div>
</div>`,
style: {
backgroundColor: 'white',
borderRadius: '4px',
boxShadow: '0 2px 4px rgba(0,0,0,0.2)',
},
};
}, []);
const layers = useMemo(
() => [hexLayer, poiLayer, ...(postcodeLayer ? [postcodeLayer] : [])],
[hexLayer, poiLayer, postcodeLayer]
);
return (
<div className="flex-1 h-full relative" ref={containerRef}>
<DeckGL
viewState={viewState}
controller
layers={layers}
onViewStateChange={handleViewStateChange as never}
getTooltip={getTooltip as never}
<MapGL
{...viewState}
onMove={handleMove}
onLoad={handleMapLoad as never}
mapStyle={mapStyle}
style={{ width: '100%', height: '100%' }}
attributionControl={false}
dragRotate={false}
touchZoomRotate={true}
touchPitch={false}
keyboard={true}
pitchWithRotate={false}
minZoom={5}
maxBounds={[-12, 49, 4, 62]}
>
<MapGL mapStyle={MAP_STYLE} />
</DeckGL>
<DeckOverlay layers={layers} getTooltip={null} />
</MapGL>
<PostcodeSearch onFlyTo={handleFlyTo} />
{viewFeature && colorRange && colorFeatureMeta ? (
<MapLegend
featureLabel={colorFeatureMeta.name}
range={colorRange}
showCancel={viewSource === 'eye'}
onCancel={onCancelPin}
mode="feature"
enumValues={colorFeatureMeta.type === 'enum' ? colorFeatureMeta.values : undefined}
/>
) : (
<MapLegend
featureLabel="Property density"
range={[0, 0]}
showCancel={false}
onCancel={onCancelPin}
mode="density"
/>
)}
{popupInfo && (
<div
className="absolute pointer-events-none bg-white rounded shadow-lg p-2 text-sm"
className="absolute pointer-events-none bg-white dark:bg-navy-800 rounded shadow-lg p-2 text-sm dark:text-warm-200"
style={{
left: popupInfo.x,
top: popupInfo.y - 40,
@ -324,14 +682,10 @@ export default function Map({ data, pois, onViewChange }: MapProps) {
zIndex: 9999,
}}
>
<strong>
{getTooltipEmoji(popupInfo.category)} {popupInfo.name}
</strong>
<div className="text-gray-500 text-xs">
{popupInfo.category.replace(/_/g, ' ')}
</div>
<strong>{popupInfo.name}</strong>
<div className="text-gray-500 dark:text-warm-400 text-xs">{popupInfo.category}</div>
</div>
)}
</div>
);
}
});

View file

@ -0,0 +1,297 @@
import { useState, useRef, useEffect, useCallback } from 'react';
import type { POICategoryGroup } from '../types';
interface POIPaneProps {
groups: POICategoryGroup[];
selectedCategories: Set<string>;
onCategoriesChange: (categories: Set<string>) => void;
poiCount: number;
onNavigateToSource?: (slug: string) => void;
}
export default function POIPane({
groups,
selectedCategories,
onCategoriesChange,
poiCount,
onNavigateToSource,
}: POIPaneProps) {
const [dropdownOpen, setDropdownOpen] = useState(false);
const [searchTerm, setSearchTerm] = useState('');
const [collapsedGroups, setCollapsedGroups] = useState<Set<string>>(new Set());
const [showInfo, setShowInfo] = useState(false);
const dropdownRef = useRef<HTMLDivElement>(null);
const infoPopupRef = useRef<HTMLDivElement>(null);
// Close dropdown when clicking outside
useEffect(() => {
function handleClickOutside(event: MouseEvent) {
if (dropdownRef.current && !dropdownRef.current.contains(event.target as Node)) {
setDropdownOpen(false);
}
}
document.addEventListener('mousedown', handleClickOutside);
return () => document.removeEventListener('mousedown', handleClickOutside);
}, []);
// Close info popup when clicking outside
useEffect(() => {
if (!showInfo) return;
function handleClickOutside(e: MouseEvent) {
if (infoPopupRef.current && !infoPopupRef.current.contains(e.target as Node)) {
setShowInfo(false);
}
}
document.addEventListener('mousedown', handleClickOutside);
return () => document.removeEventListener('mousedown', handleClickOutside);
}, [showInfo]);
const allCategories = groups.flatMap((g) => g.categories);
const toggleCategory = (category: string) => {
const newSet = new Set(selectedCategories);
if (newSet.has(category)) {
newSet.delete(category);
} else {
newSet.add(category);
}
onCategoriesChange(newSet);
};
const selectAll = () => {
onCategoriesChange(new Set(allCategories));
};
const selectNone = () => {
onCategoriesChange(new Set());
};
const toggleGroup = useCallback(
(groupName: string) => {
const group = groups.find((g) => g.name === groupName);
if (!group) return;
const allSelected = group.categories.every((c) => selectedCategories.has(c));
const newSet = new Set(selectedCategories);
if (allSelected) {
group.categories.forEach((c) => newSet.delete(c));
} else {
group.categories.forEach((c) => newSet.add(c));
}
onCategoriesChange(newSet);
},
[groups, selectedCategories, onCategoriesChange]
);
const toggleCollapse = (groupName: string) => {
setCollapsedGroups((prev) => {
const next = new Set(prev);
if (next.has(groupName)) {
next.delete(groupName);
} else {
next.add(groupName);
}
return next;
});
};
const lowerSearch = searchTerm.toLowerCase();
// Filter groups and categories by search term
const filteredGroups = groups
.map((group) => {
if (!searchTerm) return group;
const matchingCats = group.categories.filter((c) => c.toLowerCase().includes(lowerSearch));
const groupMatches = group.name.toLowerCase().includes(lowerSearch);
if (groupMatches) return group;
if (matchingCats.length === 0) return null;
return { ...group, categories: matchingCats };
})
.filter(Boolean) as POICategoryGroup[];
const selectedCount = selectedCategories.size;
return (
<div className="w-72 p-4 bg-white dark:bg-navy-950 shadow-lg space-y-4 overflow-y-auto max-h-screen">
<div className="flex items-center gap-2">
<h2 className="text-xl font-bold dark:text-warm-100">Points of Interest</h2>
<button
onClick={() => setShowInfo(true)}
className="text-warm-400 hover:text-warm-700 dark:hover:text-warm-300 p-0.5 rounded"
title="Data source info"
>
<svg className="w-3.5 h-3.5" viewBox="0 0 24 24" fill="none" stroke="currentColor" strokeWidth={2}>
<circle cx="12" cy="12" r="10" />
<path strokeLinecap="round" d="M12 16v-4m0-4h.01" />
</svg>
</button>
</div>
{showInfo && (
<div className="fixed inset-0 z-50 flex items-center justify-center bg-black/30">
<div
ref={infoPopupRef}
className="bg-white dark:bg-navy-800 border border-warm-200 dark:border-navy-700 rounded-lg shadow-xl max-w-md w-full mx-4 p-5"
>
<div className="flex items-start justify-between mb-3">
<h3 className="text-sm font-semibold text-warm-900 dark:text-warm-100 pr-4">
Points of Interest
</h3>
<button
onClick={() => setShowInfo(false)}
className="text-warm-400 hover:text-warm-700 dark:hover:text-warm-300 shrink-0"
>
<svg className="w-4 h-4" viewBox="0 0 24 24" fill="none" stroke="currentColor" strokeWidth={2}>
<path strokeLinecap="round" strokeLinejoin="round" d="M6 18L18 6M6 6l12 12" />
</svg>
</button>
</div>
<p className="text-sm text-warm-700 dark:text-warm-300 mb-4 leading-relaxed">
Points of interest are sourced from OpenStreetMap via Geofabrik extracts.
Categories include public transport stops, shops, restaurants, healthcare
facilities, leisure venues, and more. Data is filtered and mapped to
friendly names with exhaustive category coverage.
</p>
{onNavigateToSource && (
<button
onClick={() => {
onNavigateToSource('osm-pois');
setShowInfo(false);
}}
className="text-sm text-teal-600 dark:text-teal-400 hover:text-teal-800 dark:hover:text-teal-300 hover:underline"
>
View data source
</button>
)}
</div>
</div>
)}
<div className="space-y-2" ref={dropdownRef}>
<button
onClick={() => setDropdownOpen(!dropdownOpen)}
className="w-full flex items-center justify-between px-3 py-2 text-sm border border-warm-300 dark:border-navy-700 rounded hover:border-warm-400 bg-white dark:bg-navy-800 dark:text-warm-200"
>
<span className="truncate text-left">
{selectedCount === 0
? 'Select categories...'
: selectedCount === allCategories.length
? 'All categories'
: `${selectedCount} selected`}
</span>
<svg
className={`w-4 h-4 ml-2 flex-shrink-0 transition-transform ${dropdownOpen ? 'rotate-180' : ''}`}
fill="none"
stroke="currentColor"
viewBox="0 0 24 24"
>
<path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M19 9l-7 7-7-7" />
</svg>
</button>
{dropdownOpen && (
<div className="border border-warm-300 dark:border-navy-700 rounded shadow-lg bg-white dark:bg-navy-800">
<div className="flex gap-2 px-3 py-2 border-b border-warm-200 dark:border-navy-700">
<button onClick={selectAll} className="text-xs text-teal-600 dark:text-teal-400 hover:text-teal-800 dark:hover:text-teal-300">
All
</button>
<span className="text-xs text-warm-300 dark:text-warm-600">|</span>
<button onClick={selectNone} className="text-xs text-teal-600 dark:text-teal-400 hover:text-teal-800 dark:hover:text-teal-300">
None
</button>
</div>
<div className="px-3 py-2 border-b border-warm-200 dark:border-navy-700">
<input
type="text"
placeholder="Search categories..."
value={searchTerm}
onChange={(e) => setSearchTerm(e.target.value)}
className="w-full px-2 py-1 text-sm border border-warm-300 dark:border-navy-700 rounded bg-white dark:bg-navy-950 dark:text-warm-200 dark:placeholder-warm-500"
/>
</div>
<div className="max-h-96 overflow-y-auto py-1">
{filteredGroups.map((group) => {
const groupSelected = group.categories.filter((c) =>
selectedCategories.has(c)
).length;
const allInGroupSelected = groupSelected === group.categories.length;
const someInGroupSelected = groupSelected > 0 && !allInGroupSelected;
const isCollapsed = collapsedGroups.has(group.name) && !searchTerm;
return (
<div key={group.name}>
<div className="flex items-center gap-1 px-3 py-1.5 bg-warm-50 dark:bg-navy-950 border-y border-warm-100 dark:border-navy-700">
<button
onClick={() => toggleCollapse(group.name)}
className="p-0.5 text-warm-400 hover:text-warm-600"
>
<svg
className={`w-3 h-3 transition-transform ${isCollapsed ? '' : 'rotate-90'}`}
fill="none"
stroke="currentColor"
viewBox="0 0 24 24"
>
<path
strokeLinecap="round"
strokeLinejoin="round"
strokeWidth={2}
d="M9 5l7 7-7 7"
/>
</svg>
</button>
<label className="flex items-center gap-2 flex-1 cursor-pointer">
<input
type="checkbox"
checked={allInGroupSelected}
ref={(el) => {
if (el) el.indeterminate = someInGroupSelected;
}}
onChange={() => toggleGroup(group.name)}
className="rounded accent-teal-600"
/>
<span className="text-xs font-semibold text-warm-700 dark:text-warm-300">{group.name}</span>
</label>
<span className="text-xs text-warm-400">
{groupSelected}/{group.categories.length}
</span>
</div>
{!isCollapsed &&
group.categories.map((category) => (
<label
key={category}
className="flex items-center gap-2 px-3 pl-8 py-1.5 hover:bg-warm-50 dark:hover:bg-navy-700 cursor-pointer dark:text-warm-300"
>
<input
type="checkbox"
checked={selectedCategories.has(category)}
onChange={() => toggleCategory(category)}
className="rounded accent-teal-600"
/>
<span className="text-sm flex-1">{category}</span>
</label>
))}
</div>
);
})}
</div>
</div>
)}
</div>
{selectedCount > 0 && (
<div className="p-3 bg-teal-50 dark:bg-teal-900/30 rounded text-sm">
<div className="font-medium text-teal-900 dark:text-teal-300">
{poiCount.toLocaleString()} POI{poiCount !== 1 ? 's' : ''} visible
</div>
<div className="text-xs text-teal-700 dark:text-teal-400 mt-1">
{selectedCount} categor{selectedCount !== 1 ? 'ies' : 'y'} selected
</div>
</div>
)}
<div className="p-3 bg-warm-100 dark:bg-navy-800 rounded text-xs text-warm-600 dark:text-warm-400">
<p>Select categories to display POIs on the map.</p>
<p className="mt-2">Zoom in for better visibility of individual locations.</p>
</div>
</div>
);
}

View file

@ -0,0 +1,316 @@
import React, { useMemo, useState, useRef, useEffect } from 'react';
import { Property } from '../types';
interface PropertiesPaneProps {
properties: Property[];
total: number;
loading: boolean;
hexagonId: string | null;
onLoadMore: () => void;
onClose: () => void;
onNavigateToSource?: (slug: string) => void;
isHoveredPreview?: boolean;
hoverMode?: boolean;
onHoverModeChange?: (enabled: boolean) => void;
}
type SortBy = 'price' | 'size' | 'energy';
export function PropertiesPane({
properties,
total,
loading,
hexagonId,
onLoadMore,
onClose,
onNavigateToSource,
isHoveredPreview,
hoverMode,
onHoverModeChange,
}: PropertiesPaneProps) {
const [sortBy, setSortBy] = useState<SortBy>('price');
const [search, setSearch] = useState('');
const [showInfo, setShowInfo] = useState(false);
const infoPopupRef = useRef<HTMLDivElement>(null);
useEffect(() => {
if (!showInfo) return;
function handleClickOutside(e: MouseEvent) {
if (infoPopupRef.current && !infoPopupRef.current.contains(e.target as Node)) {
setShowInfo(false);
}
}
document.addEventListener('mousedown', handleClickOutside);
return () => document.removeEventListener('mousedown', handleClickOutside);
}, [showInfo]);
// Filter and sort properties
const filteredAndSorted = useMemo(() => {
const query = search.trim().toLowerCase();
const filtered = query
? properties.filter((p) => {
const addr = (p.address || '').toLowerCase();
const pc = (p.postcode || '').toLowerCase();
return addr.includes(query) || pc.includes(query);
})
: properties;
return [...filtered].sort((a, b) => {
switch (sortBy) {
case 'price':
return ((b.latest_price as number) || 0) - ((a.latest_price as number) || 0);
case 'size':
return ((b.total_floor_area as number) || 0) - ((a.total_floor_area as number) || 0);
case 'energy':
return (a.current_energy_rating || 'Z').localeCompare(b.current_energy_rating || 'Z');
}
});
}, [properties, sortBy, search]);
if (!hexagonId) {
return (
<div className="flex items-center justify-center h-full text-warm-500 dark:text-warm-400">
Click a hexagon to view properties
</div>
);
}
return (
<div className="flex flex-col h-full">
{/* Header */}
<div className="p-4 border-b border-warm-200 dark:border-navy-700">
<div className="flex justify-between items-center">
<div className="flex items-center gap-2">
<h2 className="text-lg font-semibold dark:text-warm-100">Properties</h2>
{isHoveredPreview && (
<span className="text-xs px-1.5 py-0.5 rounded bg-teal-50 dark:bg-teal-900/30 text-teal-600 dark:text-teal-400">
Preview
</span>
)}
<button
onClick={() => setShowInfo(true)}
className="text-warm-400 hover:text-warm-700 dark:hover:text-warm-300 p-0.5 rounded"
title="Data source info"
>
<svg className="w-3.5 h-3.5" viewBox="0 0 24 24" fill="none" stroke="currentColor" strokeWidth={2}>
<circle cx="12" cy="12" r="10" />
<path strokeLinecap="round" d="M12 16v-4m0-4h.01" />
</svg>
</button>
</div>
<div className="flex items-center gap-1">
{onHoverModeChange && (
<button
onClick={() => onHoverModeChange(!hoverMode)}
className={`p-1 rounded ${
hoverMode
? 'text-teal-600 dark:text-teal-400 bg-teal-50 dark:bg-teal-900/30'
: 'text-warm-400 hover:text-warm-700 dark:hover:text-warm-300'
}`}
title={hoverMode ? 'Live preview on (click to lock)' : 'Live preview off (click to enable)'}
>
<svg className="w-4 h-4" viewBox="0 0 24 24" fill="none" stroke="currentColor" strokeWidth={2}>
<path strokeLinecap="round" strokeLinejoin="round" d="M15 12a3 3 0 11-6 0 3 3 0 016 0z" />
<path strokeLinecap="round" strokeLinejoin="round" d="M2.458 12C3.732 7.943 7.523 5 12 5c4.478 0 8.268 2.943 9.542 7-1.274 4.057-5.064 7-9.542 7-4.477 0-8.268-2.943-9.542-7z" />
</svg>
</button>
)}
<button
onClick={onClose}
className="text-warm-400 hover:text-warm-700 dark:hover:text-warm-300 p-1"
>
<svg className="w-4 h-4" viewBox="0 0 24 24" fill="none" stroke="currentColor" strokeWidth={2}>
<path strokeLinecap="round" strokeLinejoin="round" d="M6 18L18 6M6 6l12 12" />
</svg>
</button>
</div>
</div>
<p className="text-sm text-warm-600 dark:text-warm-400">
{search.trim()
? `${filteredAndSorted.length} match${filteredAndSorted.length !== 1 ? 'es' : ''} in ${properties.length} loaded`
: `Showing ${properties.length} of ${total} properties`}
</p>
{showInfo && (
<div className="fixed inset-0 z-50 flex items-center justify-center bg-black/30">
<div
ref={infoPopupRef}
className="bg-white dark:bg-navy-800 border border-warm-200 dark:border-navy-700 rounded-lg shadow-xl max-w-md w-full mx-4 p-5"
>
<div className="flex items-start justify-between mb-3">
<h3 className="text-sm font-semibold text-warm-900 dark:text-warm-100 pr-4">
Property Data
</h3>
<button
onClick={() => setShowInfo(false)}
className="text-warm-400 hover:text-warm-700 dark:hover:text-warm-300 shrink-0"
>
<svg className="w-4 h-4" viewBox="0 0 24 24" fill="none" stroke="currentColor" strokeWidth={2}>
<path strokeLinecap="round" strokeLinejoin="round" d="M6 18L18 6M6 6l12 12" />
</svg>
</button>
</div>
<p className="text-sm text-warm-700 dark:text-warm-300 mb-4 leading-relaxed">
Property data combines Energy Performance Certificates (EPC) with HM Land
Registry Price Paid records, fuzzy-matched by address within each postcode.
Includes floor area, energy ratings, construction age, and tenure from EPC
surveys, plus the most recent sale price from the Land Registry.
</p>
{onNavigateToSource && (
<button
onClick={() => {
onNavigateToSource('epc');
setShowInfo(false);
}}
className="text-sm text-teal-600 dark:text-teal-400 hover:text-teal-800 dark:hover:text-teal-300 hover:underline"
>
View data source
</button>
)}
</div>
</div>
)}
</div>
{/* Search and sort controls */}
<div className="p-2 border-b border-warm-200 dark:border-navy-700 space-y-2">
<input
type="text"
value={search}
onChange={(e) => setSearch(e.target.value)}
placeholder="Search by address or postcode..."
className="w-full p-2 border border-warm-300 dark:border-navy-700 rounded text-sm bg-white dark:bg-navy-800 dark:text-warm-200 placeholder-warm-400 dark:placeholder-warm-500"
/>
<select
value={sortBy}
onChange={(e) => setSortBy(e.target.value as SortBy)}
className="w-full p-2 border border-warm-300 dark:border-navy-700 rounded text-sm bg-white dark:bg-navy-800 dark:text-warm-200"
>
<option value="price">Price (High to Low)</option>
<option value="size">Size (Large to Small)</option>
<option value="energy">Energy Rating (Best to Worst)</option>
</select>
</div>
{/* Properties list */}
<div className="flex-1 overflow-y-auto">
{loading && properties.length === 0 ? (
<div className="p-4 dark:text-warm-400">Loading...</div>
) : (
<>
{filteredAndSorted.map((property, idx) => (
<PropertyCard key={idx} property={property} />
))}
{properties.length < total && (
<button
onClick={onLoadMore}
disabled={loading}
className="w-full p-4 text-teal-600 dark:text-teal-400 hover:bg-teal-50 dark:hover:bg-teal-900/30 disabled:opacity-50"
>
{loading ? 'Loading...' : `Load More (${total - properties.length} remaining)`}
</button>
)}
</>
)}
</div>
</div>
);
}
function formatDuration(d: string): string {
if (d === 'F') return 'Freehold';
if (d === 'L') return 'Leasehold';
return d;
}
function formatAge(value: number, approximate = true): string {
if (value >= 1000) return approximate ? `~${Math.round(value)}` : `${Math.round(value)}`;
return Math.round(value).toString();
}
// Helper to get a numeric value from a property, trying multiple field names
function getNum(property: Property, ...keys: string[]): number | undefined {
for (const key of keys) {
const v = property[key];
if (v !== undefined && v !== null && typeof v === 'number') return v;
}
return undefined;
}
// Property card component showing all fields
function PropertyCard({ property }: { property: Property }) {
const fmt = (value: number | undefined, decimals = 0): string => {
if (value === undefined) return '';
return decimals > 0 ? value.toFixed(decimals) : Math.round(value).toLocaleString();
};
const price = getNum(property, 'Last known price', 'latest_price');
const pricePerSqm = getNum(property, 'Price per sqm', 'price_per_sqm');
const floorArea = getNum(property, 'Total floor area (sqm)', 'total_floor_area');
const rooms = getNum(
property,
'Rooms (including bedrooms & bathrooms)',
'number_habitable_rooms'
);
const age = getNum(property, 'Approximate construction age', 'construction_age_band');
return (
<div className="p-4 border-b border-warm-100 dark:border-navy-800 hover:bg-warm-50 dark:hover:bg-navy-800">
{/* Address & postcode */}
<div className="font-semibold dark:text-warm-100">{property.address || 'Unknown Address'}</div>
<div className="text-sm text-warm-600 dark:text-warm-400">{property.postcode}</div>
{/* Price */}
{price !== undefined && (
<div className="mt-2 text-lg font-bold text-teal-700 dark:text-teal-400">
£{fmt(price)}
{pricePerSqm !== undefined && (
<span className="text-sm font-normal text-warm-600 dark:text-warm-400"> (£{fmt(pricePerSqm)}/m²)</span>
)}
</div>
)}
{/* Property details grid */}
<div className="mt-2 grid grid-cols-2 gap-x-4 gap-y-1 text-sm dark:text-warm-300">
{property.property_type && (
<div>
<span className="text-warm-500 dark:text-warm-400">Type:</span> {property.property_type}
</div>
)}
{property.built_form && (
<div>
<span className="text-warm-500 dark:text-warm-400">Built form:</span> {property.built_form}
</div>
)}
{property.duration && (
<div>
<span className="text-warm-500 dark:text-warm-400">Tenure:</span> {formatDuration(property.duration)}
</div>
)}
{floorArea !== undefined && (
<div>
<span className="text-warm-500 dark:text-warm-400">Floor area:</span> {fmt(floorArea)}m²
</div>
)}
{rooms !== undefined && (
<div>
<span className="text-warm-500 dark:text-warm-400">Rooms:</span> {fmt(rooms)}
</div>
)}
{age !== undefined && (
<div>
<span className="text-warm-500 dark:text-warm-400">Built:</span> {formatAge(age, property.is_construction_date_approximate ?? true)}
</div>
)}
{property.current_energy_rating && (
<div>
<span className="text-warm-500 dark:text-warm-400">EPC rating:</span> {property.current_energy_rating}
</div>
)}
{property.potential_energy_rating && (
<div>
<span className="text-warm-500 dark:text-warm-400">EPC potential:</span> {property.potential_energy_rating}
</div>
)}
</div>
</div>
);
}

View file

@ -7,6 +7,6 @@ interface LabelProps {
export function Label({ children, className }: LabelProps) {
return (
<label className={`text-sm font-medium text-slate-700 ${className || ''}`}>{children}</label>
<label className={`text-sm font-medium text-warm-700 dark:text-warm-300 ${className || ''}`}>{children}</label>
);
}

View file

@ -11,13 +11,13 @@ export function Slider({ className, ...props }: SliderProps) {
className={cn('relative flex w-full touch-none select-none items-center', className)}
{...props}
>
<SliderPrimitive.Track className="relative h-2 w-full grow overflow-hidden rounded-full bg-slate-200">
<SliderPrimitive.Range className="absolute h-full bg-slate-900" />
<SliderPrimitive.Track className="relative h-2 w-full grow overflow-hidden rounded-full bg-warm-200 dark:bg-navy-700">
<SliderPrimitive.Range className="absolute h-full bg-teal-600" />
</SliderPrimitive.Track>
{props.value?.map((_, i) => (
<SliderPrimitive.Thumb
key={i}
className="block h-5 w-5 rounded-full border-2 border-slate-900 bg-white ring-offset-white transition-colors focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-slate-950 focus-visible:ring-offset-2 disabled:pointer-events-none disabled:opacity-50"
className="block h-5 w-5 rounded-full border-2 border-teal-600 dark:border-teal-500 bg-white dark:bg-navy-800 ring-offset-white dark:ring-offset-navy-950 transition-colors focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-teal-600 focus-visible:ring-offset-2 disabled:pointer-events-none disabled:opacity-50"
/>
))}
</SliderPrimitive.Root>

View file

@ -9,3 +9,41 @@ body,
margin: 0;
padding: 0;
}
html.dark {
background-color: #0a0e1a;
color-scheme: dark;
}
/* Smooth theme transitions (scoped to avoid map performance issues) */
body,
div,
aside,
section,
header,
nav,
button,
input,
select,
label,
span,
p,
h1,
h2,
h3 {
transition: background-color 0.2s ease, border-color 0.2s ease, color 0.2s ease;
}
/* Fade-in animation for homepage sections */
.fade-in-section {
opacity: 0;
transform: translateY(24px);
transition:
opacity 0.6s ease-out,
transform 0.6s ease-out;
}
.fade-in-visible {
opacity: 1;
transform: translateY(0);
}

View file

@ -3,7 +3,14 @@
<head>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>UK Property Prices Map</title>
<title>Narrowit</title>
<script>
(function() {
if (localStorage.getItem('theme') === 'dark') {
document.documentElement.classList.add('dark');
}
})();
</script>
</head>
<body>
<div id="root"></div>

View file

@ -1,19 +0,0 @@
import type { Filters } from '../types';
// Filter configuration constants
// Should match backend pipeline/config.py
export const YEAR_MIN = 1995;
export const YEAR_MAX = 2024;
export const YEAR_STEP = 1;
export const PRICE_MIN = 0;
export const PRICE_MAX = 5000000; // £5M max for slider, but no server-side cap
export const PRICE_STEP = 50000;
export const DEFAULT_FILTERS: Filters = {
minYear: 2020,
maxYear: YEAR_MAX,
minPrice: PRICE_MIN,
maxPrice: PRICE_MAX,
};

View file

@ -1,8 +1,31 @@
export interface Filters {
minYear: number;
maxYear: number;
minPrice: number;
maxPrice: number;
export interface FeatureMeta {
name: string;
type: 'numeric' | 'enum';
group?: string;
// Numeric-only fields
min?: number;
max?: number;
step?: number;
// Enum-only fields
values?: string[];
// Description fields
description?: string;
detail?: string;
source?: string;
}
export interface FeatureGroup {
name: string;
features: FeatureMeta[];
}
// Filters: feature name -> [selectedMin, selectedMax] for numeric, string[] for enum
export type FeatureFilters = Record<string, [number, number] | string[]>;
export interface HexagonData {
h3: string;
count: number;
[key: string]: string | number | null;
}
export interface Bounds {
@ -12,15 +35,6 @@ export interface Bounds {
east: number;
}
export interface HexagonData {
h3: string;
count: number;
avg_price: number;
median_price: number;
min_price: number;
max_price: number;
}
export interface ViewState {
longitude: number;
latitude: number;
@ -33,6 +47,8 @@ export interface ViewChangeParams {
resolution: number;
bounds: Bounds;
zoom: number;
latitude: number;
longitude: number;
}
export interface ApiResponse {
@ -43,21 +59,69 @@ export interface POI {
id: string;
name: string;
category: string;
group: string;
lat: number;
lng: number;
emoji: string;
}
export interface POIResponse {
features: POI[];
pois: POI[];
}
export const POI_CATEGORY_GROUPS = [
'schools',
'healthcare',
'transport',
'parks',
'emergency',
'supermarkets',
] as const;
export interface POICategoryGroup {
name: string;
categories: string[];
}
export type POICategoryGroup = (typeof POI_CATEGORY_GROUPS)[number];
export interface POICategoriesResponse {
groups: POICategoryGroup[];
}
export interface Property {
// String fields
address?: string;
postcode?: string;
property_type?: string;
built_form?: string;
duration?: string;
current_energy_rating?: string;
potential_energy_rating?: string;
// Numeric fields
lat: number;
lon: number;
is_construction_date_approximate?: boolean;
// All other numeric features (dynamic, including construction_age_band)
[key: string]: string | number | boolean | undefined;
}
export interface HexagonPropertiesResponse {
properties: Property[];
total: number;
limit: number;
offset: number;
truncated: boolean;
}
export interface NumericFeatureStats {
name: string;
count: number;
min: number;
max: number;
mean: number;
histogram: { min: number; max: number; bin_width: number; counts: number[] };
}
export interface EnumFeatureStats {
name: string;
counts: Record<string, number>;
}
export interface HexagonStatsResponse {
count: number;
numeric_features: NumericFeatureStats[];
enum_features: EnumFeatureStats[];
}

View file

@ -1,7 +1,54 @@
module.exports = {
darkMode: 'class',
content: ['./src/**/*.{js,jsx,ts,tsx,html}'],
theme: {
extend: {},
extend: {
colors: {
navy: {
50: '#eef1f8',
100: '#d9dff0',
200: '#b3bfe1',
300: '#8d9fd2',
400: '#677fc3',
500: '#4a63a8',
600: '#2a3f6b',
700: '#1e2d50',
800: '#141e38',
900: '#0f1528',
950: '#0a0e1a',
},
teal: {
50: '#effefb',
100: '#c7fff4',
200: '#90ffe9',
300: '#51f7d9',
400: '#1de4c3',
500: '#05c9aa',
600: '#00a28c',
700: '#058172',
800: '#0a665b',
900: '#0d544c',
950: '#003330',
},
coral: {
400: '#fb923c',
500: '#f97316',
600: '#ea580c',
},
warm: {
50: '#fafaf9',
100: '#f5f5f4',
200: '#e7e5e4',
300: '#d6d3d1',
400: '#a8a29e',
500: '#78716c',
600: '#57534e',
700: '#44403c',
800: '#292524',
900: '#1c1917',
},
},
},
},
plugins: [require('tailwindcss-animate')],
};

View file

@ -1,49 +0,0 @@
#!/usr/bin/env python3
# /// script
# requires-python = ">=3.12"
# dependencies = ["openapi-python-client"]
# ///
"""Regenerate the TfL Journey API client from the OpenAPI specification."""
# Run it with:
# uv run generate_tfl_client.py
import subprocess
from pathlib import Path
OPENAPI_SPEC = Path("Journey.yaml")
OUTPUT_PATH = Path("tfl_journey_client")
def main() -> None:
if not OPENAPI_SPEC.exists():
raise FileNotFoundError(f"OpenAPI spec not found: {OPENAPI_SPEC}")
# Skip if client already exists
if OUTPUT_PATH.exists():
print(f"TfL client already exists at {OUTPUT_PATH}, skipping")
return
# Generate the client
print(f"Generating client from {OPENAPI_SPEC}")
result = subprocess.run(
[
"openapi-python-client",
"generate",
"--path",
str(OPENAPI_SPEC),
"--output-path",
str(OUTPUT_PATH),
],
check=True,
)
if result.returncode == 0:
print(f"Client generated successfully at {OUTPUT_PATH}")
else:
print("Client generation failed")
raise SystemExit(1)
if __name__ == "__main__":
main()

View file

@ -1,6 +0,0 @@
def main():
print("Hello from property-map!")
if __name__ == "__main__":
main()

View file

@ -1,22 +0,0 @@
from abc import ABC, abstractmethod
import polars as pl
class DataSource(ABC):
"""Base class for all data sources."""
@property
@abstractmethod
def name(self) -> str:
"""Unique identifier for this data source."""
pass
@abstractmethod
def load(self) -> pl.LazyFrame:
"""Load raw data as LazyFrame."""
pass
@abstractmethod
def process(self, postcodes: pl.LazyFrame) -> pl.LazyFrame:
"""Process and join with postcode coordinates."""
pass

View file

@ -1,22 +0,0 @@
"""Shared configuration for the pipeline and server."""
from pathlib import Path
DATA_DIR = Path(__file__).parent.parent / "data_sources"
PROCESSED_DIR = DATA_DIR / "processed"
AGGREGATES_DIR = PROCESSED_DIR / "aggregates"
# H3 resolutions to generate and serve
# https://h3geo.org/docs/core-library/restable/#average-area-in-m2
H3_RESOLUTIONS = [7, 8, 9, 10, 11]
DEFAULT_H3_RESOLUTION = 8
# Year filters
MIN_YEAR = 1995
MAX_YEAR = 2024
DEFAULT_MIN_YEAR = 2020
DEFAULT_MAX_YEAR = 2024
# Price filters
DEFAULT_MIN_PRICE = 0
DEFAULT_MAX_PRICE = 100_000_000

View file

@ -0,0 +1,38 @@
import argparse
import tempfile
import polars as pl
from pathlib import Path
from pipeline.utils import download, extract_zip
URL = "https://www.arcgis.com/sharing/rest/content/items/077631e063eb4e1ab43575d01381ec33/data"
def convert_to_parquet(data_path: Path, parquet_path: Path) -> None:
df = pl.scan_csv(data_path / "Data/NSPL_MAY_2025_UK.csv", try_parse_dates=True)
print(f"Columns: {df.collect_schema().names()}")
parquet_path.parent.mkdir(parents=True, exist_ok=True)
df.sink_parquet(parquet_path, compression="zstd")
print(f"Saved to {parquet_path}")
def main() -> None:
parser = argparse.ArgumentParser(
description="Download and convert ArcGIS postcode data"
)
parser.add_argument(
"--output", type=Path, required=True, help="Output parquet file path"
)
args = parser.parse_args()
with tempfile.TemporaryDirectory() as cache_dir:
download_path = Path(cache_dir) / "arcgis_data.zip"
extract_path = Path(cache_dir) / "arcgis_extracted"
download(URL, download_path)
extract_zip(download_path, extract_path)
convert_to_parquet(extract_path, args.output)
if __name__ == "__main__":
main()

View file

@ -0,0 +1,62 @@
import argparse
import tempfile
import polars as pl
from pathlib import Path
from pipeline.utils import download, extract_zip
# Ofcom Connected Nations 2025 - Fixed broadband performance (output area & local authority level)
# Source: https://www.ofcom.org.uk/phones-and-broadband/coverage-and-speeds/connected-nations-20252/data-downloads-2025
PERFORMANCE_URL = "https://www.ofcom.org.uk/siteassets/resources/documents/research-and-data/multi-sector/infrastructure-research/connected-nations-2025/202507_fixed_broadband_coverage_r01.zip?v=407830"
def convert_to_parquet(extract_dir: Path, parquet_path: Path) -> None:
# Find CSV files in the extracted directory
csv_files = list(extract_dir.rglob("*.csv"))
if not csv_files:
raise FileNotFoundError(f"No CSV files found in {extract_dir}")
print(f"Found {len(csv_files)} CSV files: {[f.name for f in csv_files]}")
frames = []
for csv_file in sorted(csv_files):
print(f"Reading {csv_file.name}...")
df = pl.read_csv(csv_file, infer_schema_length=10000, encoding="utf8-lossy")
print(f" Shape: {df.shape}")
frames.append(df)
combined = pl.concat(frames, how="diagonal_relaxed")
print(f"Combined shape: {combined.shape}")
parquet_path.parent.mkdir(parents=True, exist_ok=True)
combined.write_parquet(parquet_path, compression="zstd")
print(f"Saved {parquet_path} ({combined.shape[0]} rows)")
def main() -> None:
parser = argparse.ArgumentParser(
description="Download Ofcom broadband performance data"
)
parser.add_argument(
"--output",
type=Path,
required=True,
help="Output parquet file path",
)
args = parser.parse_args()
with tempfile.TemporaryDirectory(delete=False) as cache_dir:
cache = Path(cache_dir)
zip_path = cache / "broadband_performance.zip"
extract_dir = cache / "extracted"
extracted_again_dir = cache / "extracted-again"
download(PERFORMANCE_URL, zip_path)
extract_zip(zip_path, extract_dir)
print(list((extract_dir / "202507_fixed_coverage_r01").glob("*")))
extract_zip(extract_dir / "202507_fixed_coverage_r01" / "202507_fixed_pc_coverage_r01.zip", extracted_again_dir)
convert_to_parquet(extracted_again_dir, args.output)
if __name__ == "__main__":
main()

View file

@ -0,0 +1,43 @@
import argparse
import tempfile
import polars as pl
from pathlib import Path
from pipeline.utils import download
URL = "https://assets.publishing.service.gov.uk/media/691ded34513046b952c500bd/File_5_IoD2025_Scores_for_the_Indices_of_Deprivation.xlsx"
def convert_to_parquet(xlsx_path: Path, parquet_path: Path) -> None:
print("Reading Excel file (sheet 2)...")
# Read the 2nd sheet (index 1) - IoD2025 Scores
df = pl.read_excel(
xlsx_path,
sheet_id=2, # 1-indexed, so 2 = second sheet
)
print(f"Shape: {df.shape}")
print(f"Columns: {df.columns}")
df.write_parquet(parquet_path, compression="zstd")
print(f"Saved to {parquet_path}")
def main() -> None:
parser = argparse.ArgumentParser(
description="Download and convert Index of Deprivation data"
)
parser.add_argument(
"--output", type=Path, required=True, help="Output parquet file path"
)
args = parser.parse_args()
with tempfile.TemporaryDirectory() as cache_dir:
xlsx_path = Path(cache_dir) / "IoD2025_Scores.xlsx"
download(URL, xlsx_path, timeout=60)
convert_to_parquet(xlsx_path, args.output)
if __name__ == "__main__":
main()

View file

@ -0,0 +1,58 @@
import argparse
from pathlib import Path
import httpx
import polars as pl
pl.Config.set_tbl_cols(-1)
URL = "https://www.ethnicity-facts-figures.service.gov.uk/uk-population-by-ethnicity/national-and-regional-populations/regional-ethnic-diversity/latest/downloads/population-by-ethnicity-and-local-authority-2021.csv"
def download_and_convert(output_path: Path) -> None:
print("Downloading ethnicity data...")
response = httpx.get(URL, follow_redirects=True, timeout=60)
response.raise_for_status()
df = pl.read_csv(response.content)
print(f"Raw shape: {df.head(100)}")
# Keep only broad ethnicity categories (5+1), exclude "All" totals
df = df.filter(
(pl.col("Ethnicity_type") == "ONS 2021 5+1") & (pl.col("Ethnicity") != "All")
)
# Pivot: one row per local authority, columns = ethnicity percentages
wide = df.pivot(
on="Ethnicity",
index="Geography_code",
values="Value1",
)
# Rename columns to be descriptive
rename_map = {
col: f"% {col}" for col in wide.columns if col != "Geography_code"
}
wide = wide.rename(rename_map)
print(f"Output shape: {wide.shape}")
print(f"Columns: {wide.columns}")
wide.write_parquet(output_path, compression="zstd")
print(f"Saved to {output_path}")
def main() -> None:
parser = argparse.ArgumentParser(
description="Download and convert ethnicity by local authority data"
)
parser.add_argument(
"--output", type=Path, required=True, help="Output parquet file path"
)
args = parser.parse_args()
download_and_convert(args.output)
if __name__ == "__main__":
main()

View file

@ -0,0 +1,69 @@
"""Download NaPTAN data and extract railway/metro station POIs."""
import argparse
import io
import urllib.request
from pathlib import Path
import polars as pl
NAPTAN_CSV_URL = "https://naptan.api.dft.gov.uk/v1/access-nodes?dataFormat=csv"
STOP_TYPES = {
'AIR': "Airport",
'FTD': "Ferry",
"RSE": "Rail station",
"BCT": "Bus stop",
"BCE": "Bus station",
"TXR": "Taxi rank",
"TMU": "Metro or Tram stop",
}
def download_naptan(output: Path) -> None:
output.parent.mkdir(parents=True, exist_ok=True)
print(f"Downloading NaPTAN data from {NAPTAN_CSV_URL}")
with urllib.request.urlopen(NAPTAN_CSV_URL) as resp:
raw = resp.read()
print(f"Downloaded {len(raw) / (1024 * 1024):.1f} MB")
df = (
pl.read_csv(io.BytesIO(raw), infer_schema_length=0)
.with_columns(
pl.col("Latitude").cast(pl.Float64, strict=False),
pl.col("Longitude").cast(pl.Float64, strict=False),
)
.drop_nulls(subset=["Latitude", "Longitude"])
.filter(pl.col("StopType").is_in(list(STOP_TYPES.keys())))
.select(
pl.col("ATCOCode").alias("id"),
pl.col("CommonName").alias("name"),
pl.col("StopType").replace(STOP_TYPES).alias("category"),
pl.col("Latitude").alias("lat"),
pl.col("Longitude").alias("lng"),
)
)
df.write_parquet(output)
size_mb = output.stat().st_size / (1024 * 1024)
print(f"Wrote {output} ({size_mb:.1f} MB, {len(df):,} stations)")
counts = df.group_by("category").len().sort("len", descending=True)
for row in counts.iter_rows(named=True):
print(f" {row['category']}: {row['len']:,}")
def main() -> None:
parser = argparse.ArgumentParser(description="Download NaPTAN station data")
parser.add_argument(
"--output", type=Path, required=True, help="Output parquet file path"
)
args = parser.parse_args()
download_naptan(args.output)
if __name__ == "__main__":
main()

283
pipeline/download/noise.py Normal file
View file

@ -0,0 +1,283 @@
"""Download Defra Round 4 (2022) strategic noise data for England.
Downloads modelled noise levels (road, rail, airport) as GeoTIFF rasters via
WCS, then samples noise values at postcode centroids. Outputs a parquet file
with postcode-level noise in dB for each source.
Uses 100km tiles (~42 per source) to balance request size vs count. The server
times out on tiles larger than ~150km at 100m resolution.
Data source: Defra Strategic Noise Mapping Round 4 (2022)
- Lden = day-evening-night 24h weighted average (the EU standard metric)
- 10m grid, modelled at 4m above ground
License: Open Government Licence v3.0
Note: Road/rail use WCS 1.0.0; airport requires WCS 2.0.1 (Defra's 1.0.0
endpoint is broken for that coverage).
"""
import argparse
import tempfile
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
import httpx
import numpy as np
import polars as pl
import rasterio
from pyproj import Transformer
from rasterio.merge import merge
from rasterio.transform import rowcol
# Noise sources: (label, column_name, WCS base URL, coverage ID, WCS version)
# Road/rail work with WCS 1.0.0; airport requires WCS 2.0.1.
NOISE_SOURCES = [
(
"Road",
"road_noise_lden_db",
"https://environment.data.gov.uk/spatialdata/road-noise-all-metrics-england-round-4/wcs",
"Road_Noise_Lden_England_Round_4_All",
"1.0.0",
),
(
"Rail",
"rail_noise_lden_db",
"https://environment.data.gov.uk/spatialdata/noise-data/wcs",
"Rail_Noise_Lden_England_Round_4_All",
"1.0.0",
),
(
"Airport",
"airport_noise_lden_db",
"https://environment.data.gov.uk/spatialdata/airport-noise-all-metrics-england-round-4/wcs",
"dac9cba4-abe7-43bd-b8e9-8a83da52edd8__Airport_Noise_ALL_Lden",
"2.0.1",
),
]
# England extent in EPSG:27700 (British National Grid), rounded outward
BNG_MIN_E = 80_000
BNG_MAX_E = 660_000
BNG_MIN_N = 0
BNG_MAX_N = 660_000
# Tile size in metres (100km balances request size vs count; 300km causes 504s)
TILE_SIZE = 100_000
# Max concurrent tile downloads
MAX_WORKERS = 4
# Native raster resolution (10m grid)
NATIVE_RESOLUTION = 10
# Request pixel resolution in metres (100m is sufficient for postcode-level data
# and keeps download size ~100x smaller than native 10m)
RESOLUTION = 100
def _wcs_get_coverage_url(
wcs_base: str,
coverage_id: str,
min_e: int,
min_n: int,
max_e: int,
max_n: int,
wcs_version: str = "1.0.0",
) -> str:
"""Build a WCS GetCoverage URL for a BNG bounding box."""
if wcs_version == "2.0.1":
return (
f"{wcs_base}?"
f"service=WCS&version=2.0.1&request=GetCoverage"
f"&coverageId={coverage_id}"
f"&format=image/tiff"
f"&subsettingCRS=EPSG:27700"
f"&subset=E({min_e},{max_e})"
f"&subset=N({min_n},{max_n})"
f"&scaleFactor={NATIVE_RESOLUTION / RESOLUTION}"
)
width = (max_e - min_e) // RESOLUTION
height = (max_n - min_n) // RESOLUTION
return (
f"{wcs_base}?"
f"service=WCS&version=1.0.0&request=GetCoverage"
f"&coverage={coverage_id}"
f"&CRS=EPSG:27700"
f"&BBOX={min_e},{min_n},{max_e},{max_n}"
f"&width={width}&height={height}"
f"&format=GeoTIFF"
)
_TO_BNG = Transformer.from_crs("EPSG:4326", "EPSG:27700", always_xy=True)
def _bng_from_latlon(lat: np.ndarray, lon: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
"""Convert WGS84 lat/lon to British National Grid easting/northing."""
return _TO_BNG.transform(lon, lat) # pyproj takes (x=lon, y=lat)
def _download_tile(
wcs_base: str,
coverage_id: str,
min_e: int,
min_n: int,
max_e: int,
max_n: int,
tile_path: Path,
wcs_version: str = "1.0.0",
) -> Path | None:
"""Download a single WCS tile. Returns path if successful, None otherwise."""
url = _wcs_get_coverage_url(wcs_base, coverage_id, min_e, min_n, max_e, max_n, wcs_version)
try:
with httpx.Client(timeout=300, follow_redirects=True) as client:
resp = client.get(url)
resp.raise_for_status()
content_type = resp.headers.get("content-type", "")
if "tiff" not in content_type and resp.content[:4] not in (b"II*\x00", b"MM\x00*"):
return None
tile_path.write_bytes(resp.content)
return tile_path
except (httpx.HTTPStatusError, httpx.TimeoutException, httpx.ConnectError) as e:
print(f" Failed to download tile ({min_e},{min_n})-({max_e},{max_n}): {e}")
return None
def download_raster(
tile_dir: Path, wcs_base: str, coverage_id: str, label: str, wcs_version: str = "1.0.0"
) -> list[Path]:
"""Download noise GeoTIFF raster covering England, returning paths to saved files."""
tiles = []
for min_e in range(BNG_MIN_E, BNG_MAX_E, TILE_SIZE):
for min_n in range(BNG_MIN_N, BNG_MAX_N, TILE_SIZE):
max_e = min(min_e + TILE_SIZE, BNG_MAX_E)
max_n = min(min_n + TILE_SIZE, BNG_MAX_N)
tiles.append((min_e, min_n, max_e, max_n))
print(f"[{label}] Downloading {len(tiles)} tiles at {RESOLUTION}m resolution ({MAX_WORKERS} workers)...")
paths = []
completed = 0
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
futures = {}
for min_e, min_n, max_e, max_n in tiles:
tile_path = tile_dir / f"tile_{min_e}_{min_n}.tif"
fut = executor.submit(
_download_tile, wcs_base, coverage_id,
min_e, min_n, max_e, max_n, tile_path, wcs_version,
)
futures[fut] = (min_e, min_n)
for fut in as_completed(futures):
completed += 1
result = fut.result()
if result is not None:
paths.append(result)
print(
f"\r [{completed}/{len(tiles)}] Downloaded {len(paths)} valid tiles",
end="",
flush=True,
)
print(f"\n[{label}] Downloaded {len(paths)}/{len(tiles)} tiles")
return paths
def sample_noise_at_postcodes(
tile_paths: list[Path],
easting: np.ndarray,
northing: np.ndarray,
label: str,
col_name: str,
) -> pl.Series:
"""Sample noise values from merged tiles at given BNG coordinates."""
print(f"[{label}] Merging {len(tile_paths)} tiles...")
datasets = [rasterio.open(p) for p in tile_paths]
raster_nodata = datasets[0].nodata
mosaic, mosaic_transform = merge(datasets)
for ds in datasets:
ds.close()
noise_grid = mosaic[0]
print(f"[{label}] Sampling noise values at postcode centroids...")
rows, cols = rowcol(mosaic_transform, easting, northing)
rows = np.asarray(rows)
cols = np.asarray(cols)
h, w = noise_grid.shape
in_bounds = (rows >= 0) & (rows < h) & (cols >= 0) & (cols < w)
noise_db = np.full(len(easting), np.nan, dtype=np.float32)
valid_rows = rows[in_bounds]
valid_cols = cols[in_bounds]
sampled = noise_grid[valid_rows, valid_cols].astype(np.float32)
# Mark nodata and zero (unmapped areas) as NaN.
# Road/rail use nodata=-96, airport uses nodata=3.4e38.
if raster_nodata is not None:
sampled[np.isclose(sampled, np.float32(raster_nodata), rtol=1e-5)] = np.nan
sampled[sampled == 0] = np.nan
noise_db[in_bounds] = sampled
valid_count = int(np.sum(~np.isnan(noise_db)))
print(f"[{label}] Sampled {valid_count:,} / {len(easting):,} postcodes with noise data")
# Return as masked Series: use null (not NaN) so that Polars max_horizontal
# correctly ignores missing values instead of propagating NaN.
return pl.Series(col_name, noise_db).fill_nan(None)
def main() -> None:
parser = argparse.ArgumentParser(
description="Download Defra noise data (road, rail, airport) and sample at postcode centroids"
)
parser.add_argument(
"--arcgis",
type=Path,
required=True,
help="ArcGIS postcode data parquet (for lat/lon coordinates)",
)
parser.add_argument(
"--output", type=Path, required=True, help="Output parquet file path"
)
args = parser.parse_args()
args.output.parent.mkdir(parents=True, exist_ok=True)
print("Loading postcode coordinates...")
postcodes = pl.read_parquet(
args.arcgis, columns=["pcds", "lat", "long"]
).rename({"pcds": "postcode", "long": "lon"})
lat = postcodes["lat"].to_numpy()
lon = postcodes["lon"].to_numpy()
print("Converting lat/lon to BNG...")
easting, northing = _bng_from_latlon(lat, lon)
result = postcodes.select("postcode")
with tempfile.TemporaryDirectory() as tmp:
for label, col_name, wcs_base, coverage_id, wcs_version in NOISE_SOURCES:
tile_dir = Path(tmp) / label.lower()
tile_dir.mkdir()
tile_paths = download_raster(tile_dir, wcs_base, coverage_id, label, wcs_version)
if not tile_paths:
print(f"[{label}] WARNING: No tiles downloaded — column will be all null")
series = pl.Series(col_name, [None] * len(lat), dtype=pl.Float32)
else:
series = sample_noise_at_postcodes(tile_paths, easting, northing, label, col_name)
result = result.with_columns(series)
result.write_parquet(args.output, compression="zstd")
size_mb = args.output.stat().st_size / (1024 * 1024)
print(f"Wrote {args.output} ({size_mb:.1f} MB)")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,46 @@
import argparse
import tempfile
import polars as pl
from pathlib import Path
from pipeline.utils import download
# Management information - state-funded schools - latest inspections (as at 30 Apr 2025)
# Source: https://www.gov.uk/government/statistical-data-sets/monthly-management-information-ofsteds-school-inspections-outcomes
URL = "https://assets.publishing.service.gov.uk/media/681cd390275cb67b18d870fc/Management_information_-_state-funded_schools_-_latest_inspections_as_at_30_Apr_2025.csv"
def convert_to_parquet(csv_path: Path, parquet_path: Path) -> None:
print("Reading CSV...")
df = pl.read_csv(
csv_path,
infer_schema_length=10000,
encoding="utf8-lossy",
null_values=["NULL", "Not applicable"],
)
print(f"Shape: {df.shape}")
print(f"Columns: {df.columns}")
df.write_parquet(parquet_path, compression="zstd")
print(f"Saved to {parquet_path}")
def main() -> None:
parser = argparse.ArgumentParser(
description="Download Ofsted school inspection outcomes data"
)
parser.add_argument(
"--output", type=Path, required=True, help="Output parquet file path"
)
args = parser.parse_args()
with tempfile.TemporaryDirectory() as cache_dir:
csv_path = Path(cache_dir) / "ofsted_latest_inspections.csv"
download(URL, csv_path, timeout=60)
convert_to_parquet(csv_path, args.output)
if __name__ == "__main__":
main()

175
pipeline/download/pois.py Normal file
View file

@ -0,0 +1,175 @@
import argparse
import tempfile
import urllib.request
from pathlib import Path
from tempfile import mkdtemp
import osmium
import polars as pl
from tqdm import tqdm
BATCH_SIZE = 50_000
MIN_OCCURENCE_COUNT = 20
GEOFABRIK_GB_URL = "https://download.geofabrik.de/europe/great-britain-latest.osm.pbf"
UK_BBOX_WEST = -7.57
UK_BBOX_SOUTH = 49.96
UK_BBOX_EAST = 1.68
UK_BBOX_NORTH = 58.64
POI_TAG_KEYS: list[str] = [
"amenity",
"building",
"craft",
"emergency",
"healthcare",
"leisure",
"office",
"shop",
"tourism",
"public_transport",
]
def download_pbf(pbf_file: Path) -> None:
pbf_file.parent.mkdir(parents=True, exist_ok=True)
tmp = pbf_file.with_suffix(".pbf.tmp")
print(f"Downloading {GEOFABRIK_GB_URL}")
with (
tqdm(unit="B", unit_scale=True, desc="Downloading") as bar,
urllib.request.urlopen(GEOFABRIK_GB_URL) as resp,
open(tmp, "wb") as f,
):
length = resp.headers.get("Content-Length")
if length:
bar.total = int(length)
while chunk := resp.read(1 << 20):
f.write(chunk)
bar.update(len(chunk))
tmp.rename(pbf_file)
print(f"Saved to {pbf_file}")
class POIHandler(osmium.SimpleHandler):
def __init__(self, progress: tqdm, tmp_dir: Path) -> None:
super().__init__()
self._batch: list[dict] = []
self._tmp_dir = tmp_dir
self._batch_num = 0
self.poi_count = 0
self._progress = progress
def _in_uk(self, lat: float, lon: float) -> bool:
return (
UK_BBOX_SOUTH <= lat <= UK_BBOX_NORTH
and UK_BBOX_WEST <= lon <= UK_BBOX_EAST
)
def _match_tags(self, tags: osmium.osm.TagList) -> list[str]:
return [f"{key}/{tags[key]}" for key in POI_TAG_KEYS if key in tags]
def _get_name(self, tags: osmium.osm.TagList) -> str:
return tags.get("name:en", tags.get("name", ""))
def _flush_batch(self) -> None:
if not self._batch:
return
df = pl.DataFrame(self._batch)
out = self._tmp_dir / f"batch_{self._batch_num:05d}.parquet"
df.write_parquet(out)
self._batch_num += 1
self._batch.clear()
def _add_poi(
self,
osm_id: str,
tags: osmium.osm.TagList,
category: str,
lat: float,
lng: float,
) -> None:
self._batch.append(
{
"id": osm_id,
"name": self._get_name(tags),
"category": category,
"lat": lat,
"lng": lng,
}
)
self.poi_count += 1
self._progress.set_postfix(pois=f"{self.poi_count:,}", refresh=False)
if len(self._batch) >= BATCH_SIZE:
self._flush_batch()
def _tick(self) -> None:
self._progress.update(1)
def node(self, n: osmium.osm.Node) -> None:
self._tick()
if not n.location.valid:
return
lat, lon = n.location.lat, n.location.lon
if not self._in_uk(lat, lon):
return
categories = self._match_tags(n.tags)
for category in categories:
self._add_poi(f"n{n.id}", n.tags, category, lat, lon)
def main() -> None:
parser = argparse.ArgumentParser(
description="Download and extract POIs from OpenStreetMap"
)
parser.add_argument(
"--output", type=Path, required=True, help="Output parquet file path"
)
args = parser.parse_args()
with tempfile.TemporaryDirectory() as cache_dir:
pbf_file = Path(cache_dir) / "great-britain-latest.osm.pbf"
if not pbf_file.exists():
download_pbf(pbf_file)
else:
print(f"Using cached PBF file at {pbf_file}")
print(f"Tag keys: {POI_TAG_KEYS}")
tmp_dir = Path(mkdtemp(prefix="pois_"))
with tqdm(
unit=" elements",
unit_scale=True,
desc="Streaming",
smoothing=0.05,
mininterval=1.0,
) as progress:
handler = POIHandler(progress, tmp_dir)
handler.apply_file(str(pbf_file), locations=True)
handler._flush_batch() # write any remaining POIs
print(f"Extracted {handler.poi_count:,} POIs")
batch_files = sorted(tmp_dir.glob("batch_*.parquet"))
df = pl.concat([pl.scan_parquet(f) for f in batch_files])
# Only keep categories with enough occurrences
valid_categories = (
df.group_by("category")
.agg(pl.len().alias("count"))
.filter(pl.col("count") >= MIN_OCCURENCE_COUNT)
)
df = df.join(valid_categories.select("category"), on="category", how="semi")
print(f"Total POIs: {handler.poi_count:,}")
df.sink_parquet(args.output)
print(f"Saved to {args.output}")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,66 @@
import argparse
import tempfile
import polars as pl
from pathlib import Path
from pipeline.utils import download
URL = "http://prod.publicdata.landregistry.gov.uk.s3-website-eu-west-1.amazonaws.com/pp-complete.csv"
def convert_to_parquet(csv_path: Path, parquet_path: Path) -> None:
"""Convert CSV to Parquet using Polars."""
print("Converting to Parquet...")
# https://www.gov.uk/guidance/about-the-price-paid-data
# Land Registry CSV columns
columns = [
"transaction_id",
"price",
"date_of_transfer",
"postcode",
"property_type",
"old_new",
"duration",
"paon",
"saon",
"street",
"locality",
"town_city",
"district",
"county",
"ppd_category",
"record_status",
]
df = pl.read_csv(
csv_path,
has_header=False,
new_columns=columns,
try_parse_dates=True,
)
parquet_path.parent.mkdir(parents=True, exist_ok=True)
print(f"Columns: {df.collect_schema().names()}")
df.write_parquet(parquet_path, compression="zstd")
print(f"Saved to {parquet_path}")
def main() -> None:
parser = argparse.ArgumentParser(
description="Download and convert Land Registry price-paid data"
)
parser.add_argument(
"--output", type=Path, required=True, help="Output parquet file path"
)
args = parser.parse_args()
with tempfile.TemporaryDirectory() as cache_dir:
csv_path = Path(cache_dir) / "price-paid-complete.csv"
download(URL, csv_path)
convert_to_parquet(csv_path, args.output)
if __name__ == "__main__":
main()

View file

@ -0,0 +1,29 @@
"""Journey times calculation module for TfL transit data."""
from .config import (
DATA_DIR,
DESTINATIONS,
MAX_CONCURRENT,
MAX_DELAY,
MAX_POSTCODES,
OUTPUT_DIR,
REQUESTS_PER_MIN,
)
from .models import Destination, JourneyResult
from .results import results_to_dataframe, save_results
from .tfl_client import fetch_journey_times
__all__ = [
"DATA_DIR",
"OUTPUT_DIR",
"MAX_DELAY",
"REQUESTS_PER_MIN",
"MAX_POSTCODES",
"MAX_CONCURRENT",
"DESTINATIONS",
"Destination",
"JourneyResult",
"fetch_journey_times",
"results_to_dataframe",
"save_results",
]

View file

@ -0,0 +1,142 @@
import asyncio
import random
from datetime import date, timedelta
import polars as pl
from tqdm import tqdm
from .config import (
DESTINATIONS,
MAX_CONCURRENT,
MAX_POSTCODES,
OUTPUT_DIR,
MAX_DISTANCE_KM,
)
from .models import JourneyResult
from .results import CheckpointSaver, results_to_dataframe, save_results
from .tfl_client import fetch_journey_times
from pipeline.utils import haversine_km_expr
def main():
destination = DESTINATIONS["bank"]
# Calculate next Monday at 8am
today = date.today()
days_until_monday = (7 - today.weekday()) % 7 or 7
journey_date = today + timedelta(days=days_until_monday)
journey_time = "0845"
print(f"Destination: {destination.name}")
print(
f"Journey: {journey_date.strftime('%A %Y-%m-%d')} "
f"at {journey_time[:2]}:{journey_time[2:]}"
)
postcodes_df = pl.read_parquet(OUTPUT_DIR / "postcodes_h3.parquet")
print(f"Loaded {postcodes_df.height:,} postcodes")
# Filter to postcodes within range of destination
postcodes_df = postcodes_df.with_columns(
haversine_km_expr("lat", "long", destination.lat, destination.lon).alias(
"distance_km"
)
).filter(pl.col("distance_km") <= MAX_DISTANCE_KM)
print(f"Filtered to {postcodes_df.height:,} postcodes within {MAX_DISTANCE_KM}km")
postcode_data = list(
zip(
postcodes_df["postcode"].to_list(),
postcodes_df["lat"].to_list(),
postcodes_df["long"].to_list(),
)
)
if MAX_POSTCODES is not None and len(postcode_data) > MAX_POSTCODES:
postcode_data = random.sample(postcode_data, MAX_POSTCODES)
print(f"Randomly sampled {MAX_POSTCODES} postcodes")
checkpoint_saver = CheckpointSaver(
destination_name=destination.name,
on_save=lambda path, count: print(
f"Checkpoint saved: {count:,} results to {path}"
),
)
# 25556/76273
# Resume from checkpoint if one exists
checkpoint_path = checkpoint_saver._checkpoint_path()
prior_results: list[JourneyResult] = []
if checkpoint_path.exists():
checkpoint_df = pl.read_parquet(checkpoint_path)
# Deduplicate checkpoint rows per postcode, preferring rows with data
checkpoint_df = (
checkpoint_df.sort("public_transport_quick_minutes", nulls_last=True)
.unique(subset=["postcode"], keep="first")
)
completed_postcodes = set(checkpoint_df["postcode"].to_list())
prior_results = [
JourneyResult(
postcode=row["postcode"],
public_transport_easy_minutes=row["public_transport_easy_minutes"],
public_transport_quick_minutes=row["public_transport_quick_minutes"],
cycling_minutes=row["cycling_minutes"],
error=row["error"],
)
for row in checkpoint_df.iter_rows(named=True)
]
checkpoint_saver.results = prior_results
checkpoint_saver._last_save_count = len(prior_results)
postcode_data = [
(pc, lat, lon)
for pc, lat, lon in postcode_data
if pc not in completed_postcodes
]
print(
f"Resumed from checkpoint: {len(prior_results):,} already done, "
f"{len(postcode_data):,} remaining"
)
def on_result(result):
pbar.update(1)
checkpoint_saver.add_result(result)
with tqdm(total=len(postcode_data), desc="Fetching journeys") as pbar:
new_results = asyncio.run(
fetch_journey_times(
postcode_data,
destination,
journey_date.strftime("%Y%m%d"),
journey_time,
MAX_CONCURRENT,
progress_callback=on_result,
)
)
all_results = prior_results + new_results
results_df = results_to_dataframe(all_results)
all_postcodes = {r.postcode for r in all_results}
coords_df = postcodes_df.filter(
pl.col("postcode").is_in(all_postcodes)
).select(["postcode", "lat", "long"])
results_df = coords_df.join(results_df, on="postcode", how="left")
results_df = results_df.with_columns(
pl.lit(destination.name).alias("destination"),
pl.lit(journey_date.strftime("%Y-%m-%d")).alias("journey_date"),
pl.lit(f"{journey_time[:2]}:{journey_time[2:]}").alias("journey_time"),
)
successful = results_df.filter(pl.col("cycling_minutes").is_not_null()).height
print(f"Completed: {successful}/{len(all_results)} successful")
parquet_path = save_results(results_df, destination.name)
checkpoint_saver.cleanup_checkpoint()
print(f"Saved to {parquet_path}")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,23 @@
"""Configuration constants for journey times processing."""
from .models import Destination
MAX_DELAY = 10
REQUESTS_PER_MIN = 500
MAX_POSTCODES = None
MAX_CONCURRENT = 80
MAX_DISTANCE_KM = 110
CHECKPOINT_INTERVAL = 10000
DESTINATIONS = {
"bank": Destination(51.5133, -0.0886, "Bank", "940GZZLUBNK"),
"waterloo": Destination(51.5031, -0.1132, "Waterloo", "940GZZLUWLO"),
"kings-cross": Destination(51.5308, -0.1238, "King's Cross", "940GZZLUKSX"),
"liverpool-street": Destination(
51.5178, -0.0823, "Liverpool Street", "940GZZLULVS"
),
"paddington": Destination(51.5154, -0.1755, "Paddington", "940GZZLUPAC"),
"victoria": Destination(51.4965, -0.1447, "Victoria", "940GZZLUVIC"),
}

View file

@ -0,0 +1,30 @@
"""Data models for journey times processing."""
from dataclasses import dataclass
@dataclass
class Destination:
"""A destination point for journey planning."""
lat: float
lon: float
name: str
naptan_id: str | None = None
def to_tfl_location(self) -> str:
"""Convert to TfL API location string."""
if self.naptan_id:
return self.naptan_id
return f"{self.lat},{self.lon}"
@dataclass
class JourneyResult:
"""Result of a journey time calculation for a postcode."""
postcode: str
public_transport_easy_minutes: int | None = None
cycling_minutes: int | None = None
public_transport_quick_minutes: int | None = None
error: str | None = None

View file

@ -0,0 +1,35 @@
"""Rate limiting for TfL API requests."""
import asyncio
import warnings
from .config import REQUESTS_PER_MIN
class RateLimiter:
"""Rate limiter enforcing max requests per minute."""
def __init__(self):
self.request_times: list[float] = []
self._lock = asyncio.Lock()
async def acquire(self):
"""Wait until we can make a request within rate limits."""
async with self._lock:
now = asyncio.get_event_loop().time()
cutoff = now - 10.0 # 10 seconds
self.request_times = [t for t in self.request_times if t > cutoff]
if (
len(self.request_times) >= REQUESTS_PER_MIN // 6
): # we look at it every 10 seconds instead of minutes
wait_time = self.request_times[0] - cutoff
if wait_time > 0:
warnings.warn(
f"Rate limit reached ({REQUESTS_PER_MIN}/min), "
f"waiting {wait_time:.1f}s",
stacklevel=1,
)
await asyncio.sleep(wait_time)
self.request_times.append(asyncio.get_event_loop().time())

View file

@ -0,0 +1,85 @@
from pathlib import Path
from typing import Callable
import polars as pl
from .config import CHECKPOINT_INTERVAL, OUTPUT_DIR
from .models import JourneyResult
def results_to_dataframe(results: list[JourneyResult]) -> pl.DataFrame:
return pl.DataFrame(
[
{
"postcode": r.postcode,
"public_transport_easy_minutes": r.public_transport_easy_minutes,
"public_transport_quick_minutes": r.public_transport_quick_minutes,
"cycling_minutes": r.cycling_minutes,
"error": r.error,
}
for r in results
]
)
class CheckpointSaver:
"""Collects results and saves checkpoints at regular intervals."""
def __init__(
self,
destination_name: str,
output_dir: Path | None = None,
interval: int = CHECKPOINT_INTERVAL,
on_save: Callable[[Path, int], None] | None = None,
):
self.destination_name = destination_name
self.output_dir = output_dir or OUTPUT_DIR
self.interval = interval
self.on_save = on_save
self.results: list[JourneyResult] = []
self._last_save_count = 0
def add_result(self, result: JourneyResult) -> None:
"""Add a result and save checkpoint if interval is reached."""
self.results.append(result)
if len(self.results) - self._last_save_count >= self.interval:
self.save_checkpoint()
def save_checkpoint(self) -> Path:
"""Save current results to checkpoint file."""
df = results_to_dataframe(self.results)
path = self._checkpoint_path()
df.write_parquet(path)
self._last_save_count = len(self.results)
if self.on_save:
self.on_save(path, len(self.results))
return path
def _checkpoint_path(self) -> Path:
safe_name = self.destination_name.lower().replace(" ", "-")
return self.output_dir / f"journey_times_{safe_name}_checkpoint.parquet"
def get_results(self) -> list[JourneyResult]:
"""Return all collected results."""
return self.results
def cleanup_checkpoint(self) -> None:
"""Remove the checkpoint file after successful completion."""
path = self._checkpoint_path()
if path.exists():
path.unlink()
def save_results(
results: pl.DataFrame,
destination_name: str,
output_dir: Path | None = None,
) -> Path:
if output_dir is None:
output_dir = OUTPUT_DIR
safe_name = destination_name.lower().replace(" ", "-")
parquet_path = output_dir / f"journey_times_{safe_name}.parquet"
results.write_parquet(parquet_path)
return parquet_path

View file

@ -0,0 +1,254 @@
import asyncio
import os
from typing import Literal
import warnings
from collections.abc import Callable
from http import HTTPStatus
import httpx
from .config import MAX_DELAY
from .models import Destination, JourneyResult
from .rate_limiter import RateLimiter
BASE_URL = "https://api.tfl.gov.uk"
async def fetch_journey_for_mode(
client: httpx.AsyncClient,
rate_limiter: RateLimiter,
from_location: str,
to_location: str,
journey_date: str,
journey_time: str,
journey_type: Literal["quick"] | Literal["easy"] | Literal["cycle"],
retry_count: int = 5,
) -> int | None:
"""Fetch journey time for a specific mode with rate limiting."""
backoff = 1.0
for attempt in range(retry_count):
try:
await rate_limiter.acquire()
journey_preference = {
"quick": "LeastTime",
"easy": "LeastInterchange",
"cycle": None,
}[journey_type]
cycle_preference = {
"quick": None,
"easy": None,
"cycle": "AllTheWay",
}[journey_type]
# curl -s "https://api.tfl.gov.uk/Journey/Meta/Modes" | jq '.[].modeName'
mode = {
"quick": [
"bus",
"overground",
"national-rail",
"international-rail",
"elizabeth-line",
"tube",
"coach",
"dlr",
"cable-car",
"replacement-bus",
"tram",
"river-bus",
"walking",
"cycle",
],
"easy": [
"bus",
"overground",
"national-rail",
"international-rail",
"elizabeth-line",
"replacement-bus",
"tube",
"coach",
"dlr",
"cable-car",
"tram",
"river-bus",
],
"cycle": ["cycle"],
}[journey_type]
params: dict = {
"date": journey_date,
"time": journey_time,
"nationalSearch": "true",
"timeIs": "Arriving",
"cyclePreference": cycle_preference,
"bikeProficiency": "Fast",
"walkingOptimization": str(journey_type == "quick").lower(),
"mode": ",".join(mode),
}
if journey_preference:
params["journeyPreference"] = journey_preference
url = f"/Journey/JourneyResults/{from_location}/to/{to_location}"
response = await client.get(url, params=params)
if response.status_code == HTTPStatus.OK:
data = response.json()
journeys = data.get("journeys", [])
if journeys:
durations = [
j["duration"] for j in journeys if j.get("duration") is not None
]
if durations:
return min(durations)
return None
elif response.status_code in (
HTTPStatus.TOO_MANY_REQUESTS,
HTTPStatus.INTERNAL_SERVER_ERROR,
HTTPStatus.BAD_GATEWAY,
HTTPStatus.SERVICE_UNAVAILABLE,
HTTPStatus.GATEWAY_TIMEOUT,
):
warnings.warn(
f"HTTP {response.status_code} for {journey_type} from {from_location}, "
f"retrying in {backoff:.1f}s (attempt {attempt + 1}/{retry_count})",
stacklevel=2,
)
await asyncio.sleep(backoff)
backoff = min(backoff * 2, MAX_DELAY)
continue
else:
return None
except Exception as e:
warnings.warn(
f"Network error for {journey_type} from {from_location}: {e}, "
f"retrying in {backoff:.1f}s (attempt {attempt + 1}/{retry_count})",
stacklevel=2,
)
await asyncio.sleep(backoff)
backoff = min(backoff * 2, MAX_DELAY)
continue
warnings.warn(
f"Failed to fetch {journey_type} from {from_location} after {retry_count} attempts",
stacklevel=2,
)
return None
async def fetch_all_modes(
client: httpx.AsyncClient,
rate_limiter: RateLimiter,
postcode: str,
lat: float,
lon: float,
to_location: str,
journey_date: str,
journey_time: str,
semaphore: asyncio.Semaphore,
) -> JourneyResult:
"""Fetch journey times for all transport modes using coordinates."""
async with semaphore:
try:
from_location = f"{lat},{lon}"
easy = await fetch_journey_for_mode(
client,
rate_limiter,
from_location,
to_location,
journey_date,
journey_time,
journey_type="easy",
)
quick = await fetch_journey_for_mode(
client,
rate_limiter,
from_location,
to_location,
journey_date,
journey_time,
journey_type="quick",
)
cycling = await fetch_journey_for_mode(
client,
rate_limiter,
from_location,
to_location,
journey_date,
journey_time,
journey_type="cycle",
)
return JourneyResult(
postcode=postcode,
public_transport_easy_minutes=easy,
public_transport_quick_minutes=quick,
cycling_minutes=cycling,
)
except Exception as e:
print(f"Error: {e}")
return JourneyResult(postcode=postcode, error=str(e))
async def fetch_journey_times(
postcode_data: list[tuple[str, float, float]],
dest: Destination,
journey_date: str,
journey_time: str,
max_concurrent: int = 2,
progress_callback: Callable[[JourneyResult], None] | None = None,
) -> list[JourneyResult]:
"""Fetch journey times for all postcodes with rate limiting.
Args:
postcode_data: List of (postcode, lat, lon) tuples
dest: Destination for journey planning
journey_date: Date in YYYYMMDD format
journey_time: Time in HHMM format
max_concurrent: Maximum concurrent API requests
progress_callback: Optional callback called with each result
Returns:
List of JourneyResult objects in the same order as postcode_data
"""
semaphore = asyncio.Semaphore(max_concurrent)
to_location = dest.to_tfl_location()
rate_limiter = RateLimiter()
# TFL API authentication via app_key query parameter
tfl_token = os.environ.get("TFL_TOKEN")
if not tfl_token:
raise RuntimeError("TFL_TOKEN environment variable not set")
params = {"app_key": tfl_token}
async with httpx.AsyncClient(
base_url=BASE_URL,
params=params,
timeout=httpx.Timeout(30),
) as client:
tasks = [
fetch_all_modes(
client,
rate_limiter,
pc,
lat,
lon,
to_location,
journey_date,
journey_time,
semaphore,
)
for pc, lat, lon in postcode_data
]
results = []
for coro in asyncio.as_completed(tasks):
result = await coro
results.append(result)
if progress_callback:
progress_callback(result)
postcode_to_result = {r.postcode: r for r in results}
return [postcode_to_result[pc] for pc, _, _ in postcode_data]

View file

@ -1,42 +0,0 @@
from pathlib import Path
import polars as pl
from pipeline.config import AGGREGATES_DIR, H3_RESOLUTIONS
def aggregate(df: pl.LazyFrame, resolution: int) -> pl.LazyFrame:
"""Aggregate property data by H3 cell and year."""
h3_col = f"h3_res{resolution}"
return (
df.group_by(h3_col, "year")
.agg(
pl.len().alias("count"),
pl.col("price").mean().alias("avg_price"),
pl.col("price").median().alias("median_price"),
pl.col("price").min().alias("min_price"),
pl.col("price").max().alias("max_price"),
)
.rename({h3_col: "h3"})
)
def aggregate_all(df: pl.LazyFrame) -> dict[int, pl.LazyFrame]:
"""Aggregate at all H3 resolutions."""
return {res: aggregate(df, res) for res in H3_RESOLUTIONS}
def save_aggregates(df: pl.LazyFrame, output_dir: Path | None = None) -> list[Path]:
"""Aggregate and save at all H3 resolutions."""
output_dir = output_dir or AGGREGATES_DIR
output_dir.mkdir(parents=True, exist_ok=True)
saved_paths = []
aggregates = aggregate_all(df)
for res, agg_df in aggregates.items():
output_path = output_dir / f"res{res}.parquet"
agg_df.collect().write_parquet(output_path)
saved_paths.append(output_path)
return saved_paths

View file

@ -1,35 +0,0 @@
"""Pipeline CLI to process property data with H3 spatial indexing."""
import polars as pl
from pipeline.sources.postcodes import save_postcodes
from pipeline.sources.property_prices import PropertyPricesSource
from pipeline.processors.h3_aggregator import save_aggregates
def run_pipeline():
"""Run the full data processing pipeline."""
print("=" * 60)
print("Property Map Data Pipeline")
print("=" * 60)
# Step 1: Process postcodes with H3 indices
print("\n[1/3] Processing postcodes with H3 indices...")
postcodes_path = save_postcodes()
print(f" Saved: {postcodes_path}")
print("\n[2/3] Processing property prices...")
postcodes = pl.scan_parquet(postcodes_path)
property_source = PropertyPricesSource()
properties = property_source.process(postcodes)
print(" Joined property prices with postcodes")
print("\n[3/3] Aggregating at H3 resolutions...")
saved_paths = save_aggregates(properties)
for path in saved_paths:
size_mb = path.stat().st_size / (1024 * 1024)
print(f" Saved: {path.name} ({size_mb:.1f} MB)")
if __name__ == "__main__":
run_pipeline()

View file

@ -1,49 +0,0 @@
from pathlib import Path
import polars as pl
import h3
from pipeline.config import DATA_DIR, H3_RESOLUTIONS, PROCESSED_DIR
def lat_long_to_h3(lat: float, long: float, resolution: int) -> str:
"""Convert lat/long to H3 index at given resolution."""
return h3.latlng_to_cell(lat, long, resolution)
def load_postcodes() -> pl.LazyFrame:
"""Load postcode data from arcgis parquet file."""
return pl.scan_parquet(DATA_DIR / "arcgis_data.parquet").select(
pl.col("pcds").alias("postcode"),
pl.col("lat"),
pl.col("long"),
)
def process_postcodes() -> pl.LazyFrame:
"""Process postcodes and add H3 indices at multiple resolutions."""
df = load_postcodes().collect()
for res in H3_RESOLUTIONS:
col_name = f"h3_res{res}"
df = df.with_columns(
pl.struct(["lat", "long"])
.map_elements(
# Capture res by value using default argument to avoid closure bug
lambda x, res=res: lat_long_to_h3(x["lat"], x["long"], res),
return_dtype=pl.Utf8,
)
.alias(col_name)
)
return df.lazy()
def save_postcodes(output_path: Path | None = None) -> Path:
"""Process and save postcodes with H3 indices."""
output_path = output_path or PROCESSED_DIR / "postcodes_h3.parquet"
output_path.parent.mkdir(parents=True, exist_ok=True)
df = process_postcodes().collect()
df.write_parquet(output_path)
return output_path

View file

@ -1,41 +0,0 @@
import polars as pl
from pipeline.base import DataSource
from pipeline.config import DATA_DIR, H3_RESOLUTIONS
class PropertyPricesSource(DataSource):
"""Land Registry property prices data source."""
@property
def name(self) -> str:
return "property_prices"
def load(self) -> pl.LazyFrame:
"""Load raw property prices data."""
return pl.scan_parquet(DATA_DIR / "pp-complete.parquet")
def process(self, postcodes: pl.LazyFrame) -> pl.LazyFrame:
"""Process and join with postcode coordinates and H3 indices."""
prices = self.load().select(
pl.col("price"),
pl.col("date_of_transfer").dt.year().alias("year"),
pl.col("property_type"),
pl.col("postcode"),
)
joined = prices.join(
postcodes,
on="postcode",
how="inner",
)
h3_cols = [pl.col(f"h3_res{res}") for res in H3_RESOLUTIONS]
return joined.select(
pl.col("price"),
pl.col("year"),
pl.col("property_type"),
pl.col("lat"),
pl.col("long"),
*h3_cols,
)

View file

@ -0,0 +1,63 @@
import argparse
from pathlib import Path
import polars as pl
def transform_crime(crime_dir: Path, output_path: Path) -> None:
csvs = sorted(crime_dir.rglob("*.csv"))
print(f"Found {len(csvs)} CSV files across {len(list(crime_dir.iterdir()))} months")
df = pl.scan_csv(
csvs,
schema_overrides={"LSOA code": pl.Utf8, "Crime type": pl.Utf8, "Month": pl.Utf8},
).select("LSOA code", "Crime type", "Month")
# Extract year, count crimes per LSOA / year / crime type
yearly_counts = (
df.filter(pl.col("LSOA code").is_not_null() & (pl.col("LSOA code") != ""))
.with_columns(pl.col("Month").str.slice(0, 4).alias("year"))
.group_by("LSOA code", "year", "Crime type")
.agg(pl.len().alias("count"))
.group_by("LSOA code", "Crime type")
.agg(pl.col("count").mean().round(1).alias("yearly_avg"))
.collect(engine="streaming")
)
print(f"Crime types: {sorted(yearly_counts['Crime type'].unique().to_list())}")
# Pivot crime types into columns
wide = yearly_counts.pivot(
on="Crime type",
index="LSOA code",
values="yearly_avg",
)
# Fill nulls with 0 and rename columns to be descriptive
value_cols = [col for col in wide.columns if col != "LSOA code"]
wide = wide.with_columns(pl.col(col).fill_null(0) for col in value_cols)
wide = wide.rename({col: f"{col} (avg/yr)" for col in value_cols})
print(f"Output shape: {wide.shape}")
print(f"Columns: {wide.columns}")
wide.write_parquet(output_path, compression="zstd")
print(f"Saved to {output_path}")
def main() -> None:
parser = argparse.ArgumentParser(
description="Transform crime CSVs into yearly average by LSOA and crime type"
)
parser.add_argument(
"--input", type=Path, required=True, help="Directory containing crime data"
)
parser.add_argument(
"--output", type=Path, required=True, help="Output parquet file path"
)
args = parser.parse_args()
transform_crime(args.input, args.output)
if __name__ == "__main__":
main()

View file

@ -0,0 +1,161 @@
import argparse
import polars as pl
from pathlib import Path
from ..utils import fuzzy_join_on_postcode
MIN_FLOOR_AREA_M2 = 10
pl.Config.set_tbl_cols(-1)
def main():
parser = argparse.ArgumentParser(description="Fuzzy join EPC and Price Paid data")
parser.add_argument(
"--epc", type=Path, required=True, help="EPC certificates CSV file"
)
parser.add_argument(
"--price-paid", type=Path, required=True, help="Price paid parquet file"
)
parser.add_argument(
"--output", type=Path, required=True, help="Output parquet file path"
)
args = parser.parse_args()
epc = (
pl.scan_csv(args.epc)
.select(
pl.col("ADDRESS").alias("epc_address"),
"POSTCODE",
"CURRENT_ENERGY_RATING",
"POTENTIAL_ENERGY_RATING",
pl.col("PROPERTY_TYPE").alias("epc_property_type"),
"BUILT_FORM",
"INSPECTION_DATE",
"TOTAL_FLOOR_AREA",
"NUMBER_HABITABLE_ROOMS",
"FLOOR_HEIGHT",
"CONSTRUCTION_AGE_BAND",
)
.filter(pl.col("epc_address").is_not_null())
.sort("INSPECTION_DATE", descending=True)
.group_by("epc_address", "POSTCODE")
.first()
)
print("EPC dataset")
print(epc.head().collect())
# https://www.gov.uk/guidance/about-the-price-paid-data
property_type_map = {
"D": "Detached",
"S": "Semi-Detached",
"T": "Terraced",
"F": "Flats/Maisonettes",
"O": "Other",
}
duration_map = {"F": "Freehold", "L": "Leasehold"}
price_paid = (
pl.scan_parquet(args.price_paid)
.select(
"price",
"date_of_transfer",
pl.col("property_type")
.alias("pp_property_type")
.replace(property_type_map),
"postcode",
"paon",
"saon",
"street",
"locality",
"town_city",
pl.col("duration").replace(duration_map),
"old_new",
)
.filter(pl.col("pp_property_type") != "Other")
.with_columns(
pl.concat_str(
[pl.col("saon"), pl.col("paon"), pl.col("street")],
separator=" ",
ignore_nulls=True,
).alias("pp_address"),
)
.sort("date_of_transfer")
.group_by("pp_address", "postcode", maintain_order=True)
.agg(
pl.struct(
pl.col("date_of_transfer").dt.year().alias("year"),
"price",
).alias("historical_prices"),
pl.col("pp_property_type").last(),
pl.col("duration").last(),
pl.col("price").last().alias("latest_price"),
pl.col("date_of_transfer").last(),
pl.col("date_of_transfer").first().alias("first_transfer_date"),
pl.col("old_new").first(),
)
).filter(pl.col("pp_address").is_not_null())
print("Price paid dataset")
print(price_paid.head().collect())
joined = (
fuzzy_join_on_postcode(
left=price_paid,
right=epc,
left_address_col="pp_address",
right_address_col="epc_address",
left_postcode_col="postcode",
right_postcode_col="POSTCODE",
)
.drop("POSTCODE")
.collect(engine="streaming")
)
matched = joined.filter(
pl.col("epc_address").is_not_null() & pl.col("pp_address").is_not_null()
)
total = joined.height
print(f"Unique properties: {total}")
print(f"Matched: {matched.height} ({100 * matched.height / total:.1f}%)")
print(f"Unmatched: {total - matched.height}")
matched = matched.filter(pl.col("TOTAL_FLOOR_AREA") >= MIN_FLOOR_AREA_M2)
# For new-builds (old_new == "Y"), use the first transaction date year as
# the exact construction date; otherwise fall back to the EPC age band.
epc_band_year = (
pl.col("CONSTRUCTION_AGE_BAND")
.str.replace("England and Wales: ", "")
.str.replace(" onwards", "")
.str.extract(r"(\d{4})", 1)
.cast(pl.UInt16, strict=False)
)
transfer_year = (
pl.col("first_transfer_date").dt.year().cast(pl.UInt16, strict=False)
)
is_new_build = pl.col("old_new") == "Y"
matched = matched.with_columns(
pl.when(is_new_build & transfer_year.is_not_null())
.then(transfer_year)
.otherwise(epc_band_year)
.alias("CONSTRUCTION_AGE_BAND"),
pl.when(is_new_build & transfer_year.is_not_null())
.then(pl.lit(0, dtype=pl.UInt8))
.when(epc_band_year.is_not_null())
.then(pl.lit(1, dtype=pl.UInt8))
.otherwise(pl.lit(None, dtype=pl.UInt8))
.alias("is_construction_date_approximate"),
).drop("old_new", "first_transfer_date")
matched = matched.rename({col: col.lower() for col in joined.columns})
print(matched.head())
matched.write_parquet(args.output)
print(f"Wrote {args.output}")
if __name__ == "__main__":
main()

290
pipeline/transform/merge.py Normal file
View file

@ -0,0 +1,290 @@
import argparse
import polars as pl
from pathlib import Path
MIN_PRICE = 10_000
MIN_FLOOR_AREA_M2 = 10
def _build_wide(
epc_pp_path: Path,
arcgis_path: Path,
iod_path: Path,
poi_proximity_path: Path,
journey_times_path: Path,
ethnicity_path: Path,
crime_path: Path,
noise_path: Path,
school_proximity_path: Path,
broadband_path: Path,
) -> pl.DataFrame:
"""Build the wide dataframe by joining epc_pp with all auxiliary data."""
wide = pl.scan_parquet(epc_pp_path)
arcgis = pl.scan_parquet(arcgis_path).select(
pl.col("pcds").alias("postcode"),
"lat",
pl.col("long").alias("lon"),
"lsoa21",
"oa21",
)
wide = wide.join(arcgis, on="postcode", how="inner")
journey_times = (
pl.scan_parquet(journey_times_path)
.select(
"postcode",
"public_transport_easy_minutes",
"public_transport_quick_minutes",
"cycling_minutes",
)
.sort("public_transport_quick_minutes", nulls_last=True)
.group_by("postcode")
.first()
)
wide = wide.join(journey_times, on="postcode", how="left")
iod = pl.scan_parquet(iod_path)
wide = wide.join(iod, left_on="lsoa21", right_on="LSOA code (2021)", how="left")
ethnicity = pl.scan_parquet(ethnicity_path)
wide = wide.join(
ethnicity,
left_on="Local Authority District code (2024)",
right_on="Geography_code",
how="left",
)
crime = pl.scan_parquet(crime_path)
wide = wide.join(crime, left_on="lsoa21", right_on="LSOA code", how="left")
wide = wide.with_columns(
pl.sum_horizontal(
"Violence and sexual offences (avg/yr)",
"Robbery (avg/yr)",
"Burglary (avg/yr)",
"Possession of weapons (avg/yr)",
).alias("serious_crime_avg_yr"),
pl.sum_horizontal(
"Anti-social behaviour (avg/yr)",
"Criminal damage and arson (avg/yr)",
"Shoplifting (avg/yr)",
"Bicycle theft (avg/yr)",
"Theft from the person (avg/yr)",
"Other theft (avg/yr)",
"Vehicle crime (avg/yr)",
"Public order (avg/yr)",
"Drugs (avg/yr)",
"Other crime (avg/yr)",
).alias("minor_crime_avg_yr"),
)
poi_counts = pl.scan_parquet(poi_proximity_path)
wide = wide.join(poi_counts, on="postcode", how="left")
noise_cols = ["road_noise_lden_db", "rail_noise_lden_db", "airport_noise_lden_db"]
noise = (
pl.scan_parquet(noise_path)
.with_columns(
# NaN → null so max_horizontal ignores missing instead of propagating NaN
*[pl.col(c).fill_nan(None) for c in noise_cols],
)
.with_columns(
pl.max_horizontal(*noise_cols).fill_null(0).alias("noise_lden_db"),
)
.select("postcode", "noise_lden_db")
)
wide = wide.join(noise, on="postcode", how="left")
school_proximity = pl.scan_parquet(school_proximity_path)
wide = wide.join(school_proximity, on="postcode", how="left")
# Broadband: derive max available download speed tier per postcode from
# Ofcom availability percentages. Tiers: Gigabit ≥1000, UFBB ≥300,
# UFBB(100) ≥100, SFBB ≥30 Mbps.
broadband = (
pl.scan_parquet(broadband_path)
.select(
pl.col("postcode_space").alias("bb_postcode"),
pl.when(pl.col("Gigabit availability (% premises)") > 0)
.then(1000)
.when(pl.col("UFBB availability (% premises)") > 0)
.then(300)
.when(pl.col("UFBB (100Mbit/s) availability (% premises)") > 0)
.then(100)
.when(pl.col("SFBB availability (% premises)") > 0)
.then(30)
.otherwise(10)
.cast(pl.UInt16)
.alias("max_download_speed"),
)
.group_by("bb_postcode")
.agg(pl.col("max_download_speed").max())
)
wide = wide.join(broadband, left_on="postcode", right_on="bb_postcode", how="left")
wide = wide.with_columns(
pl.when(pl.col("pp_property_type") == pl.col("built_form"))
.then(pl.col("pp_property_type"))
.otherwise(
pl.concat_str(
[pl.col("pp_property_type"), pl.lit("/"), pl.col("built_form")]
)
)
.alias("property_type_built_form")
)
wide = (
wide.filter(pl.col("total_floor_area") > MIN_FLOOR_AREA_M2)
.filter(pl.col("latest_price") >= MIN_PRICE)
.with_columns(
pl.when(pl.col("duration") == "U")
.then(None)
.otherwise(pl.col("duration"))
.alias("duration"),
pl.when(pl.col("current_energy_rating") == "INVALID!")
.then(None)
.otherwise(pl.col("current_energy_rating"))
.alias("current_energy_rating"),
)
.with_columns(
(pl.col("latest_price") / pl.col("total_floor_area"))
.round(0)
.cast(pl.Int32)
.alias("Price per sqm"),
)
.drop(
"date_of_transfer",
"inspection_date",
"floor_height",
"LSOA name (2021)",
"Local Authority District code (2024)",
"Local Authority District name (2024)",
"Wider Barriers Sub-domain Score",
"Geographical Barriers Sub-domain Score",
"Adult Skills Sub-domain Score",
"Children and Young People Sub-domain Score",
"Income Deprivation Affecting Older People (IDAOPI) Score (rate)",
"Income Deprivation Affecting Children Index (IDACI) Score (rate)",
"Barriers to Housing and Services Score",
"lsoa21",
"oa21",
"pp_property_type",
"built_form",
)
.rename(
{
"construction_age_band": "Approximate construction age",
"is_construction_date_approximate": "Is construction date approximate",
"pp_address": "Address per Property Register",
"epc_address": "Address per EPC",
"postcode": "Postcode",
"duration": "Leashold/Freehold",
"current_energy_rating": "Current energy rating",
"potential_energy_rating": "Potential energy rating",
"total_floor_area": "Total floor area (sqm)",
"epc_property_type": "Property type",
"property_type_built_form": "Property type/built form",
"restaurants_2km": "Restaurants within 2km",
"groceries_2km": "Groceries within 2km",
"parks_2km": "Parks within 2km",
"public_transport_2km": "Public transport within 2km",
"latest_price": "Last known price",
"number_habitable_rooms": "Number of bedrooms & living rooms",
"noise_lden_db": "Noise (dB)",
"good_primary_5km": "Good+ primary schools within 5km",
"good_secondary_5km": "Good+ secondary schools within 5km",
"max_download_speed": "Max available download speed (Mbps)",
"serious_crime_avg_yr": "Serious crime (avg/yr)",
"minor_crime_avg_yr": "Minor crime (avg/yr)",
}
)
)
print("Collecting with streaming engine...")
return wide.collect(engine="streaming")
def main():
parser = argparse.ArgumentParser(
description="Build wide property dataframe with all joins"
)
parser.add_argument(
"--epc-pp", type=Path, required=True, help="EPC-Price Paid joined parquet file"
)
parser.add_argument(
"--arcgis", type=Path, required=True, help="ArcGIS postcode data parquet file"
)
parser.add_argument(
"--iod",
type=Path,
required=True,
help="Index of Deprivation parquet file (optional)",
)
parser.add_argument(
"--poi-proximity",
type=Path,
help="POI proximity counts parquet file (optional)",
)
parser.add_argument(
"--journey-times",
required=True,
type=Path,
help="Journey times parquet file (optional)",
)
parser.add_argument(
"--ethnicity",
type=Path,
required=True,
help="Ethnicity by local authority parquet file (optional)",
)
parser.add_argument(
"--crime",
type=Path,
required=True,
help="Crime by LSOA parquet file (optional)",
)
parser.add_argument(
"--noise", type=Path, required=True, help="Road noise by postcode parquet file"
)
parser.add_argument(
"--school-proximity",
type=Path,
required=True,
help="School proximity counts parquet file",
)
parser.add_argument(
"--broadband",
type=Path,
required=True,
help="Broadband performance by output area parquet file",
)
parser.add_argument(
"--output", type=Path, required=True, help="Output parquet file path"
)
args = parser.parse_args()
wide = _build_wide(
epc_pp_path=args.epc_pp,
arcgis_path=args.arcgis,
iod_path=args.iod,
poi_proximity_path=args.poi_proximity,
journey_times_path=args.journey_times,
ethnicity_path=args.ethnicity,
crime_path=args.crime,
noise_path=args.noise,
school_proximity_path=args.school_proximity,
broadband_path=args.broadband,
)
print(f"Columns: {wide.columns}")
print(f"Rows: {wide.height}")
wide.write_parquet(args.output)
size_mb = args.output.stat().st_size / (1024 * 1024)
print(f"Wrote {args.output} ({size_mb:.1f} MB)")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,52 @@
"""Compute POI proximity counts per postcode from ArcGIS + filtered POIs."""
import argparse
from pathlib import Path
import polars as pl
from pipeline.utils.poi_counts import _count_pois_per_postcode
# POI category groups for proximity counting
POI_GROUPS = {
"restaurants": ["Restaurant", "Fast Food"],
"groceries": ["Greengrocer", "Grocery Shop", "Supermarket", "Convenience Store"],
"parks": ["Park", "Garden", "Nature Reserve"],
"public_transport": ["Metro or Tram stop", "Rail station", "Bus stop", "Bus station"], # comes from naptan.py
}
def main():
parser = argparse.ArgumentParser(
description="Count POIs within radius per postcode"
)
parser.add_argument(
"--arcgis", type=Path, required=True, help="ArcGIS postcode parquet"
)
parser.add_argument(
"--pois", type=Path, required=True, help="Filtered POIs parquet"
)
parser.add_argument(
"--output", type=Path, required=True, help="Output parquet path"
)
args = parser.parse_args()
postcodes = pl.read_parquet(args.arcgis).select(
pl.col("pcds").alias("postcode"),
"lat",
pl.col("long").alias("lon"),
)
pois = pl.read_parquet(args.pois)
result = _count_pois_per_postcode(postcodes, pois, groups=POI_GROUPS, radius_km=2)
result.write_parquet(args.output)
size_mb = args.output.stat().st_size / (1024 * 1024)
print(f"Wrote {args.output} ({size_mb:.1f} MB)")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,73 @@
"""Compute good-rated school proximity counts per postcode."""
import argparse
from pathlib import Path
import polars as pl
from pipeline.utils.poi_counts import _count_pois_per_postcode
SCHOOL_GROUPS = {
"good_primary": ["good_primary"],
"good_secondary": ["good_secondary"],
}
def main():
parser = argparse.ArgumentParser(
description="Count good+ primary/secondary schools within 2km per postcode"
)
parser.add_argument(
"--ofsted", type=Path, required=True, help="Ofsted inspection parquet"
)
parser.add_argument(
"--arcgis", type=Path, required=True, help="ArcGIS postcode parquet"
)
parser.add_argument(
"--output", type=Path, required=True, help="Output parquet path"
)
args = parser.parse_args()
# Load Ofsted data: filter to good+ (1, 2) primary/secondary schools
ofsted = pl.read_parquet(args.ofsted).filter(
pl.col("Ofsted phase").is_in(["Primary", "Secondary"])
& pl.col("Overall effectiveness").is_in(["1", "2"])
)
print(f"Good+ schools: {len(ofsted):,}")
# Assign category based on phase
ofsted = ofsted.with_columns(
pl.when(pl.col("Ofsted phase") == "Primary")
.then(pl.lit("good_primary"))
.otherwise(pl.lit("good_secondary"))
.alias("category")
).select(
pl.col("Postcode").alias("postcode"),
"category",
)
# Join with arcgis to get lat/lng for each school's postcode
arcgis = pl.read_parquet(args.arcgis).select(
pl.col("pcds").alias("postcode"),
"lat",
pl.col("long").alias("lng"),
)
schools = ofsted.join(arcgis, on="postcode", how="inner")
print(f"Schools with coordinates: {len(schools):,}")
# Load all postcodes for proximity counting
postcodes = arcgis.rename({"lng": "lon"})
result = _count_pois_per_postcode(
postcodes, schools, radius_km=5, groups=SCHOOL_GROUPS
)
result.write_parquet(args.output)
size_mb = args.output.stat().st_size / (1024 * 1024)
print(f"Wrote {args.output} ({size_mb:.1f} MB)")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,673 @@
import argparse
import warnings
from pathlib import Path
import polars as pl
DROP_CATEGORIES = {
"amenity/advice",
"amenity/atm",
"amenity/bbq",
"amenity/bench",
"amenity/bicycle_parking",
"amenity/clock",
"amenity/fixme",
"amenity/grit_bin",
"amenity/hunting_stand",
"amenity/motorcycle_parking",
"amenity/notice_board",
"amenity/parking",
"amenity/parking_entrance",
"amenity/parking_space",
"amenity/post_box",
"amenity/telephone",
"amenity/toilets",
"amenity/vacuum_cleaner",
"amenity/waste_basket",
"building/air_shaft",
"building/apartments",
"building/detached",
"building/entrance",
"building/entry",
"building/garage",
"building/garages",
"building/house",
"building/hut",
"building/no",
"building/office",
"building/public",
"building/residential",
"building/roof",
"building/shed",
"building/terrace",
"building/yes",
"emergency/access_point",
"emergency/ambulance_station",
"emergency/assembly_point",
"emergency/bleed_control_kit",
"emergency/defibrillator",
"emergency/designated",
"emergency/dry_riser_inlet",
"emergency/emergency_ward_entrance",
"emergency/fire_alarm_box",
"emergency/fire_extinguisher",
"emergency/fire_hydrant",
"emergency/fire_service_inlet",
"emergency/first_aid_kit",
"emergency/life_ring",
"emergency/lifeguard",
"emergency/no",
"emergency/phone",
"emergency/rescue_equipment",
"emergency/siren",
"emergency/throw_bag",
"emergency/water_rescue",
"emergency/yes",
"leisure/firepit",
"leisure/fishing",
"leisure/picnic_table",
"office/company",
"office/yes",
"tourism/apartment",
"tourism/apartments",
"tourism/camp_pitch",
"tourism/information",
"tourism/village_sign",
"tourism/yes",
# public transport comes from naptan
"public_transport/entrance",
"public_transport/platform",
"public_transport/station",
"public_transport/stop_position",
}
# (friendly_name, emoji) for every category we keep
CATEGORY_MAP: dict[str, tuple[str, str]] = {
# amenity
"amenity/animal_boarding": ("Animal Boarding", "🐾"),
"amenity/animal_breeding": ("Animal Breeding", "🐣"),
"amenity/animal_shelter": ("Animal Shelter", "🏠"),
"amenity/arts_centre": ("Arts Centre", "🎨"),
"amenity/bank": ("Bank", "🏦"),
"amenity/bar": ("Bar", "🍸"),
"amenity/bicycle_rental": ("Bike Rental", "🚲"),
"amenity/bicycle_repair_station": ("Bike Repair", "🔧"),
"amenity/binoculars": ("Public Binoculars", "🔭"),
"amenity/boat_rental": ("Boat Rental", ""),
"amenity/boat_storage": ("Boat Storage", "🚢"),
"amenity/boot_scraper": ("Boot Scraper", "🥾"),
"amenity/bureau_de_change": ("Currency Exchange", "💱"),
"amenity/bus_station": ("Bus Station", "🚌"),
"amenity/cafe": ("Café", ""),
"amenity/car_rental": ("Car Rental", "🚗"),
"amenity/car_sharing": ("Car Sharing", "🚙"),
"amenity/car_wash": ("Car Wash", "🧽"),
"amenity/care_home": ("Care Home", "🏥"),
"amenity/casino": ("Casino", "🎰"),
"amenity/charging_station": ("EV Charging", "🔌"),
"amenity/check_in": ("Check-In Point", ""),
"amenity/childcare": ("Childcare", "👶"),
"amenity/cinema": ("Cinema", "🎬"),
"amenity/clinic": ("Clinic", "🩺"),
"amenity/club": ("Club", "🏛️"),
"amenity/college": ("College", "🎓"),
"amenity/community_centre": ("Community Centre", "🤝"),
"amenity/compressed_air": ("Compressed Air", "💨"),
"amenity/conference_centre": ("Conference Centre", "📋"),
"amenity/courthouse": ("Courthouse", "⚖️"),
"amenity/coworking_space": ("Co-working Space", "💻"),
"amenity/crematorium": ("Crematorium", "🕯️"),
"amenity/dancing_school": ("Dance School", "💃"),
"amenity/dentist": ("Dentist", "🦷"),
"amenity/doctors": ("Doctor", "👨‍⚕️"),
"amenity/dojo": ("Dojo", "🥋"),
"amenity/donation_box": ("Donation Box", "📦"),
"amenity/dressing_room": ("Dressing Room", "👗"),
"amenity/drinking_water": ("Drinking Water", "🚰"),
"amenity/driving_school": ("Driving School", "🚦"),
"amenity/escooter_rental": ("E-Scooter Rental", "🛴"),
"amenity/events_venue": ("Events Venue", "🎪"),
"amenity/fast_food": ("Fast Food", "🍔"),
"amenity/feeding_place": ("Feeding Place", "🍽️"),
"amenity/ferry_terminal": ("Ferry Terminal", "⛴️"),
"amenity/fire_station": ("Fire Station", "🚒"),
"amenity/food_court": ("Food Court", "🍴"),
"amenity/fountain": ("Fountain", ""),
"amenity/fuel": ("Fuel Station", ""),
"amenity/gambling": ("Gambling", "🎲"),
"amenity/grave_yard": ("Graveyard", "🪦"),
"amenity/hall": ("Hall", "🏛️"),
"amenity/hookah_lounge": ("Hookah Lounge", "💨"),
"amenity/hospital": ("Hospital", "🏥"),
"amenity/ice_cream": ("Ice Cream", "🍦"),
"amenity/internet_cafe": ("Internet Café", "🌐"),
"amenity/kick-scooter_rental": ("Kick Scooter Rental", "🛴"),
"amenity/kindergarten": ("Kindergarten", "💒"),
"amenity/language_school": ("Language School", "🗣️"),
"amenity/letter_box": ("Letter Box", "📮"),
"amenity/library": ("Library", "📚"),
"amenity/loading_dock": ("Loading Dock", "📥"),
"amenity/lounge": ("Lounge", "🛋️"),
"amenity/lounger": ("Public Lounger", "🪑"),
"amenity/marketplace": ("Market", "🛒"),
"amenity/money_transfer": ("Money Transfer", "💸"),
"amenity/mounting_block": ("Mounting Block", "🐴"),
"amenity/music_school": ("Music School", "🎵"),
"amenity/music_venue": ("Music Venue", "🎶"),
"amenity/nightclub": ("Nightclub", "🪩"),
"amenity/nursing_home": ("Nursing Home", "🏠"),
"amenity/parcel_locker": ("Parcel Locker", "📦"),
"amenity/payment_terminal": ("Payment Terminal", "💳"),
"amenity/pharmacy": ("Pharmacy", "💊"),
"amenity/photo_booth": ("Photo Booth", "📸"),
"amenity/piano": ("Public Piano", "🎹"),
"amenity/place_of_worship": ("Place of Worship", ""),
"amenity/police": ("Police Station", "🚔"),
"amenity/post_depot": ("Post Depot", "📬"),
"amenity/post_office": ("Post Office", "🏤"),
"amenity/prep_school": ("Prep School", "📖"),
"amenity/pub": ("Pub", "🍺"),
"amenity/public_bookcase": ("Public Bookcase", "📕"),
"amenity/public_building": ("Public Building", "🏢"),
"amenity/reception_desk": ("Reception Desk", "🛎️"),
"amenity/recycling": ("Recycling", "♻️"),
"amenity/restaurant": ("Restaurant", "🍽️"),
"amenity/sanitary_dump_station": ("Sanitary Dump Station", "🚿"),
"amenity/school": ("School", "🏫"),
"amenity/scout_hut": ("Scout Hut", "⚜️"),
"amenity/shelter": ("Shelter", "🛖"),
"amenity/shower": ("Public Shower", "🚿"),
"amenity/smoking_area": ("Smoking Area", "🚬"),
"amenity/social_centre": ("Social Centre", "🏘️"),
"amenity/social_club": ("Social Club", "🤝"),
"amenity/social_facility": ("Social Facility", "🫂"),
"amenity/stripclub": ("Strip Club", "🔞"),
"amenity/studio": ("Studio", "🎙️"),
"amenity/table": ("Public Table", "🪑"),
"amenity/taxi": ("Taxi Stand", "🚕"),
"amenity/telescope": ("Public Telescope", "🔭"),
"amenity/theatre": ("Theatre", "🎭"),
"amenity/ticket_validator": ("Ticket Validator", "🎫"),
"amenity/townhall": ("Town Hall", "🏛️"),
"amenity/training": ("Training Centre", "📝"),
"amenity/trolley_bay": ("Trolley Bay", "🛒"),
"amenity/university": ("University", "🏫"),
"amenity/vehicle_inspection": ("Vehicle Inspection", "🔍"),
"amenity/vending_machine": ("Vending Machine", "🏧"),
"amenity/veterinary": ("Vet", "🐕"),
"amenity/washing_machine": ("Washing Machine", "🧺"),
"amenity/washingline": ("Washing Line", "👕"),
"amenity/waste_disposal": ("Waste Disposal", "🗑️"),
"amenity/waste_transfer_station": ("Waste Transfer Station", "🚛"),
"amenity/water_point": ("Water Point", "💧"),
"amenity/watering_place": ("Watering Place", "🚰"),
"amenity/weighbridge": ("Weighbridge", "⚖️"),
# building
"building/barn": ("Barn", "🏚️"),
"building/bunker": ("Bunker", "🏗️"),
"building/chapel": ("Chapel", ""),
"building/church": ("Church", ""),
"building/commercial": ("Commercial Building", "🏬"),
"building/construction": ("Construction Site", "🚧"),
"building/farm": ("Farmhouse", "🌾"),
"building/greenhouse": ("Greenhouse", "🌿"),
"building/industrial": ("Industrial Building", "🏭"),
"building/kiosk": ("Kiosk", "🏪"),
"building/retail": ("Retail Building", "🏬"),
"building/ruins": ("Ruins", "🏚️"),
"building/school": ("School Building", "🏫"),
"building/semidetached_house": ("Semi-Detached House", "🏠"),
"building/service": ("Service Building", "🔧"),
"building/university": ("University Building", "🎓"),
"building/warehouse": ("Warehouse", "🏭"),
# craft
"craft/agricultural_engines": ("Agricultural Engines", "🚜"),
"craft/atelier": ("Atelier", "🎨"),
"craft/blacksmith": ("Blacksmith", "🔨"),
"craft/bookbinder": ("Bookbinder", "📖"),
"craft/brewery": ("Brewery", "🍺"),
"craft/builder": ("Builder", "🧱"),
"craft/carpenter": ("Carpenter", "🪚"),
"craft/caterer": ("Caterer", "🍱"),
"craft/cleaning": ("Cleaning Service", "🧹"),
"craft/confectionery": ("Confectioner", "🍬"),
"craft/distillery": ("Distillery", "🥃"),
"craft/dressmaker": ("Dressmaker", "👗"),
"craft/electrician": ("Electrician", ""),
"craft/electronics_repair": ("Electronics Repair", "🔌"),
"craft/floorer": ("Flooring Specialist", "🪵"),
"craft/gardener": ("Gardener", "🌱"),
"craft/glaziery": ("Glazier", "🪟"),
"craft/handicraft": ("Handicraft", "✂️"),
"craft/hvac": ("HVAC", "❄️"),
"craft/jeweller": ("Jeweller", "💎"),
"craft/joiner": ("Joiner", "🪚"),
"craft/key_cutter": ("Key Cutter", "🔑"),
"craft/locksmith": ("Locksmith", "🔐"),
"craft/metal_construction": ("Metal Fabrication", "🔩"),
"craft/painter": ("Painter & Decorator", "🖌️"),
"craft/photographer": ("Photographer", "📷"),
"craft/photographic_laboratory": ("Photo Lab", "🖼️"),
"craft/plumber": ("Plumber", "🔧"),
"craft/pottery": ("Pottery", "🏺"),
"craft/printer": ("Printer", "🖨️"),
"craft/roofer": ("Roofer", "🏠"),
"craft/sawmill": ("Sawmill", "🪵"),
"craft/scaffolder": ("Scaffolder", "🏗️"),
"craft/sculptor": ("Sculptor", "🗿"),
"craft/shoemaker": ("Shoemaker", "👞"),
"craft/signmaker": ("Sign Maker", "🪧"),
"craft/stonemason": ("Stonemason", "🪨"),
"craft/tailor": ("Tailor", "🧵"),
"craft/upholsterer": ("Upholsterer", "🛋️"),
"craft/watchmaker": ("Watchmaker", ""),
"craft/window_construction": ("Window Fitter", "🪟"),
"craft/winery": ("Winery", "🍷"),
"craft/yes": ("Craft Workshop", "🛠️"),
# healthcare
"healthcare/alternative": ("Alternative Medicine", "🌿"),
"healthcare/audiologist": ("Audiologist", "👂"),
"healthcare/centre": ("Health Centre", "🏥"),
"healthcare/clinic": ("Health Clinic", "🩺"),
"healthcare/counselling": ("Counselling", "🧠"),
"healthcare/dentist": ("Dental Practice", "🦷"),
"healthcare/doctor": ("GP Surgery", "👨‍⚕️"),
"healthcare/hospital": ("Hospital", "🏥"),
"healthcare/laboratory": ("Medical Lab", "🔬"),
"healthcare/optometrist": ("Optometrist", "👁️"),
"healthcare/pharmacy": ("Pharmacy", "💊"),
"healthcare/physiotherapist": ("Physiotherapist", "🏃"),
"healthcare/podiatrist": ("Podiatrist", "🦶"),
"healthcare/psychotherapist": ("Psychotherapist", "🧠"),
"healthcare/rehabilitation": ("Rehabilitation Centre", ""),
"healthcare/vaccination_centre": ("Vaccination Centre", "💉"),
"healthcare/yes": ("Healthcare Facility", "🏥"),
# leisure
"leisure/adult_gaming_centre": ("Adult Gaming Centre", "🎮"),
"leisure/amusement_arcade": ("Amusement Arcade", "🕹️"),
"leisure/bandstand": ("Bandstand", "🎺"),
"leisure/bathing_place": ("Bathing Spot", "🏖️"),
"leisure/bird_hide": ("Bird Hide", "🐦"),
"leisure/bowling_alley": ("Bowling Alley", "🎳"),
"leisure/common": ("Common Land", "🌳"),
"leisure/dance": ("Dance Venue", "💃"),
"leisure/dog_park": ("Dog Park", "🐕"),
"leisure/escape_game": ("Escape Room", "🔓"),
"leisure/fitness_centre": ("Gym", "🏋️"),
"leisure/fitness_station": ("Outdoor Gym", "💪"),
"leisure/garden": ("Garden", "🌷"),
"leisure/golf_course": ("Golf Course", ""),
"leisure/hackerspace": ("Hackerspace", "💻"),
"leisure/horse_riding": ("Horse Riding", "🐎"),
"leisure/indoor_play": ("Indoor Play Area", "🧒"),
"leisure/marina": ("Marina", ""),
"leisure/miniature_golf": ("Mini Golf", ""),
"leisure/nature_reserve": ("Nature Reserve", "🦔"),
"leisure/outdoor_seating": ("Outdoor Seating", "🪑"),
"leisure/park": ("Park", "🌳"),
"leisure/pitch": ("Sports Pitch", ""),
"leisure/playground": ("Playground", "🛝"),
"leisure/sauna": ("Sauna", "🧖"),
"leisure/slipway": ("Slipway", "🚤"),
"leisure/social_club": ("Social Club", "🍻"),
"leisure/sports_centre": ("Sports Centre", "🏟️"),
"leisure/sports_hall": ("Sports Hall", "🏀"),
"leisure/swimming_pool": ("Swimming Pool", "🏊"),
"leisure/tanning_salon": ("Tanning Salon", "☀️"),
"leisure/track": ("Running Track", "🏃"),
"leisure/trampoline_park": ("Trampoline Park", "🤸"),
"leisure/water_park": ("Water Park", "🌊"),
"leisure/wildlife_hide": ("Wildlife Hide", "🦌"),
"leisure/yes": ("Leisure Facility", "🎉"),
# office
"office/accountant": ("Accountant", "🧮"),
"office/advertising_agency": ("Advertising Agency", "📢"),
"office/architect": ("Architect", "📐"),
"office/association": ("Association", "🏛️"),
"office/charity": ("Charity", "❤️"),
"office/construction_company": ("Construction Company", "🏗️"),
"office/consulting": ("Consulting Firm", "📊"),
"office/courier": ("Courier Service", "📦"),
"office/coworking": ("Co-working Space", "💻"),
"office/design": ("Design Studio", "🎨"),
"office/diplomatic": ("Diplomatic Office", "🏛️"),
"office/educational_institution": ("Education Office", "🎓"),
"office/employment_agency": ("Employment Agency", "💼"),
"office/energy_supplier": ("Energy Supplier", ""),
"office/engineer": ("Engineering Firm", "⚙️"),
"office/estate_agent": ("Estate Agent", "🏠"),
"office/financial": ("Financial Services", "💰"),
"office/financial_advisor": ("Financial Advisor", "📈"),
"office/foundation": ("Foundation", "🏛️"),
"office/government": ("Government Office", "🏛️"),
"office/graphic_design": ("Graphic Design", "🖌️"),
"office/healthcare": ("Healthcare Office", "🏥"),
"office/home_care": ("Home Care Service", "🏠"),
"office/insurance": ("Insurance", "🛡️"),
"office/interior_design": ("Interior Design", "🛋️"),
"office/it": ("IT Company", "💻"),
"office/lawyer": ("Lawyer", "⚖️"),
"office/logistics": ("Logistics", "🚚"),
"office/marketing": ("Marketing Agency", "📣"),
"office/mortgage": ("Mortgage Broker", "🏦"),
"office/moving_company": ("Moving Company", "📦"),
"office/newspaper": ("Newspaper Office", "📰"),
"office/ngo": ("NGO", "🌍"),
"office/notary": ("Notary", "📜"),
"office/political_party": ("Political Party", "🗳️"),
"office/politician": ("Politician Office", "🏛️"),
"office/property_management": ("Property Management", "🏘️"),
"office/recruitment": ("Recruitment Agency", "👥"),
"office/religion": ("Religious Office", "✝️"),
"office/research": ("Research Office", "🔬"),
"office/security": ("Security Company", "🔒"),
"office/solicitor": ("Solicitor", "⚖️"),
"office/surveyor": ("Surveyor", "📏"),
"office/tax_advisor": ("Tax Advisor", "🧾"),
"office/taxi": ("Taxi Office", "🚕"),
"office/telecommunication": ("Telecoms Office", "📡"),
"office/therapist": ("Therapist", "🧠"),
"office/travel_agent": ("Travel Agent", "✈️"),
"office/union": ("Trade Union", ""),
"office/university": ("University Office", "🎓"),
"office/vacant": ("Vacant Office", "🏚️"),
"office/web_design": ("Web Design", "🌐"),
# shop
"shop/accessories": ("Accessories Shop", "👜"),
"shop/agrarian": ("Farm Supply Shop", "🌾"),
"shop/alcohol": ("Off-Licence", "🍷"),
"shop/antiques": ("Antiques Shop", "🏺"),
"shop/appliance": ("Appliance Shop", "🔌"),
"shop/art": ("Art Shop", "🎨"),
"shop/baby_goods": ("Baby Shop", "🍼"),
"shop/bag": ("Bag Shop", "👜"),
"shop/bakery": ("Bakery", "🥐"),
"shop/bathroom": ("Bathroom Shop", "🛁"),
"shop/bathroom_furnishing": ("Bathroom Furnishings", "🚿"),
"shop/beauty": ("Beauty Shop", "💄"),
"shop/bed": ("Bed Shop", "🛏️"),
"shop/beverages": ("Drinks Shop", "🥤"),
"shop/bicycle": ("Bike Shop", "🚲"),
"shop/boat": ("Boat Shop", ""),
"shop/bookmaker": ("Bookmaker", "🏇"),
"shop/books": ("Bookshop", "📚"),
"shop/boutique": ("Boutique", "👗"),
"shop/building_materials": ("Building Materials", "🧱"),
"shop/butcher": ("Butcher", "🥩"),
"shop/camera": ("Camera Shop", "📷"),
"shop/candles": ("Candle Shop", "🕯️"),
"shop/car": ("Car Dealership", "🚗"),
"shop/car;car_repair": ("Car Sales & Repair", "🚗"),
"shop/car_parts": ("Car Parts", "🔩"),
"shop/car_repair": ("Car Repair", "🔧"),
"shop/caravan": ("Caravan Dealer", "🚐"),
"shop/carpet": ("Carpet Shop", "🧶"),
"shop/catalogue": ("Catalogue Shop", "📋"),
"shop/charity": ("Charity Shop", "❤️"),
"shop/cheese": ("Cheese Shop", "🧀"),
"shop/chemist": ("Chemist", "🧪"),
"shop/chocolate": ("Chocolate Shop", "🍫"),
"shop/clothes": ("Clothes Shop", "👕"),
"shop/coffee": ("Coffee Shop", ""),
"shop/collector": ("Collector Shop", "🏆"),
"shop/computer": ("Computer Shop", "🖥️"),
"shop/confectionery": ("Sweet Shop", "🍬"),
"shop/convenience": ("Convenience Store", "🏪"),
"shop/copyshop": ("Copy Shop", "🖨️"),
"shop/cosmetics": ("Cosmetics Shop", "💅"),
"shop/country_store": ("Country Store", "🏡"),
"shop/craft": ("Craft Shop", "✂️"),
"shop/curtain": ("Curtain Shop", "🪟"),
"shop/dairy": ("Dairy Shop", "🥛"),
"shop/deli": ("Delicatessen", "🧆"),
"shop/department_store": ("Department Store", "🏬"),
"shop/discount": ("Discount Store", "💲"),
"shop/doityourself": ("DIY Store", "🔨"),
"shop/doors": ("Door Shop", "🚪"),
"shop/dry_cleaning": ("Dry Cleaner", "👔"),
"shop/e-cigarette": ("Vape Shop", "💨"),
"shop/electrical": ("Electrical Shop", ""),
"shop/electronics": ("Electronics Shop", "📱"),
"shop/erotic": ("Adult Shop", "🔞"),
"shop/esoteric": ("Esoteric Shop", "🔮"),
"shop/estate_agent": ("Estate Agent", "🏠"),
"shop/fabric": ("Fabric Shop", "🧵"),
"shop/fan": ("Fan Shop", "🏅"),
"shop/farm": ("Farm Shop", "🥕"),
"shop/fashion_accessories": ("Fashion Accessories", "👒"),
"shop/fireplace": ("Fireplace Shop", "🔥"),
"shop/fishing": ("Fishing Shop", "🎣"),
"shop/flooring": ("Flooring Shop", "🪵"),
"shop/florist": ("Florist", "💐"),
"shop/food": ("Food Shop", "🍞"),
"shop/frame": ("Framing Shop", "🖼️"),
"shop/frozen_food": ("Frozen Food Shop", "🧊"),
"shop/fuel": ("Fuel Shop", ""),
"shop/funeral_directors": ("Funeral Director", "⚰️"),
"shop/furniture": ("Furniture Shop", "🪑"),
"shop/games": ("Games Shop", "🎮"),
"shop/garden_centre": ("Garden Centre", "🌻"),
"shop/gas": ("Gas Shop", "🔥"),
"shop/general": ("General Store", "🏪"),
"shop/gift": ("Gift Shop", "🎁"),
"shop/glaziery": ("Glazier", "🪟"),
"shop/greengrocer": ("Greengrocer", "🥬"),
"shop/grocery": ("Grocery Shop", "🛒"),
"shop/haberdashery": ("Haberdashery", "🧵"),
"shop/hairdresser": ("Hairdresser", "💇"),
"shop/hairdresser_supply": ("Hairdresser Supply", "💇"),
"shop/hardware": ("Hardware Shop", "🔩"),
"shop/health": ("Health Shop", "🌿"),
"shop/health_food": ("Health Food Shop", "🥗"),
"shop/hearing_aids": ("Hearing Aid Shop", "👂"),
"shop/herbalist": ("Herbalist", "🌿"),
"shop/hifi": ("Hi-Fi Shop", "🔊"),
"shop/household": ("Household Shop", "🏠"),
"shop/household_linen": ("Linen Shop", "🛏️"),
"shop/houseware": ("Houseware Shop", "🍳"),
"shop/ice_cream": ("Ice Cream Shop", "🍦"),
"shop/interior_decoration": ("Interior Decoration", "🖼️"),
"shop/jewelry": ("Jewellery Shop", "💍"),
"shop/kiosk": ("Kiosk", "🏪"),
"shop/kitchen": ("Kitchen Shop", "🍳"),
"shop/laundry": ("Laundry", "🧺"),
"shop/leather": ("Leather Shop", "🧳"),
"shop/lighting": ("Lighting Shop", "💡"),
"shop/locksmith": ("Locksmith", "🔐"),
"shop/mall": ("Shopping Centre", "🏬"),
"shop/massage": ("Massage Parlour", "💆"),
"shop/medical_supply": ("Medical Supply", "🩺"),
"shop/military_surplus": ("Military Surplus", "🎖️"),
"shop/mobile_phone": ("Mobile Phone Shop", "📱"),
"shop/mobile_phone_accessories": ("Phone Accessories", "📱"),
"shop/mobility": ("Mobility Shop", ""),
"shop/mobility_scooter": ("Mobility Scooter Shop", "🦽"),
"shop/model": ("Model Shop", "✈️"),
"shop/money_lender": ("Money Lender", "💰"),
"shop/motorcycle": ("Motorcycle Shop", "🏍️"),
"shop/motorcycle_repair": ("Motorcycle Repair", "🔧"),
"shop/music": ("Music Shop", "🎵"),
"shop/musical_instrument": ("Musical Instrument Shop", "🎸"),
"shop/newsagent": ("Newsagent", "📰"),
"shop/nutrition_supplements": ("Nutrition Shop", "💪"),
"shop/optician": ("Optician", "👓"),
"shop/outdoor": ("Outdoor Shop", "🏕️"),
"shop/outpost": ("Outpost", "📦"),
"shop/paint": ("Paint Shop", "🎨"),
"shop/party": ("Party Shop", "🎈"),
"shop/pastry": ("Pastry Shop", "🥐"),
"shop/pawnbroker": ("Pawnbroker", "💰"),
"shop/perfumery": ("Perfumery", "🌸"),
"shop/pet": ("Pet Shop", "🐾"),
"shop/pet_grooming": ("Pet Grooming", "🐩"),
"shop/photo": ("Photo Shop", "📸"),
"shop/piercing": ("Piercing Studio", "💎"),
"shop/plant_hire": ("Plant Hire", "🚜"),
"shop/pottery": ("Pottery Shop", "🏺"),
"shop/printer_ink": ("Ink & Toner Shop", "🖨️"),
"shop/printing": ("Print Shop", "🖨️"),
"shop/psychic": ("Psychic", "🔮"),
"shop/pyrotechnics": ("Fireworks Shop", "🎆"),
"shop/religion": ("Religious Shop", "✝️"),
"shop/rental": ("Rental Shop", "🔑"),
"shop/repair": ("Repair Shop", "🔧"),
"shop/scuba_diving": ("Scuba Diving Shop", "🤿"),
"shop/seafood": ("Fishmonger", "🐟"),
"shop/second_hand": ("Second-Hand Shop", "♻️"),
"shop/security": ("Security Shop", "🔒"),
"shop/sewing": ("Sewing Shop", "🪡"),
"shop/shoe_repair": ("Shoe Repair", "👞"),
"shop/shoes": ("Shoe Shop", "👟"),
"shop/sports": ("Sports Shop", ""),
"shop/stationery": ("Stationery Shop", "✏️"),
"shop/storage_rental": ("Self Storage", "📦"),
"shop/supermarket": ("Supermarket", "🛒"),
"shop/swimming_pool": ("Pool Supplies", "🏊"),
"shop/tailor": ("Tailor", "🧵"),
"shop/tattoo": ("Tattoo Studio", "🖋️"),
"shop/taxi": ("Taxi Booking", "🚕"),
"shop/tea": ("Tea Shop", "🫖"),
"shop/telecommunication": ("Telecoms Shop", "📡"),
"shop/ticket": ("Ticket Office", "🎫"),
"shop/tiles": ("Tile Shop", "🔲"),
"shop/tobacco": ("Tobacconist", "🚬"),
"shop/tool_hire": ("Tool Hire", "🧰"),
"shop/toys": ("Toy Shop", "🧸"),
"shop/trade": ("Trade Supplier", "🏭"),
"shop/travel_agency": ("Travel Agency", "✈️"),
"shop/trophy": ("Trophy Shop", "🏆"),
"shop/tyres": ("Tyre Shop", "🛞"),
"shop/vacant": ("Vacant Shop", "🏚️"),
"shop/variety_store": ("Variety Store", "🏪"),
"shop/video": ("Video Shop", "📀"),
"shop/video_games": ("Video Game Shop", "🎮"),
"shop/watches": ("Watch Shop", ""),
"shop/water_sports": ("Water Sports Shop", "🏄"),
"shop/weapons": ("Weapons Shop", "🗡️"),
"shop/wedding": ("Wedding Shop", "💒"),
"shop/wholesale": ("Wholesaler", "📦"),
"shop/wigs": ("Wig Shop", "💇"),
"shop/window_blind": ("Blinds Shop", "🪟"),
"shop/windows": ("Window Shop", "🪟"),
"shop/wine": ("Wine Shop", "🍷"),
"shop/wool": ("Wool Shop", "🧶"),
"shop/yes": ("Shop", "🛍️"),
# tourism
"tourism/artwork": ("Public Artwork", "🎨"),
"tourism/attraction": ("Tourist Attraction", "📸"),
"tourism/camp_site": ("Campsite", ""),
"tourism/caravan_site": ("Caravan Site", "🚐"),
"tourism/chalet": ("Chalet", "🏔️"),
"tourism/gallery": ("Gallery", "🖼️"),
"tourism/guest_house": ("Guest House", "🏡"),
"tourism/hostel": ("Hostel", "🛏️"),
"tourism/hotel": ("Hotel", "🏨"),
"tourism/motel": ("Motel", "🏨"),
"tourism/museum": ("Museum", "🏛️"),
"tourism/picnic_site": ("Picnic Site", "🧺"),
"tourism/preserved_railway": ("Heritage Railway", "🚂"),
"tourism/theme_park": ("Theme Park", "🎢"),
"tourism/viewpoint": ("Viewpoint", "🔭"),
"tourism/zoo": ("Zoo", "🦁"),
}
NAPTAN_EMOJIS: dict[str, str] = {
"Airport": "✈️",
"Ferry": "⛴️",
"Rail station": "🚆",
"Bus stop": "🚏",
"Bus station": "🚌",
"Taxi rank": "🚕",
"Metro or Tram stop": "🚊",
}
def transform(input_path: Path, naptan_path: Path | None = None) -> pl.LazyFrame:
lf = pl.scan_parquet(input_path)
# Get all unique categories present in the data
all_categories = lf.select("category").unique().collect(engine="streaming").to_series().to_list()
# Verify every non-dropped category has a mapping
unmapped = []
for cat in all_categories:
if cat not in DROP_CATEGORIES and cat not in CATEGORY_MAP:
unmapped.append(cat)
if unmapped:
raise ValueError(f"Categories missing from CATEGORY_MAP: {sorted(unmapped)}")
# Verify every CATEGORY_MAP key actually exists in the data (catch typos)
mapped_but_absent = []
all_set = set(all_categories)
for cat in CATEGORY_MAP:
if cat not in all_set:
mapped_but_absent.append(cat)
if mapped_but_absent:
raise ValueError(
f"CATEGORY_MAP contains categories not in data: {sorted(mapped_but_absent)}"
)
# Drop unwanted categories
lf = lf.filter(~pl.col("category").is_in(list(DROP_CATEGORIES)))
# Build name and emoji lookup expressions
name_mapping = {k: v[0] for k, v in CATEGORY_MAP.items()}
emoji_mapping = {k: v[1] for k, v in CATEGORY_MAP.items()}
# Check no friendly names are missing (defensive)
missing_names = [k for k, v in CATEGORY_MAP.items() if not v[0]]
if missing_names:
raise ValueError(f"Empty friendly names for: {missing_names}")
missing_emojis = [k for k, v in CATEGORY_MAP.items() if not v[1]]
if missing_emojis:
raise ValueError(f"Empty emojis for: {missing_emojis}")
# Derive group from the first component of the raw category key, title-cased
group_mapping = {
k: k.split("/")[0].replace("_", " ").title() for k in CATEGORY_MAP
}
lf = lf.with_columns(
pl.col("category").replace_strict(group_mapping).alias("group"),
pl.col("category").replace_strict(name_mapping).alias("category"),
pl.col("category").replace_strict(emoji_mapping).alias("emoji"),
)
naptan = pl.scan_parquet(naptan_path).with_columns(
pl.col("category").replace_strict(NAPTAN_EMOJIS).alias("emoji"),
pl.lit("Public Transport").alias("group"),
)
return pl.concat([lf, naptan], how="diagonal_relaxed")
def main():
parser = argparse.ArgumentParser(
description="Transform raw POIs to filtered version with friendly names"
)
parser.add_argument(
"--input", type=Path, required=True, help="Raw POIs parquet file"
)
parser.add_argument(
"--naptan", type=Path, required=True, help="NaPTAN stations parquet file"
)
parser.add_argument(
"--output", type=Path, required=True, help="Output filtered POIs parquet file"
)
args = parser.parse_args()
df = transform(args.input, args.naptan).collect(engine="streaming")
df.write_parquet(args.output)
size_mb = args.output.stat().st_size / (1024 * 1024)
print(f"Wrote {args.output} ({size_mb:.1f} MB, {len(df):,} POIs)")
print(f"\nCategories ({df['category'].n_unique()}):")
counts = df.group_by("category", "emoji").len().sort("len", descending=True)
for row in counts.iter_rows(named=True):
print(f" {row['emoji']} {row['category']}: {row['len']:,}")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,13 @@
from .download import download, extract_zip
from .fuzzy_join import fuzzy_join_on_postcode
from .haversine import haversine_km, haversine_km_expr
from .poi_counts import count_pois_within_radius
__all__ = [
"download",
"extract_zip",
"fuzzy_join_on_postcode",
"haversine_km",
"haversine_km_expr",
"count_pois_within_radius",
]

View file

@ -0,0 +1,40 @@
"""Shared download and extraction helpers for pipeline scripts."""
import zipfile
from pathlib import Path
import httpx
from tqdm import tqdm
def download(url: str, output_path: Path, *, timeout: float = 120) -> None:
"""Stream-download a URL to a local file with a tqdm progress bar."""
with httpx.stream(
"GET",
url,
follow_redirects=True,
timeout=httpx.Timeout(30.0, read=timeout),
) as response:
response.raise_for_status() # pyright: ignore[reportUnusedCallResult]
total = int(response.headers.get("content-length", 0))
with (
open(output_path, "wb") as f,
tqdm(
total=total or None,
unit="B",
unit_scale=True,
unit_divisor=1024,
desc=output_path.name,
) as pbar,
):
for chunk in response.iter_bytes(chunk_size=8192):
f.write(chunk)
pbar.update(len(chunk))
def extract_zip(zip_path: Path, extract_dir: Path) -> None:
"""Extract a ZIP archive into the given directory."""
extract_dir.mkdir(parents=True, exist_ok=True)
with zipfile.ZipFile(zip_path, "r") as zf:
zf.extractall(extract_dir)

View file

@ -0,0 +1,194 @@
import re
import shutil
import tempfile
from concurrent.futures import ProcessPoolExecutor
from os import cpu_count
from pathlib import Path
import polars as pl
from thefuzz import fuzz
from tqdm import tqdm
_NUMBER_RE = re.compile(r"\d+")
def _normalize(s: pl.Expr) -> pl.Expr:
return (
s.str.to_uppercase()
.str.replace_all(r"[,.\-]", " ")
.str.replace_all(r"\s+", " ")
.str.strip_chars()
)
def fuzzy_join_on_postcode(
left: pl.LazyFrame,
right: pl.LazyFrame,
left_address_col: str,
right_address_col: str,
left_postcode_col: str,
right_postcode_col: str,
) -> pl.LazyFrame:
"""Fuzzy join two LazyFrames by matching addresses within postcode buckets.
Sinks each side to a temporary parquet file so the upstream pipeline
executes only once. The matching phase collects just three narrow
columns (index, address, postcode) via projection pushdown, and the
final join reads the remaining columns lazily.
Returns a LazyFrame with all left and right columns. Unmatched rows
have null right columns.
"""
tmpdir = tempfile.mkdtemp(prefix="fuzzy_join_")
left_path = Path(tmpdir) / "left.parquet"
right_path = Path(tmpdir) / "right.parquet"
try:
# Materialise each side exactly once, with a row index, to temp parquet.
left.with_row_index("_left_idx").sink_parquet(left_path)
right.with_row_index("_right_idx").sink_parquet(right_path)
# Collect only the narrow columns needed for matching (projection pushdown).
left_match = (
pl.scan_parquet(left_path)
.select(
"_left_idx",
_normalize(pl.col(left_address_col)).alias("_left_address"),
pl.col(left_postcode_col)
.str.strip_chars()
.str.to_uppercase()
.alias("_left_postcode"),
)
.collect(engine="streaming")
)
right_match = (
pl.scan_parquet(right_path)
.select(
"_right_idx",
_normalize(pl.col(right_address_col)).alias("_right_address"),
pl.col(right_postcode_col)
.str.strip_chars()
.str.to_uppercase()
.alias("_right_postcode"),
)
.unique(subset=["_right_address", "_right_postcode"], keep="first")
.collect(engine="streaming")
)
# Group right side by postcode for fast lookup
right_by_postcode: dict[str, list[tuple[int, str]]] = {}
for idx, postcode, address in zip(
right_match["_right_idx"],
right_match["_right_postcode"],
right_match["_right_address"],
):
if postcode is not None:
right_by_postcode.setdefault(postcode, []).append((idx, address))
# Group left side by postcode
left_by_postcode: dict[str, list[tuple[int, str]]] = {}
for idx, postcode, address in zip(
left_match["_left_idx"],
left_match["_left_postcode"],
left_match["_left_address"],
):
if address is not None and postcode is not None:
left_by_postcode.setdefault(postcode, []).append((idx, address))
del left_match, right_match
# Build tasks for each postcode bucket
tasks = [
(left_entries, right_by_postcode[postcode])
for postcode, left_entries in left_by_postcode.items()
if postcode in right_by_postcode
]
# Score all pairwise matches in parallel, then greedily assign from
# highest score downward so best pairs lock in first.
all_pairs: list[tuple[int, int, int]] = [] # (score, left_idx, right_idx)
with ProcessPoolExecutor(max_workers=cpu_count()) as executor:
for pairs in tqdm(
executor.map(_score_bucket, tasks, chunksize=64),
total=len(tasks),
desc="Fuzzy matching",
):
all_pairs.extend(pairs)
del tasks, left_by_postcode, right_by_postcode
# Sort descending by score so best matches are assigned first
all_pairs.sort(key=lambda t: (t[0], -t[1]), reverse=True)
matches: list[tuple[int, int]] = []
matched_left: set[int] = set()
matched_right: set[int] = set()
for _score, left_idx, right_idx in all_pairs:
if left_idx in matched_left or right_idx in matched_right:
continue
matches.append((left_idx, right_idx))
matched_left.add(left_idx)
matched_right.add(right_idx)
del all_pairs, matched_left, matched_right
# Build a small mapping LazyFrame and join back to the cached parquets.
if matches:
mapping = pl.LazyFrame(
{
"_left_idx": pl.Series([m[0] for m in matches], dtype=pl.UInt32),
"_right_idx": pl.Series([m[1] for m in matches], dtype=pl.UInt32),
}
)
else:
mapping = pl.LazyFrame(
{
"_left_idx": pl.Series([], dtype=pl.UInt32),
"_right_idx": pl.Series([], dtype=pl.UInt32),
}
)
left_cached = pl.scan_parquet(left_path)
right_cached = pl.scan_parquet(right_path)
return (
left_cached.join(mapping, on="_left_idx", how="left")
.join(right_cached, on="_right_idx", how="left")
.drop("_left_idx", "_right_idx")
)
except BaseException:
shutil.rmtree(tmpdir, ignore_errors=True)
raise
def _numbers_compatible(a: str, b: str) -> bool:
"""Check that numeric tokens (flat/house numbers) in the shorter set are a subset of the longer.
Returns False if one address has numbers and the other doesn't.
"""
nums_a = set(_NUMBER_RE.findall(a))
nums_b = set(_NUMBER_RE.findall(b))
smaller, larger = (
(nums_a, nums_b) if len(nums_a) <= len(nums_b) else (nums_b, nums_a)
)
if not smaller and larger:
return False
return smaller.issubset(larger)
def _score_bucket(
args: tuple[list[tuple[int, str]], list[tuple[int, str]], int],
) -> list[tuple[int, int, int]]:
"""Score all address pairs within a single postcode bucket."""
left_entries, right_entries = args
pairs = []
for left_row, left_address in left_entries:
for right_row, right_address in right_entries:
if not _numbers_compatible(left_address, right_address):
continue
score = fuzz.token_sort_ratio(left_address, right_address)
pairs.append((score, left_row, right_row))
return pairs

View file

@ -0,0 +1,43 @@
import math
import numpy as np
import polars as pl
_EARTH_RADIUS_KM = 6371.0
def haversine_km(
lat1: np.ndarray, lon1: np.ndarray, lat2: float, lon2: float
) -> np.ndarray:
"""Compute haversine distance in km between arrays (lat1, lon1) and a single point (lat2, lon2)."""
lat1_rad = np.radians(lat1)
lon1_rad = np.radians(lon1)
lat2_rad = np.radians(lat2)
lon2_rad = np.radians(lon2)
dlat = lat2_rad - lat1_rad
dlon = lon2_rad - lon1_rad
a = (
np.sin(dlat / 2) ** 2
+ np.cos(lat1_rad) * np.cos(lat2_rad) * np.sin(dlon / 2) ** 2
)
c = 2 * np.arcsin(np.sqrt(a))
return _EARTH_RADIUS_KM * c
def haversine_km_expr(
lat_col: str, lon_col: str, dest_lat: float, dest_lon: float
) -> pl.Expr:
"""Polars expression computing haversine distance in km to a fixed point."""
dest_lat_rad = math.radians(dest_lat)
dest_lon_rad = math.radians(dest_lon)
lat_rad = pl.col(lat_col).radians()
lon_rad = pl.col(lon_col).radians()
dlat = pl.lit(dest_lat_rad) - lat_rad
dlon = pl.lit(dest_lon_rad) - lon_rad
a = (dlat / 2).sin() ** 2 + pl.lit(dest_lat_rad).cos() * lat_rad.cos() * (
dlon / 2
).sin() ** 2
return 2 * _EARTH_RADIUS_KM * a.sqrt().arcsin()

View file

@ -0,0 +1,174 @@
"""Count POIs within a radius of properties, optimized via postcode deduplication."""
import tempfile
import numpy as np
import polars as pl
from .haversine import haversine_km
def _count_pois_per_postcode(
postcodes_df: pl.DataFrame,
pois: pl.DataFrame,
groups: dict[str, list[str]],
radius_km: float = 2.0,
) -> pl.DataFrame:
"""
For each unique postcode, count POIs within radius_km by category group.
Uses spatial grid with vectorized distance calculations.
"""
print(f"Counting POIs within {radius_km}km per postcode...")
n_postcodes = len(postcodes_df)
n_pois = len(pois)
print(f" {n_postcodes:,} postcodes, {n_pois:,} POIs")
# Build spatial grid for POIs (0.05 degree cells ~5.5km)
grid_size = 0.05
print(" Building POI spatial grid...")
# Convert to numpy arrays
poi_lats = pois["lat"].to_numpy()
poi_lngs = pois["lng"].to_numpy()
poi_cats = pois["category"].to_numpy()
# Compute grid coordinates for all POIs
poi_grid_lats = np.floor(poi_lats / grid_size).astype(np.int32)
poi_grid_lngs = np.floor(poi_lngs / grid_size).astype(np.int32)
# Build grid cell lookup using numpy indexing
poi_grid = {}
for i in range(n_pois):
key = (poi_grid_lats[i], poi_grid_lngs[i])
if key not in poi_grid:
poi_grid[key] = []
poi_grid[key].append(i)
# Convert grid values to numpy arrays for faster indexing
for key in poi_grid:
poi_grid[key] = np.array(poi_grid[key], dtype=np.int32)
print(f" POI grid has {len(poi_grid):,} occupied cells")
# Pre-compute category masks
category_masks = {}
for group, categories in groups.items():
mask = np.isin(poi_cats, categories)
category_masks[group] = mask
print(f" {group}: {mask.sum():,} POIs")
# Extract postcode coordinates as numpy arrays
pc_lats = postcodes_df["lat"].to_numpy()
pc_lons = postcodes_df["lon"].to_numpy()
pc_codes = postcodes_df["postcode"].to_list()
# Initialize result arrays
result_counts = {
group: np.zeros(n_postcodes, dtype=np.int32) for group in groups
}
# Process in batches with progress
batch_size = 50000
n_batches = (n_postcodes + batch_size - 1) // batch_size
print(f" Processing {n_postcodes:,} postcodes in {n_batches} batches...")
for batch_idx in range(n_batches):
start_idx = batch_idx * batch_size
end_idx = min(start_idx + batch_size, n_postcodes)
if batch_idx % 5 == 0:
print(
f" Batch {batch_idx + 1}/{n_batches}: postcodes {start_idx:,} - {end_idx:,}"
)
# Process batch
for i in range(start_idx, end_idx):
pc_lat = pc_lats[i]
pc_lon = pc_lons[i]
# Find grid cells to check (3x3 grid)
grid_lat = int(np.floor(pc_lat / grid_size))
grid_lng = int(np.floor(pc_lon / grid_size))
# Collect nearby POI indices
nearby_indices = []
for dlat in [-1, 0, 1]:
for dlng in [-1, 0, 1]:
cell_key = (grid_lat + dlat, grid_lng + dlng)
if cell_key in poi_grid:
nearby_indices.append(poi_grid[cell_key])
if not nearby_indices:
continue
# Concatenate all nearby POI indices
nearby = np.concatenate(nearby_indices)
# Vectorized distance calculation for all nearby POIs
distances = haversine_km(poi_lats[nearby], poi_lngs[nearby], pc_lat, pc_lon)
# Filter by radius
within_mask = distances <= radius_km
within_indices = nearby[within_mask]
if len(within_indices) == 0:
continue
# Count by category group using pre-computed masks
for group, cat_mask in category_masks.items():
result_counts[group][i] = cat_mask[within_indices].sum()
# Build result dataframe
result_data = {"postcode": pc_codes}
for group in groups:
result_data[f"{group}_{int(radius_km)}km"] = result_counts[group]
result = pl.DataFrame(result_data)
print(" Completed POI counting")
return result
def count_pois_within_radius(
properties: pl.DataFrame, pois: pl.DataFrame, radius_km: float = 2.0
) -> dict[str, pl.Series]:
"""
Count POIs within radius for properties, optimized by deduplicating postcodes.
Returns dict of {column_name: count_series} aligned to properties dataframe.
"""
# Get unique postcodes with coordinates
print("Deduplicating postcodes...")
unique_postcodes = properties.select(["postcode", "lat", "lon"]).unique(
subset=["postcode"]
)
print(
f" {len(properties):,} properties → {len(unique_postcodes):,} unique postcodes"
)
# Count POIs per postcode
postcode_counts = _count_pois_per_postcode(unique_postcodes, pois, radius_km)
print(" Writing postcode counts to temp file...")
with tempfile.NamedTemporaryFile(suffix=".parquet") as tmp:
tmp_path = tmp.name
postcode_counts.write_parquet(tmp_path)
# Join using lazy evaluation
print(" Joining counts back to properties (lazy)...")
count_cols = [f"{group}_{int(radius_km)}km" for group in POI_GROUPS]
# Convert properties to lazy frame, join, then collect
result_lazy = (
properties.lazy()
.select("postcode")
.join(pl.scan_parquet(tmp_path), on="postcode", how="left")
.select(count_cols)
.fill_null(0)
)
result_df = result_lazy.collect(engine="streaming")
return {col: result_df[col] for col in count_cols}

View file

@ -0,0 +1,46 @@
import polars as pl
from pipeline.utils import fuzzy_join_on_postcode
POSTCODE = "E14 2DG"
# Price paid: unique addresses for this postcode
pp = (
pl.scan_parquet("data/price-paid-complete.parquet")
.filter(pl.col("postcode") == POSTCODE)
.select("paon", "saon", "street", "postcode")
.unique()
.sort("saon")
.with_columns(
pl.concat_str(
[pl.col("saon"), pl.col("paon"), pl.col("street")],
separator=" ",
ignore_nulls=True,
).alias("pp_address"),
)
)
# EPC: latest inspection per address for this postcode
epc = (
pl.scan_csv("data/epc/certificates.csv")
.select("ADDRESS", "POSTCODE", "INSPECTION_DATE")
.filter(pl.col("POSTCODE").str.strip_chars() == POSTCODE)
.sort("INSPECTION_DATE", descending=True)
.unique("ADDRESS")
.sort("ADDRESS")
)
result = fuzzy_join_on_postcode(
left=pp,
right=epc,
left_address_col="pp_address",
right_address_col="ADDRESS",
left_postcode_col="postcode",
right_postcode_col="POSTCODE",
).collect()
snapshot = result.select("pp_address", "ADDRESS").sort("pp_address")
print("Testing the matching between EPC and PP addresses")
with pl.Config(tbl_rows=-1, tbl_cols=-1, fmt_str_lengths=80):
print(snapshot)

View file

@ -0,0 +1,147 @@
import numpy as np
import polars as pl
import pytest
from pipeline.utils.haversine import haversine_km, haversine_km_expr
class TestHaversineKm:
"""Test numpy-based haversine distance calculation."""
def test_same_point(self):
"""Distance from a point to itself should be zero."""
lat = np.array([51.5074])
lon = np.array([-0.1278])
dist = haversine_km(lat, lon, 51.5074, -0.1278)
assert np.allclose(dist, 0.0, atol=1e-10)
def test_known_distance_london_to_paris(self):
"""Test distance from London to Paris (~344 km)."""
# London coordinates
london_lat = np.array([51.5074])
london_lon = np.array([-0.1278])
# Paris coordinates
paris_lat = 48.8566
paris_lon = 2.3522
dist = haversine_km(london_lat, london_lon, paris_lat, paris_lon)
# Expected distance is approximately 344 km
assert np.allclose(dist[0], 344, rtol=0.01)
def test_known_distance_new_york_to_london(self):
"""Test distance from New York to London (~5570 km)."""
ny_lat = np.array([40.7128])
ny_lon = np.array([-74.0060])
london_lat = 51.5074
london_lon = -0.1278
dist = haversine_km(ny_lat, ny_lon, london_lat, london_lon)
# Expected distance is approximately 5570 km
assert np.allclose(dist[0], 5570, rtol=0.01)
def test_multiple_points(self):
"""Test calculating distances from multiple points to a single destination."""
lats = np.array([51.5074, 48.8566, 40.7128]) # London, Paris, NYC
lons = np.array([-0.1278, 2.3522, -74.0060])
# Distance to Edinburgh
edinburgh_lat = 55.9533
edinburgh_lon = -3.1883
dists = haversine_km(lats, lons, edinburgh_lat, edinburgh_lon)
# All distances should be positive
assert np.all(dists > 0)
# London to Edinburgh should be shortest (~530 km)
assert dists[0] < dists[1] < dists[2]
assert np.allclose(dists[0], 530, rtol=0.02)
def test_equator_points(self):
"""Test distance along the equator."""
# Two points on the equator, 1 degree apart
lat = np.array([0.0])
lon1 = np.array([0.0])
lon2 = 1.0
dist = haversine_km(lat, lon1, 0.0, lon2)
# 1 degree at equator ≈ 111 km
assert np.allclose(dist[0], 111.2, rtol=0.01)
class TestHaversineKmExpr:
"""Test Polars expression-based haversine distance calculation."""
def test_same_point(self):
"""Distance from a point to itself should be zero."""
df = pl.DataFrame({"lat": [51.5074], "lon": [-0.1278]})
result = df.select(
haversine_km_expr("lat", "lon", 51.5074, -0.1278).alias("dist")
)
assert result["dist"][0] == pytest.approx(0.0, abs=1e-10)
def test_known_distance_london_to_paris(self):
"""Test distance from London to Paris (~344 km)."""
df = pl.DataFrame({"lat": [51.5074], "lon": [-0.1278]})
result = df.select(
haversine_km_expr("lat", "lon", 48.8566, 2.3522).alias("dist")
)
assert result["dist"][0] == pytest.approx(344, rel=0.01)
def test_known_distance_new_york_to_london(self):
"""Test distance from New York to London (~5570 km)."""
df = pl.DataFrame({"lat": [40.7128], "lon": [-74.0060]})
result = df.select(
haversine_km_expr("lat", "lon", 51.5074, -0.1278).alias("dist")
)
assert result["dist"][0] == pytest.approx(5570, rel=0.01)
def test_multiple_points(self):
"""Test calculating distances from multiple points to a single destination."""
df = pl.DataFrame(
{
"lat": [51.5074, 48.8566, 40.7128], # London, Paris, NYC
"lon": [-0.1278, 2.3522, -74.0060],
}
)
# Distance to Edinburgh
result = df.select(
haversine_km_expr("lat", "lon", 55.9533, -3.1883).alias("dist")
)
dists = result["dist"].to_numpy()
# All distances should be positive
assert np.all(dists > 0)
# London to Edinburgh should be shortest (~530 km)
assert dists[0] < dists[1] < dists[2]
assert dists[0] == pytest.approx(530, rel=0.02)
def test_equator_points(self):
"""Test distance along the equator."""
df = pl.DataFrame({"lat": [0.0], "lon": [0.0]})
result = df.select(haversine_km_expr("lat", "lon", 0.0, 1.0).alias("dist"))
# 1 degree at equator ≈ 111 km
assert result["dist"][0] == pytest.approx(111.2, rel=0.01)
class TestHaversineConsistency:
"""Test that both implementations give consistent results."""
def test_numpy_and_polars_match(self):
"""Both implementations should give identical results."""
# Test data
lats = np.array([51.5074, 48.8566, 40.7128, 55.9533, 52.5200])
lons = np.array([-0.1278, 2.3522, -74.0060, -3.1883, 13.4050])
dest_lat = 41.9028 # Rome
dest_lon = 12.4964
# Numpy version
numpy_dists = haversine_km(lats, lons, dest_lat, dest_lon)
# Polars version
df = pl.DataFrame({"lat": lats, "lon": lons})
polars_result = df.select(
haversine_km_expr("lat", "lon", dest_lat, dest_lon).alias("dist")
)
polars_dists = polars_result["dist"].to_numpy()
# Should be identical (or at least very close due to floating point)
assert np.allclose(numpy_dists, polars_dists, rtol=1e-10)

View file

@ -0,0 +1,93 @@
import polars as pl
import pytest
from pipeline.utils.poi_counts import POI_GROUPS, count_pois_within_radius
@pytest.fixture
def pois():
"""POIs clustered around two locations: central London and 10km away."""
return pl.DataFrame(
{
"lat": [51.5074, 51.5075, 51.5080, 51.5076, 51.5073, 51.60],
"lng": [-0.1278, -0.1280, -0.1275, -0.1279, -0.1277, -0.20],
"category": [
"Restaurant",
"Fast Food",
"Supermarket",
"Park",
"Station",
"Restaurant", # too far from any property
],
}
)
@pytest.fixture
def properties():
"""Two properties at the same postcode near central London, one at a distant postcode."""
return pl.DataFrame(
{
"postcode": ["EC1A 1BB", "EC1A 1BB", "ZZ99 9ZZ"],
"lat": [51.5074, 51.5074, 55.0],
"lon": [-0.1278, -0.1278, -3.0],
}
)
def test_counts_pois_within_radius(properties, pois):
result = count_pois_within_radius(properties, pois, radius_km=2.0)
assert set(result.keys()) == {f"{g}_2km" for g in POI_GROUPS}
# Result Series must be aligned to properties (3 rows)
for col, series in result.items():
assert len(series) == 3, f"{col} has {len(series)} rows, expected 3"
# First two rows share a postcode near the central London cluster
assert result["restaurants_2km"][0] == 2 # Restaurant + Fast Food
assert result["groceries_2km"][0] == 1 # Supermarket
assert result["parks_2km"][0] == 1 # Park
assert result["public_transport_2km"][0] == 1 # Station
# Second row is the same postcode, so same counts
assert result["restaurants_2km"][1] == result["restaurants_2km"][0]
# Third row (ZZ99 9ZZ) is far from all POIs → zero counts
for group in POI_GROUPS:
assert result[f"{group}_2km"][2] == 0
def test_no_pois_returns_zeros(properties):
empty_pois = pl.DataFrame(
{
"lat": pl.Series([], dtype=pl.Float64),
"lng": pl.Series([], dtype=pl.Float64),
"category": pl.Series([], dtype=pl.String),
}
)
result = count_pois_within_radius(properties, empty_pois, radius_km=2.0)
for group in POI_GROUPS:
col = f"{group}_2km"
assert col in result
assert result[col].to_list() == [0, 0, 0]
def test_custom_radius(pois):
"""A tiny radius should exclude POIs that are even slightly away."""
properties = pl.DataFrame(
{
"postcode": ["EC1A 1BB"],
"lat": [51.5074],
"lon": [-0.1278],
}
)
# 0.01 km = 10m — only the POI at the exact same location should match
result = count_pois_within_radius(properties, pois, radius_km=0.01)
# The Restaurant at (51.5074, -0.1278) is at distance 0
assert result["restaurants_0km"][0] >= 1
# POIs >100m away should not be counted
total = sum(result[f"{g}_0km"][0] for g in POI_GROUPS)
assert total <= 2 # at most the co-located POIs

View file

@ -6,11 +6,9 @@ readme = "README.md"
requires-python = ">=3.12"
dependencies = [
"attrs>=22.2.0",
"httpx>=0.28.1",
"httpx[socks]>=0.28.1",
"ipywidgets>=8.0.0",
"journey-client",
"jupyter>=1.0.0",
"nest-asyncio>=1.6.0",
"numpy>=1.26.0",
"pandas>=2.0.0",
"plotly>=6.5.2",
@ -18,17 +16,31 @@ dependencies = [
"pyarrow>=15.0.0",
"python-dateutil>=2.8.0",
"tqdm>=4.67.1",
"fastapi[standard]>=0.115.0",
"uvicorn>=0.34.0",
"h3>=3.7.0",
"overturemaps>=0.18.0",
"fastexcel>=0.19.0",
"scipy>=1.17.0",
"matplotlib>=3.10.8",
"osmium>=4.0.0",
"matplotlib>=3.10.8",
"thefuzz>=0.22.1",
"scipy>=1.17.0",
"shapely>=2.0.0",
"rasterio>=1.5.0",
"pyproj>=3.7.2",
]
[dependency-groups]
dev = ["ruff>=0.8.0"]
[tool.uv]
environments = ["sys_platform == 'linux' and python_version < '3.14'"]
[tool.uv.sources]
journey-client = { path = "./tfl_journey_client" }
[dependency-groups]
dev = [
"deptry>=0.22.0",
"pytest>=9.0.2",
"ruff>=0.8.0",
]
[tool.deptry.per_rule_ignores]
# pyarrow/fastexcel: runtime backends for polars parquet/Excel I/O
# jupyter/ipywidgets/pandas: needed to run analysis notebooks
DEP002 = ["pyarrow", "fastexcel", "jupyter", "ipywidgets", "pandas"]
# pytest is a dev dependency, not a missing one
DEP004 = ["pytest"]

2962
server-rs/Cargo.lock generated Normal file

File diff suppressed because it is too large Load diff

33
server-rs/Cargo.toml Normal file
View file

@ -0,0 +1,33 @@
[package]
name = "property-map-server"
version = "0.1.0"
edition = "2021"
[dependencies]
anyhow = "1"
clap = { version = "4", features = ["derive"] }
axum = "0.8"
tower-http = { version = "0.6", features = ["cors", "fs", "compression-gzip", "compression-zstd", "trace"] }
tokio = { version = "1", features = ["full"] }
polars = { version = "0.46", features = ["parquet", "lazy", "dtype-struct", "dtype-u8", "dtype-u16", "dtype-i8", "dtype-i16"] }
h3o = "0.7"
serde = { version = "1", features = ["derive"] }
serde_json = "1"
rayon = "1"
rustc-hash = "2"
tracing = "0.1"
tracing-subscriber = { version = "0.3", features = ["env-filter", "fmt"] }
[lints.clippy]
min_ident_chars = "warn"
[profile.dev]
opt-level = 1
[profile.release]
opt-level = 3
lto = "thin"
[profile.production]
inherits = "release"
lto = true

1
server-rs/clippy.toml Normal file
View file

@ -0,0 +1 @@
allowed-idents-below-min-chars = ["i", "j", "k", "_"]

View file

@ -0,0 +1,8 @@
[toolchain]
channel = "stable"
targets = [
"x86_64-unknown-linux-gnu",
"x86_64-unknown-linux-musl",
"aarch64-unknown-linux-gnu",
]
profile = "default"

28
server-rs/src/consts.rs Normal file
View file

@ -0,0 +1,28 @@
pub const HISTOGRAM_BINS: usize = 100;
pub const H3_PRECOMPUTE_MIN: u8 = 4;
pub const H3_PRECOMPUTE_MAX: u8 = 12;
pub const SERVER_ADDRESS: &str = "0.0.0.0:8001";
pub const BOUNDS_QUANTIZATION: f64 = 0.01;
pub const BOUNDS_BUFFER_PERCENT: f64 = 0.1;
pub const POSTCODE_MIN_RESOLUTION: u8 = 11;
pub const MAX_POIS_PER_REQUEST: usize = 2500;
pub const DEFAULT_PROPERTIES_LIMIT: usize = 100;
pub const MAX_PROPERTIES_LIMIT: usize = 500;
pub const ENUM_NULL: u8 = 255;
/// Canonical display order for POI category groups.
/// The server will panic at startup if the data contains groups not in this list or vice versa.
pub const POI_GROUP_ORDER: &[&str] = &[
"Public Transport",
"Amenity",
"Building",
"Craft",
"Healthcare",
"Leisure",
"Office",
"Shop",
"Tourism",
];

676
server-rs/src/features.rs Normal file
View file

@ -0,0 +1,676 @@
//! Static feature configuration. Every numeric and enum column in wide.parquet
//! must be declared here. Unknown columns cause a startup panic.
pub enum Bounds {
/// Fixed min/max values for the slider
Fixed { min: f64, max: f64 },
/// Compute percentile from data at startup
Percentile { low: f64, high: f64 },
}
pub struct FeatureConfig {
/// Must match parquet column name exactly (also used as display label)
pub name: &'static str,
pub bounds: Bounds,
/// Slider step size. Controls the granularity of the range slider in the UI.
pub step: f64,
/// Short one-line description shown in the filter sidebar
pub description: &'static str,
/// Longer description explaining methodology, data source, and caveats
pub detail: &'static str,
/// Data source slug for linking to /data-sources#<slug>
pub source: &'static str,
}
pub struct FeatureGroup {
pub name: &'static str,
pub features: &'static [FeatureConfig],
}
pub struct EnumFeatureConfig {
pub name: &'static str,
/// If set, values are presented in this order instead of alphabetical.
/// Values not listed are appended alphabetically after the ordered ones.
pub order: Option<&'static [&'static str]>,
/// Short one-line description shown in the filter sidebar
pub description: &'static str,
/// Longer description explaining methodology, data source, and caveats
pub detail: &'static str,
/// Data source slug for linking to /data-sources#<slug>
pub source: &'static str,
}
pub struct EnumFeatureGroup {
pub name: &'static str,
pub features: &'static [EnumFeatureConfig],
}
/// Columns in parquet that are neither numeric features nor enum features.
/// These are silently skipped during schema validation.
pub const IGNORED_COLUMNS: &[&str] = &[
"lat",
"lon",
"Address per Property Register",
"Address per EPC",
"Postcode",
"historical_prices",
"Is construction date approximate",
];
pub static FEATURE_GROUPS: &[FeatureGroup] = &[
FeatureGroup {
name: "Property",
features: &[
FeatureConfig {
name: "Last known price",
bounds: Bounds::Fixed {
min: 0.0,
max: 2_000_000.0,
},
step: 10000.0,
description: "Most recent sale price from the Land Registry",
detail: "The last recorded sale price for this property from HM Land Registry Price Paid data. Covers residential sales in England and Wales. May be years old if the property hasn't sold recently.",
source: "price-paid",
},
FeatureConfig {
name: "Price per sqm",
bounds: Bounds::Percentile {
low: 0.0,
high: 98.0,
},
step: 100.0,
description: "Sale price divided by total floor area",
detail: "Calculated by dividing the last known sale price by the total floor area from the EPC certificate. Useful for comparing value across different-sized properties. Only available where both price and floor area data exist.",
source: "price-paid",
},
FeatureConfig {
name: "Total floor area (sqm)",
bounds: Bounds::Percentile {
low: 0.0,
high: 98.0,
},
step: 1.0,
description: "Internal floor area from the EPC survey",
detail: "Total useful floor area in square metres as measured during the Energy Performance Certificate assessment. Includes all habitable rooms but excludes garages, outbuildings, and external areas.",
source: "epc",
},
FeatureConfig {
name: "Number of bedrooms & living rooms",
bounds: Bounds::Fixed {
min: 1.0,
max: 10.0,
},
step: 1.0,
description: "Count of habitable rooms from the EPC survey",
detail: "Total number of habitable rooms (bedrooms plus living rooms) as recorded in the Energy Performance Certificate. Kitchens and bathrooms are typically excluded unless they are large enough to count as habitable rooms.",
source: "epc",
},
FeatureConfig {
name: "Approximate construction age",
bounds: Bounds::Fixed {
min: 0.0,
max: 2026.0,
},
step: 1.0,
description: "Estimated year of construction from the EPC",
detail: "The approximate year of construction as recorded in the Energy Performance Certificate. Derived from the construction age band (e.g. '1930-1949') by taking the midpoint. May be approximate, especially for older buildings.",
source: "epc",
},
],
},
FeatureGroup {
name: "Transport",
features: &[
FeatureConfig {
name: "public_transport_easy_minutes",
bounds: Bounds::Fixed {
min: 0.0,
max: 180.0,
},
step: 2.0,
description: "Quickest public transport journey to central London (easy route)",
detail: "Journey time in minutes by public transport to central London destinations, using TfL's Journey Planner API. The 'easy' route minimises changes and walking. Calculated for weekday morning commute times.",
source: "tfl-journey-times",
},
FeatureConfig {
name: "public_transport_quick_minutes",
bounds: Bounds::Fixed {
min: 0.0,
max: 180.0,
},
step: 2.0,
description: "Fastest public transport journey to central London",
detail: "Journey time in minutes by public transport to central London destinations, using TfL's Journey Planner API. The 'quick' route optimises for shortest total time regardless of changes. Calculated for weekday morning commute times.",
source: "tfl-journey-times",
},
FeatureConfig {
name: "cycling_minutes",
bounds: Bounds::Fixed {
min: 0.0,
max: 180.0,
},
step: 1.0,
description: "Cycling time to central London via TfL routing",
detail: "Cycling journey time in minutes to central London destinations, as calculated by the TfL Journey Planner API. Uses TfL's default cycling speed and route preferences.",
source: "tfl-journey-times",
},
FeatureConfig {
name: "Public transport within 2km",
bounds: Bounds::Percentile {
low: 5.0,
high: 95.0,
},
step: 1.0,
description: "Number of public transport stops within 2km",
detail: "Count of bus stops, rail stations, tube stations, tram stops, and other public transport access points within a 2km radius of the property's postcode. Derived from the NaPTAN (National Public Transport Access Nodes) dataset.",
source: "naptan",
},
],
},
FeatureGroup {
name: "Education",
features: &[
FeatureConfig {
name: "Education, Skills and Training Score",
bounds: Bounds::Percentile {
low: 2.0,
high: 98.0,
},
step: 0.1,
description: "IoD education deprivation score for the local area",
detail: "From the English Indices of Deprivation. Measures deprivation in education, skills and training in the local area (LSOA). Higher scores indicate greater deprivation. Combines children/young people sub-domain (school attainment, entry to higher education) and adult skills sub-domain (adult qualifications, English language proficiency).",
source: "iod",
},
FeatureConfig {
name: "Good+ primary schools within 5km",
bounds: Bounds::Fixed {
min: 0.0,
max: 30.0,
},
step: 1.0,
description: "Primary schools rated Good or Outstanding by Ofsted nearby",
detail: "Number of state-funded primary schools within 5km that have a current Ofsted rating of Good or Outstanding. Based on the latest inspection outcomes dataset. Schools that have not yet been inspected are excluded.",
source: "ofsted",
},
FeatureConfig {
name: "Good+ secondary schools within 5km",
bounds: Bounds::Fixed {
min: 0.0,
max: 15.0,
},
step: 1.0,
description: "Secondary schools rated Good or Outstanding by Ofsted nearby",
detail: "Number of state-funded secondary schools within 5km that have a current Ofsted rating of Good or Outstanding. Based on the latest inspection outcomes dataset. Schools that have not yet been inspected are excluded.",
source: "ofsted",
},
],
},
FeatureGroup {
name: "Deprivation",
features: &[
FeatureConfig {
name: "Index of Multiple Deprivation (IMD) Score",
bounds: Bounds::Percentile {
low: 2.0,
high: 98.0,
},
step: 0.1,
description: "Overall deprivation score combining all domains",
detail: "The Index of Multiple Deprivation is the official measure of relative deprivation in England. It combines seven weighted domains: Income (22.5%), Employment (22.5%), Education (13.5%), Health (13.5%), Crime (9.3%), Barriers to Housing & Services (9.3%), and Living Environment (9.3%). Higher scores indicate greater deprivation. Measured at LSOA level (~1,500 people).",
source: "iod",
},
FeatureConfig {
name: "Income Score (rate)",
bounds: Bounds::Fixed { min: 0.0, max: 0.6 },
step: 0.01,
description: "Proportion of the population experiencing income deprivation",
detail: "From the English Indices of Deprivation. The proportion of the local population experiencing deprivation relating to low income. Includes people on Income Support, income-based Jobseeker's Allowance, income-based Employment and Support Allowance, Pension Credit, Working Tax Credit and Child Tax Credit, Universal Credit, and asylum seekers.",
source: "iod",
},
FeatureConfig {
name: "Employment Score (rate)",
bounds: Bounds::Fixed { min: 0.0, max: 0.4 },
step: 0.01,
description: "Proportion of the working-age population involuntarily excluded from work",
detail: "From the English Indices of Deprivation. The proportion of the working-age population involuntarily excluded from the labour market. Includes claimants of Jobseeker's Allowance, Employment and Support Allowance, Incapacity Benefit, Severe Disablement Allowance, Carer's Allowance, and relevant Universal Credit claimants.",
source: "iod",
},
FeatureConfig {
name: "Health Deprivation and Disability Score",
bounds: Bounds::Percentile {
low: 2.0,
high: 98.0,
},
step: 0.1,
description: "Risk of premature death and quality of life impairment",
detail: "From the English Indices of Deprivation. Measures the risk of premature death and impairment of quality of life through poor physical or mental health. Derived from years of potential life lost, comparative illness and disability ratio, acute morbidity, and mood and anxiety disorders.",
source: "iod",
},
FeatureConfig {
name: "Crime Score",
bounds: Bounds::Percentile {
low: 2.0,
high: 98.0,
},
step: 0.1,
description: "IoD crime deprivation score measuring personal risk",
detail: "From the English Indices of Deprivation. Measures the risk of personal and material victimisation at local level. Derived from recorded rates of violence, burglary, theft, and criminal damage. Higher scores indicate higher crime-related deprivation.",
source: "iod",
},
FeatureConfig {
name: "Living Environment Score",
bounds: Bounds::Percentile {
low: 2.0,
high: 98.0,
},
step: 0.1,
description: "Quality of the local indoor and outdoor environment",
detail: "From the English Indices of Deprivation. Measures deprivation in the quality of the local environment. Combines the Indoors sub-domain (housing quality, central heating, housing conditions) and Outdoors sub-domain (air quality, road traffic accidents). Higher scores indicate poorer living environments.",
source: "iod",
},
FeatureConfig {
name: "Indoors Sub-domain Score",
bounds: Bounds::Percentile {
low: 2.0,
high: 98.0,
},
step: 0.1,
description: "Housing quality and conditions in the local area",
detail: "From the English Indices of Deprivation, Living Environment domain. Measures the quality of housing stock: houses without central heating, housing in poor condition, and houses failing Decent Homes standards. Higher scores indicate worse housing conditions.",
source: "iod",
},
FeatureConfig {
name: "Outdoors Sub-domain Score",
bounds: Bounds::Percentile {
low: 2.0,
high: 98.0,
},
step: 0.1,
description: "Air quality and road safety in the local area",
detail: "From the English Indices of Deprivation, Living Environment domain. Measures the outdoor living environment quality through air quality indicators and road traffic accident casualties involving pedestrians and cyclists. Higher scores indicate poorer outdoor environments.",
source: "iod",
},
],
},
FeatureGroup {
name: "Crime",
features: &[
FeatureConfig {
name: "Anti-social behaviour (avg/yr)",
bounds: Bounds::Percentile {
low: 2.0,
high: 98.0,
},
step: 1.0,
description: "Average yearly anti-social behaviour incidents in the area",
detail: "Average number of anti-social behaviour incidents per year in the LSOA, from police.uk street-level crime data (2023-2025). Includes nuisance, environmental, and personal anti-social behaviour.",
source: "crime",
},
FeatureConfig {
name: "Violence and sexual offences (avg/yr)",
bounds: Bounds::Percentile {
low: 2.0,
high: 98.0,
},
step: 1.0,
description: "Average yearly violent and sexual offences in the area",
detail: "Average number of violence and sexual offences per year in the LSOA, from police.uk street-level crime data (2023-2025). Includes assault, harassment, and sexual offences.",
source: "crime",
},
FeatureConfig {
name: "Criminal damage and arson (avg/yr)",
bounds: Bounds::Percentile {
low: 2.0,
high: 98.0,
},
step: 1.0,
description: "Average yearly criminal damage and arson in the area",
detail: "Average number of criminal damage and arson incidents per year in the LSOA, from police.uk street-level crime data (2023-2025).",
source: "crime",
},
FeatureConfig {
name: "Burglary (avg/yr)",
bounds: Bounds::Percentile {
low: 2.0,
high: 98.0,
},
step: 1.0,
description: "Average yearly burglary offences in the area",
detail: "Average number of burglary offences per year in the LSOA, from police.uk street-level crime data (2023-2025). Includes residential and commercial burglary.",
source: "crime",
},
FeatureConfig {
name: "Vehicle crime (avg/yr)",
bounds: Bounds::Percentile {
low: 2.0,
high: 98.0,
},
step: 1.0,
description: "Average yearly vehicle crime in the area",
detail: "Average number of vehicle crime incidents per year in the LSOA, from police.uk street-level crime data (2023-2025). Includes theft of and from vehicles.",
source: "crime",
},
FeatureConfig {
name: "Robbery (avg/yr)",
bounds: Bounds::Percentile {
low: 2.0,
high: 98.0,
},
step: 1.0,
description: "Average yearly robbery offences in the area",
detail: "Average number of robbery offences per year in the LSOA, from police.uk street-level crime data (2023-2025). Robbery involves theft with force or threat of force.",
source: "crime",
},
FeatureConfig {
name: "Other theft (avg/yr)",
bounds: Bounds::Percentile {
low: 2.0,
high: 98.0,
},
step: 1.0,
description: "Average yearly other theft offences in the area",
detail: "Average number of 'other theft' offences per year in the LSOA, from police.uk street-level crime data (2023-2025). Includes theft not classified under burglary, vehicle crime, shoplifting, or bicycle theft.",
source: "crime",
},
FeatureConfig {
name: "Shoplifting (avg/yr)",
bounds: Bounds::Percentile {
low: 2.0,
high: 98.0,
},
step: 1.0,
description: "Average yearly shoplifting offences in the area",
detail: "Average number of shoplifting offences per year in the LSOA, from police.uk street-level crime data (2023-2025).",
source: "crime",
},
FeatureConfig {
name: "Drugs (avg/yr)",
bounds: Bounds::Percentile {
low: 2.0,
high: 98.0,
},
step: 1.0,
description: "Average yearly drug offences in the area",
detail: "Average number of drug offences per year in the LSOA, from police.uk street-level crime data (2023-2025). Includes possession and trafficking offences.",
source: "crime",
},
FeatureConfig {
name: "Possession of weapons (avg/yr)",
bounds: Bounds::Percentile {
low: 2.0,
high: 98.0,
},
step: 1.0,
description: "Average yearly weapons possession offences in the area",
detail: "Average number of possession of weapons offences per year in the LSOA, from police.uk street-level crime data (2023-2025).",
source: "crime",
},
FeatureConfig {
name: "Public order (avg/yr)",
bounds: Bounds::Percentile {
low: 2.0,
high: 98.0,
},
step: 1.0,
description: "Average yearly public order offences in the area",
detail: "Average number of public order offences per year in the LSOA, from police.uk street-level crime data (2023-2025). Includes causing fear, alarm, or distress.",
source: "crime",
},
FeatureConfig {
name: "Bicycle theft (avg/yr)",
bounds: Bounds::Percentile {
low: 2.0,
high: 98.0,
},
step: 1.0,
description: "Average yearly bicycle theft in the area",
detail: "Average number of bicycle theft offences per year in the LSOA, from police.uk street-level crime data (2023-2025).",
source: "crime",
},
FeatureConfig {
name: "Theft from the person (avg/yr)",
bounds: Bounds::Percentile {
low: 2.0,
high: 98.0,
},
step: 1.0,
description: "Average yearly theft from the person in the area",
detail: "Average number of theft from the person offences per year in the LSOA, from police.uk street-level crime data (2023-2025). Includes pickpocketing and bag snatching without force.",
source: "crime",
},
FeatureConfig {
name: "Other crime (avg/yr)",
bounds: Bounds::Percentile {
low: 2.0,
high: 98.0,
},
step: 1.0,
description: "Average yearly other crime in the area",
detail: "Average number of other crime offences per year in the LSOA, from police.uk street-level crime data (2023-2025). A catch-all category for offences not classified elsewhere.",
source: "crime",
},
FeatureConfig {
name: "Serious crime (avg/yr)",
bounds: Bounds::Percentile {
low: 2.0,
high: 98.0,
},
step: 1.0,
description: "Aggregate of serious crime categories per year",
detail: "Sum of violence, robbery, burglary, and weapons possession per year in the LSOA, from police.uk street-level crime data (2023-2025). Provides a single serious crime metric.",
source: "crime",
},
FeatureConfig {
name: "Minor crime (avg/yr)",
bounds: Bounds::Percentile {
low: 2.0,
high: 98.0,
},
step: 1.0,
description: "Aggregate of minor crime categories per year",
detail: "Sum of anti-social behaviour, shoplifting, bicycle theft, and other lower-severity crime per year in the LSOA, from police.uk street-level crime data (2023-2025). Provides a single minor crime metric.",
source: "crime",
},
],
},
FeatureGroup {
name: "Demographics",
features: &[
FeatureConfig {
name: "% White",
bounds: Bounds::Fixed {
min: 0.0,
max: 100.0,
},
step: 1.0,
description: "Percentage of population identifying as White",
detail: "From the 2021 Census. Percentage of the local authority population identifying as White (English, Welsh, Scottish, Northern Irish, British, Irish, Gypsy or Irish Traveller, Roma, or any other White background).",
source: "ethnicity",
},
FeatureConfig {
name: "% Asian",
bounds: Bounds::Fixed {
min: 0.0,
max: 100.0,
},
step: 1.0,
description: "Percentage of population identifying as Asian",
detail: "From the 2021 Census. Percentage of the local authority population identifying as Asian or Asian British (Indian, Pakistani, Bangladeshi, Chinese, or any other Asian background).",
source: "ethnicity",
},
FeatureConfig {
name: "% Black",
bounds: Bounds::Fixed {
min: 0.0,
max: 100.0,
},
step: 1.0,
description: "Percentage of population identifying as Black",
detail: "From the 2021 Census. Percentage of the local authority population identifying as Black, Black British, Caribbean, or African.",
source: "ethnicity",
},
FeatureConfig {
name: "% Mixed",
bounds: Bounds::Fixed {
min: 0.0,
max: 100.0,
},
step: 1.0,
description: "Percentage of population identifying as Mixed or Multiple ethnic groups",
detail: "From the 2021 Census. Percentage of the local authority population identifying as Mixed or Multiple ethnic groups (White and Black Caribbean, White and Black African, White and Asian, or any other Mixed or Multiple background).",
source: "ethnicity",
},
FeatureConfig {
name: "% Other",
bounds: Bounds::Fixed {
min: 0.0,
max: 100.0,
},
step: 1.0,
description: "Percentage of population identifying as Other ethnic group",
detail: "From the 2021 Census. Percentage of the local authority population identifying as Other ethnic group (Arab or any other ethnic group not covered by the main categories).",
source: "ethnicity",
},
],
},
FeatureGroup {
name: "Amenities",
features: &[
FeatureConfig {
name: "Restaurants within 2km",
bounds: Bounds::Percentile {
low: 5.0,
high: 95.0,
},
step: 1.0,
description: "Number of restaurants and cafes within 2km",
detail: "Count of restaurants, cafes, and food establishments within a 2km radius of the property's postcode centroid. Derived from OpenStreetMap POI data using haversine distance calculation with a 0.05° spatial grid for candidate reduction.",
source: "osm-pois",
},
FeatureConfig {
name: "Groceries within 2km",
bounds: Bounds::Percentile {
low: 5.0,
high: 95.0,
},
step: 1.0,
description: "Number of grocery shops and supermarkets within 2km",
detail: "Count of supermarkets, convenience stores, and other grocery shops within a 2km radius of the property's postcode centroid. Derived from OpenStreetMap POI data.",
source: "osm-pois",
},
FeatureConfig {
name: "Parks within 2km",
bounds: Bounds::Percentile {
low: 5.0,
high: 95.0,
},
step: 1.0,
description: "Number of parks and green spaces within 2km",
detail: "Count of parks, gardens, nature reserves, and other green spaces within a 2km radius of the property's postcode centroid. Derived from OpenStreetMap POI data.",
source: "osm-pois",
},
],
},
FeatureGroup {
name: "Environment",
features: &[
FeatureConfig {
name: "Noise (dB)",
bounds: Bounds::Fixed {
min: 50.0,
max: 80.0,
},
step: 1.0,
description: "Road noise level at the postcode in decibels (Lden)",
detail: "Road noise level in decibels (Lden — day-evening-night 24-hour weighted average) from Defra's Strategic Noise Mapping Round 4 (2022). Modelled at 4m above ground on a 10m grid. Sampled at postcode centroids via WCS GeoTIFF tiles. Values above ~55 dB are generally considered noticeable; above ~70 dB can affect health.",
source: "noise",
},
FeatureConfig {
name: "Max available download speed (Mbps)",
bounds: Bounds::Percentile {
low: 5.0,
high: 95.0,
},
step: 10.0,
description: "Maximum broadband download speed available at the postcode",
detail: "Maximum available fixed broadband download speed in Megabits per second, from Ofcom's Connected Nations 2025 report. Measured at Output Area level and represents the maximum speed available from any provider, not actual achieved speeds.",
source: "broadband",
},
],
},
];
pub static ENUM_FEATURE_GROUPS: &[EnumFeatureGroup] = &[EnumFeatureGroup {
name: "Property",
features: &[
EnumFeatureConfig {
name: "Leashold/Freehold",
order: Some(&["Freehold", "Leasehold"]),
description: "Whether the property is leasehold or freehold",
detail: "From HM Land Registry Price Paid data. Freehold means you own the building and the land it stands on. Leasehold means you own the building but not the land — you have a lease from the freeholder for a set number of years.",
source: "price-paid",
},
EnumFeatureConfig {
name: "Current energy rating",
order: Some(&["A", "B", "C", "D", "E", "F", "G"]),
description: "Current EPC energy efficiency rating (A-G)",
detail: "The current energy efficiency rating from the Energy Performance Certificate, graded A (most efficient) to G (least efficient). Based on the energy costs per square metre of floor area for heating, hot water, lighting, and ventilation.",
source: "epc",
},
EnumFeatureConfig {
name: "Potential energy rating",
order: Some(&["A", "B", "C", "D", "E", "F", "G"]),
description: "Achievable EPC rating after recommended improvements",
detail: "The potential energy efficiency rating that could be achieved if all cost-effective improvements recommended in the EPC were carried out. Graded A (most efficient) to G (least efficient).",
source: "epc",
},
EnumFeatureConfig {
name: "Property type",
order: Some(&["Detached", "Semi-Detached", "Terraced", "Flat"]),
description: "Type of property: detached, semi-detached, terraced, or flat",
detail: "From HM Land Registry Price Paid data. The broad property type classification: Detached, Semi-Detached, Terraced, or Flat/Maisonette.",
source: "price-paid",
},
EnumFeatureConfig {
name: "Property type/built form",
order: None,
description: "Detailed property type and built form from the EPC",
detail: "A more detailed classification from the Energy Performance Certificate combining property type and built form. Examples include 'Semi-Detached House', 'Mid-Terrace House', 'Ground-Floor Flat', 'Detached Bungalow', etc.",
source: "epc",
},
],
}];
/// Flat ordered list of all numeric feature names (follows group order).
pub fn all_numeric_feature_names() -> Vec<&'static str> {
FEATURE_GROUPS
.iter()
.flat_map(|group| group.features.iter().map(|feature| feature.name))
.collect()
}
/// Flat ordered list of all enum feature names (follows group order).
pub fn all_enum_feature_names() -> Vec<&'static str> {
ENUM_FEATURE_GROUPS
.iter()
.flat_map(|group| group.features.iter().map(|feature| feature.name))
.collect()
}
/// Look up the configured value order for an enum feature by name.
pub fn order_for(name: &str) -> Option<&'static [&'static str]> {
ENUM_FEATURE_GROUPS
.iter()
.flat_map(|group| group.features.iter())
.find(|feature| feature.name == name)
.and_then(|feature| feature.order)
}
/// Look up the Bounds config for a numeric feature by name.
pub fn bounds_for(name: &str) -> Option<&'static Bounds> {
FEATURE_GROUPS
.iter()
.flat_map(|group| group.features.iter())
.find(|feature| feature.name == name)
.map(|feature| &feature.bounds)
}

86
server-rs/src/filter.rs Normal file
View file

@ -0,0 +1,86 @@
use crate::consts::ENUM_NULL;
use crate::data::EnumFeatureData;
pub struct ParsedFilter {
pub feat_idx: usize,
pub min: f64,
pub max: f64,
}
pub struct ParsedEnumFilter {
pub enum_idx: usize,
pub allowed: Vec<u8>,
}
/// Parse comma-separated filter string into numeric and enum filters.
/// Numeric format: `name:min:max`
/// Enum format: `name:val1|val2|val3` (pipe-separated values)
pub fn parse_filters(
filter_str: Option<&str>,
feature_names: &[String],
enum_features: &[EnumFeatureData],
) -> (Vec<ParsedFilter>, Vec<ParsedEnumFilter>) {
let mut numeric = Vec::new();
let mut enums = Vec::new();
let input = match filter_str.filter(|text| !text.is_empty()) {
Some(text) => text,
None => return (numeric, enums),
};
for entry in input.split(',') {
let parts: Vec<&str> = entry.splitn(2, ':').collect();
if parts.len() != 2 {
continue;
}
let name = parts[0].trim();
let rest = parts[1].trim();
if let Some(enum_idx) = enum_features.iter().position(|enum_feat| enum_feat.name == name) {
let enum_feat = &enum_features[enum_idx];
let allowed: Vec<u8> = rest
.split('|')
.filter_map(|value| {
let value = value.trim();
enum_feat.values.iter().position(|existing| existing == value).map(|position| position as u8)
})
.collect();
enums.push(ParsedEnumFilter { enum_idx, allowed });
} else {
let num_parts: Vec<&str> = rest.splitn(2, ':').collect();
if num_parts.len() != 2 {
continue;
}
let min = match num_parts[0].trim().parse::<f64>() {
Ok(value) => value,
Err(_) => continue,
};
let max = match num_parts[1].trim().parse::<f64>() {
Ok(value) => value,
Err(_) => continue,
};
if let Some(feat_idx) = feature_names.iter().position(|feat_name| feat_name == name) {
numeric.push(ParsedFilter { feat_idx, min, max });
}
}
}
(numeric, enums)
}
pub fn row_passes_filters(
row: usize,
filters: &[ParsedFilter],
enum_filters: &[ParsedEnumFilter],
feature_data: &[f64],
num_features: usize,
enum_features: &[EnumFeatureData],
) -> bool {
filters.iter().all(|filter| {
let value = feature_data[row * num_features + filter.feat_idx];
value.is_finite() && value >= filter.min && value <= filter.max
}) && enum_filters.iter().all(|enum_filter| {
let value = enum_features[enum_filter.enum_idx].data[row];
value != ENUM_NULL && enum_filter.allowed.contains(&value)
})
}

147
server-rs/src/grid_index.rs Normal file
View file

@ -0,0 +1,147 @@
/// Grid-based spatial index for fast rectangle queries over property rows.
///
/// Divides the UK bounding box into cells of ~0.01 degrees (~1km),
/// each storing indices of rows whose lat/lon falls within that cell.
pub struct GridIndex {
min_lat: f64,
min_lon: f64,
cell_size: f64,
cols: usize,
rows: usize,
/// cells[row * cols + col] = vec of row indices
cells: Vec<Vec<u32>>,
}
impl GridIndex {
pub fn build(lat: &[f64], lon: &[f64], cell_size: f64) -> Self {
let mut min_lat = f64::INFINITY;
let mut max_lat = f64::NEG_INFINITY;
let mut min_lon = f64::INFINITY;
let mut max_lon = f64::NEG_INFINITY;
for index in 0..lat.len() {
if lat[index] < min_lat {
min_lat = lat[index];
}
if lat[index] > max_lat {
max_lat = lat[index];
}
if lon[index] < min_lon {
min_lon = lon[index];
}
if lon[index] > max_lon {
max_lon = lon[index];
}
}
min_lat -= cell_size;
min_lon -= cell_size;
max_lat += cell_size;
max_lon += cell_size;
let rows = ((max_lat - min_lat) / cell_size).ceil() as usize + 1;
let cols = ((max_lon - min_lon) / cell_size).ceil() as usize + 1;
tracing::debug!(
rows_grid = rows,
cols_grid = cols,
total_cells = rows * cols,
cell_size,
"Building grid index"
);
let mut cells: Vec<Vec<u32>> = vec![Vec::new(); rows * cols];
for index in 0..lat.len() {
let grid_row = ((lat[index] - min_lat) / cell_size) as usize;
let grid_col = ((lon[index] - min_lon) / cell_size) as usize;
let cell_index = grid_row * cols + grid_col;
cells[cell_index].push(index as u32);
}
tracing::debug!("Grid index built");
GridIndex {
min_lat,
min_lon,
cell_size,
cols,
rows,
cells,
}
}
pub fn query(&self, south: f64, west: f64, north: f64, east: f64) -> Vec<u32> {
let Some((row_min, row_max, col_min, col_max)) =
self.clamp_bounds(south, west, north, east)
else {
return Vec::new();
};
let mut result = Vec::new();
for row in row_min..=row_max {
let row_start = row * self.cols;
for col in col_min..=col_max {
result.extend_from_slice(&self.cells[row_start + col]);
}
}
result
}
#[inline]
pub fn for_each_in_bounds(
&self,
south: f64,
west: f64,
north: f64,
east: f64,
mut callback: impl FnMut(u32),
) {
let Some((row_min, row_max, col_min, col_max)) =
self.clamp_bounds(south, west, north, east)
else {
return;
};
for row in row_min..=row_max {
let row_start = row * self.cols;
for col in col_min..=col_max {
for &row_idx in &self.cells[row_start + col] {
callback(row_idx);
}
}
}
}
fn clamp_bounds(
&self,
south: f64,
west: f64,
north: f64,
east: f64,
) -> Option<(usize, usize, usize, usize)> {
let row_min_raw = ((south - self.min_lat) / self.cell_size) as isize;
let row_max_raw = ((north - self.min_lat) / self.cell_size) as isize;
let col_min_raw = ((west - self.min_lon) / self.cell_size) as isize;
let col_max_raw = ((east - self.min_lon) / self.cell_size) as isize;
let row_min = row_min_raw.max(0) as usize;
let row_max_clamped = row_max_raw.min(self.rows as isize - 1);
let col_min = col_min_raw.max(0) as usize;
let col_max_clamped = col_max_raw.min(self.cols as isize - 1);
if row_max_clamped < 0 || col_max_clamped < 0 {
return None;
}
let row_max = row_max_clamped as usize;
let col_max = col_max_clamped as usize;
if row_min > row_max || col_min > col_max {
return None;
}
Some((row_min, row_max, col_min, col_max))
}
}

242
server-rs/src/main.rs Normal file
View file

@ -0,0 +1,242 @@
mod consts;
mod data;
mod features;
mod filter;
mod grid_index;
mod routes;
mod state;
#[cfg(test)]
mod tests;
use std::path::PathBuf;
use std::sync::Arc;
use anyhow::{bail, Context};
use axum::routing::get;
use axum::Router;
use clap::Parser;
use tower_http::compression::CompressionLayer;
use tower_http::cors::{Any, CorsLayer};
use tower_http::services::ServeDir;
use tower_http::trace::TraceLayer;
use tracing::info;
use tracing_subscriber::EnvFilter;
use state::AppState;
#[derive(Parser)]
#[command(name = "narrowit", about = "Narrowit property map server")]
struct Cli {
/// Path to the wide property parquet file
#[arg(long)]
data: PathBuf,
/// Path to the POI parquet file
#[arg(long)]
pois: PathBuf,
/// Path to the frontend dist directory
#[arg(long)]
dist: Option<PathBuf>,
}
#[tokio::main]
async fn main() -> anyhow::Result<()> {
tracing_subscriber::fmt()
.with_env_filter(
EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info")),
)
.with_ansi(true)
.init();
let cli = Cli::parse();
let parquet_path = &cli.data;
if !parquet_path.exists() {
bail!(
"Property parquet file not found: {}",
parquet_path.display()
);
}
info!("Loading property data from {}", parquet_path.display());
let property_data = data::PropertyData::load(parquet_path)?;
info!(
rows = property_data.lat.len(),
features = property_data.num_features,
enums = property_data.enum_features.len(),
"Property data loaded"
);
info!("Building spatial grid index (0.01° cells)");
let grid = grid_index::GridIndex::build(&property_data.lat, &property_data.lon, 0.01);
info!(
"Precomputing H3 cells for resolutions {}-{}",
consts::H3_PRECOMPUTE_MIN,
consts::H3_PRECOMPUTE_MAX
);
let h3_cells = data::precompute_h3(&property_data.lat, &property_data.lon)?;
let poi_path = cli.pois;
if !poi_path.exists() {
bail!("POI parquet file not found: {}", poi_path.display());
}
info!("Loading POI data from {}", poi_path.display());
let poi_data = data::POIData::load(&poi_path)?;
info!(pois = poi_data.lat.len(), "POI data loaded");
info!("Building POI spatial grid index");
let poi_grid = grid_index::GridIndex::build(&poi_data.lat, &poi_data.lng, 0.01);
let min_keys: Vec<String> = property_data
.feature_names
.iter()
.map(|name| format!("min_{}", name))
.collect();
let max_keys: Vec<String> = property_data
.feature_names
.iter()
.map(|name| format!("max_{}", name))
.collect();
let enum_min_keys: Vec<String> = property_data
.enum_features
.iter()
.map(|enum_feature| format!("min_{}", enum_feature.name))
.collect();
let enum_max_keys: Vec<String> = property_data
.enum_features
.iter()
.map(|enum_feature| format!("max_{}", enum_feature.name))
.collect();
// Precompute POI category groups
let poi_category_groups = {
let mut group_cats: std::collections::HashMap<String, std::collections::HashSet<String>> =
std::collections::HashMap::new();
for (category, group) in poi_data.category.iter().zip(poi_data.group.iter()) {
group_cats
.entry(group.clone())
.or_default()
.insert(category.clone());
}
// Validate that data groups match the hardcoded order exactly
let expected: std::collections::HashSet<&str> =
consts::POI_GROUP_ORDER.iter().copied().collect();
let actual: std::collections::HashSet<&str> =
group_cats.keys().map(|key| key.as_str()).collect();
let missing_from_data: Vec<&&str> = expected.difference(&actual).collect();
let missing_from_order: Vec<&&str> = actual.difference(&expected).collect();
if !missing_from_data.is_empty() || !missing_from_order.is_empty() {
bail!(
"POI group mismatch!\n In POI_GROUP_ORDER but not in data: {:?}\n In data but not in POI_GROUP_ORDER: {:?}",
missing_from_data, missing_from_order
);
}
consts::POI_GROUP_ORDER.iter().map(|group_name| group_name.to_string()).collect::<Vec<_>>()
.into_iter()
.map(|name| {
let mut categories: Vec<String> =
group_cats.remove(&name).context("POI group validated but missing from map")?.into_iter().collect();
categories.sort();
Ok(state::POICategoryGroup { name, categories })
})
.collect::<anyhow::Result<Vec<_>>>()?
};
// Precompute enum name → index map
let enum_name_to_idx: rustc_hash::FxHashMap<String, usize> = property_data
.enum_features
.iter()
.enumerate()
.map(|(index, enum_feature)| (enum_feature.name.clone(), index))
.collect();
let state = Arc::new(AppState {
data: property_data,
grid,
h3_cells,
poi_data,
poi_grid,
min_keys,
max_keys,
enum_min_keys,
enum_max_keys,
poi_category_groups,
enum_name_to_idx,
});
let cors = CorsLayer::new()
.allow_origin(Any)
.allow_methods(Any)
.allow_headers(Any);
let state_features = state.clone();
let state_hexagons = state.clone();
let state_pois = state.clone();
let state_poi_categories = state.clone();
let state_hexagon_properties = state.clone();
let state_hexagon_stats = state.clone();
let api = Router::new()
.route(
"/api/features",
get(move || routes::get_features(state_features.clone())),
)
.route(
"/api/hexagons",
get(move |query| routes::get_hexagons(state_hexagons.clone(), query)),
)
.route(
"/api/pois",
get(move |query| routes::get_pois(state_pois.clone(), query)),
)
.route(
"/api/poi-categories",
get(move || routes::get_poi_categories(state_poi_categories.clone())),
)
.route(
"/api/hexagon-properties",
get(move |query| {
routes::get_hexagon_properties(state_hexagon_properties.clone(), query)
}),
)
.route(
"/api/hexagon-stats",
get(move |query| routes::get_hexagon_stats(state_hexagon_stats.clone(), query)),
);
let frontend_dist = cli.dist.unwrap_or_else(|| {
// Check next to the binary first, then fall back to working directory
if let Ok(executable) = std::env::current_exe() {
let executable_dir = executable.parent().unwrap_or_else(|| std::path::Path::new("."));
let dist_next_to_binary = executable_dir.join("dist");
if dist_next_to_binary.exists() {
return dist_next_to_binary;
}
}
PathBuf::from("frontend/dist")
});
let app = if frontend_dist.exists() {
api.fallback_service(ServeDir::new(frontend_dist))
} else {
api
};
let app = app
.layer(cors)
.layer(CompressionLayer::new().zstd(true).gzip(true))
.layer(TraceLayer::new_for_http());
let addr = consts::SERVER_ADDRESS;
let listener = tokio::net::TcpListener::bind(addr)
.await
.with_context(|| format!("Failed to bind to {addr}"))?;
info!("Server listening on {}", addr);
axum::serve(listener, app)
.await
.context("Server error")?;
Ok(())
}

View file

@ -0,0 +1,136 @@
use std::sync::Arc;
use axum::response::Json;
use serde::Serialize;
use tracing::info;
use crate::data::Histogram;
use crate::features::{ENUM_FEATURE_GROUPS, FEATURE_GROUPS};
use crate::state::AppState;
#[derive(Serialize)]
#[serde(tag = "type")]
pub enum FeatureInfo {
#[serde(rename = "numeric")]
Numeric {
name: String,
min: f64,
max: f64,
step: f64,
histogram: Histogram,
description: &'static str,
detail: &'static str,
source: &'static str,
},
#[serde(rename = "enum")]
Enum {
name: String,
values: Vec<String>,
description: &'static str,
detail: &'static str,
source: &'static str,
},
}
#[derive(Serialize)]
pub struct FeatureGroupResponse {
name: String,
features: Vec<FeatureInfo>,
}
#[derive(Serialize)]
pub struct FeaturesResponse {
groups: Vec<FeatureGroupResponse>,
}
pub async fn get_features(state: Arc<AppState>) -> Json<FeaturesResponse> {
// Collect all group names in order, merging numeric and enum groups with the same name
let mut group_names: Vec<&str> = Vec::new();
for feature_group in FEATURE_GROUPS {
if !group_names.contains(&feature_group.name) {
group_names.push(feature_group.name);
}
}
for enum_group in ENUM_FEATURE_GROUPS {
if !group_names.contains(&enum_group.name) {
group_names.push(enum_group.name);
}
}
let mut groups: Vec<FeatureGroupResponse> = Vec::new();
for &group_name in &group_names {
let mut features: Vec<FeatureInfo> = Vec::new();
// Add numeric features for this group
for feature_group in FEATURE_GROUPS {
if feature_group.name == group_name {
for feature_config in feature_group.features {
if let Some(feat_idx) =
state.data.feature_names.iter().position(|feat_name| feat_name == feature_config.name)
{
let stats = &state.data.feature_stats[feat_idx];
features.push(FeatureInfo::Numeric {
name: feature_config.name.to_string(),
min: stats.slider_min,
max: stats.slider_max,
step: feature_config.step,
histogram: stats.histogram.clone(),
description: feature_config.description,
detail: feature_config.detail,
source: feature_config.source,
});
}
}
}
}
// Add enum features for this group
for enum_group in ENUM_FEATURE_GROUPS {
if enum_group.name == group_name {
for enum_config in enum_group.features {
if let Some(enum_feature) = state
.data
.enum_features
.iter()
.find(|enum_feat| enum_feat.name == enum_config.name)
{
features.push(FeatureInfo::Enum {
name: enum_config.name.to_string(),
values: enum_feature.values.clone(),
description: enum_config.description,
detail: enum_config.detail,
source: enum_config.source,
});
}
}
}
}
if !features.is_empty() {
groups.push(FeatureGroupResponse {
name: group_name.to_string(),
features,
});
}
}
let num_numeric: usize = groups
.iter()
.flat_map(|group| &group.features)
.filter(|feature| matches!(feature, FeatureInfo::Numeric { .. }))
.count();
let num_enum: usize = groups
.iter()
.flat_map(|group| &group.features)
.filter(|feature| matches!(feature, FeatureInfo::Enum { .. }))
.count();
info!(
numeric = num_numeric,
enums = num_enum,
groups = groups.len(),
"GET /api/features"
);
Json(FeaturesResponse { groups })
}

View file

@ -0,0 +1,251 @@
use std::fmt::Write;
use std::str::FromStr;
use std::sync::Arc;
use axum::extract::Query;
use axum::http::StatusCode;
use axum::response::IntoResponse;
use serde::Deserialize;
use tracing::{info, warn};
use crate::consts::{ENUM_NULL, HISTOGRAM_BINS};
use crate::filter::{parse_filters, row_passes_filters};
use crate::state::AppState;
use super::parse::h3_cell_bounds;
#[derive(Deserialize)]
pub struct HexagonStatsParams {
pub h3: String,
pub resolution: u8,
pub filters: Option<String>,
}
pub async fn get_hexagon_stats(
state: Arc<AppState>,
Query(params): Query<HexagonStatsParams>,
) -> Result<impl IntoResponse, (StatusCode, String)> {
let cell = h3o::CellIndex::from_str(&params.h3).map_err(|error| {
warn!(h3 = %params.h3, error = %error, "Invalid H3 cell index");
(StatusCode::BAD_REQUEST, format!("Invalid H3 cell: {}", error))
})?;
let cell_u64: u64 = cell.into();
let resolution = params.resolution as usize;
if resolution >= state.h3_cells.len() || state.h3_cells[resolution].is_empty() {
warn!(
resolution,
"Invalid or non-precomputed resolution for hexagon-stats"
);
return Err((
StatusCode::BAD_REQUEST,
"Invalid or non-precomputed resolution".to_string(),
));
}
let h3_str = params.h3.clone();
let filters_str = params.filters.clone();
let (parsed_filters, parsed_enum_filters) = parse_filters(
params.filters.as_deref(),
&state.data.feature_names,
&state.data.enum_features,
);
let num_filters = parsed_filters.len() + parsed_enum_filters.len();
let result = tokio::task::spawn_blocking(move || {
let start_time = std::time::Instant::now();
let h3_data = &state.h3_cells[resolution];
let num_features = state.data.num_features;
let feature_data = &state.data.feature_data;
let enum_features = &state.data.enum_features;
let (min_lat, min_lon, max_lat, max_lon) = h3_cell_bounds(cell, 0.001);
// Collect matching rows
let mut matching_rows: Vec<usize> = Vec::new();
state
.grid
.for_each_in_bounds(min_lat, min_lon, max_lat, max_lon, |row_idx| {
let row = row_idx as usize;
if h3_data[row] == cell_u64
&& row_passes_filters(
row,
&parsed_filters,
&parsed_enum_filters,
feature_data,
num_features,
enum_features,
)
{
matching_rows.push(row);
}
});
let total_count = matching_rows.len();
// Build JSON directly via string buffer
let mut output = String::with_capacity(4096);
output.push_str("{\"count\":");
write!(output, "{}", total_count).unwrap();
// Numeric features: compute count, min, max, sum, histogram using global bin edges
output.push_str(",\"numeric_features\":[");
let mut first_numeric = true;
for (feature_index, feature_name) in state.data.feature_names.iter().enumerate() {
let global_stats = &state.data.feature_stats[feature_index];
let histogram_min = global_stats.histogram.min;
let histogram_max = global_stats.histogram.max;
let bin_width = global_stats.histogram.bin_width;
let mut count = 0usize;
let mut min_value = f64::INFINITY;
let mut max_value = f64::NEG_INFINITY;
let mut sum = 0.0f64;
let mut bins = vec![0u64; HISTOGRAM_BINS];
for &row in &matching_rows {
let value = feature_data[row * num_features + feature_index];
if value.is_finite() {
count += 1;
if value < min_value {
min_value = value;
}
if value > max_value {
max_value = value;
}
sum += value;
// Bin into histogram using global edges
if bin_width > 0.0 {
let bin_index =
((value - histogram_min) / bin_width).floor() as isize;
let clamped_index = bin_index.max(0).min((HISTOGRAM_BINS - 1) as isize) as usize;
bins[clamped_index] += 1;
}
}
}
if count == 0 {
continue;
}
if !first_numeric {
output.push(',');
}
first_numeric = false;
let mean = sum / count as f64;
output.push_str("{\"name\":");
write_json_string(&mut output, feature_name);
write!(output, ",\"count\":{}", count).unwrap();
write!(output, ",\"min\":{}", format_f64(min_value)).unwrap();
write!(output, ",\"max\":{}", format_f64(max_value)).unwrap();
write!(output, ",\"mean\":{}", format_f64(mean)).unwrap();
output.push_str(",\"histogram\":{\"min\":");
write!(output, "{}", format_f64(histogram_min)).unwrap();
output.push_str(",\"max\":");
write!(output, "{}", format_f64(histogram_max)).unwrap();
output.push_str(",\"bin_width\":");
write!(output, "{}", format_f64(bin_width)).unwrap();
output.push_str(",\"counts\":[");
for (bin_index, &bin_count) in bins.iter().enumerate() {
if bin_index > 0 {
output.push(',');
}
write!(output, "{}", bin_count).unwrap();
}
output.push_str("]}}")
}
// Enum features: count per value
output.push_str("],\"enum_features\":[");
let mut first_enum = true;
for enum_feature in enum_features {
let enum_index = match state.enum_name_to_idx.get(&enum_feature.name) {
Some(&index) => index,
None => continue,
};
let enum_data = &state.data.enum_features[enum_index];
let mut value_counts = vec![0u64; enum_data.values.len()];
for &row in &matching_rows {
let value = enum_data.data[row];
if value != ENUM_NULL && (value as usize) < value_counts.len() {
value_counts[value as usize] += 1;
}
}
// Only include if there are any non-zero counts
let has_values = value_counts.iter().any(|&count| count > 0);
if !has_values {
continue;
}
if !first_enum {
output.push(',');
}
first_enum = false;
output.push_str("{\"name\":");
write_json_string(&mut output, &enum_feature.name);
output.push_str(",\"counts\":{");
let mut first_value = true;
for (value_index, &count) in value_counts.iter().enumerate() {
if count == 0 {
continue;
}
if !first_value {
output.push(',');
}
first_value = false;
write_json_string(&mut output, &enum_data.values[value_index]);
write!(output, ":{}", count).unwrap();
}
output.push_str("}}");
}
output.push_str("]}");
let elapsed = start_time.elapsed();
info!(
h3 = %h3_str,
resolution,
total_count,
filters = num_filters,
filters_raw = filters_str.as_deref().unwrap_or("-"),
ms = format_args!("{:.1}", elapsed.as_secs_f64() * 1000.0),
"GET /api/hexagon-stats"
);
output
})
.await
.map_err(|error| (StatusCode::INTERNAL_SERVER_ERROR, error.to_string()))?;
Ok((
[(axum::http::header::CONTENT_TYPE, "application/json")],
result,
))
}
fn write_json_string(output: &mut String, value: &str) {
output.push('"');
for character in value.chars() {
match character {
'"' => output.push_str("\\\""),
'\\' => output.push_str("\\\\"),
'\n' => output.push_str("\\n"),
'\r' => output.push_str("\\r"),
'\t' => output.push_str("\\t"),
other => output.push(other),
}
}
output.push('"');
}
fn format_f64(value: f64) -> String {
if value.fract() == 0.0 && value.abs() < 1e15 {
format!("{:.1}", value)
} else {
format!("{}", value)
}
}

View file

@ -0,0 +1,375 @@
use std::fmt::{self, Write};
use std::sync::Arc;
use axum::extract::Query;
use axum::http::StatusCode;
use axum::response::IntoResponse;
use rustc_hash::FxHashMap;
use serde::Deserialize;
use tracing::{info, warn};
use crate::consts::{
BOUNDS_BUFFER_PERCENT, BOUNDS_QUANTIZATION, ENUM_NULL, H3_PRECOMPUTE_MAX, H3_PRECOMPUTE_MIN,
POSTCODE_MIN_RESOLUTION,
};
use crate::filter::parse_filters;
use crate::state::AppState;
use super::parse::parse_bounds;
struct HumanBytes(usize);
impl fmt::Display for HumanBytes {
fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
let bytes = self.0;
if bytes >= 1_000_000 {
write!(formatter, "{:.1} MB", bytes as f64 / 1_000_000.0)
} else if bytes >= 1_000 {
write!(formatter, "{:.1} KB", bytes as f64 / 1_000.0)
} else {
write!(formatter, "{} B", bytes)
}
}
}
#[derive(Deserialize)]
pub struct HexagonParams {
resolution: u8,
bounds: Option<String>,
/// Comma-separated filters: `name:min:max,...`
/// Rows must have non-NaN values within [min,max] for each filter.
filters: Option<String>,
}
/// Per-cell accumulator for aggregating features
struct CellAgg {
count: u32,
mins: Vec<f64>,
maxs: Vec<f64>,
/// Min/max ordinal indices for enum features (255 = no data yet)
enum_mins: Vec<u8>,
enum_maxs: Vec<u8>,
/// Most common postcode in this cell (only tracked at high resolutions)
postcode: Option<String>,
postcode_count: u32,
lat_sum: f64,
lon_sum: f64,
}
impl CellAgg {
fn new(num_features: usize, num_enums: usize) -> Self {
CellAgg {
count: 0,
mins: vec![f64::INFINITY; num_features],
maxs: vec![f64::NEG_INFINITY; num_features],
enum_mins: vec![ENUM_NULL; num_enums],
enum_maxs: vec![0; num_enums],
postcode: None,
postcode_count: 0,
lat_sum: 0.0,
lon_sum: 0.0,
}
}
/// Add a row using row-major feature_data layout.
/// feature_data[row * num_features + feat_idx] — all features for one row
/// are contiguous, so this reads a single cache line per ~8 features.
#[inline]
fn add_row(&mut self, feature_data: &[f64], row: usize, num_features: usize) {
self.count += 1;
let base = row * num_features;
let row_slice = &feature_data[base..base + num_features];
for (feat_index, &value) in row_slice.iter().enumerate() {
if value.is_finite() {
if value < self.mins[feat_index] {
self.mins[feat_index] = value;
}
if value > self.maxs[feat_index] {
self.maxs[feat_index] = value;
}
}
}
}
/// Track min/max ordinal index for each enum feature in this cell.
#[inline]
fn add_enums(&mut self, enum_features: &[crate::data::EnumFeatureData], row: usize) {
for (enum_index, enum_feature) in enum_features.iter().enumerate() {
let value = enum_feature.data[row];
if value != ENUM_NULL {
if self.enum_mins[enum_index] == ENUM_NULL || value < self.enum_mins[enum_index] {
self.enum_mins[enum_index] = value;
}
if value > self.enum_maxs[enum_index] {
self.enum_maxs[enum_index] = value;
}
}
}
}
/// Track postcode and centroid for high-resolution cells.
/// Uses simple "first seen" approach — at res 11/12, most rows in a cell share a postcode.
#[inline]
fn add_postcode(&mut self, postcode: &str, lat: f64, lon: f64) {
self.lat_sum += lat;
self.lon_sum += lon;
if postcode.is_empty() {
return;
}
if self.postcode.is_none() {
self.postcode = Some(postcode.to_string());
self.postcode_count = 1;
} else if self.postcode.as_deref() == Some(postcode) {
self.postcode_count += 1;
}
}
}
/// Escape a string for inclusion in a JSON string literal.
pub(crate) fn write_json_escaped(buf: &mut String, text: &str) {
for character in text.chars() {
match character {
'"' => buf.push_str("\\\""),
'\\' => buf.push_str("\\\\"),
'\n' => buf.push_str("\\n"),
'\r' => buf.push_str("\\r"),
'\t' => buf.push_str("\\t"),
ctrl if ctrl < '\x20' => { let _ = write!(buf, "\\u{:04x}", ctrl as u32); }
other => buf.push(other),
}
}
}
/// Write the hexagons JSON response directly to a String buffer,
/// avoiding serde_json::Value allocations entirely.
#[allow(clippy::too_many_arguments)]
fn write_hexagons_json(
buf: &mut String,
groups: &FxHashMap<u64, CellAgg>,
min_keys: &[String],
max_keys: &[String],
num_features: usize,
enum_min_keys: &[String],
enum_max_keys: &[String],
num_enums: usize,
include_postcode: bool,
) {
buf.push_str("{\"features\":[");
let mut first = true;
for (&cell_id, aggregation) in groups {
let Some(cell) = h3o::CellIndex::try_from(cell_id).ok() else {
continue;
};
if !first {
buf.push(',');
}
first = false;
let _ = write!(buf, "{{\"h3\":\"{}\",\"count\":{}", cell, aggregation.count);
for feat_index in 0..num_features {
if aggregation.mins[feat_index].is_finite() && aggregation.maxs[feat_index].is_finite() {
let _ = write!(
buf,
",\"{}\":{},\"{}\":{}",
min_keys[feat_index], aggregation.mins[feat_index], max_keys[feat_index], aggregation.maxs[feat_index]
);
}
}
for enum_index in 0..num_enums {
if aggregation.enum_mins[enum_index] != ENUM_NULL {
let _ = write!(
buf,
",\"{}\":{},\"{}\":{}",
enum_min_keys[enum_index], aggregation.enum_mins[enum_index],
enum_max_keys[enum_index], aggregation.enum_maxs[enum_index]
);
}
}
if include_postcode {
if let Some(ref postcode) = aggregation.postcode {
let total = aggregation.count as f64;
let centroid_lat = aggregation.lat_sum / total;
let centroid_lon = aggregation.lon_sum / total;
if centroid_lat.is_finite() && centroid_lon.is_finite() {
buf.push_str(",\"postcode\":\"");
write_json_escaped(buf, postcode);
let _ = write!(buf, "\",\"lat\":{},\"lon\":{}", centroid_lat, centroid_lon);
}
}
}
buf.push('}');
}
buf.push_str("]}");
}
pub async fn get_hexagons(
state: Arc<AppState>,
Query(params): Query<HexagonParams>,
) -> Result<impl IntoResponse, (StatusCode, String)> {
let resolution = params.resolution;
if resolution < H3_PRECOMPUTE_MIN || resolution > H3_PRECOMPUTE_MAX {
warn!(
resolution,
"Resolution out of range [{}, {}]", H3_PRECOMPUTE_MIN, H3_PRECOMPUTE_MAX
);
return Err((
StatusCode::BAD_REQUEST,
format!(
"resolution must be between {} and {}",
H3_PRECOMPUTE_MIN, H3_PRECOMPUTE_MAX
),
));
}
let bounds_str = params.bounds.ok_or((
StatusCode::BAD_REQUEST,
"bounds parameter is required".into(),
))?;
let (mut south, mut west, mut north, mut east) = parse_bounds(&bounds_str)?;
let lat_range = north - south;
let lng_range = east - west;
south -= lat_range * BOUNDS_BUFFER_PERCENT;
north += lat_range * BOUNDS_BUFFER_PERCENT;
west -= lng_range * BOUNDS_BUFFER_PERCENT;
east += lng_range * BOUNDS_BUFFER_PERCENT;
south = (south / BOUNDS_QUANTIZATION).floor() * BOUNDS_QUANTIZATION;
west = (west / BOUNDS_QUANTIZATION).floor() * BOUNDS_QUANTIZATION;
north = (north / BOUNDS_QUANTIZATION).ceil() * BOUNDS_QUANTIZATION;
east = (east / BOUNDS_QUANTIZATION).ceil() * BOUNDS_QUANTIZATION;
let filters_str = params.filters.clone();
let (parsed_filters, parsed_enum_filters) = parse_filters(
params.filters.as_deref(),
&state.data.feature_names,
&state.data.enum_features,
);
let num_filters = parsed_filters.len() + parsed_enum_filters.len();
let json_body = tokio::task::spawn_blocking(move || -> Result<String, String> {
let t0 = std::time::Instant::now();
let num_features = state.data.num_features;
let num_enums = state.data.enum_features.len();
let feature_data = &state.data.feature_data;
let min_keys = &state.min_keys;
let max_keys = &state.max_keys;
let enum_min_keys = &state.enum_min_keys;
let enum_max_keys = &state.enum_max_keys;
let h3_cells_for_res: Option<&[u64]> = state
.h3_cells
.get(resolution as usize)
.filter(|cells| !cells.is_empty())
.map(|cells| cells.as_slice());
let mut groups: FxHashMap<u64, CellAgg> = FxHashMap::default();
let enum_features = &state.data.enum_features;
let include_postcode = resolution >= POSTCODE_MIN_RESOLUTION;
// Row-level filter check: numeric must be non-NaN and within [min, max],
// enum must have value index in the allowed set
let row_passes = |row: usize| -> bool {
parsed_filters.iter().all(|filter| {
let value = feature_data[row * num_features + filter.feat_idx];
value.is_finite() && value >= filter.min && value <= filter.max
}) && parsed_enum_filters.iter().all(|enum_filter| {
let value = enum_features[enum_filter.enum_idx].data[row];
value != ENUM_NULL && enum_filter.allowed.contains(&value)
})
};
if let Some(precomputed) = h3_cells_for_res {
state
.grid
.for_each_in_bounds(south, west, north, east, |row_idx| {
let row = row_idx as usize;
if !row_passes(row) {
return;
}
let cell_id = precomputed[row];
let aggregation = groups
.entry(cell_id)
.or_insert_with(|| CellAgg::new(num_features, num_enums));
aggregation.add_row(feature_data, row, num_features);
aggregation.add_enums(enum_features, row);
if include_postcode {
aggregation.add_postcode(
&state.data.postcode[row],
state.data.lat[row],
state.data.lon[row],
);
}
});
} else {
let h3_res = h3o::Resolution::try_from(resolution)
.map_err(|error| format!("Invalid H3 resolution {}: {}", resolution, error))?;
state
.grid
.for_each_in_bounds(south, west, north, east, |row_idx| {
let row = row_idx as usize;
if !row_passes(row) {
return;
}
let cell_id = h3o::LatLng::new(state.data.lat[row], state.data.lon[row])
.map(|coord| u64::from(coord.to_cell(h3_res)))
.unwrap_or(0);
let aggregation = groups
.entry(cell_id)
.or_insert_with(|| CellAgg::new(num_features, num_enums));
aggregation.add_row(feature_data, row, num_features);
aggregation.add_enums(enum_features, row);
if include_postcode {
aggregation.add_postcode(
&state.data.postcode[row],
state.data.lat[row],
state.data.lon[row],
);
}
});
}
let t_agg = t0.elapsed();
let mut json_buf = String::with_capacity(groups.len() * 128);
write_hexagons_json(
&mut json_buf,
&groups,
min_keys,
max_keys,
num_features,
enum_min_keys,
enum_max_keys,
num_enums,
include_postcode,
);
let t_total = t0.elapsed();
info!(
resolution,
cells = groups.len(),
filters = num_filters,
filters_raw = filters_str.as_deref().unwrap_or("-"),
agg_ms = format_args!("{:.1}", t_agg.as_secs_f64() * 1000.0),
total_ms = format_args!("{:.1}", t_total.as_secs_f64() * 1000.0),
size = format_args!("{}", HumanBytes(json_buf.len())),
"GET /api/hexagons"
);
Ok(json_buf)
})
.await
.map_err(|error| (StatusCode::INTERNAL_SERVER_ERROR, error.to_string()))?
.map_err(|error| (StatusCode::INTERNAL_SERVER_ERROR, error))?;
Ok(([("content-type", "application/json")], json_body))
}

View file

@ -0,0 +1,12 @@
mod features;
pub(crate) mod hexagons;
mod hexagon_stats;
pub(crate) mod parse;
mod pois;
pub(crate) mod properties;
pub use features::get_features;
pub use hexagon_stats::get_hexagon_stats;
pub use hexagons::get_hexagons;
pub use pois::{get_poi_categories, get_pois};
pub use properties::get_hexagon_properties;

View file

@ -0,0 +1,52 @@
use axum::http::StatusCode;
/// Compute the lat/lon bounding box of an H3 cell, with a configurable buffer in degrees.
pub fn h3_cell_bounds(cell: h3o::CellIndex, buffer: f64) -> (f64, f64, f64, f64) {
let boundary = cell.boundary();
let (mut min_lat, mut max_lat) = (f64::INFINITY, f64::NEG_INFINITY);
let (mut min_lon, mut max_lon) = (f64::INFINITY, f64::NEG_INFINITY);
for vertex in boundary.iter() {
let lat = vertex.lat();
let lon = vertex.lng();
if lat < min_lat {
min_lat = lat;
}
if lat > max_lat {
max_lat = lat;
}
if lon < min_lon {
min_lon = lon;
}
if lon > max_lon {
max_lon = lon;
}
}
(
min_lat - buffer,
min_lon - buffer,
max_lat + buffer,
max_lon + buffer,
)
}
pub fn parse_bounds(bounds_str: &str) -> Result<(f64, f64, f64, f64), (StatusCode, String)> {
let parts: Vec<f64> = bounds_str
.split(',')
.map(|part| part.trim().parse::<f64>())
.collect::<Result<Vec<_>, _>>()
.map_err(|_| {
(
StatusCode::BAD_REQUEST,
"Invalid bounds format. Use: south,west,north,east".into(),
)
})?;
if parts.len() != 4 {
return Err((
StatusCode::BAD_REQUEST,
"Invalid bounds format. Use: south,west,north,east".into(),
));
}
Ok((parts[0], parts[1], parts[2], parts[3]))
}

View file

@ -0,0 +1,128 @@
use std::sync::Arc;
use axum::extract::Query;
use axum::http::StatusCode;
use axum::response::Json;
use serde::{Deserialize, Serialize};
use tracing::info;
use crate::consts::MAX_POIS_PER_REQUEST;
use crate::data::POI;
use crate::state::{AppState, POICategoryGroup};
use super::parse::parse_bounds;
#[derive(Deserialize)]
pub struct POIParams {
bounds: Option<String>,
/// Comma-separated list of categories to filter by
categories: Option<String>,
}
#[derive(Serialize)]
pub struct POIsResponse {
pois: Vec<POI>,
}
pub async fn get_pois(
state: Arc<AppState>,
Query(params): Query<POIParams>,
) -> Result<Json<POIsResponse>, (StatusCode, String)> {
let bounds_str = params.bounds.ok_or((
StatusCode::BAD_REQUEST,
"bounds parameter is required".into(),
))?;
let (south, west, north, east) = parse_bounds(&bounds_str)?;
let categories_str = params.categories.clone();
let category_filter: Option<rustc_hash::FxHashSet<String>> = params
.categories
.as_deref()
.filter(|text| !text.is_empty())
.map(|text| text.split(',').map(|part| part.trim().to_string()).collect());
let num_categories = category_filter.as_ref().map(|cats| cats.len()).unwrap_or(0);
let result = tokio::task::spawn_blocking(move || {
let t0 = std::time::Instant::now();
let row_indices = state.poi_grid.query(south, west, north, east);
// Collect matching row indices first, then sample randomly so the
// subset covers the viewport uniformly instead of clustering in one area.
let mut matching_rows: Vec<usize> = row_indices
.iter()
.filter_map(|&row_idx| {
let row = row_idx as usize;
if let Some(ref categories) = category_filter {
if !categories.contains(&state.poi_data.category[row]) {
return None;
}
}
Some(row)
})
.collect();
if matching_rows.len() > MAX_POIS_PER_REQUEST {
// Use a power-of-2 sampling step so each POI's inclusion depends
// only on its own priority hash, not on what other POIs are in
// the viewport. This prevents visible reshuffling when panning.
let ratio = (matching_rows.len() / MAX_POIS_PER_REQUEST) as u32;
let step = ratio.next_power_of_two();
let mask = step - 1;
matching_rows.retain(|&row| state.poi_data.priority[row] & mask == 0);
// Statistical noise may leave us slightly over the limit
if matching_rows.len() > MAX_POIS_PER_REQUEST {
matching_rows.sort_unstable_by_key(|&row| state.poi_data.priority[row]);
matching_rows.truncate(MAX_POIS_PER_REQUEST);
}
}
let pois: Vec<POI> = matching_rows
.iter()
.map(|&row| POI {
id: state.poi_data.id[row].clone(),
name: state.poi_data.name[row].clone(),
category: state.poi_data.category[row].clone(),
group: state.poi_data.group[row].clone(),
lat: state.poi_data.lat[row],
lng: state.poi_data.lng[row],
emoji: state.poi_data.emoji[row].clone(),
})
.collect();
let elapsed = t0.elapsed();
info!(
results = pois.len(),
candidates = row_indices.len(),
categories = num_categories,
categories_raw = categories_str.as_deref().unwrap_or("-"),
ms = format_args!("{:.1}", elapsed.as_secs_f64() * 1000.0),
"GET /api/pois"
);
POIsResponse { pois }
})
.await
.map_err(|error| (StatusCode::INTERNAL_SERVER_ERROR, error.to_string()))?;
Ok(Json(result))
}
#[derive(Serialize)]
pub struct POICategoriesResponse {
groups: Vec<POICategoryGroup>,
}
pub async fn get_poi_categories(state: Arc<AppState>) -> Json<POICategoriesResponse> {
let groups: Vec<POICategoryGroup> = state.poi_category_groups.clone();
let total: usize = groups.iter().map(|group| group.categories.len()).sum();
info!(
count = total,
groups = groups.len(),
"GET /api/poi-categories"
);
Json(POICategoriesResponse { groups })
}

View file

@ -0,0 +1,230 @@
use std::str::FromStr;
use std::sync::Arc;
use axum::extract::Query;
use axum::http::StatusCode;
use axum::response::Json;
use rustc_hash::FxHashMap;
use serde::{Deserialize, Serialize};
use tracing::{info, warn};
use crate::consts::{DEFAULT_PROPERTIES_LIMIT, ENUM_NULL, MAX_PROPERTIES_LIMIT};
use crate::data::EnumFeatureData;
use crate::filter::{parse_filters, row_passes_filters};
use crate::state::AppState;
use super::parse::h3_cell_bounds;
#[derive(Deserialize)]
pub struct HexagonPropertiesParams {
pub h3: String,
pub resolution: u8,
pub filters: Option<String>,
pub limit: Option<usize>,
pub offset: Option<usize>,
}
#[derive(Serialize)]
pub struct Property {
// String fields
pub address: Option<String>,
pub postcode: Option<String>,
pub property_type: Option<String>,
pub built_form: Option<String>,
pub duration: Option<String>,
pub current_energy_rating: Option<String>,
pub potential_energy_rating: Option<String>,
// Numeric fields
pub lat: f64,
pub lon: f64,
pub is_construction_date_approximate: Option<bool>,
#[serde(flatten)]
pub features: FxHashMap<String, f64>,
}
#[derive(Serialize)]
pub struct HexagonPropertiesResponse {
pub properties: Vec<Property>,
pub total: usize,
pub limit: usize,
pub offset: usize,
pub truncated: bool,
}
fn non_empty_string(text: &str) -> Option<String> {
let trimmed = text.trim();
if trimmed.is_empty() {
None
} else {
Some(trimmed.to_string())
}
}
fn lookup_enum_value(
enum_features: &[EnumFeatureData],
enum_idx: &FxHashMap<String, usize>,
row: usize,
names: &[&str],
) -> Option<String> {
for name in names {
if let Some(&feature_index) = enum_idx.get(*name) {
let enum_feature = &enum_features[feature_index];
let data_index = enum_feature.data[row];
if data_index != ENUM_NULL {
if let Some(value) = enum_feature.values.get(data_index as usize) {
return Some(value.clone());
}
}
}
}
None
}
pub async fn get_hexagon_properties(
state: Arc<AppState>,
Query(params): Query<HexagonPropertiesParams>,
) -> Result<Json<HexagonPropertiesResponse>, (StatusCode, String)> {
let cell = h3o::CellIndex::from_str(&params.h3).map_err(|error| {
warn!(h3 = %params.h3, error = %error, "Invalid H3 cell index");
(StatusCode::BAD_REQUEST, format!("Invalid H3 cell: {}", error))
})?;
let cell_u64: u64 = cell.into();
let resolution = params.resolution as usize;
if resolution >= state.h3_cells.len() || state.h3_cells[resolution].is_empty() {
warn!(
resolution,
"Invalid or non-precomputed resolution for hexagon-properties"
);
return Err((
StatusCode::BAD_REQUEST,
"Invalid or non-precomputed resolution".to_string(),
));
}
let h3_str = params.h3.clone();
let filters_str = params.filters.clone();
let (parsed_filters, parsed_enum_filters) = parse_filters(
params.filters.as_deref(),
&state.data.feature_names,
&state.data.enum_features,
);
let num_filters = parsed_filters.len() + parsed_enum_filters.len();
let result = tokio::task::spawn_blocking(move || {
let t0 = std::time::Instant::now();
let h3_data = &state.h3_cells[resolution];
let num_features = state.data.num_features;
let feature_data = &state.data.feature_data;
let enum_features = &state.data.enum_features;
let (min_lat, min_lon, max_lat, max_lon) = h3_cell_bounds(cell, 0.001);
let mut matching_rows: Vec<usize> = Vec::new();
state
.grid
.for_each_in_bounds(min_lat, min_lon, max_lat, max_lon, |row_idx| {
let row = row_idx as usize;
if h3_data[row] == cell_u64
&& row_passes_filters(
row,
&parsed_filters,
&parsed_enum_filters,
feature_data,
num_features,
enum_features,
)
{
matching_rows.push(row);
}
});
let total = matching_rows.len();
let limit = params.limit.unwrap_or(DEFAULT_PROPERTIES_LIMIT).min(MAX_PROPERTIES_LIMIT);
let offset = params.offset.unwrap_or(0);
let truncated = total > offset + limit;
let properties: Vec<Property> = matching_rows
.iter()
.skip(offset)
.take(limit)
.map(|&row| {
let mut features = FxHashMap::default();
let base = row * num_features;
for (feat_idx, feat_name) in state.data.feature_names.iter().enumerate() {
let value = feature_data[base + feat_idx];
if value.is_finite() {
features.insert(feat_name.clone(), value);
}
}
Property {
address: non_empty_string(&state.data.address[row]),
postcode: non_empty_string(&state.data.postcode[row]),
is_construction_date_approximate: Some(state.data.is_approx_build_date[row]),
property_type: lookup_enum_value(
enum_features,
&state.enum_name_to_idx,
row,
&["Property type", "epc_property_type", "pp_property_type"],
),
built_form: lookup_enum_value(
enum_features,
&state.enum_name_to_idx,
row,
&["Property type/built form", "built_form"],
),
duration: lookup_enum_value(
enum_features,
&state.enum_name_to_idx,
row,
&["Leashold/Freehold", "duration"],
),
current_energy_rating: lookup_enum_value(
enum_features,
&state.enum_name_to_idx,
row,
&["Current energy rating", "current_energy_rating"],
),
potential_energy_rating: lookup_enum_value(
enum_features,
&state.enum_name_to_idx,
row,
&["Potential energy rating", "potential_energy_rating"],
),
lat: state.data.lat[row],
lon: state.data.lon[row],
features,
}
})
.collect();
let elapsed = t0.elapsed();
info!(
h3 = %h3_str,
resolution,
total,
returned = properties.len(),
offset,
filters = num_filters,
filters_raw = filters_str.as_deref().unwrap_or("-"),
ms = format_args!("{:.1}", elapsed.as_secs_f64() * 1000.0),
"GET /api/hexagon-properties"
);
HexagonPropertiesResponse {
properties,
total,
limit,
offset,
truncated,
}
})
.await
.map_err(|error| (StatusCode::INTERNAL_SERVER_ERROR, error.to_string()))?;
Ok(Json(result))
}

33
server-rs/src/state.rs Normal file
View file

@ -0,0 +1,33 @@
use rustc_hash::FxHashMap;
use serde::Serialize;
use crate::data::{POIData, PropertyData};
use crate::grid_index::GridIndex;
#[derive(Serialize, Clone)]
pub struct POICategoryGroup {
pub name: String,
pub categories: Vec<String>,
}
pub struct AppState {
pub data: PropertyData,
pub grid: GridIndex,
/// h3_cells[resolution][row_idx] = precomputed H3 cell ID.
/// Empty Vec for resolutions not precomputed.
pub h3_cells: Vec<Vec<u64>>,
pub poi_data: POIData,
pub poi_grid: GridIndex,
/// Precomputed JSON key names: "min_{feature_name}" for each numeric feature
pub min_keys: Vec<String>,
/// Precomputed JSON key names: "max_{feature_name}" for each numeric feature
pub max_keys: Vec<String>,
/// Precomputed JSON key names: "min_{enum_name}" for each enum feature
pub enum_min_keys: Vec<String>,
/// Precomputed JSON key names: "max_{enum_name}" for each enum feature
pub enum_max_keys: Vec<String>,
/// Precomputed POI category groups (sorted)
pub poi_category_groups: Vec<POICategoryGroup>,
/// Precomputed map from enum feature name to index in data.enum_features
pub enum_name_to_idx: FxHashMap<String, usize>,
}

250
server-rs/src/tests.rs Normal file
View file

@ -0,0 +1,250 @@
#[cfg(test)]
mod grid_index_tests {
use crate::grid_index::GridIndex;
#[test]
fn query_bounds_fully_below_grid_returns_empty() {
let lat = vec![50.0, 50.5, 51.0];
let lon = vec![0.0, 0.5, 1.0];
let grid = GridIndex::build(&lat, &lon, 0.01);
let results = grid.query(10.0, -10.0, 20.0, -5.0);
assert!(
results.is_empty(),
"Should return empty for bounds fully below grid"
);
}
#[test]
fn query_bounds_fully_above_grid_returns_empty() {
let lat = vec![50.0, 50.5, 51.0];
let lon = vec![0.0, 0.5, 1.0];
let grid = GridIndex::build(&lat, &lon, 0.01);
let results = grid.query(80.0, 50.0, 90.0, 60.0);
assert!(
results.is_empty(),
"Should return empty for bounds fully above grid"
);
}
#[test]
fn query_inverted_bounds_returns_empty() {
let lat = vec![50.0, 50.5, 51.0];
let lon = vec![0.0, 0.5, 1.0];
let grid = GridIndex::build(&lat, &lon, 0.01);
// south > north
let results = grid.query(52.0, 0.0, 49.0, 1.0);
assert!(
results.is_empty(),
"Should return empty for inverted bounds"
);
}
#[test]
fn for_each_bounds_fully_outside_yields_nothing() {
let lat = vec![50.0, 50.5, 51.0];
let lon = vec![0.0, 0.5, 1.0];
let grid = GridIndex::build(&lat, &lon, 0.01);
let mut count = 0;
grid.for_each_in_bounds(10.0, -10.0, 20.0, -5.0, |_| count += 1);
assert_eq!(
count, 0,
"for_each should yield nothing for out-of-bounds query"
);
}
#[test]
fn query_with_large_cells_outside_returns_empty() {
// Previously, out-of-bounds queries with large cell sizes would
// scan cell (0,0) which could contain data. Now returns empty.
let lat = vec![50.0];
let lon = vec![0.0];
let grid = GridIndex::build(&lat, &lon, 1.0);
let results = grid.query(0.0, -50.0, 10.0, -40.0);
assert!(
results.is_empty(),
"Should return empty even with large cell size"
);
}
#[test]
fn query_within_bounds_returns_correct_results() {
let lat = vec![50.0, 50.5, 51.0];
let lon = vec![0.0, 0.5, 1.0];
let grid = GridIndex::build(&lat, &lon, 0.01);
let results = grid.query(49.9, -0.1, 51.1, 1.1);
assert_eq!(results.len(), 3, "Should return all 3 points within bounds");
}
#[test]
fn query_partial_bounds_returns_subset() {
let lat = vec![50.0, 51.0, 52.0];
let lon = vec![0.0, 0.0, 0.0];
let grid = GridIndex::build(&lat, &lon, 0.01);
let results = grid.query(49.9, -0.1, 50.1, 0.1);
assert_eq!(results.len(), 1, "Should return only the point at lat=50");
}
}
#[cfg(test)]
mod filter_tests {
use crate::data::EnumFeatureData;
use crate::filter::{parse_filters, row_passes_filters};
#[test]
fn nan_rows_fail_numeric_filter_even_with_infinite_range() {
let feature_names = vec!["price".to_string()];
let feature_data = vec![f64::NAN];
let enum_features: Vec<EnumFeatureData> = vec![];
let (numeric, enums) =
parse_filters(Some("price:-inf:inf"), &feature_names, &enum_features);
assert_eq!(numeric.len(), 1, "Should parse -inf:inf as valid filter");
let passes = row_passes_filters(0, &numeric, &enums, &feature_data, 1, &enum_features);
assert!(!passes, "NaN should fail filter even with infinite range");
}
#[test]
fn empty_enum_filter_value_rejects_all() {
let enum_features = vec![EnumFeatureData {
name: "rating".to_string(),
values: vec!["A".to_string(), "B".to_string()],
data: vec![0],
}];
let feature_names: Vec<String> = vec![];
let (numeric, enums) = parse_filters(Some("rating:"), &feature_names, &enum_features);
assert_eq!(enums.len(), 1);
assert!(enums[0].allowed.is_empty());
let passes = row_passes_filters(0, &numeric, &enums, &[], 0, &enum_features);
assert!(!passes, "Empty allowed set should reject all rows");
}
#[test]
fn enum_filter_with_nonexistent_values_produces_empty_allowed() {
let enum_features = vec![EnumFeatureData {
name: "rating".to_string(),
values: vec!["A".to_string(), "B".to_string()],
data: vec![0],
}];
let feature_names: Vec<String> = vec![];
let (_, enums) = parse_filters(Some("rating:X|Y|Z"), &feature_names, &enum_features);
assert_eq!(enums.len(), 1);
assert!(enums[0].allowed.is_empty());
}
#[test]
fn malformed_numeric_min_is_silently_skipped() {
let feature_names = vec!["price".to_string()];
let enum_features: Vec<EnumFeatureData> = vec![];
let (numeric, enums) = parse_filters(
Some("price:not_a_number:200"),
&feature_names,
&enum_features,
);
assert_eq!(numeric.len(), 0);
assert_eq!(enums.len(), 0);
}
}
#[cfg(test)]
mod json_tests {
#[test]
fn json_escaped_postcode_with_quotes_is_valid() {
use crate::routes::hexagons::write_json_escaped;
let mut buf = String::new();
buf.push_str("{\"postcode\":\"");
write_json_escaped(&mut buf, "SW1A \"test");
buf.push_str("\"}");
let result: Result<serde_json::Value, _> = serde_json::from_str(&buf);
assert!(
result.is_ok(),
"Escaped quote should produce valid JSON: {}",
buf
);
assert_eq!(result.unwrap()["postcode"].as_str().unwrap(), "SW1A \"test");
}
#[test]
fn json_escaped_postcode_with_backslash_is_valid() {
use crate::routes::hexagons::write_json_escaped;
let mut buf = String::new();
buf.push_str("{\"postcode\":\"");
write_json_escaped(&mut buf, "SW1A\\2AA");
buf.push_str("\"}");
let result: Result<serde_json::Value, _> = serde_json::from_str(&buf);
assert!(
result.is_ok(),
"Escaped backslash should produce valid JSON: {}",
buf
);
assert_eq!(result.unwrap()["postcode"].as_str().unwrap(), "SW1A\\2AA");
}
#[test]
fn nan_is_not_valid_json() {
use std::fmt::Write;
// Verify that raw NaN in write! is still invalid JSON (documenting the risk
// that the is_finite() guard in write_hexagons_json protects against).
let mut buf = String::new();
write!(buf, "{{\"min_price\":{}}}", f64::NAN).unwrap();
let result: Result<serde_json::Value, _> = serde_json::from_str(&buf);
assert!(result.is_err(), "Raw NaN should produce invalid JSON");
}
#[test]
fn infinity_is_not_valid_json() {
use std::fmt::Write;
let mut buf = String::new();
write!(buf, "{{\"min_price\":{}}}", f64::INFINITY).unwrap();
let result: Result<serde_json::Value, _> = serde_json::from_str(&buf);
assert!(result.is_err(), "Raw Infinity should produce invalid JSON");
}
}
#[cfg(test)]
mod enum_encoding_tests {
#[test]
fn u8_cast_wraps_around_beyond_255() {
// Documents the underlying u8 wrapping behavior that the truncation
// guard in property.rs now prevents.
let num_values = 300usize;
let indices: Vec<u8> = (0..num_values).map(|index| index as u8).collect();
assert_eq!(indices[0], indices[256], "u8 wraps: 0 == 256");
assert_eq!(indices[1], indices[257], "u8 wraps: 1 == 257");
use std::collections::HashMap;
let values: Vec<String> = (0..num_values).map(|i| format!("val_{}", i)).collect();
let value_to_idx: HashMap<&str, u8> = values
.iter()
.enumerate()
.map(|(index, value)| (value.as_str(), index as u8))
.collect();
let unique_indices: std::collections::HashSet<u8> =
value_to_idx.values().cloned().collect();
assert!(
unique_indices.len() < num_values,
"Without the truncation guard, {} values produce only {} unique u8 indices",
num_values,
unique_indices.len()
);
}
}

View file

View file

@ -1,30 +0,0 @@
"""Server configuration - imports shared values from pipeline config."""
from pipeline.config import (
AGGREGATES_DIR,
H3_RESOLUTIONS as VALID_RESOLUTIONS,
DEFAULT_H3_RESOLUTION as DEFAULT_RESOLUTION,
MIN_YEAR,
MAX_YEAR,
DEFAULT_MIN_YEAR,
DEFAULT_MAX_YEAR,
DEFAULT_MIN_PRICE,
DEFAULT_MAX_PRICE,
)
# Extra area to return beyond requested bounds (0.2 = 20%)
# Makes panning smoother by preloading nearby hexagons
BOUNDS_BUFFER_PERCENT = 0.2
__all__ = [
"AGGREGATES_DIR",
"VALID_RESOLUTIONS",
"DEFAULT_RESOLUTION",
"MIN_YEAR",
"MAX_YEAR",
"DEFAULT_MIN_YEAR",
"DEFAULT_MAX_YEAR",
"DEFAULT_MIN_PRICE",
"DEFAULT_MAX_PRICE",
"BOUNDS_BUFFER_PERCENT",
]

View file

@ -1,35 +0,0 @@
from contextlib import asynccontextmanager
from pathlib import Path
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from fastapi.staticfiles import StaticFiles
from server.routes import hexagons, pois
@asynccontextmanager
async def lifespan(app: FastAPI):
# Startup: preload all parquet files
hexagons.preload_dataframes()
pois.preload_pois()
yield
# Shutdown: nothing to clean up
app = FastAPI(title="Property Map API", lifespan=lifespan)
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=False, # Cannot use True with wildcard origins
allow_methods=["*"],
allow_headers=["*"],
)
app.include_router(hexagons.router, prefix="/api")
app.include_router(pois.router, prefix="/api")
# Mount static files for production (frontend build)
frontend_dist = Path(__file__).parent.parent / "frontend" / "dist"
if frontend_dist.exists():
app.mount("/", StaticFiles(directory=frontend_dist, html=True), name="static")

Some files were not shown because too many files have changed in this diff Show more