diff --git a/CLAUDE.md b/CLAUDE.md index cd0a5df..592ee92 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -83,12 +83,11 @@ The server and frontend must handle these human-readable names. See the full ren Rust + Axum. Loads parquet into memory at startup. -**Structure:** -- `data/property.rs` — Loads `wide.parquet`, auto-discovers numeric + enum features, computes histograms, sorts rows by spatial locality, precomputes H3 cells (resolutions 4–12) -- `data/poi.rs` — Loads `filtered_uk_pois.parquet` -- `index.rs` — `GridIndex`: 0.01° spatial grid for O(1) cell lookup -- `filter.rs` — Parses filter strings and checks rows. Format: `name:min:max` (numeric), `name:val1|val2` (enum) -- `routes/` — One file per endpoint +**Structure** (uses Rust 2018 module style — `foo.rs` + `foo/` directory, not `foo/mod.rs`): +- `data.rs` + `data/` — Property and POI data loading +- `parsing.rs` + `parsing/` — Filter parsing and bounds parsing +- `routes.rs` + `routes/` — One file per endpoint +- `utils.rs` + `utils/` — GridIndex, hashing, interned columns - `consts.rs` — Key constants (histogram bins, H3 range, max enum cardinality, excluded columns) **API endpoints:** @@ -100,10 +99,10 @@ Rust + Axum. Loads parquet into memory at startup. Serves `frontend/dist/` as static fallback in production. -**Data representation:** -- Numeric features: row-major flat `Vec`, NaN = null -- Enum features: `Vec` indices into value list, 255 = null -- String fields (address, postcode): `Vec`, empty = null +**Data representation (unified model):** +- All features (numeric and enum): row-major flat `Vec`, NaN = null +- Enum features: stored as f32 indices (0.0, 1.0, 2.0...) with `enum_values: FxHashMap>` mapping feature index → string values +- String fields (address, postcode): interned/packed for memory efficiency - The server accepts the parquet path as a CLI argument (defaults to `data_sources/processed/wide.parquet`) ### Frontend (`frontend/`) @@ -216,14 +215,49 @@ Every UI element must use the correct token from this table. Do not invent new p - [ ] Sidebars, dropdowns, and popups are readable in both modes - [ ] HomePage and DataSourcesPage adapt correctly +## Coding Preferences + +- **Unified data models over special-casing**: Prefer storing different data types uniformly (e.g., enums as f32 indices alongside numeric features) rather than maintaining separate code paths +- **Terse tests**: Test what matters in as few tests as possible — don't overcomplicate with excessive setup or edge cases that don't add value +- **Extract and organize**: Group related utilities into proper modules (e.g., `utils/`, `parsing/`) rather than leaving helpers scattered +- **Inline module tests**: Place `#[cfg(test)] mod tests { }` at the bottom of each module file rather than in separate test files + +## Rust Code Style (server-rs) + +Follow these conventions in all Rust code: + +1. **Module style**: Use Rust 2018 module naming — `foo.rs` + `foo/` directory, NOT `foo/mod.rs` +2. **Imports over inline paths**: Import items at the top of the file, don't use `crate::` inline in code + ```rust + // Good + use crate::utils::generate_priorities; + let p = generate_priorities(n); + + // Bad + let p = crate::utils::generate_priorities(n); + ``` +3. **Tracing macros**: Import and use short form, not fully qualified + ```rust + // Good + use tracing::{info, warn}; + info!("message"); + + // Bad + tracing::info!("message"); + ``` +4. **JSON serialization**: Use `serde_json` with `#[derive(Serialize)]` structs, not manual string building +5. **Precompute at startup**: For static/rarely-changing responses, compute once at startup and store in `AppState` +6. **Unique placeholders**: When injecting content into HTML, use distinctive markers like `__NARROWIT_OG_TAGS__` that won't accidentally match other content + ## Key Implementation Details - **Spatial sort**: Rows sorted by 0.01° grid cell at load time for cache-friendly sequential access -- **Row-major layout**: `feature_data[row * num_features + feat_idx]` — all features for one property are contiguous +- **Row-major layout**: `feature_data[row * num_features + feat_idx]` — all features (numeric and enum) for one property are contiguous - **H3 precomputation**: Resolutions 4–12 computed in parallel (rayon) at startup - **Histogram percentiles without sorting**: O(n) two-pass algorithm — build histogram, interpolate percentiles -- **Direct JSON writing**: Hexagon endpoint writes JSON via string buffer, avoids serde_json::Value allocations +- **Startup precomputation**: Static responses (like `/api/features`) are computed once at startup and cached in `AppState` - **POI transform validation**: Fails if any OSM category is unmapped — guarantees exhaustive coverage - **Fuzzy join**: Groups by postcode, uses `thefuzz.token_sort_ratio` with numeric token compatibility, greedy assignment from highest score - **Filter bounds format**: `south,west,north,east` (not standard bbox order) - **POI proximity**: Uses 0.05° grid (~5km cells) to reduce candidates before haversine distance check +- **OG tag injection**: Uses `` placeholder in HTML, replaced at runtime by middleware diff --git a/Dockerfile b/Dockerfile index c690687..133564c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -22,4 +22,4 @@ COPY --from=frontend /app/frontend/dist ./dist/ EXPOSE 8001 ENTRYPOINT ["./property-map-server"] -CMD ["--data", "/data/wide.parquet", "--pois", "/data/filtered_uk_pois.parquet"] +CMD ["--data", "/data/wide.parquet", "--pois", "/data/filtered_uk_pois.parquet", "--tiles", "/data/uk.pmtiles", "--postcodes", "/data/postcodes"] diff --git a/Taskfile.data.yml b/Taskfile.data.yml index 76c6ec6..a7b340b 100644 --- a/Taskfile.data.yml +++ b/Taskfile.data.yml @@ -2,6 +2,7 @@ version: '3' vars: DATA_DIR: /bulk/property-data + TILES_OUTPUT: "{{.DATA_DIR}}/uk.pmtiles" ARCGIS_OUTPUT: "{{.DATA_DIR}}/arcgis_data.parquet" PRICE_PAID_OUTPUT: "{{.DATA_DIR}}/price-paid-complete.parquet" IOD_OUTPUT: "{{.DATA_DIR}}/IoD2025_Scores.parquet" @@ -21,8 +22,31 @@ vars: NAPTAN_OUTPUT: "{{.DATA_DIR}}/naptan.parquet" BROADBAND_OUTPUT: "{{.DATA_DIR}}/broadband.parquet" SCHOOL_PROXIMITY_OUTPUT: "{{.DATA_DIR}}/school_proximity.parquet" + POSTCODES_OUTPUT: "{{.DATA_DIR}}/postcodes" tasks: + download:tiles: + desc: Download UK map tiles (PMTiles format from Protomaps) + status: + - test -f {{.TILES_OUTPUT}} + vars: + PMTILES_VERSION: "1.22.3" + PMTILES_BIN: "{{.DATA_DIR}}/pmtiles" + cmds: + - | + echo "Downloading UK PMTiles (~1.5GB)..." + echo "This extracts UK tiles from the Protomaps planet file." + echo "" + # Download pmtiles CLI if not present + if [ ! -f "{{.PMTILES_BIN}}" ]; then + echo "Downloading pmtiles CLI v{{.PMTILES_VERSION}}..." + curl -sL "https://github.com/protomaps/go-pmtiles/releases/download/v{{.PMTILES_VERSION}}/go-pmtiles_{{.PMTILES_VERSION}}_Linux_x86_64.tar.gz" | tar -xz -C "{{.DATA_DIR}}" pmtiles + chmod +x "{{.PMTILES_BIN}}" + fi + # Extract UK region (bbox: -10.5,49.5,2.5,61) + # Using a recent daily build from Protomaps + "{{.PMTILES_BIN}}" extract https://build.protomaps.com/20260201.pmtiles {{.TILES_OUTPUT}} --bbox=-10.5,49.5,2.5,61 + prompt:epc: desc: Prompt user to download EPC dataset (requires registration) status: @@ -110,6 +134,12 @@ tasks: cmds: - uv run python -m pipeline.download.broadband --output {{.BROADBAND_OUTPUT}} + download:postcodes: + desc: Download GB postcodes data from MapIt + status: + - test -f {{.POSTCODES_OUTPUT}} + cmds: + - uv run python -m pipeline.download.postcodes --output {{.POSTCODES_OUTPUT}} download:noise: desc: Download Defra noise data (road, rail, airport) sampled at postcode centroids diff --git a/Taskfile.yml b/Taskfile.yml index fbbb156..677d189 100644 --- a/Taskfile.yml +++ b/Taskfile.yml @@ -5,10 +5,6 @@ includes: taskfile: ./Taskfile.data.yml flatten: true -vars: - DATA_DIR: /bulk/property-data - WIDE_OUTPUT: "{{.DATA_DIR}}/wide.parquet" - POIS_FILTERED_OUTPUT: "{{.DATA_DIR}}/filtered_uk_pois.parquet" tasks: install: @@ -33,13 +29,13 @@ tasks: desc: Run Rust backend on port 8001 (debug build, fast compile) dir: server-rs cmds: - - cargo run -- --data {{.WIDE_OUTPUT}} --pois {{.POIS_FILTERED_OUTPUT}} + - cargo run -- --data {{.WIDE_OUTPUT}} --pois {{.POIS_FILTERED_OUTPUT}} --tiles {{.TILES_OUTPUT}} --postcodes {{.POSTCODES_OUTPUT}} dev:server:release: desc: Run Rust backend on port 8001 (release build) dir: server-rs cmds: - - cargo run --release -- --data {{.WIDE_OUTPUT}} --pois {{.POIS_FILTERED_OUTPUT}} + - cargo run --release -- --data {{.WIDE_OUTPUT}} --pois {{.POIS_FILTERED_OUTPUT}} --tiles {{.TILES_OUTPUT}} --postcodes {{.POSTCODES_OUTPUT}} dev:og: desc: Run OG screenshot sidecar on port 8002 diff --git a/frontend/src/lib/map-utils.ts b/frontend/src/lib/map-utils.ts index 2ece3c8..e5c238f 100644 --- a/frontend/src/lib/map-utils.ts +++ b/frontend/src/lib/map-utils.ts @@ -1,7 +1,8 @@ import type { ViewState, Bounds } from '../types'; -export const MAP_STYLE_LIGHT = 'https://basemaps.cartocdn.com/gl/voyager-gl-style/style.json'; -export const MAP_STYLE_DARK = 'https://basemaps.cartocdn.com/gl/dark-matter-gl-style/style.json'; +// Self-hosted tile styles from server +export const MAP_STYLE_LIGHT = '/api/tiles/style.json?theme=light'; +export const MAP_STYLE_DARK = '/api/tiles/style.json?theme=dark'; export const GRADIENT: { t: number; color: [number, number, number] }[] = [ { t: 0, color: [46, 204, 113] }, @@ -54,6 +55,9 @@ export function countToColor(t: number): [number, number, number] { return DENSITY_GRADIENT[DENSITY_GRADIENT.length - 1].color; } +/** Zoom threshold at which we switch from hexagons to postcode polygons */ +export const POSTCODE_ZOOM_THRESHOLD = 15; + export function zoomToResolution(zoom: number): number { if (zoom < 6) return 5; if (zoom < 7) return 6; diff --git a/server-rs/src/routes/tiles.rs b/server-rs/src/routes/tiles.rs new file mode 100644 index 0000000..ef868e3 --- /dev/null +++ b/server-rs/src/routes/tiles.rs @@ -0,0 +1,257 @@ +use std::sync::Arc; + +use axum::extract::{Path, Query, State}; +use axum::http::{header, HeaderMap, StatusCode}; +use axum::response::{IntoResponse, Response}; +use pmtiles::async_reader::AsyncPmTilesReader; +use pmtiles::MmapBackend; +use serde::Deserialize; +use tracing::warn; + +pub type TileReader = AsyncPmTilesReader; + +pub async fn get_tile( + State(reader): State>, + Path((z, x, y)): Path<(u8, u32, u32)>, +) -> Response { + match reader.get_tile(z, x as u64, y as u64).await { + Ok(Some(tile_bytes)) => ( + StatusCode::OK, + [ + (header::CONTENT_TYPE, "application/x-protobuf"), + (header::CONTENT_ENCODING, "gzip"), + (header::CACHE_CONTROL, "public, max-age=86400"), + ], + tile_bytes.to_vec(), + ) + .into_response(), + Ok(None) => StatusCode::NO_CONTENT.into_response(), + Err(err) => { + warn!(z, x, y, error = %err, "Failed to get tile"); + StatusCode::INTERNAL_SERVER_ERROR.into_response() + } + } +} + +#[derive(Deserialize)] +pub struct StyleParams { + #[serde(default)] + theme: Option, +} + +pub async fn get_style( + State(reader): State>, + headers: HeaderMap, + Query(params): Query, +) -> Response { + let is_dark = params.theme.as_deref() == Some("dark"); + + // Metadata is returned as a JSON string + let metadata_str = match reader.get_metadata().await { + Ok(meta) => meta, + Err(err) => { + warn!(error = %err, "Failed to get PMTiles metadata"); + return StatusCode::INTERNAL_SERVER_ERROR.into_response(); + } + }; + + // Parse the JSON string + let metadata: serde_json::Value = match serde_json::from_str(&metadata_str) { + Ok(v) => v, + Err(err) => { + warn!(error = %err, "Failed to parse PMTiles metadata JSON"); + serde_json::Value::Object(serde_json::Map::new()) + } + }; + + // Extract tilestats for layer info if available + let layers: Vec = metadata + .get("vector_layers") + .and_then(|v| v.as_array()) + .cloned() + .unwrap_or_default(); + + // Build absolute tile URL using the request host + let host = headers + .get(header::HOST) + .and_then(|h| h.to_str().ok()) + .unwrap_or("localhost:8001"); + let tile_url = format!("http://{}/api/tiles/{{z}}/{{x}}/{{y}}", host); + let style = build_style(is_dark, &layers, &tile_url); + + ( + StatusCode::OK, + [(header::CONTENT_TYPE, "application/json")], + serde_json::to_string(&style).unwrap(), + ) + .into_response() +} + +fn build_style(is_dark: bool, layers: &[serde_json::Value], tile_url: &str) -> serde_json::Value { + let (bg_color, water_color, land_color, road_color, text_color, text_halo) = if is_dark { + ("#1a1a1a", "#193447", "#1a1a1a", "#2a2a2a", "#888888", "#000000") + } else { + ("#f8f4f0", "#aad3df", "#f8f4f0", "#ffffff", "#333333", "#ffffff") + }; + + // Build layer list from metadata + let layer_ids: Vec<&str> = layers + .iter() + .filter_map(|l| l.get("id").and_then(|v| v.as_str())) + .collect(); + + let mut style_layers = vec![serde_json::json!({ + "id": "background", + "type": "background", + "paint": { "background-color": bg_color } + })]; + + // Water layer + if layer_ids.contains(&"water") { + style_layers.push(serde_json::json!({ + "id": "water", + "type": "fill", + "source": "protomaps", + "source-layer": "water", + "paint": { "fill-color": water_color } + })); + } + + // Land/earth layer + if layer_ids.contains(&"earth") { + style_layers.push(serde_json::json!({ + "id": "earth", + "type": "fill", + "source": "protomaps", + "source-layer": "earth", + "paint": { "fill-color": land_color } + })); + } + + // Landuse + if layer_ids.contains(&"landuse") { + let landuse_color = if is_dark { "#1f2d1f" } else { "#d8e8c8" }; + style_layers.push(serde_json::json!({ + "id": "landuse-park", + "type": "fill", + "source": "protomaps", + "source-layer": "landuse", + "filter": ["any", + ["==", ["get", "pmap:kind"], "park"], + ["==", ["get", "pmap:kind"], "nature_reserve"], + ["==", ["get", "pmap:kind"], "forest"] + ], + "paint": { "fill-color": landuse_color, "fill-opacity": 0.5 } + })); + } + + // Roads + if layer_ids.contains(&"roads") { + let road_casing = if is_dark { "#111111" } else { "#cccccc" }; + style_layers.extend(vec![ + serde_json::json!({ + "id": "roads-casing", + "type": "line", + "source": "protomaps", + "source-layer": "roads", + "filter": ["!=", ["get", "pmap:kind"], "path"], + "paint": { + "line-color": road_casing, + "line-width": ["interpolate", ["linear"], ["zoom"], 10, 1, 18, 12] + } + }), + serde_json::json!({ + "id": "roads", + "type": "line", + "source": "protomaps", + "source-layer": "roads", + "filter": ["!=", ["get", "pmap:kind"], "path"], + "paint": { + "line-color": road_color, + "line-width": ["interpolate", ["linear"], ["zoom"], 10, 0.5, 18, 8] + } + }), + ]); + } + + // Buildings + if layer_ids.contains(&"buildings") { + let building_color = if is_dark { "#252525" } else { "#e8e4e0" }; + style_layers.push(serde_json::json!({ + "id": "buildings", + "type": "fill", + "source": "protomaps", + "source-layer": "buildings", + "minzoom": 14, + "paint": { "fill-color": building_color, "fill-opacity": 0.8 } + })); + } + + // Waterway labels - this layer ID is used by deck.gl as an insertion point + // for interleaved layers (beforeId: 'waterway_label') + if layer_ids.contains(&"water") { + style_layers.push(serde_json::json!({ + "id": "waterway_label", + "type": "symbol", + "source": "protomaps", + "source-layer": "water", + "filter": ["all", ["has", "name"], ["==", ["geometry-type"], "LineString"]], + "layout": { + "text-field": ["get", "name"], + "text-font": ["Noto Sans Regular"], + "text-size": 10, + "symbol-placement": "line" + }, + "paint": { + "text-color": water_color, + "text-halo-color": text_halo, + "text-halo-width": 1 + } + })); + } + + // Place labels + if layer_ids.contains(&"places") { + style_layers.push(serde_json::json!({ + "id": "place-labels", + "type": "symbol", + "source": "protomaps", + "source-layer": "places", + "filter": ["has", "name"], + "layout": { + "text-field": ["get", "name"], + "text-font": ["Noto Sans Regular"], + "text-size": ["interpolate", ["linear"], ["zoom"], + 6, ["match", ["get", "pmap:kind"], "city", 12, "town", 10, 8], + 14, ["match", ["get", "pmap:kind"], "city", 24, "town", 18, 14] + ], + "text-max-width": 8 + }, + "paint": { + "text-color": text_color, + "text-halo-color": text_halo, + "text-halo-width": 1.5 + } + })); + } + + serde_json::json!({ + "version": 8, + "name": if is_dark { "Dark" } else { "Light" }, + "glyphs": "https://protomaps.github.io/basemaps-assets/fonts/{fontstack}/{range}.pbf", + "sources": { + "protomaps": { + "type": "vector", + "tiles": [tile_url], + "maxzoom": 15 + } + }, + "layers": style_layers + }) +} + +pub async fn init_tile_reader(path: &std::path::Path) -> anyhow::Result { + let backend = MmapBackend::try_from(path).await?; + let reader = AsyncPmTilesReader::try_from_source(backend).await?; + Ok(reader) +}