From 02712f41e8888835415aac4a9833ed84a7d8f2fa Mon Sep 17 00:00:00 2001 From: Andras Schmelczer Date: Sun, 15 Mar 2026 14:03:38 +0000 Subject: [PATCH] England only --- Makefile.data | 23 +-- frontend/src/components/map/AiFilterInput.tsx | 142 ++++++++++++++---- pipeline/download/england_boundary.py | 45 ++++++ pipeline/download/map_assets.py | 10 ++ pipeline/download/places.py | 30 +++- pipeline/download/pois.py | 39 +++-- pipeline/transform/transform_poi.py | 32 +++- pipeline/utils/england_geometry.py | 33 ++++ 8 files changed, 294 insertions(+), 60 deletions(-) create mode 100644 pipeline/download/england_boundary.py create mode 100644 pipeline/utils/england_geometry.py diff --git a/Makefile.data b/Makefile.data index bf5f683..21d7b6e 100644 --- a/Makefile.data +++ b/Makefile.data @@ -46,11 +46,12 @@ PC_BOUNDARIES := $(MANUAL_DATA)/postcode_boundaries TRANSIT_DIR := $(DATA_DIR)/transit TRANSIT_STAMP := $(TRANSIT_DIR)/.done GREENSPACE := $(DATA_DIR)/greenspace_water.parquet -PBF := $(DATA_DIR)/great-britain-latest.osm.pbf +PBF := $(DATA_DIR)/england-latest.osm.pbf PLACES := $(DATA_DIR)/places.parquet LISTINGS_BUY := $(DATA_DIR)/online_listings_buy.parquet LISTINGS_RENT := $(DATA_DIR)/online_listings_rent.parquet LSOA_POP := $(DATA_DIR)/lsoa_population.parquet +ENGLAND_BOUNDARY := $(DATA_DIR)/england_boundary.geojson RM_OUTCODES := frontend/src/lib/rightmove-outcodes.json # Sentinel files for directory targets (Make doesn't track directories well) @@ -65,7 +66,7 @@ PMTILES_VERSION := 1.22.3 download-arcgis download-price-paid download-deprivation download-ethnicity \ download-naptan download-pois download-ofsted download-broadband download-rental-prices \ download-postcodes download-geosure download-noise download-inspire \ - download-oa-boundaries download-uprn-lookup download-transit-network download-greenspace download-pbf download-places download-lsoa-population download-rightmove-outcodes \ + download-oa-boundaries download-uprn-lookup download-transit-network download-greenspace download-pbf download-places download-lsoa-population download-england-boundary download-rightmove-outcodes \ transform-pois transform-epc-pp transform-crime transform-poi-proximity \ transform-school-proximity transform-geosure transform-postcode-boundaries \ generate-postcode-boundaries @@ -93,6 +94,7 @@ download-greenspace: $(GREENSPACE) download-pbf: $(PBF) download-places: $(PLACES) download-lsoa-population: $(LSOA_POP) +download-england-boundary: $(ENGLAND_BOUNDARY) download-rightmove-outcodes: $(RM_OUTCODES) transform-pois: $(POIS_FILTERED) transform-epc-pp: $(EPC_PP) @@ -141,11 +143,11 @@ $(NAPTAN): $(PBF): @mkdir -p $(DATA_DIR) - curl -L -o $@.tmp https://download.geofabrik.de/europe/great-britain-latest.osm.pbf + curl -L -o $@.tmp https://download.geofabrik.de/europe/united-kingdom/england-latest.osm.pbf mv $@.tmp $@ -$(POIS_RAW): $(PBF) - uv run python -m pipeline.download.pois --output $@ --pbf $(PBF) +$(POIS_RAW): $(PBF) $(ENGLAND_BOUNDARY) + uv run python -m pipeline.download.pois --output $@ --pbf $(PBF) --boundary $(ENGLAND_BOUNDARY) $(OFSTED): uv run python -m pipeline.download.ofsted --output $@ @@ -183,19 +185,22 @@ $(RENTAL): $(GREENSPACE): $(PBF) uv run python -m pipeline.download.greenspace_water --output $@ --pbf $(PBF) -$(PLACES): $(PBF) - uv run python -m pipeline.download.places --output $@ --pbf $(PBF) +$(PLACES): $(PBF) $(ENGLAND_BOUNDARY) + uv run python -m pipeline.download.places --output $@ --pbf $(PBF) --boundary $(ENGLAND_BOUNDARY) $(LSOA_POP): uv run python -m pipeline.download.lsoa_population --output $@ +$(ENGLAND_BOUNDARY): + uv run python -m pipeline.download.england_boundary --output $@ + $(RM_OUTCODES): $(MERGE_STAMP) uv run python -m pipeline.download.rightmove_outcodes --postcodes $(POSTCODES_PQ) --output $@ # ── Transforms ──────────────────────────────────────────────────────────────── -$(POIS_FILTERED): $(POIS_RAW) $(NAPTAN) - uv run python -m pipeline.transform.transform_poi --input $(POIS_RAW) --naptan $(NAPTAN) --output $@ +$(POIS_FILTERED): $(POIS_RAW) $(NAPTAN) $(ENGLAND_BOUNDARY) + uv run python -m pipeline.transform.transform_poi --input $(POIS_RAW) --naptan $(NAPTAN) --boundary $(ENGLAND_BOUNDARY) --output $@ $(EPC_PP): $(PRICE_PAID) $(EPC) uv run python -m pipeline.transform.join_epc_pp --epc $(EPC) --price-paid $(PRICE_PAID) --output $@ diff --git a/frontend/src/components/map/AiFilterInput.tsx b/frontend/src/components/map/AiFilterInput.tsx index e7e5c55..371e5c0 100644 --- a/frontend/src/components/map/AiFilterInput.tsx +++ b/frontend/src/components/map/AiFilterInput.tsx @@ -1,8 +1,42 @@ -import { memo, useState, useCallback } from 'react'; +import { memo, useState, useCallback, useEffect, useRef } from 'react'; import { SpinnerIcon } from '../ui/icons/SpinnerIcon'; import { SparklesIcon } from '../ui/icons/SparklesIcon'; import type { AiFilterErrorType } from '../../hooks/useAiFilters'; +const EXAMPLE_QUERIES = [ + 'Safe area near good schools', + '30 min commute to Kings Cross, under 500k', + 'Quiet village, 3 bed, fast broadband', +]; + +const LOADING_MESSAGES = [ + 'Analysing your query...', + 'Searching for destinations...', + 'Generating filters...', +]; + +/** Cycle through loading messages to show progress. */ +function useLoadingMessage(loading: boolean): string { + const [index, setIndex] = useState(0); + const timerRef = useRef>(); + + useEffect(() => { + if (!loading) { + setIndex(0); + return; + } + // Advance message every 1.5s + timerRef.current = setTimeout(() => setIndex(1), 1500); + const t2 = setTimeout(() => setIndex(2), 3500); + return () => { + clearTimeout(timerRef.current); + clearTimeout(t2); + }; + }, [loading]); + + return LOADING_MESSAGES[index]; +} + interface AiFilterInputProps { loading: boolean; error: string | null; @@ -23,6 +57,8 @@ export default memo(function AiFilterInput({ onLoginRequired, }: AiFilterInputProps) { const [query, setQuery] = useState(''); + const [expanded, setExpanded] = useState(false); + const loadingMessage = useLoadingMessage(loading); const handleSubmit = useCallback( (e: React.FormEvent) => { @@ -38,36 +74,90 @@ export default memo(function AiFilterInput({ [query, loading, isLoggedIn, onLoginRequired, onSubmit] ); + const handleExampleClick = useCallback( + (example: string) => { + if (loading) return; + setQuery(example); + if (!isLoggedIn) { + onLoginRequired(); + return; + } + onSubmit(example); + }, + [loading, isLoggedIn, onLoginRequired, onSubmit] + ); + const hasContent = query.trim().length > 0; + const showExamples = expanded && !hasContent && !loading && !error && !notes; + + if (!expanded) { + return ( +
+ +
+ ); + } return (
+
+ + AI Search + — describe what you're looking for +
-
- - setQuery(e.target.value)} - placeholder="Describe your ideal area..." - className="w-full pl-7 pr-2.5 py-1.5 text-sm rounded-lg border border-warm-200 dark:border-warm-700 bg-warm-50 dark:bg-warm-800 text-warm-700 dark:text-warm-200 placeholder-warm-400 dark:placeholder-warm-500 focus:outline-none focus:ring-2 focus:ring-teal-400 focus:bg-white dark:focus:bg-warm-800" - disabled={loading} - /> -
- {(hasContent || loading) && ( - - )} + setQuery(e.target.value)} + placeholder="e.g. quiet area, under 400k, near good schools..." + className="flex-1 px-2.5 py-1.5 text-sm rounded-lg border border-warm-200 dark:border-warm-700 bg-warm-50 dark:bg-warm-800 text-warm-700 dark:text-warm-200 placeholder-warm-400 dark:placeholder-warm-500 focus:outline-none focus:ring-2 focus:ring-teal-400 focus:bg-white dark:focus:bg-warm-800" + disabled={loading} + autoFocus + /> +
+ {loading && ( +

+ {loadingMessage} +

+ )} + {showExamples && ( +
+ {EXAMPLE_QUERIES.map((example) => ( + + ))} +
+ )} {error && errorType === 'verification' && (

Please verify your email address to use AI-powered search. Check your inbox for a verification link. @@ -83,7 +173,7 @@ export default memo(function AiFilterInput({ {error}

)} - {notes && !error && ( + {notes && !error && !loading && (

{notes}

diff --git a/pipeline/download/england_boundary.py b/pipeline/download/england_boundary.py new file mode 100644 index 0000000..f4f8897 --- /dev/null +++ b/pipeline/download/england_boundary.py @@ -0,0 +1,45 @@ +"""Download England country boundary GeoJSON from ONS Open Geography Portal. + +Source: ONS Countries (December 2024) Boundaries UK BGC (Generalised Clipped) +Licence: OGL v3 +""" + +import argparse +from pathlib import Path + +import httpx + +# ArcGIS REST API — query for England only, generalised (BGC) resolution +URL = ( + "https://services1.arcgis.com/ESMARspQHYMw9BZ9/arcgis/rest/services/" + "Countries_December_2024_Boundaries_UK_BGC/FeatureServer/0/query" + "?where=CTRY24NM%3D%27England%27&outFields=CTRY24NM&f=geojson" +) + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Download England country boundary GeoJSON" + ) + parser.add_argument( + "--output", type=Path, required=True, help="Output GeoJSON file path" + ) + args = parser.parse_args() + args.output.parent.mkdir(parents=True, exist_ok=True) + + print("Downloading England boundary from ONS...") + response = httpx.get(URL, follow_redirects=True, timeout=60) + response.raise_for_status() + + data = response.json() + features = data.get("features", []) + if len(features) != 1: + raise ValueError(f"Expected 1 feature for England, got {len(features)}") + + args.output.write_text(response.text) + size_kb = args.output.stat().st_size / 1024 + print(f"Saved to {args.output} ({size_kb:.0f} KB)") + + +if __name__ == "__main__": + main() diff --git a/pipeline/download/map_assets.py b/pipeline/download/map_assets.py index ec47c8d..5bc8dae 100644 --- a/pipeline/download/map_assets.py +++ b/pipeline/download/map_assets.py @@ -7,6 +7,7 @@ from pathlib import Path from pipeline.transform.transform_poi import NAPTAN_EMOJIS, _CATEGORIES GLYPHS_BASE = "https://protomaps.github.io/basemaps-assets/fonts" +SPRITES_BASE = "https://protomaps.github.io/basemaps-assets/sprites/v4" TWEMOJI_BASE = "https://cdn.jsdelivr.net/gh/twitter/twemoji@14.0.2/assets/72x72" # Font stacks used by @protomaps/basemaps with lang='en' @@ -77,6 +78,15 @@ def main(): url = f"{GLYPHS_BASE}/{font_encoded}/{name}" tasks.append((url, font_dir / name)) + # Sprite sheets (light/dark, 1x and 2x) + sprites_dir = out / "sprites" + for theme in ("light", "dark"): + for suffix in ("json", "png"): + url = f"{SPRITES_BASE}/{theme}.{suffix}" + tasks.append((url, sprites_dir / f"{theme}.{suffix}")) + url_2x = f"{SPRITES_BASE}/{theme}@2x.{suffix}" + tasks.append((url_2x, sprites_dir / f"{theme}@2x.{suffix}")) + # Twemoji PNGs twemoji_dir = out / "twemoji" for code in twemoji_codes: diff --git a/pipeline/download/places.py b/pipeline/download/places.py index 5e487b7..a441c1e 100644 --- a/pipeline/download/places.py +++ b/pipeline/download/places.py @@ -2,7 +2,7 @@ Extracts named place nodes (cities, towns, suburbs, etc.) and railway stations (tube, national rail, DLR, etc.) for typeahead search. -Reuses the same great-britain-latest.osm.pbf as pois.py. +Reuses the same england-latest.osm.pbf as pois.py. """ import argparse @@ -10,9 +10,16 @@ from pathlib import Path import osmium import polars as pl +from shapely.geometry import Point from tqdm import tqdm -from .pois import UK_BBOX_EAST, UK_BBOX_NORTH, UK_BBOX_SOUTH, UK_BBOX_WEST +from pipeline.download.pois import ( + ENGLAND_BBOX_EAST, + ENGLAND_BBOX_NORTH, + ENGLAND_BBOX_SOUTH, + ENGLAND_BBOX_WEST, +) +from pipeline.utils.england_geometry import load_england_polygon PLACE_TYPES = {"city"} @@ -57,10 +64,11 @@ def _station_display_name(name: str, tags: dict[str, str]) -> str: class PlaceHandler(osmium.SimpleHandler): - def __init__(self, progress: tqdm) -> None: + def __init__(self, progress: tqdm, england_polygon) -> None: super().__init__() self._progress = progress self.places: list[dict] = [] + self._england = england_polygon def _add( self, name: str, place_type: str, lat: float, lon: float, population: int @@ -82,10 +90,12 @@ class PlaceHandler(osmium.SimpleHandler): return lat, lon = n.location.lat, n.location.lon if not ( - UK_BBOX_SOUTH <= lat <= UK_BBOX_NORTH - and UK_BBOX_WEST <= lon <= UK_BBOX_EAST + ENGLAND_BBOX_SOUTH <= lat <= ENGLAND_BBOX_NORTH + and ENGLAND_BBOX_WEST <= lon <= ENGLAND_BBOX_EAST ): return + if not self._england.contains(Point(lon, lat)): + return name = n.tags.get("name:en", n.tags.get("name", "")) if not name: @@ -124,9 +134,17 @@ def main() -> None: parser.add_argument( "--pbf", type=Path, required=True, help="Path to OSM PBF file" ) + parser.add_argument( + "--boundary", + type=Path, + required=True, + help="England boundary GeoJSON file", + ) args = parser.parse_args() pbf_file = args.pbf + england_polygon = load_england_polygon(args.boundary) + print("Extracting place nodes: cities + railway stations") with tqdm( unit=" elements", @@ -135,7 +153,7 @@ def main() -> None: smoothing=0.05, mininterval=1.0, ) as progress: - handler = PlaceHandler(progress) + handler = PlaceHandler(progress, england_polygon) handler.apply_file(str(pbf_file), locations=True) print(f"Extracted {len(handler.places):,} place nodes") diff --git a/pipeline/download/pois.py b/pipeline/download/pois.py index 74de703..0b39f8b 100644 --- a/pipeline/download/pois.py +++ b/pipeline/download/pois.py @@ -4,17 +4,20 @@ from tempfile import mkdtemp import osmium import polars as pl +from shapely.geometry import Point from tqdm import tqdm +from pipeline.utils.england_geometry import load_england_polygon BATCH_SIZE = 50_000 MIN_OCCURENCE_COUNT = 20 -UK_BBOX_WEST = -7.57 -UK_BBOX_SOUTH = 49.96 -UK_BBOX_EAST = 1.68 -UK_BBOX_NORTH = 58.64 +# Bounding box for fast pre-filtering before the precise polygon check +ENGLAND_BBOX_WEST = -6.45 +ENGLAND_BBOX_SOUTH = 49.85 +ENGLAND_BBOX_EAST = 1.77 +ENGLAND_BBOX_NORTH = 55.82 POI_TAG_KEYS: list[str] = [ "amenity", @@ -31,19 +34,23 @@ POI_TAG_KEYS: list[str] = [ class POIHandler(osmium.SimpleHandler): - def __init__(self, progress: tqdm, tmp_dir: Path) -> None: + def __init__(self, progress: tqdm, tmp_dir: Path, england_polygon) -> None: super().__init__() self._batch: list[dict] = [] self._tmp_dir = tmp_dir self._batch_num = 0 self.poi_count = 0 self._progress = progress + self._england = england_polygon - def _in_uk(self, lat: float, lon: float) -> bool: - return ( - UK_BBOX_SOUTH <= lat <= UK_BBOX_NORTH - and UK_BBOX_WEST <= lon <= UK_BBOX_EAST - ) + def _in_england(self, lat: float, lon: float) -> bool: + # Fast bbox pre-filter, then precise polygon check + if not ( + ENGLAND_BBOX_SOUTH <= lat <= ENGLAND_BBOX_NORTH + and ENGLAND_BBOX_WEST <= lon <= ENGLAND_BBOX_EAST + ): + return False + return self._england.contains(Point(lon, lat)) def _match_tags(self, tags: osmium.osm.TagList) -> list[str]: return [f"{key}/{tags[key]}" for key in POI_TAG_KEYS if key in tags] @@ -90,7 +97,7 @@ class POIHandler(osmium.SimpleHandler): if not n.location.valid: return lat, lon = n.location.lat, n.location.lon - if not self._in_uk(lat, lon): + if not self._in_england(lat, lon): return categories = self._match_tags(n.tags) for category in categories: @@ -107,11 +114,19 @@ def main() -> None: parser.add_argument( "--pbf", type=Path, required=True, help="Path to OSM PBF file" ) + parser.add_argument( + "--boundary", + type=Path, + required=True, + help="England boundary GeoJSON file", + ) args = parser.parse_args() pbf_file = args.pbf print(f"Tag keys: {POI_TAG_KEYS}") + england_polygon = load_england_polygon(args.boundary) + tmp_dir = Path(mkdtemp(prefix="pois_")) with tqdm( unit=" elements", @@ -120,7 +135,7 @@ def main() -> None: smoothing=0.05, mininterval=1.0, ) as progress: - handler = POIHandler(progress, tmp_dir) + handler = POIHandler(progress, tmp_dir, england_polygon) handler.apply_file(str(pbf_file), locations=True) handler._flush_batch() # write any remaining POIs diff --git a/pipeline/transform/transform_poi.py b/pipeline/transform/transform_poi.py index 777d2d4..47dfdfc 100644 --- a/pipeline/transform/transform_poi.py +++ b/pipeline/transform/transform_poi.py @@ -3,6 +3,8 @@ from pathlib import Path import polars as pl +from pipeline.utils.england_geometry import in_england_mask + DROP_CATEGORIES = { # Street furniture & infrastructure @@ -1056,7 +1058,11 @@ NAPTAN_EMOJIS: dict[str, str] = { } -def transform(input_path: Path, naptan_path: Path | None = None) -> pl.LazyFrame: +def transform( + input_path: Path, + naptan_path: Path | None = None, + boundary_path: Path | None = None, +) -> pl.LazyFrame: lf = pl.scan_parquet(input_path) # Get all unique categories present in the data @@ -1072,16 +1078,14 @@ def transform(input_path: Path, naptan_path: Path | None = None) -> pl.LazyFrame if unmapped: raise ValueError(f"Categories missing from CATEGORY_MAP: {sorted(unmapped)}") - # Verify every CATEGORY_MAP key actually exists in the data (catch typos) + # Warn about CATEGORY_MAP keys not in data (may be absent in regional extracts) mapped_but_absent = [] all_set = set(all_categories) for cat in CATEGORY_MAP: if cat not in all_set: mapped_but_absent.append(cat) if mapped_but_absent: - raise ValueError( - f"CATEGORY_MAP contains categories not in data: {sorted(mapped_but_absent)}" - ) + print(f"CATEGORY_MAP categories not in data (skipped): {sorted(mapped_but_absent)}") # Drop unwanted categories lf = lf.filter(~pl.col("category").is_in(list(DROP_CATEGORIES))) @@ -1105,7 +1109,15 @@ def transform(input_path: Path, naptan_path: Path | None = None) -> pl.LazyFrame pl.col("category").replace_strict(emoji_mapping).alias("emoji"), ) - naptan = pl.scan_parquet(naptan_path).with_columns( + naptan_df = pl.scan_parquet(naptan_path).collect() + if boundary_path is not None: + mask = in_england_mask( + boundary_path, + naptan_df["lat"].to_numpy(), + naptan_df["lng"].to_numpy(), + ) + naptan_df = naptan_df.filter(pl.Series(mask)) + naptan = naptan_df.lazy().with_columns( pl.col("category").replace_strict(NAPTAN_EMOJIS).alias("emoji"), pl.lit("Public Transport").alias("group"), ) @@ -1122,12 +1134,18 @@ def main(): parser.add_argument( "--naptan", type=Path, required=True, help="NaPTAN stations parquet file" ) + parser.add_argument( + "--boundary", + type=Path, + required=True, + help="England boundary GeoJSON file", + ) parser.add_argument( "--output", type=Path, required=True, help="Output filtered POIs parquet file" ) args = parser.parse_args() - df = transform(args.input, args.naptan).collect(engine="streaming") + df = transform(args.input, args.naptan, args.boundary).collect(engine="streaming") df.write_parquet(args.output) diff --git a/pipeline/utils/england_geometry.py b/pipeline/utils/england_geometry.py new file mode 100644 index 0000000..b545363 --- /dev/null +++ b/pipeline/utils/england_geometry.py @@ -0,0 +1,33 @@ +"""England boundary polygon for accurate point-in-country filtering. + +Uses shapely prepared geometry for fast single-point checks (osmium handlers) +and vectorized shapely.contains for batch checks (Polars DataFrames). +""" + +import json +from pathlib import Path + +import numpy as np +import shapely +from shapely.geometry import shape +from shapely.prepared import PreparedGeometry, prep + + +def load_england_polygon(geojson_path: Path) -> PreparedGeometry: + """Load England boundary as a prepared shapely polygon for fast contains checks.""" + with open(geojson_path) as f: + data = json.load(f) + geometry = shape(data["features"][0]["geometry"]) + return prep(geometry) + + +def in_england_mask(geojson_path: Path, lats: np.ndarray, lngs: np.ndarray) -> np.ndarray: + """Vectorized check: which (lat, lng) points are within England. + + Returns a boolean numpy array. + """ + with open(geojson_path) as f: + data = json.load(f) + polygon = shape(data["features"][0]["geometry"]) + pts = shapely.points(lngs, lats) + return shapely.contains(polygon, pts)