England only

This commit is contained in:
Andras Schmelczer 2026-03-15 14:03:38 +00:00
parent 4d08f5d08d
commit 02712f41e8
8 changed files with 294 additions and 60 deletions

View file

@ -46,11 +46,12 @@ PC_BOUNDARIES := $(MANUAL_DATA)/postcode_boundaries
TRANSIT_DIR := $(DATA_DIR)/transit
TRANSIT_STAMP := $(TRANSIT_DIR)/.done
GREENSPACE := $(DATA_DIR)/greenspace_water.parquet
PBF := $(DATA_DIR)/great-britain-latest.osm.pbf
PBF := $(DATA_DIR)/england-latest.osm.pbf
PLACES := $(DATA_DIR)/places.parquet
LISTINGS_BUY := $(DATA_DIR)/online_listings_buy.parquet
LISTINGS_RENT := $(DATA_DIR)/online_listings_rent.parquet
LSOA_POP := $(DATA_DIR)/lsoa_population.parquet
ENGLAND_BOUNDARY := $(DATA_DIR)/england_boundary.geojson
RM_OUTCODES := frontend/src/lib/rightmove-outcodes.json
# Sentinel files for directory targets (Make doesn't track directories well)
@ -65,7 +66,7 @@ PMTILES_VERSION := 1.22.3
download-arcgis download-price-paid download-deprivation download-ethnicity \
download-naptan download-pois download-ofsted download-broadband download-rental-prices \
download-postcodes download-geosure download-noise download-inspire \
download-oa-boundaries download-uprn-lookup download-transit-network download-greenspace download-pbf download-places download-lsoa-population download-rightmove-outcodes \
download-oa-boundaries download-uprn-lookup download-transit-network download-greenspace download-pbf download-places download-lsoa-population download-england-boundary download-rightmove-outcodes \
transform-pois transform-epc-pp transform-crime transform-poi-proximity \
transform-school-proximity transform-geosure transform-postcode-boundaries \
generate-postcode-boundaries
@ -93,6 +94,7 @@ download-greenspace: $(GREENSPACE)
download-pbf: $(PBF)
download-places: $(PLACES)
download-lsoa-population: $(LSOA_POP)
download-england-boundary: $(ENGLAND_BOUNDARY)
download-rightmove-outcodes: $(RM_OUTCODES)
transform-pois: $(POIS_FILTERED)
transform-epc-pp: $(EPC_PP)
@ -141,11 +143,11 @@ $(NAPTAN):
$(PBF):
@mkdir -p $(DATA_DIR)
curl -L -o $@.tmp https://download.geofabrik.de/europe/great-britain-latest.osm.pbf
curl -L -o $@.tmp https://download.geofabrik.de/europe/united-kingdom/england-latest.osm.pbf
mv $@.tmp $@
$(POIS_RAW): $(PBF)
uv run python -m pipeline.download.pois --output $@ --pbf $(PBF)
$(POIS_RAW): $(PBF) $(ENGLAND_BOUNDARY)
uv run python -m pipeline.download.pois --output $@ --pbf $(PBF) --boundary $(ENGLAND_BOUNDARY)
$(OFSTED):
uv run python -m pipeline.download.ofsted --output $@
@ -183,19 +185,22 @@ $(RENTAL):
$(GREENSPACE): $(PBF)
uv run python -m pipeline.download.greenspace_water --output $@ --pbf $(PBF)
$(PLACES): $(PBF)
uv run python -m pipeline.download.places --output $@ --pbf $(PBF)
$(PLACES): $(PBF) $(ENGLAND_BOUNDARY)
uv run python -m pipeline.download.places --output $@ --pbf $(PBF) --boundary $(ENGLAND_BOUNDARY)
$(LSOA_POP):
uv run python -m pipeline.download.lsoa_population --output $@
$(ENGLAND_BOUNDARY):
uv run python -m pipeline.download.england_boundary --output $@
$(RM_OUTCODES): $(MERGE_STAMP)
uv run python -m pipeline.download.rightmove_outcodes --postcodes $(POSTCODES_PQ) --output $@
# ── Transforms ────────────────────────────────────────────────────────────────
$(POIS_FILTERED): $(POIS_RAW) $(NAPTAN)
uv run python -m pipeline.transform.transform_poi --input $(POIS_RAW) --naptan $(NAPTAN) --output $@
$(POIS_FILTERED): $(POIS_RAW) $(NAPTAN) $(ENGLAND_BOUNDARY)
uv run python -m pipeline.transform.transform_poi --input $(POIS_RAW) --naptan $(NAPTAN) --boundary $(ENGLAND_BOUNDARY) --output $@
$(EPC_PP): $(PRICE_PAID) $(EPC)
uv run python -m pipeline.transform.join_epc_pp --epc $(EPC) --price-paid $(PRICE_PAID) --output $@

View file

@ -1,8 +1,42 @@
import { memo, useState, useCallback } from 'react';
import { memo, useState, useCallback, useEffect, useRef } from 'react';
import { SpinnerIcon } from '../ui/icons/SpinnerIcon';
import { SparklesIcon } from '../ui/icons/SparklesIcon';
import type { AiFilterErrorType } from '../../hooks/useAiFilters';
const EXAMPLE_QUERIES = [
'Safe area near good schools',
'30 min commute to Kings Cross, under 500k',
'Quiet village, 3 bed, fast broadband',
];
const LOADING_MESSAGES = [
'Analysing your query...',
'Searching for destinations...',
'Generating filters...',
];
/** Cycle through loading messages to show progress. */
function useLoadingMessage(loading: boolean): string {
const [index, setIndex] = useState(0);
const timerRef = useRef<ReturnType<typeof setTimeout>>();
useEffect(() => {
if (!loading) {
setIndex(0);
return;
}
// Advance message every 1.5s
timerRef.current = setTimeout(() => setIndex(1), 1500);
const t2 = setTimeout(() => setIndex(2), 3500);
return () => {
clearTimeout(timerRef.current);
clearTimeout(t2);
};
}, [loading]);
return LOADING_MESSAGES[index];
}
interface AiFilterInputProps {
loading: boolean;
error: string | null;
@ -23,6 +57,8 @@ export default memo(function AiFilterInput({
onLoginRequired,
}: AiFilterInputProps) {
const [query, setQuery] = useState('');
const [expanded, setExpanded] = useState(false);
const loadingMessage = useLoadingMessage(loading);
const handleSubmit = useCallback(
(e: React.FormEvent) => {
@ -38,36 +74,90 @@ export default memo(function AiFilterInput({
[query, loading, isLoggedIn, onLoginRequired, onSubmit]
);
const handleExampleClick = useCallback(
(example: string) => {
if (loading) return;
setQuery(example);
if (!isLoggedIn) {
onLoginRequired();
return;
}
onSubmit(example);
},
[loading, isLoggedIn, onLoginRequired, onSubmit]
);
const hasContent = query.trim().length > 0;
const showExamples = expanded && !hasContent && !loading && !error && !notes;
if (!expanded) {
return (
<div className="px-3 py-2" data-tutorial="ai-filters">
<button
type="button"
onClick={() => setExpanded(true)}
className="w-full flex items-center gap-2 px-3 py-2 rounded-lg border border-dashed border-teal-300 dark:border-teal-700 bg-teal-50/50 dark:bg-teal-900/20 hover:bg-teal-50 dark:hover:bg-teal-900/30 cursor-pointer group"
>
<SparklesIcon className="w-4 h-4 text-teal-500 dark:text-teal-400 shrink-0" />
<span className="text-sm text-teal-700 dark:text-teal-300 group-hover:text-teal-800 dark:group-hover:text-teal-200">
Describe your ideal area with AI
</span>
</button>
</div>
);
}
return (
<div className="px-3 py-2" data-tutorial="ai-filters">
<div className="flex items-center gap-1.5 mb-1.5">
<SparklesIcon className="w-3.5 h-3.5 text-teal-500 dark:text-teal-400 shrink-0" />
<span className="text-xs font-medium text-teal-700 dark:text-teal-300">AI Search</span>
<span className="text-xs text-warm-400 dark:text-warm-500"> describe what you're looking for</span>
</div>
<form onSubmit={handleSubmit} className="flex items-center gap-1.5">
<div className="relative flex-1">
<SparklesIcon className="absolute left-2 top-1/2 -translate-y-1/2 w-3.5 h-3.5 text-teal-500 dark:text-teal-400 pointer-events-none" />
<input
type="text"
value={query}
onChange={(e) => setQuery(e.target.value)}
placeholder="Describe your ideal area..."
className="w-full pl-7 pr-2.5 py-1.5 text-sm rounded-lg border border-warm-200 dark:border-warm-700 bg-warm-50 dark:bg-warm-800 text-warm-700 dark:text-warm-200 placeholder-warm-400 dark:placeholder-warm-500 focus:outline-none focus:ring-2 focus:ring-teal-400 focus:bg-white dark:focus:bg-warm-800"
disabled={loading}
/>
</div>
{(hasContent || loading) && (
<button
type="submit"
disabled={loading || !hasContent}
className="shrink-0 px-2.5 py-1.5 rounded-lg bg-teal-600 hover:bg-teal-700 disabled:opacity-50 disabled:cursor-not-allowed text-white text-sm font-medium flex items-center justify-center"
>
{loading ? (
<SpinnerIcon className="w-4 h-4 animate-spin" />
) : (
<SparklesIcon className="w-4 h-4" />
)}
</button>
)}
<input
type="text"
value={query}
onChange={(e) => setQuery(e.target.value)}
placeholder="e.g. quiet area, under 400k, near good schools..."
className="flex-1 px-2.5 py-1.5 text-sm rounded-lg border border-warm-200 dark:border-warm-700 bg-warm-50 dark:bg-warm-800 text-warm-700 dark:text-warm-200 placeholder-warm-400 dark:placeholder-warm-500 focus:outline-none focus:ring-2 focus:ring-teal-400 focus:bg-white dark:focus:bg-warm-800"
disabled={loading}
autoFocus
/>
<button
type="submit"
disabled={loading || !hasContent}
className="shrink-0 px-3 py-1.5 rounded-lg bg-teal-600 hover:bg-teal-700 disabled:opacity-50 disabled:cursor-not-allowed text-white text-sm font-medium flex items-center gap-1.5"
>
{loading ? (
<SpinnerIcon className="w-3.5 h-3.5 animate-spin" />
) : (
<>
<SparklesIcon className="w-3.5 h-3.5" />
<span>Search</span>
</>
)}
</button>
</form>
{loading && (
<p className="mt-1 text-xs text-teal-600 dark:text-teal-400">
{loadingMessage}
</p>
)}
{showExamples && (
<div className="mt-1.5 flex flex-wrap gap-1">
{EXAMPLE_QUERIES.map((example) => (
<button
key={example}
type="button"
onClick={() => handleExampleClick(example)}
className="text-xs px-2 py-0.5 rounded-full border border-warm-200 dark:border-warm-700 text-warm-500 dark:text-warm-400 hover:border-teal-400 hover:text-teal-600 dark:hover:text-teal-400 cursor-pointer"
>
{example}
</button>
))}
</div>
)}
{error && errorType === 'verification' && (
<p className="mt-1.5 text-xs text-amber-600 dark:text-amber-400">
Please verify your email address to use AI-powered search. Check your inbox for a verification link.
@ -83,7 +173,7 @@ export default memo(function AiFilterInput({
{error}
</p>
)}
{notes && !error && (
{notes && !error && !loading && (
<p className="mt-1 text-xs text-warm-500 dark:text-warm-400 italic">
{notes}
</p>

View file

@ -0,0 +1,45 @@
"""Download England country boundary GeoJSON from ONS Open Geography Portal.
Source: ONS Countries (December 2024) Boundaries UK BGC (Generalised Clipped)
Licence: OGL v3
"""
import argparse
from pathlib import Path
import httpx
# ArcGIS REST API — query for England only, generalised (BGC) resolution
URL = (
"https://services1.arcgis.com/ESMARspQHYMw9BZ9/arcgis/rest/services/"
"Countries_December_2024_Boundaries_UK_BGC/FeatureServer/0/query"
"?where=CTRY24NM%3D%27England%27&outFields=CTRY24NM&f=geojson"
)
def main() -> None:
parser = argparse.ArgumentParser(
description="Download England country boundary GeoJSON"
)
parser.add_argument(
"--output", type=Path, required=True, help="Output GeoJSON file path"
)
args = parser.parse_args()
args.output.parent.mkdir(parents=True, exist_ok=True)
print("Downloading England boundary from ONS...")
response = httpx.get(URL, follow_redirects=True, timeout=60)
response.raise_for_status()
data = response.json()
features = data.get("features", [])
if len(features) != 1:
raise ValueError(f"Expected 1 feature for England, got {len(features)}")
args.output.write_text(response.text)
size_kb = args.output.stat().st_size / 1024
print(f"Saved to {args.output} ({size_kb:.0f} KB)")
if __name__ == "__main__":
main()

View file

@ -7,6 +7,7 @@ from pathlib import Path
from pipeline.transform.transform_poi import NAPTAN_EMOJIS, _CATEGORIES
GLYPHS_BASE = "https://protomaps.github.io/basemaps-assets/fonts"
SPRITES_BASE = "https://protomaps.github.io/basemaps-assets/sprites/v4"
TWEMOJI_BASE = "https://cdn.jsdelivr.net/gh/twitter/twemoji@14.0.2/assets/72x72"
# Font stacks used by @protomaps/basemaps with lang='en'
@ -77,6 +78,15 @@ def main():
url = f"{GLYPHS_BASE}/{font_encoded}/{name}"
tasks.append((url, font_dir / name))
# Sprite sheets (light/dark, 1x and 2x)
sprites_dir = out / "sprites"
for theme in ("light", "dark"):
for suffix in ("json", "png"):
url = f"{SPRITES_BASE}/{theme}.{suffix}"
tasks.append((url, sprites_dir / f"{theme}.{suffix}"))
url_2x = f"{SPRITES_BASE}/{theme}@2x.{suffix}"
tasks.append((url_2x, sprites_dir / f"{theme}@2x.{suffix}"))
# Twemoji PNGs
twemoji_dir = out / "twemoji"
for code in twemoji_codes:

View file

@ -2,7 +2,7 @@
Extracts named place nodes (cities, towns, suburbs, etc.) and railway stations
(tube, national rail, DLR, etc.) for typeahead search.
Reuses the same great-britain-latest.osm.pbf as pois.py.
Reuses the same england-latest.osm.pbf as pois.py.
"""
import argparse
@ -10,9 +10,16 @@ from pathlib import Path
import osmium
import polars as pl
from shapely.geometry import Point
from tqdm import tqdm
from .pois import UK_BBOX_EAST, UK_BBOX_NORTH, UK_BBOX_SOUTH, UK_BBOX_WEST
from pipeline.download.pois import (
ENGLAND_BBOX_EAST,
ENGLAND_BBOX_NORTH,
ENGLAND_BBOX_SOUTH,
ENGLAND_BBOX_WEST,
)
from pipeline.utils.england_geometry import load_england_polygon
PLACE_TYPES = {"city"}
@ -57,10 +64,11 @@ def _station_display_name(name: str, tags: dict[str, str]) -> str:
class PlaceHandler(osmium.SimpleHandler):
def __init__(self, progress: tqdm) -> None:
def __init__(self, progress: tqdm, england_polygon) -> None:
super().__init__()
self._progress = progress
self.places: list[dict] = []
self._england = england_polygon
def _add(
self, name: str, place_type: str, lat: float, lon: float, population: int
@ -82,10 +90,12 @@ class PlaceHandler(osmium.SimpleHandler):
return
lat, lon = n.location.lat, n.location.lon
if not (
UK_BBOX_SOUTH <= lat <= UK_BBOX_NORTH
and UK_BBOX_WEST <= lon <= UK_BBOX_EAST
ENGLAND_BBOX_SOUTH <= lat <= ENGLAND_BBOX_NORTH
and ENGLAND_BBOX_WEST <= lon <= ENGLAND_BBOX_EAST
):
return
if not self._england.contains(Point(lon, lat)):
return
name = n.tags.get("name:en", n.tags.get("name", ""))
if not name:
@ -124,9 +134,17 @@ def main() -> None:
parser.add_argument(
"--pbf", type=Path, required=True, help="Path to OSM PBF file"
)
parser.add_argument(
"--boundary",
type=Path,
required=True,
help="England boundary GeoJSON file",
)
args = parser.parse_args()
pbf_file = args.pbf
england_polygon = load_england_polygon(args.boundary)
print("Extracting place nodes: cities + railway stations")
with tqdm(
unit=" elements",
@ -135,7 +153,7 @@ def main() -> None:
smoothing=0.05,
mininterval=1.0,
) as progress:
handler = PlaceHandler(progress)
handler = PlaceHandler(progress, england_polygon)
handler.apply_file(str(pbf_file), locations=True)
print(f"Extracted {len(handler.places):,} place nodes")

View file

@ -4,17 +4,20 @@ from tempfile import mkdtemp
import osmium
import polars as pl
from shapely.geometry import Point
from tqdm import tqdm
from pipeline.utils.england_geometry import load_england_polygon
BATCH_SIZE = 50_000
MIN_OCCURENCE_COUNT = 20
UK_BBOX_WEST = -7.57
UK_BBOX_SOUTH = 49.96
UK_BBOX_EAST = 1.68
UK_BBOX_NORTH = 58.64
# Bounding box for fast pre-filtering before the precise polygon check
ENGLAND_BBOX_WEST = -6.45
ENGLAND_BBOX_SOUTH = 49.85
ENGLAND_BBOX_EAST = 1.77
ENGLAND_BBOX_NORTH = 55.82
POI_TAG_KEYS: list[str] = [
"amenity",
@ -31,19 +34,23 @@ POI_TAG_KEYS: list[str] = [
class POIHandler(osmium.SimpleHandler):
def __init__(self, progress: tqdm, tmp_dir: Path) -> None:
def __init__(self, progress: tqdm, tmp_dir: Path, england_polygon) -> None:
super().__init__()
self._batch: list[dict] = []
self._tmp_dir = tmp_dir
self._batch_num = 0
self.poi_count = 0
self._progress = progress
self._england = england_polygon
def _in_uk(self, lat: float, lon: float) -> bool:
return (
UK_BBOX_SOUTH <= lat <= UK_BBOX_NORTH
and UK_BBOX_WEST <= lon <= UK_BBOX_EAST
)
def _in_england(self, lat: float, lon: float) -> bool:
# Fast bbox pre-filter, then precise polygon check
if not (
ENGLAND_BBOX_SOUTH <= lat <= ENGLAND_BBOX_NORTH
and ENGLAND_BBOX_WEST <= lon <= ENGLAND_BBOX_EAST
):
return False
return self._england.contains(Point(lon, lat))
def _match_tags(self, tags: osmium.osm.TagList) -> list[str]:
return [f"{key}/{tags[key]}" for key in POI_TAG_KEYS if key in tags]
@ -90,7 +97,7 @@ class POIHandler(osmium.SimpleHandler):
if not n.location.valid:
return
lat, lon = n.location.lat, n.location.lon
if not self._in_uk(lat, lon):
if not self._in_england(lat, lon):
return
categories = self._match_tags(n.tags)
for category in categories:
@ -107,11 +114,19 @@ def main() -> None:
parser.add_argument(
"--pbf", type=Path, required=True, help="Path to OSM PBF file"
)
parser.add_argument(
"--boundary",
type=Path,
required=True,
help="England boundary GeoJSON file",
)
args = parser.parse_args()
pbf_file = args.pbf
print(f"Tag keys: {POI_TAG_KEYS}")
england_polygon = load_england_polygon(args.boundary)
tmp_dir = Path(mkdtemp(prefix="pois_"))
with tqdm(
unit=" elements",
@ -120,7 +135,7 @@ def main() -> None:
smoothing=0.05,
mininterval=1.0,
) as progress:
handler = POIHandler(progress, tmp_dir)
handler = POIHandler(progress, tmp_dir, england_polygon)
handler.apply_file(str(pbf_file), locations=True)
handler._flush_batch() # write any remaining POIs

View file

@ -3,6 +3,8 @@ from pathlib import Path
import polars as pl
from pipeline.utils.england_geometry import in_england_mask
DROP_CATEGORIES = {
# Street furniture & infrastructure
@ -1056,7 +1058,11 @@ NAPTAN_EMOJIS: dict[str, str] = {
}
def transform(input_path: Path, naptan_path: Path | None = None) -> pl.LazyFrame:
def transform(
input_path: Path,
naptan_path: Path | None = None,
boundary_path: Path | None = None,
) -> pl.LazyFrame:
lf = pl.scan_parquet(input_path)
# Get all unique categories present in the data
@ -1072,16 +1078,14 @@ def transform(input_path: Path, naptan_path: Path | None = None) -> pl.LazyFrame
if unmapped:
raise ValueError(f"Categories missing from CATEGORY_MAP: {sorted(unmapped)}")
# Verify every CATEGORY_MAP key actually exists in the data (catch typos)
# Warn about CATEGORY_MAP keys not in data (may be absent in regional extracts)
mapped_but_absent = []
all_set = set(all_categories)
for cat in CATEGORY_MAP:
if cat not in all_set:
mapped_but_absent.append(cat)
if mapped_but_absent:
raise ValueError(
f"CATEGORY_MAP contains categories not in data: {sorted(mapped_but_absent)}"
)
print(f"CATEGORY_MAP categories not in data (skipped): {sorted(mapped_but_absent)}")
# Drop unwanted categories
lf = lf.filter(~pl.col("category").is_in(list(DROP_CATEGORIES)))
@ -1105,7 +1109,15 @@ def transform(input_path: Path, naptan_path: Path | None = None) -> pl.LazyFrame
pl.col("category").replace_strict(emoji_mapping).alias("emoji"),
)
naptan = pl.scan_parquet(naptan_path).with_columns(
naptan_df = pl.scan_parquet(naptan_path).collect()
if boundary_path is not None:
mask = in_england_mask(
boundary_path,
naptan_df["lat"].to_numpy(),
naptan_df["lng"].to_numpy(),
)
naptan_df = naptan_df.filter(pl.Series(mask))
naptan = naptan_df.lazy().with_columns(
pl.col("category").replace_strict(NAPTAN_EMOJIS).alias("emoji"),
pl.lit("Public Transport").alias("group"),
)
@ -1122,12 +1134,18 @@ def main():
parser.add_argument(
"--naptan", type=Path, required=True, help="NaPTAN stations parquet file"
)
parser.add_argument(
"--boundary",
type=Path,
required=True,
help="England boundary GeoJSON file",
)
parser.add_argument(
"--output", type=Path, required=True, help="Output filtered POIs parquet file"
)
args = parser.parse_args()
df = transform(args.input, args.naptan).collect(engine="streaming")
df = transform(args.input, args.naptan, args.boundary).collect(engine="streaming")
df.write_parquet(args.output)

View file

@ -0,0 +1,33 @@
"""England boundary polygon for accurate point-in-country filtering.
Uses shapely prepared geometry for fast single-point checks (osmium handlers)
and vectorized shapely.contains for batch checks (Polars DataFrames).
"""
import json
from pathlib import Path
import numpy as np
import shapely
from shapely.geometry import shape
from shapely.prepared import PreparedGeometry, prep
def load_england_polygon(geojson_path: Path) -> PreparedGeometry:
"""Load England boundary as a prepared shapely polygon for fast contains checks."""
with open(geojson_path) as f:
data = json.load(f)
geometry = shape(data["features"][0]["geometry"])
return prep(geometry)
def in_england_mask(geojson_path: Path, lats: np.ndarray, lngs: np.ndarray) -> np.ndarray:
"""Vectorized check: which (lat, lng) points are within England.
Returns a boolean numpy array.
"""
with open(geojson_path) as f:
data = json.load(f)
polygon = shape(data["features"][0]["geometry"])
pts = shapely.points(lngs, lats)
return shapely.contains(polygon, pts)