England only
This commit is contained in:
parent
4d08f5d08d
commit
02712f41e8
8 changed files with 294 additions and 60 deletions
|
|
@ -46,11 +46,12 @@ PC_BOUNDARIES := $(MANUAL_DATA)/postcode_boundaries
|
||||||
TRANSIT_DIR := $(DATA_DIR)/transit
|
TRANSIT_DIR := $(DATA_DIR)/transit
|
||||||
TRANSIT_STAMP := $(TRANSIT_DIR)/.done
|
TRANSIT_STAMP := $(TRANSIT_DIR)/.done
|
||||||
GREENSPACE := $(DATA_DIR)/greenspace_water.parquet
|
GREENSPACE := $(DATA_DIR)/greenspace_water.parquet
|
||||||
PBF := $(DATA_DIR)/great-britain-latest.osm.pbf
|
PBF := $(DATA_DIR)/england-latest.osm.pbf
|
||||||
PLACES := $(DATA_DIR)/places.parquet
|
PLACES := $(DATA_DIR)/places.parquet
|
||||||
LISTINGS_BUY := $(DATA_DIR)/online_listings_buy.parquet
|
LISTINGS_BUY := $(DATA_DIR)/online_listings_buy.parquet
|
||||||
LISTINGS_RENT := $(DATA_DIR)/online_listings_rent.parquet
|
LISTINGS_RENT := $(DATA_DIR)/online_listings_rent.parquet
|
||||||
LSOA_POP := $(DATA_DIR)/lsoa_population.parquet
|
LSOA_POP := $(DATA_DIR)/lsoa_population.parquet
|
||||||
|
ENGLAND_BOUNDARY := $(DATA_DIR)/england_boundary.geojson
|
||||||
RM_OUTCODES := frontend/src/lib/rightmove-outcodes.json
|
RM_OUTCODES := frontend/src/lib/rightmove-outcodes.json
|
||||||
|
|
||||||
# Sentinel files for directory targets (Make doesn't track directories well)
|
# Sentinel files for directory targets (Make doesn't track directories well)
|
||||||
|
|
@ -65,7 +66,7 @@ PMTILES_VERSION := 1.22.3
|
||||||
download-arcgis download-price-paid download-deprivation download-ethnicity \
|
download-arcgis download-price-paid download-deprivation download-ethnicity \
|
||||||
download-naptan download-pois download-ofsted download-broadband download-rental-prices \
|
download-naptan download-pois download-ofsted download-broadband download-rental-prices \
|
||||||
download-postcodes download-geosure download-noise download-inspire \
|
download-postcodes download-geosure download-noise download-inspire \
|
||||||
download-oa-boundaries download-uprn-lookup download-transit-network download-greenspace download-pbf download-places download-lsoa-population download-rightmove-outcodes \
|
download-oa-boundaries download-uprn-lookup download-transit-network download-greenspace download-pbf download-places download-lsoa-population download-england-boundary download-rightmove-outcodes \
|
||||||
transform-pois transform-epc-pp transform-crime transform-poi-proximity \
|
transform-pois transform-epc-pp transform-crime transform-poi-proximity \
|
||||||
transform-school-proximity transform-geosure transform-postcode-boundaries \
|
transform-school-proximity transform-geosure transform-postcode-boundaries \
|
||||||
generate-postcode-boundaries
|
generate-postcode-boundaries
|
||||||
|
|
@ -93,6 +94,7 @@ download-greenspace: $(GREENSPACE)
|
||||||
download-pbf: $(PBF)
|
download-pbf: $(PBF)
|
||||||
download-places: $(PLACES)
|
download-places: $(PLACES)
|
||||||
download-lsoa-population: $(LSOA_POP)
|
download-lsoa-population: $(LSOA_POP)
|
||||||
|
download-england-boundary: $(ENGLAND_BOUNDARY)
|
||||||
download-rightmove-outcodes: $(RM_OUTCODES)
|
download-rightmove-outcodes: $(RM_OUTCODES)
|
||||||
transform-pois: $(POIS_FILTERED)
|
transform-pois: $(POIS_FILTERED)
|
||||||
transform-epc-pp: $(EPC_PP)
|
transform-epc-pp: $(EPC_PP)
|
||||||
|
|
@ -141,11 +143,11 @@ $(NAPTAN):
|
||||||
|
|
||||||
$(PBF):
|
$(PBF):
|
||||||
@mkdir -p $(DATA_DIR)
|
@mkdir -p $(DATA_DIR)
|
||||||
curl -L -o $@.tmp https://download.geofabrik.de/europe/great-britain-latest.osm.pbf
|
curl -L -o $@.tmp https://download.geofabrik.de/europe/united-kingdom/england-latest.osm.pbf
|
||||||
mv $@.tmp $@
|
mv $@.tmp $@
|
||||||
|
|
||||||
$(POIS_RAW): $(PBF)
|
$(POIS_RAW): $(PBF) $(ENGLAND_BOUNDARY)
|
||||||
uv run python -m pipeline.download.pois --output $@ --pbf $(PBF)
|
uv run python -m pipeline.download.pois --output $@ --pbf $(PBF) --boundary $(ENGLAND_BOUNDARY)
|
||||||
|
|
||||||
$(OFSTED):
|
$(OFSTED):
|
||||||
uv run python -m pipeline.download.ofsted --output $@
|
uv run python -m pipeline.download.ofsted --output $@
|
||||||
|
|
@ -183,19 +185,22 @@ $(RENTAL):
|
||||||
$(GREENSPACE): $(PBF)
|
$(GREENSPACE): $(PBF)
|
||||||
uv run python -m pipeline.download.greenspace_water --output $@ --pbf $(PBF)
|
uv run python -m pipeline.download.greenspace_water --output $@ --pbf $(PBF)
|
||||||
|
|
||||||
$(PLACES): $(PBF)
|
$(PLACES): $(PBF) $(ENGLAND_BOUNDARY)
|
||||||
uv run python -m pipeline.download.places --output $@ --pbf $(PBF)
|
uv run python -m pipeline.download.places --output $@ --pbf $(PBF) --boundary $(ENGLAND_BOUNDARY)
|
||||||
|
|
||||||
$(LSOA_POP):
|
$(LSOA_POP):
|
||||||
uv run python -m pipeline.download.lsoa_population --output $@
|
uv run python -m pipeline.download.lsoa_population --output $@
|
||||||
|
|
||||||
|
$(ENGLAND_BOUNDARY):
|
||||||
|
uv run python -m pipeline.download.england_boundary --output $@
|
||||||
|
|
||||||
$(RM_OUTCODES): $(MERGE_STAMP)
|
$(RM_OUTCODES): $(MERGE_STAMP)
|
||||||
uv run python -m pipeline.download.rightmove_outcodes --postcodes $(POSTCODES_PQ) --output $@
|
uv run python -m pipeline.download.rightmove_outcodes --postcodes $(POSTCODES_PQ) --output $@
|
||||||
|
|
||||||
# ── Transforms ────────────────────────────────────────────────────────────────
|
# ── Transforms ────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
$(POIS_FILTERED): $(POIS_RAW) $(NAPTAN)
|
$(POIS_FILTERED): $(POIS_RAW) $(NAPTAN) $(ENGLAND_BOUNDARY)
|
||||||
uv run python -m pipeline.transform.transform_poi --input $(POIS_RAW) --naptan $(NAPTAN) --output $@
|
uv run python -m pipeline.transform.transform_poi --input $(POIS_RAW) --naptan $(NAPTAN) --boundary $(ENGLAND_BOUNDARY) --output $@
|
||||||
|
|
||||||
$(EPC_PP): $(PRICE_PAID) $(EPC)
|
$(EPC_PP): $(PRICE_PAID) $(EPC)
|
||||||
uv run python -m pipeline.transform.join_epc_pp --epc $(EPC) --price-paid $(PRICE_PAID) --output $@
|
uv run python -m pipeline.transform.join_epc_pp --epc $(EPC) --price-paid $(PRICE_PAID) --output $@
|
||||||
|
|
|
||||||
|
|
@ -1,8 +1,42 @@
|
||||||
import { memo, useState, useCallback } from 'react';
|
import { memo, useState, useCallback, useEffect, useRef } from 'react';
|
||||||
import { SpinnerIcon } from '../ui/icons/SpinnerIcon';
|
import { SpinnerIcon } from '../ui/icons/SpinnerIcon';
|
||||||
import { SparklesIcon } from '../ui/icons/SparklesIcon';
|
import { SparklesIcon } from '../ui/icons/SparklesIcon';
|
||||||
import type { AiFilterErrorType } from '../../hooks/useAiFilters';
|
import type { AiFilterErrorType } from '../../hooks/useAiFilters';
|
||||||
|
|
||||||
|
const EXAMPLE_QUERIES = [
|
||||||
|
'Safe area near good schools',
|
||||||
|
'30 min commute to Kings Cross, under 500k',
|
||||||
|
'Quiet village, 3 bed, fast broadband',
|
||||||
|
];
|
||||||
|
|
||||||
|
const LOADING_MESSAGES = [
|
||||||
|
'Analysing your query...',
|
||||||
|
'Searching for destinations...',
|
||||||
|
'Generating filters...',
|
||||||
|
];
|
||||||
|
|
||||||
|
/** Cycle through loading messages to show progress. */
|
||||||
|
function useLoadingMessage(loading: boolean): string {
|
||||||
|
const [index, setIndex] = useState(0);
|
||||||
|
const timerRef = useRef<ReturnType<typeof setTimeout>>();
|
||||||
|
|
||||||
|
useEffect(() => {
|
||||||
|
if (!loading) {
|
||||||
|
setIndex(0);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
// Advance message every 1.5s
|
||||||
|
timerRef.current = setTimeout(() => setIndex(1), 1500);
|
||||||
|
const t2 = setTimeout(() => setIndex(2), 3500);
|
||||||
|
return () => {
|
||||||
|
clearTimeout(timerRef.current);
|
||||||
|
clearTimeout(t2);
|
||||||
|
};
|
||||||
|
}, [loading]);
|
||||||
|
|
||||||
|
return LOADING_MESSAGES[index];
|
||||||
|
}
|
||||||
|
|
||||||
interface AiFilterInputProps {
|
interface AiFilterInputProps {
|
||||||
loading: boolean;
|
loading: boolean;
|
||||||
error: string | null;
|
error: string | null;
|
||||||
|
|
@ -23,6 +57,8 @@ export default memo(function AiFilterInput({
|
||||||
onLoginRequired,
|
onLoginRequired,
|
||||||
}: AiFilterInputProps) {
|
}: AiFilterInputProps) {
|
||||||
const [query, setQuery] = useState('');
|
const [query, setQuery] = useState('');
|
||||||
|
const [expanded, setExpanded] = useState(false);
|
||||||
|
const loadingMessage = useLoadingMessage(loading);
|
||||||
|
|
||||||
const handleSubmit = useCallback(
|
const handleSubmit = useCallback(
|
||||||
(e: React.FormEvent) => {
|
(e: React.FormEvent) => {
|
||||||
|
|
@ -38,36 +74,90 @@ export default memo(function AiFilterInput({
|
||||||
[query, loading, isLoggedIn, onLoginRequired, onSubmit]
|
[query, loading, isLoggedIn, onLoginRequired, onSubmit]
|
||||||
);
|
);
|
||||||
|
|
||||||
|
const handleExampleClick = useCallback(
|
||||||
|
(example: string) => {
|
||||||
|
if (loading) return;
|
||||||
|
setQuery(example);
|
||||||
|
if (!isLoggedIn) {
|
||||||
|
onLoginRequired();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
onSubmit(example);
|
||||||
|
},
|
||||||
|
[loading, isLoggedIn, onLoginRequired, onSubmit]
|
||||||
|
);
|
||||||
|
|
||||||
const hasContent = query.trim().length > 0;
|
const hasContent = query.trim().length > 0;
|
||||||
|
const showExamples = expanded && !hasContent && !loading && !error && !notes;
|
||||||
|
|
||||||
|
if (!expanded) {
|
||||||
|
return (
|
||||||
|
<div className="px-3 py-2" data-tutorial="ai-filters">
|
||||||
|
<button
|
||||||
|
type="button"
|
||||||
|
onClick={() => setExpanded(true)}
|
||||||
|
className="w-full flex items-center gap-2 px-3 py-2 rounded-lg border border-dashed border-teal-300 dark:border-teal-700 bg-teal-50/50 dark:bg-teal-900/20 hover:bg-teal-50 dark:hover:bg-teal-900/30 cursor-pointer group"
|
||||||
|
>
|
||||||
|
<SparklesIcon className="w-4 h-4 text-teal-500 dark:text-teal-400 shrink-0" />
|
||||||
|
<span className="text-sm text-teal-700 dark:text-teal-300 group-hover:text-teal-800 dark:group-hover:text-teal-200">
|
||||||
|
Describe your ideal area with AI
|
||||||
|
</span>
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
return (
|
return (
|
||||||
<div className="px-3 py-2" data-tutorial="ai-filters">
|
<div className="px-3 py-2" data-tutorial="ai-filters">
|
||||||
|
<div className="flex items-center gap-1.5 mb-1.5">
|
||||||
|
<SparklesIcon className="w-3.5 h-3.5 text-teal-500 dark:text-teal-400 shrink-0" />
|
||||||
|
<span className="text-xs font-medium text-teal-700 dark:text-teal-300">AI Search</span>
|
||||||
|
<span className="text-xs text-warm-400 dark:text-warm-500">— describe what you're looking for</span>
|
||||||
|
</div>
|
||||||
<form onSubmit={handleSubmit} className="flex items-center gap-1.5">
|
<form onSubmit={handleSubmit} className="flex items-center gap-1.5">
|
||||||
<div className="relative flex-1">
|
<input
|
||||||
<SparklesIcon className="absolute left-2 top-1/2 -translate-y-1/2 w-3.5 h-3.5 text-teal-500 dark:text-teal-400 pointer-events-none" />
|
type="text"
|
||||||
<input
|
value={query}
|
||||||
type="text"
|
onChange={(e) => setQuery(e.target.value)}
|
||||||
value={query}
|
placeholder="e.g. quiet area, under 400k, near good schools..."
|
||||||
onChange={(e) => setQuery(e.target.value)}
|
className="flex-1 px-2.5 py-1.5 text-sm rounded-lg border border-warm-200 dark:border-warm-700 bg-warm-50 dark:bg-warm-800 text-warm-700 dark:text-warm-200 placeholder-warm-400 dark:placeholder-warm-500 focus:outline-none focus:ring-2 focus:ring-teal-400 focus:bg-white dark:focus:bg-warm-800"
|
||||||
placeholder="Describe your ideal area..."
|
disabled={loading}
|
||||||
className="w-full pl-7 pr-2.5 py-1.5 text-sm rounded-lg border border-warm-200 dark:border-warm-700 bg-warm-50 dark:bg-warm-800 text-warm-700 dark:text-warm-200 placeholder-warm-400 dark:placeholder-warm-500 focus:outline-none focus:ring-2 focus:ring-teal-400 focus:bg-white dark:focus:bg-warm-800"
|
autoFocus
|
||||||
disabled={loading}
|
/>
|
||||||
/>
|
<button
|
||||||
</div>
|
type="submit"
|
||||||
{(hasContent || loading) && (
|
disabled={loading || !hasContent}
|
||||||
<button
|
className="shrink-0 px-3 py-1.5 rounded-lg bg-teal-600 hover:bg-teal-700 disabled:opacity-50 disabled:cursor-not-allowed text-white text-sm font-medium flex items-center gap-1.5"
|
||||||
type="submit"
|
>
|
||||||
disabled={loading || !hasContent}
|
{loading ? (
|
||||||
className="shrink-0 px-2.5 py-1.5 rounded-lg bg-teal-600 hover:bg-teal-700 disabled:opacity-50 disabled:cursor-not-allowed text-white text-sm font-medium flex items-center justify-center"
|
<SpinnerIcon className="w-3.5 h-3.5 animate-spin" />
|
||||||
>
|
) : (
|
||||||
{loading ? (
|
<>
|
||||||
<SpinnerIcon className="w-4 h-4 animate-spin" />
|
<SparklesIcon className="w-3.5 h-3.5" />
|
||||||
) : (
|
<span>Search</span>
|
||||||
<SparklesIcon className="w-4 h-4" />
|
</>
|
||||||
)}
|
)}
|
||||||
</button>
|
</button>
|
||||||
)}
|
|
||||||
</form>
|
</form>
|
||||||
|
{loading && (
|
||||||
|
<p className="mt-1 text-xs text-teal-600 dark:text-teal-400">
|
||||||
|
{loadingMessage}
|
||||||
|
</p>
|
||||||
|
)}
|
||||||
|
{showExamples && (
|
||||||
|
<div className="mt-1.5 flex flex-wrap gap-1">
|
||||||
|
{EXAMPLE_QUERIES.map((example) => (
|
||||||
|
<button
|
||||||
|
key={example}
|
||||||
|
type="button"
|
||||||
|
onClick={() => handleExampleClick(example)}
|
||||||
|
className="text-xs px-2 py-0.5 rounded-full border border-warm-200 dark:border-warm-700 text-warm-500 dark:text-warm-400 hover:border-teal-400 hover:text-teal-600 dark:hover:text-teal-400 cursor-pointer"
|
||||||
|
>
|
||||||
|
{example}
|
||||||
|
</button>
|
||||||
|
))}
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
{error && errorType === 'verification' && (
|
{error && errorType === 'verification' && (
|
||||||
<p className="mt-1.5 text-xs text-amber-600 dark:text-amber-400">
|
<p className="mt-1.5 text-xs text-amber-600 dark:text-amber-400">
|
||||||
Please verify your email address to use AI-powered search. Check your inbox for a verification link.
|
Please verify your email address to use AI-powered search. Check your inbox for a verification link.
|
||||||
|
|
@ -83,7 +173,7 @@ export default memo(function AiFilterInput({
|
||||||
{error}
|
{error}
|
||||||
</p>
|
</p>
|
||||||
)}
|
)}
|
||||||
{notes && !error && (
|
{notes && !error && !loading && (
|
||||||
<p className="mt-1 text-xs text-warm-500 dark:text-warm-400 italic">
|
<p className="mt-1 text-xs text-warm-500 dark:text-warm-400 italic">
|
||||||
{notes}
|
{notes}
|
||||||
</p>
|
</p>
|
||||||
|
|
|
||||||
45
pipeline/download/england_boundary.py
Normal file
45
pipeline/download/england_boundary.py
Normal file
|
|
@ -0,0 +1,45 @@
|
||||||
|
"""Download England country boundary GeoJSON from ONS Open Geography Portal.
|
||||||
|
|
||||||
|
Source: ONS Countries (December 2024) Boundaries UK BGC (Generalised Clipped)
|
||||||
|
Licence: OGL v3
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
# ArcGIS REST API — query for England only, generalised (BGC) resolution
|
||||||
|
URL = (
|
||||||
|
"https://services1.arcgis.com/ESMARspQHYMw9BZ9/arcgis/rest/services/"
|
||||||
|
"Countries_December_2024_Boundaries_UK_BGC/FeatureServer/0/query"
|
||||||
|
"?where=CTRY24NM%3D%27England%27&outFields=CTRY24NM&f=geojson"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Download England country boundary GeoJSON"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--output", type=Path, required=True, help="Output GeoJSON file path"
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
args.output.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
print("Downloading England boundary from ONS...")
|
||||||
|
response = httpx.get(URL, follow_redirects=True, timeout=60)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
data = response.json()
|
||||||
|
features = data.get("features", [])
|
||||||
|
if len(features) != 1:
|
||||||
|
raise ValueError(f"Expected 1 feature for England, got {len(features)}")
|
||||||
|
|
||||||
|
args.output.write_text(response.text)
|
||||||
|
size_kb = args.output.stat().st_size / 1024
|
||||||
|
print(f"Saved to {args.output} ({size_kb:.0f} KB)")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
|
@ -7,6 +7,7 @@ from pathlib import Path
|
||||||
from pipeline.transform.transform_poi import NAPTAN_EMOJIS, _CATEGORIES
|
from pipeline.transform.transform_poi import NAPTAN_EMOJIS, _CATEGORIES
|
||||||
|
|
||||||
GLYPHS_BASE = "https://protomaps.github.io/basemaps-assets/fonts"
|
GLYPHS_BASE = "https://protomaps.github.io/basemaps-assets/fonts"
|
||||||
|
SPRITES_BASE = "https://protomaps.github.io/basemaps-assets/sprites/v4"
|
||||||
TWEMOJI_BASE = "https://cdn.jsdelivr.net/gh/twitter/twemoji@14.0.2/assets/72x72"
|
TWEMOJI_BASE = "https://cdn.jsdelivr.net/gh/twitter/twemoji@14.0.2/assets/72x72"
|
||||||
|
|
||||||
# Font stacks used by @protomaps/basemaps with lang='en'
|
# Font stacks used by @protomaps/basemaps with lang='en'
|
||||||
|
|
@ -77,6 +78,15 @@ def main():
|
||||||
url = f"{GLYPHS_BASE}/{font_encoded}/{name}"
|
url = f"{GLYPHS_BASE}/{font_encoded}/{name}"
|
||||||
tasks.append((url, font_dir / name))
|
tasks.append((url, font_dir / name))
|
||||||
|
|
||||||
|
# Sprite sheets (light/dark, 1x and 2x)
|
||||||
|
sprites_dir = out / "sprites"
|
||||||
|
for theme in ("light", "dark"):
|
||||||
|
for suffix in ("json", "png"):
|
||||||
|
url = f"{SPRITES_BASE}/{theme}.{suffix}"
|
||||||
|
tasks.append((url, sprites_dir / f"{theme}.{suffix}"))
|
||||||
|
url_2x = f"{SPRITES_BASE}/{theme}@2x.{suffix}"
|
||||||
|
tasks.append((url_2x, sprites_dir / f"{theme}@2x.{suffix}"))
|
||||||
|
|
||||||
# Twemoji PNGs
|
# Twemoji PNGs
|
||||||
twemoji_dir = out / "twemoji"
|
twemoji_dir = out / "twemoji"
|
||||||
for code in twemoji_codes:
|
for code in twemoji_codes:
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,7 @@
|
||||||
|
|
||||||
Extracts named place nodes (cities, towns, suburbs, etc.) and railway stations
|
Extracts named place nodes (cities, towns, suburbs, etc.) and railway stations
|
||||||
(tube, national rail, DLR, etc.) for typeahead search.
|
(tube, national rail, DLR, etc.) for typeahead search.
|
||||||
Reuses the same great-britain-latest.osm.pbf as pois.py.
|
Reuses the same england-latest.osm.pbf as pois.py.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
|
@ -10,9 +10,16 @@ from pathlib import Path
|
||||||
|
|
||||||
import osmium
|
import osmium
|
||||||
import polars as pl
|
import polars as pl
|
||||||
|
from shapely.geometry import Point
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from .pois import UK_BBOX_EAST, UK_BBOX_NORTH, UK_BBOX_SOUTH, UK_BBOX_WEST
|
from pipeline.download.pois import (
|
||||||
|
ENGLAND_BBOX_EAST,
|
||||||
|
ENGLAND_BBOX_NORTH,
|
||||||
|
ENGLAND_BBOX_SOUTH,
|
||||||
|
ENGLAND_BBOX_WEST,
|
||||||
|
)
|
||||||
|
from pipeline.utils.england_geometry import load_england_polygon
|
||||||
|
|
||||||
PLACE_TYPES = {"city"}
|
PLACE_TYPES = {"city"}
|
||||||
|
|
||||||
|
|
@ -57,10 +64,11 @@ def _station_display_name(name: str, tags: dict[str, str]) -> str:
|
||||||
|
|
||||||
|
|
||||||
class PlaceHandler(osmium.SimpleHandler):
|
class PlaceHandler(osmium.SimpleHandler):
|
||||||
def __init__(self, progress: tqdm) -> None:
|
def __init__(self, progress: tqdm, england_polygon) -> None:
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self._progress = progress
|
self._progress = progress
|
||||||
self.places: list[dict] = []
|
self.places: list[dict] = []
|
||||||
|
self._england = england_polygon
|
||||||
|
|
||||||
def _add(
|
def _add(
|
||||||
self, name: str, place_type: str, lat: float, lon: float, population: int
|
self, name: str, place_type: str, lat: float, lon: float, population: int
|
||||||
|
|
@ -82,10 +90,12 @@ class PlaceHandler(osmium.SimpleHandler):
|
||||||
return
|
return
|
||||||
lat, lon = n.location.lat, n.location.lon
|
lat, lon = n.location.lat, n.location.lon
|
||||||
if not (
|
if not (
|
||||||
UK_BBOX_SOUTH <= lat <= UK_BBOX_NORTH
|
ENGLAND_BBOX_SOUTH <= lat <= ENGLAND_BBOX_NORTH
|
||||||
and UK_BBOX_WEST <= lon <= UK_BBOX_EAST
|
and ENGLAND_BBOX_WEST <= lon <= ENGLAND_BBOX_EAST
|
||||||
):
|
):
|
||||||
return
|
return
|
||||||
|
if not self._england.contains(Point(lon, lat)):
|
||||||
|
return
|
||||||
|
|
||||||
name = n.tags.get("name:en", n.tags.get("name", ""))
|
name = n.tags.get("name:en", n.tags.get("name", ""))
|
||||||
if not name:
|
if not name:
|
||||||
|
|
@ -124,9 +134,17 @@ def main() -> None:
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--pbf", type=Path, required=True, help="Path to OSM PBF file"
|
"--pbf", type=Path, required=True, help="Path to OSM PBF file"
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--boundary",
|
||||||
|
type=Path,
|
||||||
|
required=True,
|
||||||
|
help="England boundary GeoJSON file",
|
||||||
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
pbf_file = args.pbf
|
pbf_file = args.pbf
|
||||||
|
england_polygon = load_england_polygon(args.boundary)
|
||||||
|
|
||||||
print("Extracting place nodes: cities + railway stations")
|
print("Extracting place nodes: cities + railway stations")
|
||||||
with tqdm(
|
with tqdm(
|
||||||
unit=" elements",
|
unit=" elements",
|
||||||
|
|
@ -135,7 +153,7 @@ def main() -> None:
|
||||||
smoothing=0.05,
|
smoothing=0.05,
|
||||||
mininterval=1.0,
|
mininterval=1.0,
|
||||||
) as progress:
|
) as progress:
|
||||||
handler = PlaceHandler(progress)
|
handler = PlaceHandler(progress, england_polygon)
|
||||||
handler.apply_file(str(pbf_file), locations=True)
|
handler.apply_file(str(pbf_file), locations=True)
|
||||||
|
|
||||||
print(f"Extracted {len(handler.places):,} place nodes")
|
print(f"Extracted {len(handler.places):,} place nodes")
|
||||||
|
|
|
||||||
|
|
@ -4,17 +4,20 @@ from tempfile import mkdtemp
|
||||||
|
|
||||||
import osmium
|
import osmium
|
||||||
import polars as pl
|
import polars as pl
|
||||||
|
from shapely.geometry import Point
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
from pipeline.utils.england_geometry import load_england_polygon
|
||||||
|
|
||||||
BATCH_SIZE = 50_000
|
BATCH_SIZE = 50_000
|
||||||
|
|
||||||
MIN_OCCURENCE_COUNT = 20
|
MIN_OCCURENCE_COUNT = 20
|
||||||
|
|
||||||
UK_BBOX_WEST = -7.57
|
# Bounding box for fast pre-filtering before the precise polygon check
|
||||||
UK_BBOX_SOUTH = 49.96
|
ENGLAND_BBOX_WEST = -6.45
|
||||||
UK_BBOX_EAST = 1.68
|
ENGLAND_BBOX_SOUTH = 49.85
|
||||||
UK_BBOX_NORTH = 58.64
|
ENGLAND_BBOX_EAST = 1.77
|
||||||
|
ENGLAND_BBOX_NORTH = 55.82
|
||||||
|
|
||||||
POI_TAG_KEYS: list[str] = [
|
POI_TAG_KEYS: list[str] = [
|
||||||
"amenity",
|
"amenity",
|
||||||
|
|
@ -31,19 +34,23 @@ POI_TAG_KEYS: list[str] = [
|
||||||
|
|
||||||
|
|
||||||
class POIHandler(osmium.SimpleHandler):
|
class POIHandler(osmium.SimpleHandler):
|
||||||
def __init__(self, progress: tqdm, tmp_dir: Path) -> None:
|
def __init__(self, progress: tqdm, tmp_dir: Path, england_polygon) -> None:
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self._batch: list[dict] = []
|
self._batch: list[dict] = []
|
||||||
self._tmp_dir = tmp_dir
|
self._tmp_dir = tmp_dir
|
||||||
self._batch_num = 0
|
self._batch_num = 0
|
||||||
self.poi_count = 0
|
self.poi_count = 0
|
||||||
self._progress = progress
|
self._progress = progress
|
||||||
|
self._england = england_polygon
|
||||||
|
|
||||||
def _in_uk(self, lat: float, lon: float) -> bool:
|
def _in_england(self, lat: float, lon: float) -> bool:
|
||||||
return (
|
# Fast bbox pre-filter, then precise polygon check
|
||||||
UK_BBOX_SOUTH <= lat <= UK_BBOX_NORTH
|
if not (
|
||||||
and UK_BBOX_WEST <= lon <= UK_BBOX_EAST
|
ENGLAND_BBOX_SOUTH <= lat <= ENGLAND_BBOX_NORTH
|
||||||
)
|
and ENGLAND_BBOX_WEST <= lon <= ENGLAND_BBOX_EAST
|
||||||
|
):
|
||||||
|
return False
|
||||||
|
return self._england.contains(Point(lon, lat))
|
||||||
|
|
||||||
def _match_tags(self, tags: osmium.osm.TagList) -> list[str]:
|
def _match_tags(self, tags: osmium.osm.TagList) -> list[str]:
|
||||||
return [f"{key}/{tags[key]}" for key in POI_TAG_KEYS if key in tags]
|
return [f"{key}/{tags[key]}" for key in POI_TAG_KEYS if key in tags]
|
||||||
|
|
@ -90,7 +97,7 @@ class POIHandler(osmium.SimpleHandler):
|
||||||
if not n.location.valid:
|
if not n.location.valid:
|
||||||
return
|
return
|
||||||
lat, lon = n.location.lat, n.location.lon
|
lat, lon = n.location.lat, n.location.lon
|
||||||
if not self._in_uk(lat, lon):
|
if not self._in_england(lat, lon):
|
||||||
return
|
return
|
||||||
categories = self._match_tags(n.tags)
|
categories = self._match_tags(n.tags)
|
||||||
for category in categories:
|
for category in categories:
|
||||||
|
|
@ -107,11 +114,19 @@ def main() -> None:
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--pbf", type=Path, required=True, help="Path to OSM PBF file"
|
"--pbf", type=Path, required=True, help="Path to OSM PBF file"
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--boundary",
|
||||||
|
type=Path,
|
||||||
|
required=True,
|
||||||
|
help="England boundary GeoJSON file",
|
||||||
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
pbf_file = args.pbf
|
pbf_file = args.pbf
|
||||||
print(f"Tag keys: {POI_TAG_KEYS}")
|
print(f"Tag keys: {POI_TAG_KEYS}")
|
||||||
|
|
||||||
|
england_polygon = load_england_polygon(args.boundary)
|
||||||
|
|
||||||
tmp_dir = Path(mkdtemp(prefix="pois_"))
|
tmp_dir = Path(mkdtemp(prefix="pois_"))
|
||||||
with tqdm(
|
with tqdm(
|
||||||
unit=" elements",
|
unit=" elements",
|
||||||
|
|
@ -120,7 +135,7 @@ def main() -> None:
|
||||||
smoothing=0.05,
|
smoothing=0.05,
|
||||||
mininterval=1.0,
|
mininterval=1.0,
|
||||||
) as progress:
|
) as progress:
|
||||||
handler = POIHandler(progress, tmp_dir)
|
handler = POIHandler(progress, tmp_dir, england_polygon)
|
||||||
handler.apply_file(str(pbf_file), locations=True)
|
handler.apply_file(str(pbf_file), locations=True)
|
||||||
handler._flush_batch() # write any remaining POIs
|
handler._flush_batch() # write any remaining POIs
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -3,6 +3,8 @@ from pathlib import Path
|
||||||
|
|
||||||
import polars as pl
|
import polars as pl
|
||||||
|
|
||||||
|
from pipeline.utils.england_geometry import in_england_mask
|
||||||
|
|
||||||
|
|
||||||
DROP_CATEGORIES = {
|
DROP_CATEGORIES = {
|
||||||
# Street furniture & infrastructure
|
# Street furniture & infrastructure
|
||||||
|
|
@ -1056,7 +1058,11 @@ NAPTAN_EMOJIS: dict[str, str] = {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def transform(input_path: Path, naptan_path: Path | None = None) -> pl.LazyFrame:
|
def transform(
|
||||||
|
input_path: Path,
|
||||||
|
naptan_path: Path | None = None,
|
||||||
|
boundary_path: Path | None = None,
|
||||||
|
) -> pl.LazyFrame:
|
||||||
lf = pl.scan_parquet(input_path)
|
lf = pl.scan_parquet(input_path)
|
||||||
|
|
||||||
# Get all unique categories present in the data
|
# Get all unique categories present in the data
|
||||||
|
|
@ -1072,16 +1078,14 @@ def transform(input_path: Path, naptan_path: Path | None = None) -> pl.LazyFrame
|
||||||
if unmapped:
|
if unmapped:
|
||||||
raise ValueError(f"Categories missing from CATEGORY_MAP: {sorted(unmapped)}")
|
raise ValueError(f"Categories missing from CATEGORY_MAP: {sorted(unmapped)}")
|
||||||
|
|
||||||
# Verify every CATEGORY_MAP key actually exists in the data (catch typos)
|
# Warn about CATEGORY_MAP keys not in data (may be absent in regional extracts)
|
||||||
mapped_but_absent = []
|
mapped_but_absent = []
|
||||||
all_set = set(all_categories)
|
all_set = set(all_categories)
|
||||||
for cat in CATEGORY_MAP:
|
for cat in CATEGORY_MAP:
|
||||||
if cat not in all_set:
|
if cat not in all_set:
|
||||||
mapped_but_absent.append(cat)
|
mapped_but_absent.append(cat)
|
||||||
if mapped_but_absent:
|
if mapped_but_absent:
|
||||||
raise ValueError(
|
print(f"CATEGORY_MAP categories not in data (skipped): {sorted(mapped_but_absent)}")
|
||||||
f"CATEGORY_MAP contains categories not in data: {sorted(mapped_but_absent)}"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Drop unwanted categories
|
# Drop unwanted categories
|
||||||
lf = lf.filter(~pl.col("category").is_in(list(DROP_CATEGORIES)))
|
lf = lf.filter(~pl.col("category").is_in(list(DROP_CATEGORIES)))
|
||||||
|
|
@ -1105,7 +1109,15 @@ def transform(input_path: Path, naptan_path: Path | None = None) -> pl.LazyFrame
|
||||||
pl.col("category").replace_strict(emoji_mapping).alias("emoji"),
|
pl.col("category").replace_strict(emoji_mapping).alias("emoji"),
|
||||||
)
|
)
|
||||||
|
|
||||||
naptan = pl.scan_parquet(naptan_path).with_columns(
|
naptan_df = pl.scan_parquet(naptan_path).collect()
|
||||||
|
if boundary_path is not None:
|
||||||
|
mask = in_england_mask(
|
||||||
|
boundary_path,
|
||||||
|
naptan_df["lat"].to_numpy(),
|
||||||
|
naptan_df["lng"].to_numpy(),
|
||||||
|
)
|
||||||
|
naptan_df = naptan_df.filter(pl.Series(mask))
|
||||||
|
naptan = naptan_df.lazy().with_columns(
|
||||||
pl.col("category").replace_strict(NAPTAN_EMOJIS).alias("emoji"),
|
pl.col("category").replace_strict(NAPTAN_EMOJIS).alias("emoji"),
|
||||||
pl.lit("Public Transport").alias("group"),
|
pl.lit("Public Transport").alias("group"),
|
||||||
)
|
)
|
||||||
|
|
@ -1122,12 +1134,18 @@ def main():
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--naptan", type=Path, required=True, help="NaPTAN stations parquet file"
|
"--naptan", type=Path, required=True, help="NaPTAN stations parquet file"
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--boundary",
|
||||||
|
type=Path,
|
||||||
|
required=True,
|
||||||
|
help="England boundary GeoJSON file",
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--output", type=Path, required=True, help="Output filtered POIs parquet file"
|
"--output", type=Path, required=True, help="Output filtered POIs parquet file"
|
||||||
)
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
df = transform(args.input, args.naptan).collect(engine="streaming")
|
df = transform(args.input, args.naptan, args.boundary).collect(engine="streaming")
|
||||||
|
|
||||||
df.write_parquet(args.output)
|
df.write_parquet(args.output)
|
||||||
|
|
||||||
|
|
|
||||||
33
pipeline/utils/england_geometry.py
Normal file
33
pipeline/utils/england_geometry.py
Normal file
|
|
@ -0,0 +1,33 @@
|
||||||
|
"""England boundary polygon for accurate point-in-country filtering.
|
||||||
|
|
||||||
|
Uses shapely prepared geometry for fast single-point checks (osmium handlers)
|
||||||
|
and vectorized shapely.contains for batch checks (Polars DataFrames).
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import shapely
|
||||||
|
from shapely.geometry import shape
|
||||||
|
from shapely.prepared import PreparedGeometry, prep
|
||||||
|
|
||||||
|
|
||||||
|
def load_england_polygon(geojson_path: Path) -> PreparedGeometry:
|
||||||
|
"""Load England boundary as a prepared shapely polygon for fast contains checks."""
|
||||||
|
with open(geojson_path) as f:
|
||||||
|
data = json.load(f)
|
||||||
|
geometry = shape(data["features"][0]["geometry"])
|
||||||
|
return prep(geometry)
|
||||||
|
|
||||||
|
|
||||||
|
def in_england_mask(geojson_path: Path, lats: np.ndarray, lngs: np.ndarray) -> np.ndarray:
|
||||||
|
"""Vectorized check: which (lat, lng) points are within England.
|
||||||
|
|
||||||
|
Returns a boolean numpy array.
|
||||||
|
"""
|
||||||
|
with open(geojson_path) as f:
|
||||||
|
data = json.load(f)
|
||||||
|
polygon = shape(data["features"][0]["geometry"])
|
||||||
|
pts = shapely.points(lngs, lats)
|
||||||
|
return shapely.contains(polygon, pts)
|
||||||
Loading…
Add table
Add a link
Reference in a new issue