England only
This commit is contained in:
parent
4d08f5d08d
commit
02712f41e8
8 changed files with 294 additions and 60 deletions
|
|
@ -46,11 +46,12 @@ PC_BOUNDARIES := $(MANUAL_DATA)/postcode_boundaries
|
|||
TRANSIT_DIR := $(DATA_DIR)/transit
|
||||
TRANSIT_STAMP := $(TRANSIT_DIR)/.done
|
||||
GREENSPACE := $(DATA_DIR)/greenspace_water.parquet
|
||||
PBF := $(DATA_DIR)/great-britain-latest.osm.pbf
|
||||
PBF := $(DATA_DIR)/england-latest.osm.pbf
|
||||
PLACES := $(DATA_DIR)/places.parquet
|
||||
LISTINGS_BUY := $(DATA_DIR)/online_listings_buy.parquet
|
||||
LISTINGS_RENT := $(DATA_DIR)/online_listings_rent.parquet
|
||||
LSOA_POP := $(DATA_DIR)/lsoa_population.parquet
|
||||
ENGLAND_BOUNDARY := $(DATA_DIR)/england_boundary.geojson
|
||||
RM_OUTCODES := frontend/src/lib/rightmove-outcodes.json
|
||||
|
||||
# Sentinel files for directory targets (Make doesn't track directories well)
|
||||
|
|
@ -65,7 +66,7 @@ PMTILES_VERSION := 1.22.3
|
|||
download-arcgis download-price-paid download-deprivation download-ethnicity \
|
||||
download-naptan download-pois download-ofsted download-broadband download-rental-prices \
|
||||
download-postcodes download-geosure download-noise download-inspire \
|
||||
download-oa-boundaries download-uprn-lookup download-transit-network download-greenspace download-pbf download-places download-lsoa-population download-rightmove-outcodes \
|
||||
download-oa-boundaries download-uprn-lookup download-transit-network download-greenspace download-pbf download-places download-lsoa-population download-england-boundary download-rightmove-outcodes \
|
||||
transform-pois transform-epc-pp transform-crime transform-poi-proximity \
|
||||
transform-school-proximity transform-geosure transform-postcode-boundaries \
|
||||
generate-postcode-boundaries
|
||||
|
|
@ -93,6 +94,7 @@ download-greenspace: $(GREENSPACE)
|
|||
download-pbf: $(PBF)
|
||||
download-places: $(PLACES)
|
||||
download-lsoa-population: $(LSOA_POP)
|
||||
download-england-boundary: $(ENGLAND_BOUNDARY)
|
||||
download-rightmove-outcodes: $(RM_OUTCODES)
|
||||
transform-pois: $(POIS_FILTERED)
|
||||
transform-epc-pp: $(EPC_PP)
|
||||
|
|
@ -141,11 +143,11 @@ $(NAPTAN):
|
|||
|
||||
$(PBF):
|
||||
@mkdir -p $(DATA_DIR)
|
||||
curl -L -o $@.tmp https://download.geofabrik.de/europe/great-britain-latest.osm.pbf
|
||||
curl -L -o $@.tmp https://download.geofabrik.de/europe/united-kingdom/england-latest.osm.pbf
|
||||
mv $@.tmp $@
|
||||
|
||||
$(POIS_RAW): $(PBF)
|
||||
uv run python -m pipeline.download.pois --output $@ --pbf $(PBF)
|
||||
$(POIS_RAW): $(PBF) $(ENGLAND_BOUNDARY)
|
||||
uv run python -m pipeline.download.pois --output $@ --pbf $(PBF) --boundary $(ENGLAND_BOUNDARY)
|
||||
|
||||
$(OFSTED):
|
||||
uv run python -m pipeline.download.ofsted --output $@
|
||||
|
|
@ -183,19 +185,22 @@ $(RENTAL):
|
|||
$(GREENSPACE): $(PBF)
|
||||
uv run python -m pipeline.download.greenspace_water --output $@ --pbf $(PBF)
|
||||
|
||||
$(PLACES): $(PBF)
|
||||
uv run python -m pipeline.download.places --output $@ --pbf $(PBF)
|
||||
$(PLACES): $(PBF) $(ENGLAND_BOUNDARY)
|
||||
uv run python -m pipeline.download.places --output $@ --pbf $(PBF) --boundary $(ENGLAND_BOUNDARY)
|
||||
|
||||
$(LSOA_POP):
|
||||
uv run python -m pipeline.download.lsoa_population --output $@
|
||||
|
||||
$(ENGLAND_BOUNDARY):
|
||||
uv run python -m pipeline.download.england_boundary --output $@
|
||||
|
||||
$(RM_OUTCODES): $(MERGE_STAMP)
|
||||
uv run python -m pipeline.download.rightmove_outcodes --postcodes $(POSTCODES_PQ) --output $@
|
||||
|
||||
# ── Transforms ────────────────────────────────────────────────────────────────
|
||||
|
||||
$(POIS_FILTERED): $(POIS_RAW) $(NAPTAN)
|
||||
uv run python -m pipeline.transform.transform_poi --input $(POIS_RAW) --naptan $(NAPTAN) --output $@
|
||||
$(POIS_FILTERED): $(POIS_RAW) $(NAPTAN) $(ENGLAND_BOUNDARY)
|
||||
uv run python -m pipeline.transform.transform_poi --input $(POIS_RAW) --naptan $(NAPTAN) --boundary $(ENGLAND_BOUNDARY) --output $@
|
||||
|
||||
$(EPC_PP): $(PRICE_PAID) $(EPC)
|
||||
uv run python -m pipeline.transform.join_epc_pp --epc $(EPC) --price-paid $(PRICE_PAID) --output $@
|
||||
|
|
|
|||
|
|
@ -1,8 +1,42 @@
|
|||
import { memo, useState, useCallback } from 'react';
|
||||
import { memo, useState, useCallback, useEffect, useRef } from 'react';
|
||||
import { SpinnerIcon } from '../ui/icons/SpinnerIcon';
|
||||
import { SparklesIcon } from '../ui/icons/SparklesIcon';
|
||||
import type { AiFilterErrorType } from '../../hooks/useAiFilters';
|
||||
|
||||
const EXAMPLE_QUERIES = [
|
||||
'Safe area near good schools',
|
||||
'30 min commute to Kings Cross, under 500k',
|
||||
'Quiet village, 3 bed, fast broadband',
|
||||
];
|
||||
|
||||
const LOADING_MESSAGES = [
|
||||
'Analysing your query...',
|
||||
'Searching for destinations...',
|
||||
'Generating filters...',
|
||||
];
|
||||
|
||||
/** Cycle through loading messages to show progress. */
|
||||
function useLoadingMessage(loading: boolean): string {
|
||||
const [index, setIndex] = useState(0);
|
||||
const timerRef = useRef<ReturnType<typeof setTimeout>>();
|
||||
|
||||
useEffect(() => {
|
||||
if (!loading) {
|
||||
setIndex(0);
|
||||
return;
|
||||
}
|
||||
// Advance message every 1.5s
|
||||
timerRef.current = setTimeout(() => setIndex(1), 1500);
|
||||
const t2 = setTimeout(() => setIndex(2), 3500);
|
||||
return () => {
|
||||
clearTimeout(timerRef.current);
|
||||
clearTimeout(t2);
|
||||
};
|
||||
}, [loading]);
|
||||
|
||||
return LOADING_MESSAGES[index];
|
||||
}
|
||||
|
||||
interface AiFilterInputProps {
|
||||
loading: boolean;
|
||||
error: string | null;
|
||||
|
|
@ -23,6 +57,8 @@ export default memo(function AiFilterInput({
|
|||
onLoginRequired,
|
||||
}: AiFilterInputProps) {
|
||||
const [query, setQuery] = useState('');
|
||||
const [expanded, setExpanded] = useState(false);
|
||||
const loadingMessage = useLoadingMessage(loading);
|
||||
|
||||
const handleSubmit = useCallback(
|
||||
(e: React.FormEvent) => {
|
||||
|
|
@ -38,36 +74,90 @@ export default memo(function AiFilterInput({
|
|||
[query, loading, isLoggedIn, onLoginRequired, onSubmit]
|
||||
);
|
||||
|
||||
const handleExampleClick = useCallback(
|
||||
(example: string) => {
|
||||
if (loading) return;
|
||||
setQuery(example);
|
||||
if (!isLoggedIn) {
|
||||
onLoginRequired();
|
||||
return;
|
||||
}
|
||||
onSubmit(example);
|
||||
},
|
||||
[loading, isLoggedIn, onLoginRequired, onSubmit]
|
||||
);
|
||||
|
||||
const hasContent = query.trim().length > 0;
|
||||
const showExamples = expanded && !hasContent && !loading && !error && !notes;
|
||||
|
||||
if (!expanded) {
|
||||
return (
|
||||
<div className="px-3 py-2" data-tutorial="ai-filters">
|
||||
<button
|
||||
type="button"
|
||||
onClick={() => setExpanded(true)}
|
||||
className="w-full flex items-center gap-2 px-3 py-2 rounded-lg border border-dashed border-teal-300 dark:border-teal-700 bg-teal-50/50 dark:bg-teal-900/20 hover:bg-teal-50 dark:hover:bg-teal-900/30 cursor-pointer group"
|
||||
>
|
||||
<SparklesIcon className="w-4 h-4 text-teal-500 dark:text-teal-400 shrink-0" />
|
||||
<span className="text-sm text-teal-700 dark:text-teal-300 group-hover:text-teal-800 dark:group-hover:text-teal-200">
|
||||
Describe your ideal area with AI
|
||||
</span>
|
||||
</button>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
return (
|
||||
<div className="px-3 py-2" data-tutorial="ai-filters">
|
||||
<div className="flex items-center gap-1.5 mb-1.5">
|
||||
<SparklesIcon className="w-3.5 h-3.5 text-teal-500 dark:text-teal-400 shrink-0" />
|
||||
<span className="text-xs font-medium text-teal-700 dark:text-teal-300">AI Search</span>
|
||||
<span className="text-xs text-warm-400 dark:text-warm-500">— describe what you're looking for</span>
|
||||
</div>
|
||||
<form onSubmit={handleSubmit} className="flex items-center gap-1.5">
|
||||
<div className="relative flex-1">
|
||||
<SparklesIcon className="absolute left-2 top-1/2 -translate-y-1/2 w-3.5 h-3.5 text-teal-500 dark:text-teal-400 pointer-events-none" />
|
||||
<input
|
||||
type="text"
|
||||
value={query}
|
||||
onChange={(e) => setQuery(e.target.value)}
|
||||
placeholder="Describe your ideal area..."
|
||||
className="w-full pl-7 pr-2.5 py-1.5 text-sm rounded-lg border border-warm-200 dark:border-warm-700 bg-warm-50 dark:bg-warm-800 text-warm-700 dark:text-warm-200 placeholder-warm-400 dark:placeholder-warm-500 focus:outline-none focus:ring-2 focus:ring-teal-400 focus:bg-white dark:focus:bg-warm-800"
|
||||
disabled={loading}
|
||||
/>
|
||||
</div>
|
||||
{(hasContent || loading) && (
|
||||
<button
|
||||
type="submit"
|
||||
disabled={loading || !hasContent}
|
||||
className="shrink-0 px-2.5 py-1.5 rounded-lg bg-teal-600 hover:bg-teal-700 disabled:opacity-50 disabled:cursor-not-allowed text-white text-sm font-medium flex items-center justify-center"
|
||||
>
|
||||
{loading ? (
|
||||
<SpinnerIcon className="w-4 h-4 animate-spin" />
|
||||
) : (
|
||||
<SparklesIcon className="w-4 h-4" />
|
||||
)}
|
||||
</button>
|
||||
)}
|
||||
<input
|
||||
type="text"
|
||||
value={query}
|
||||
onChange={(e) => setQuery(e.target.value)}
|
||||
placeholder="e.g. quiet area, under 400k, near good schools..."
|
||||
className="flex-1 px-2.5 py-1.5 text-sm rounded-lg border border-warm-200 dark:border-warm-700 bg-warm-50 dark:bg-warm-800 text-warm-700 dark:text-warm-200 placeholder-warm-400 dark:placeholder-warm-500 focus:outline-none focus:ring-2 focus:ring-teal-400 focus:bg-white dark:focus:bg-warm-800"
|
||||
disabled={loading}
|
||||
autoFocus
|
||||
/>
|
||||
<button
|
||||
type="submit"
|
||||
disabled={loading || !hasContent}
|
||||
className="shrink-0 px-3 py-1.5 rounded-lg bg-teal-600 hover:bg-teal-700 disabled:opacity-50 disabled:cursor-not-allowed text-white text-sm font-medium flex items-center gap-1.5"
|
||||
>
|
||||
{loading ? (
|
||||
<SpinnerIcon className="w-3.5 h-3.5 animate-spin" />
|
||||
) : (
|
||||
<>
|
||||
<SparklesIcon className="w-3.5 h-3.5" />
|
||||
<span>Search</span>
|
||||
</>
|
||||
)}
|
||||
</button>
|
||||
</form>
|
||||
{loading && (
|
||||
<p className="mt-1 text-xs text-teal-600 dark:text-teal-400">
|
||||
{loadingMessage}
|
||||
</p>
|
||||
)}
|
||||
{showExamples && (
|
||||
<div className="mt-1.5 flex flex-wrap gap-1">
|
||||
{EXAMPLE_QUERIES.map((example) => (
|
||||
<button
|
||||
key={example}
|
||||
type="button"
|
||||
onClick={() => handleExampleClick(example)}
|
||||
className="text-xs px-2 py-0.5 rounded-full border border-warm-200 dark:border-warm-700 text-warm-500 dark:text-warm-400 hover:border-teal-400 hover:text-teal-600 dark:hover:text-teal-400 cursor-pointer"
|
||||
>
|
||||
{example}
|
||||
</button>
|
||||
))}
|
||||
</div>
|
||||
)}
|
||||
{error && errorType === 'verification' && (
|
||||
<p className="mt-1.5 text-xs text-amber-600 dark:text-amber-400">
|
||||
Please verify your email address to use AI-powered search. Check your inbox for a verification link.
|
||||
|
|
@ -83,7 +173,7 @@ export default memo(function AiFilterInput({
|
|||
{error}
|
||||
</p>
|
||||
)}
|
||||
{notes && !error && (
|
||||
{notes && !error && !loading && (
|
||||
<p className="mt-1 text-xs text-warm-500 dark:text-warm-400 italic">
|
||||
{notes}
|
||||
</p>
|
||||
|
|
|
|||
45
pipeline/download/england_boundary.py
Normal file
45
pipeline/download/england_boundary.py
Normal file
|
|
@ -0,0 +1,45 @@
|
|||
"""Download England country boundary GeoJSON from ONS Open Geography Portal.
|
||||
|
||||
Source: ONS Countries (December 2024) Boundaries UK BGC (Generalised Clipped)
|
||||
Licence: OGL v3
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
import httpx
|
||||
|
||||
# ArcGIS REST API — query for England only, generalised (BGC) resolution
|
||||
URL = (
|
||||
"https://services1.arcgis.com/ESMARspQHYMw9BZ9/arcgis/rest/services/"
|
||||
"Countries_December_2024_Boundaries_UK_BGC/FeatureServer/0/query"
|
||||
"?where=CTRY24NM%3D%27England%27&outFields=CTRY24NM&f=geojson"
|
||||
)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Download England country boundary GeoJSON"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output", type=Path, required=True, help="Output GeoJSON file path"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
args.output.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
print("Downloading England boundary from ONS...")
|
||||
response = httpx.get(URL, follow_redirects=True, timeout=60)
|
||||
response.raise_for_status()
|
||||
|
||||
data = response.json()
|
||||
features = data.get("features", [])
|
||||
if len(features) != 1:
|
||||
raise ValueError(f"Expected 1 feature for England, got {len(features)}")
|
||||
|
||||
args.output.write_text(response.text)
|
||||
size_kb = args.output.stat().st_size / 1024
|
||||
print(f"Saved to {args.output} ({size_kb:.0f} KB)")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -7,6 +7,7 @@ from pathlib import Path
|
|||
from pipeline.transform.transform_poi import NAPTAN_EMOJIS, _CATEGORIES
|
||||
|
||||
GLYPHS_BASE = "https://protomaps.github.io/basemaps-assets/fonts"
|
||||
SPRITES_BASE = "https://protomaps.github.io/basemaps-assets/sprites/v4"
|
||||
TWEMOJI_BASE = "https://cdn.jsdelivr.net/gh/twitter/twemoji@14.0.2/assets/72x72"
|
||||
|
||||
# Font stacks used by @protomaps/basemaps with lang='en'
|
||||
|
|
@ -77,6 +78,15 @@ def main():
|
|||
url = f"{GLYPHS_BASE}/{font_encoded}/{name}"
|
||||
tasks.append((url, font_dir / name))
|
||||
|
||||
# Sprite sheets (light/dark, 1x and 2x)
|
||||
sprites_dir = out / "sprites"
|
||||
for theme in ("light", "dark"):
|
||||
for suffix in ("json", "png"):
|
||||
url = f"{SPRITES_BASE}/{theme}.{suffix}"
|
||||
tasks.append((url, sprites_dir / f"{theme}.{suffix}"))
|
||||
url_2x = f"{SPRITES_BASE}/{theme}@2x.{suffix}"
|
||||
tasks.append((url_2x, sprites_dir / f"{theme}@2x.{suffix}"))
|
||||
|
||||
# Twemoji PNGs
|
||||
twemoji_dir = out / "twemoji"
|
||||
for code in twemoji_codes:
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@
|
|||
|
||||
Extracts named place nodes (cities, towns, suburbs, etc.) and railway stations
|
||||
(tube, national rail, DLR, etc.) for typeahead search.
|
||||
Reuses the same great-britain-latest.osm.pbf as pois.py.
|
||||
Reuses the same england-latest.osm.pbf as pois.py.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
|
|
@ -10,9 +10,16 @@ from pathlib import Path
|
|||
|
||||
import osmium
|
||||
import polars as pl
|
||||
from shapely.geometry import Point
|
||||
from tqdm import tqdm
|
||||
|
||||
from .pois import UK_BBOX_EAST, UK_BBOX_NORTH, UK_BBOX_SOUTH, UK_BBOX_WEST
|
||||
from pipeline.download.pois import (
|
||||
ENGLAND_BBOX_EAST,
|
||||
ENGLAND_BBOX_NORTH,
|
||||
ENGLAND_BBOX_SOUTH,
|
||||
ENGLAND_BBOX_WEST,
|
||||
)
|
||||
from pipeline.utils.england_geometry import load_england_polygon
|
||||
|
||||
PLACE_TYPES = {"city"}
|
||||
|
||||
|
|
@ -57,10 +64,11 @@ def _station_display_name(name: str, tags: dict[str, str]) -> str:
|
|||
|
||||
|
||||
class PlaceHandler(osmium.SimpleHandler):
|
||||
def __init__(self, progress: tqdm) -> None:
|
||||
def __init__(self, progress: tqdm, england_polygon) -> None:
|
||||
super().__init__()
|
||||
self._progress = progress
|
||||
self.places: list[dict] = []
|
||||
self._england = england_polygon
|
||||
|
||||
def _add(
|
||||
self, name: str, place_type: str, lat: float, lon: float, population: int
|
||||
|
|
@ -82,10 +90,12 @@ class PlaceHandler(osmium.SimpleHandler):
|
|||
return
|
||||
lat, lon = n.location.lat, n.location.lon
|
||||
if not (
|
||||
UK_BBOX_SOUTH <= lat <= UK_BBOX_NORTH
|
||||
and UK_BBOX_WEST <= lon <= UK_BBOX_EAST
|
||||
ENGLAND_BBOX_SOUTH <= lat <= ENGLAND_BBOX_NORTH
|
||||
and ENGLAND_BBOX_WEST <= lon <= ENGLAND_BBOX_EAST
|
||||
):
|
||||
return
|
||||
if not self._england.contains(Point(lon, lat)):
|
||||
return
|
||||
|
||||
name = n.tags.get("name:en", n.tags.get("name", ""))
|
||||
if not name:
|
||||
|
|
@ -124,9 +134,17 @@ def main() -> None:
|
|||
parser.add_argument(
|
||||
"--pbf", type=Path, required=True, help="Path to OSM PBF file"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--boundary",
|
||||
type=Path,
|
||||
required=True,
|
||||
help="England boundary GeoJSON file",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
pbf_file = args.pbf
|
||||
england_polygon = load_england_polygon(args.boundary)
|
||||
|
||||
print("Extracting place nodes: cities + railway stations")
|
||||
with tqdm(
|
||||
unit=" elements",
|
||||
|
|
@ -135,7 +153,7 @@ def main() -> None:
|
|||
smoothing=0.05,
|
||||
mininterval=1.0,
|
||||
) as progress:
|
||||
handler = PlaceHandler(progress)
|
||||
handler = PlaceHandler(progress, england_polygon)
|
||||
handler.apply_file(str(pbf_file), locations=True)
|
||||
|
||||
print(f"Extracted {len(handler.places):,} place nodes")
|
||||
|
|
|
|||
|
|
@ -4,17 +4,20 @@ from tempfile import mkdtemp
|
|||
|
||||
import osmium
|
||||
import polars as pl
|
||||
from shapely.geometry import Point
|
||||
from tqdm import tqdm
|
||||
|
||||
from pipeline.utils.england_geometry import load_england_polygon
|
||||
|
||||
BATCH_SIZE = 50_000
|
||||
|
||||
MIN_OCCURENCE_COUNT = 20
|
||||
|
||||
UK_BBOX_WEST = -7.57
|
||||
UK_BBOX_SOUTH = 49.96
|
||||
UK_BBOX_EAST = 1.68
|
||||
UK_BBOX_NORTH = 58.64
|
||||
# Bounding box for fast pre-filtering before the precise polygon check
|
||||
ENGLAND_BBOX_WEST = -6.45
|
||||
ENGLAND_BBOX_SOUTH = 49.85
|
||||
ENGLAND_BBOX_EAST = 1.77
|
||||
ENGLAND_BBOX_NORTH = 55.82
|
||||
|
||||
POI_TAG_KEYS: list[str] = [
|
||||
"amenity",
|
||||
|
|
@ -31,19 +34,23 @@ POI_TAG_KEYS: list[str] = [
|
|||
|
||||
|
||||
class POIHandler(osmium.SimpleHandler):
|
||||
def __init__(self, progress: tqdm, tmp_dir: Path) -> None:
|
||||
def __init__(self, progress: tqdm, tmp_dir: Path, england_polygon) -> None:
|
||||
super().__init__()
|
||||
self._batch: list[dict] = []
|
||||
self._tmp_dir = tmp_dir
|
||||
self._batch_num = 0
|
||||
self.poi_count = 0
|
||||
self._progress = progress
|
||||
self._england = england_polygon
|
||||
|
||||
def _in_uk(self, lat: float, lon: float) -> bool:
|
||||
return (
|
||||
UK_BBOX_SOUTH <= lat <= UK_BBOX_NORTH
|
||||
and UK_BBOX_WEST <= lon <= UK_BBOX_EAST
|
||||
)
|
||||
def _in_england(self, lat: float, lon: float) -> bool:
|
||||
# Fast bbox pre-filter, then precise polygon check
|
||||
if not (
|
||||
ENGLAND_BBOX_SOUTH <= lat <= ENGLAND_BBOX_NORTH
|
||||
and ENGLAND_BBOX_WEST <= lon <= ENGLAND_BBOX_EAST
|
||||
):
|
||||
return False
|
||||
return self._england.contains(Point(lon, lat))
|
||||
|
||||
def _match_tags(self, tags: osmium.osm.TagList) -> list[str]:
|
||||
return [f"{key}/{tags[key]}" for key in POI_TAG_KEYS if key in tags]
|
||||
|
|
@ -90,7 +97,7 @@ class POIHandler(osmium.SimpleHandler):
|
|||
if not n.location.valid:
|
||||
return
|
||||
lat, lon = n.location.lat, n.location.lon
|
||||
if not self._in_uk(lat, lon):
|
||||
if not self._in_england(lat, lon):
|
||||
return
|
||||
categories = self._match_tags(n.tags)
|
||||
for category in categories:
|
||||
|
|
@ -107,11 +114,19 @@ def main() -> None:
|
|||
parser.add_argument(
|
||||
"--pbf", type=Path, required=True, help="Path to OSM PBF file"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--boundary",
|
||||
type=Path,
|
||||
required=True,
|
||||
help="England boundary GeoJSON file",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
pbf_file = args.pbf
|
||||
print(f"Tag keys: {POI_TAG_KEYS}")
|
||||
|
||||
england_polygon = load_england_polygon(args.boundary)
|
||||
|
||||
tmp_dir = Path(mkdtemp(prefix="pois_"))
|
||||
with tqdm(
|
||||
unit=" elements",
|
||||
|
|
@ -120,7 +135,7 @@ def main() -> None:
|
|||
smoothing=0.05,
|
||||
mininterval=1.0,
|
||||
) as progress:
|
||||
handler = POIHandler(progress, tmp_dir)
|
||||
handler = POIHandler(progress, tmp_dir, england_polygon)
|
||||
handler.apply_file(str(pbf_file), locations=True)
|
||||
handler._flush_batch() # write any remaining POIs
|
||||
|
||||
|
|
|
|||
|
|
@ -3,6 +3,8 @@ from pathlib import Path
|
|||
|
||||
import polars as pl
|
||||
|
||||
from pipeline.utils.england_geometry import in_england_mask
|
||||
|
||||
|
||||
DROP_CATEGORIES = {
|
||||
# Street furniture & infrastructure
|
||||
|
|
@ -1056,7 +1058,11 @@ NAPTAN_EMOJIS: dict[str, str] = {
|
|||
}
|
||||
|
||||
|
||||
def transform(input_path: Path, naptan_path: Path | None = None) -> pl.LazyFrame:
|
||||
def transform(
|
||||
input_path: Path,
|
||||
naptan_path: Path | None = None,
|
||||
boundary_path: Path | None = None,
|
||||
) -> pl.LazyFrame:
|
||||
lf = pl.scan_parquet(input_path)
|
||||
|
||||
# Get all unique categories present in the data
|
||||
|
|
@ -1072,16 +1078,14 @@ def transform(input_path: Path, naptan_path: Path | None = None) -> pl.LazyFrame
|
|||
if unmapped:
|
||||
raise ValueError(f"Categories missing from CATEGORY_MAP: {sorted(unmapped)}")
|
||||
|
||||
# Verify every CATEGORY_MAP key actually exists in the data (catch typos)
|
||||
# Warn about CATEGORY_MAP keys not in data (may be absent in regional extracts)
|
||||
mapped_but_absent = []
|
||||
all_set = set(all_categories)
|
||||
for cat in CATEGORY_MAP:
|
||||
if cat not in all_set:
|
||||
mapped_but_absent.append(cat)
|
||||
if mapped_but_absent:
|
||||
raise ValueError(
|
||||
f"CATEGORY_MAP contains categories not in data: {sorted(mapped_but_absent)}"
|
||||
)
|
||||
print(f"CATEGORY_MAP categories not in data (skipped): {sorted(mapped_but_absent)}")
|
||||
|
||||
# Drop unwanted categories
|
||||
lf = lf.filter(~pl.col("category").is_in(list(DROP_CATEGORIES)))
|
||||
|
|
@ -1105,7 +1109,15 @@ def transform(input_path: Path, naptan_path: Path | None = None) -> pl.LazyFrame
|
|||
pl.col("category").replace_strict(emoji_mapping).alias("emoji"),
|
||||
)
|
||||
|
||||
naptan = pl.scan_parquet(naptan_path).with_columns(
|
||||
naptan_df = pl.scan_parquet(naptan_path).collect()
|
||||
if boundary_path is not None:
|
||||
mask = in_england_mask(
|
||||
boundary_path,
|
||||
naptan_df["lat"].to_numpy(),
|
||||
naptan_df["lng"].to_numpy(),
|
||||
)
|
||||
naptan_df = naptan_df.filter(pl.Series(mask))
|
||||
naptan = naptan_df.lazy().with_columns(
|
||||
pl.col("category").replace_strict(NAPTAN_EMOJIS).alias("emoji"),
|
||||
pl.lit("Public Transport").alias("group"),
|
||||
)
|
||||
|
|
@ -1122,12 +1134,18 @@ def main():
|
|||
parser.add_argument(
|
||||
"--naptan", type=Path, required=True, help="NaPTAN stations parquet file"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--boundary",
|
||||
type=Path,
|
||||
required=True,
|
||||
help="England boundary GeoJSON file",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output", type=Path, required=True, help="Output filtered POIs parquet file"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
df = transform(args.input, args.naptan).collect(engine="streaming")
|
||||
df = transform(args.input, args.naptan, args.boundary).collect(engine="streaming")
|
||||
|
||||
df.write_parquet(args.output)
|
||||
|
||||
|
|
|
|||
33
pipeline/utils/england_geometry.py
Normal file
33
pipeline/utils/england_geometry.py
Normal file
|
|
@ -0,0 +1,33 @@
|
|||
"""England boundary polygon for accurate point-in-country filtering.
|
||||
|
||||
Uses shapely prepared geometry for fast single-point checks (osmium handlers)
|
||||
and vectorized shapely.contains for batch checks (Polars DataFrames).
|
||||
"""
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import shapely
|
||||
from shapely.geometry import shape
|
||||
from shapely.prepared import PreparedGeometry, prep
|
||||
|
||||
|
||||
def load_england_polygon(geojson_path: Path) -> PreparedGeometry:
|
||||
"""Load England boundary as a prepared shapely polygon for fast contains checks."""
|
||||
with open(geojson_path) as f:
|
||||
data = json.load(f)
|
||||
geometry = shape(data["features"][0]["geometry"])
|
||||
return prep(geometry)
|
||||
|
||||
|
||||
def in_england_mask(geojson_path: Path, lats: np.ndarray, lngs: np.ndarray) -> np.ndarray:
|
||||
"""Vectorized check: which (lat, lng) points are within England.
|
||||
|
||||
Returns a boolean numpy array.
|
||||
"""
|
||||
with open(geojson_path) as f:
|
||||
data = json.load(f)
|
||||
polygon = shape(data["features"][0]["geometry"])
|
||||
pts = shapely.points(lngs, lats)
|
||||
return shapely.contains(polygon, pts)
|
||||
Loading…
Add table
Add a link
Reference in a new issue