diff --git a/frontend/src/App.tsx b/frontend/src/App.tsx index b72b2b8..f4838fb 100644 --- a/frontend/src/App.tsx +++ b/frontend/src/App.tsx @@ -10,7 +10,7 @@ import type { ApiResponse, POI, POIResponse, - POICategoryGroup, + POICategoriesMap, ColorMode, } from './types'; @@ -55,23 +55,30 @@ export default function App() { // POI state const [pois, setPois] = useState([]); - const [selectedPOICategories, setSelectedPOICategories] = useState>( - new Set() - ); + const [poiCategories, setPOICategories] = useState({}); + const [selectedPOICategories, setSelectedPOICategories] = useState>(new Set()); const poiDebounceRef = useRef | null>(null); const poiAbortControllerRef = useRef(null); + // Fetch POI category definitions from server on mount + useEffect(() => { + fetch(`${getApiBaseUrl()}/api/poi-categories`) + .then((res) => res.json()) + .then((json: { categories: POICategoriesMap }) => { + setPOICategories(json.categories); + }) + .catch((err) => console.error('Failed to fetch POI categories:', err)); + }, []); + // Debounced fetch when dependencies change useEffect(() => { if (!bounds) return; - // Clear previous debounce timer if (debounceRef.current) { clearTimeout(debounceRef.current); } debounceRef.current = setTimeout(async () => { - // Cancel any in-flight request if (abortControllerRef.current) { abortControllerRef.current.abort(); } @@ -167,6 +174,7 @@ export default function App() { filters={filters} onChange={setFilters} zoom={zoom} + poiCategories={poiCategories} selectedPOICategories={selectedPOICategories} onPOICategoriesChange={setSelectedPOICategories} colorMode={colorMode} diff --git a/frontend/src/components/Filters.tsx b/frontend/src/components/Filters.tsx index 9e078f9..b956df8 100644 --- a/frontend/src/components/Filters.tsx +++ b/frontend/src/components/Filters.tsx @@ -1,32 +1,25 @@ +import { useState, useRef, useEffect } from 'react'; import { Slider } from './ui/slider'; import { Label } from './ui/label'; import { YEAR_MIN, YEAR_MAX, YEAR_STEP, PRICE_MIN, PRICE_MAX, PRICE_STEP } from '../lib/constants'; -import type { Filters as FiltersType, POICategoryGroup, ColorMode } from '../types'; -import { POI_CATEGORY_GROUPS } from '../types'; +import type { Filters as FiltersType, POICategoriesMap, ColorMode } from '../types'; interface FiltersProps { filters: FiltersType; onChange: (filters: FiltersType) => void; zoom: number; - selectedPOICategories: Set; - onPOICategoriesChange: (categories: Set) => void; + poiCategories: POICategoriesMap; + selectedPOICategories: Set; + onPOICategoriesChange: (categories: Set) => void; colorMode: ColorMode; onColorModeChange: (mode: ColorMode) => void; } -const POI_LABELS: Record = { - schools: '๐Ÿซ Schools', - healthcare: '๐Ÿฅ Healthcare', - transport: '๐Ÿš‰ Transport', - parks: '๐ŸŒณ Parks', - emergency: '๐Ÿšจ Emergency', - supermarkets: '๐Ÿ›’ Supermarkets', -}; - export default function Filters({ filters, onChange, zoom, + poiCategories, selectedPOICategories, onPOICategoriesChange, colorMode, @@ -34,16 +27,41 @@ export default function Filters({ }: FiltersProps) { const update = (key: keyof FiltersType, value: number) => onChange({ ...filters, [key]: value }); - const togglePOICategory = (category: POICategoryGroup) => { + const [dropdownOpen, setDropdownOpen] = useState(false); + const dropdownRef = useRef(null); + + // Close dropdown when clicking outside + useEffect(() => { + function handleClickOutside(event: MouseEvent) { + if (dropdownRef.current && !dropdownRef.current.contains(event.target as Node)) { + setDropdownOpen(false); + } + } + document.addEventListener('mousedown', handleClickOutside); + return () => document.removeEventListener('mousedown', handleClickOutside); + }, []); + + const toggleCategory = (key: string) => { const newSet = new Set(selectedPOICategories); - if (newSet.has(category)) { - newSet.delete(category); + if (newSet.has(key)) { + newSet.delete(key); } else { - newSet.add(category); + newSet.add(key); } onPOICategoriesChange(newSet); }; + const selectAll = () => { + onPOICategoriesChange(new Set(Object.keys(poiCategories))); + }; + + const selectNone = () => { + onPOICategoriesChange(new Set()); + }; + + const categoryKeys = Object.keys(poiCategories); + const selectedCount = selectedPOICategories.size; + return (

UK Property Prices

@@ -139,21 +157,69 @@ export default function Filters({
)} -
+
-
- {POI_CATEGORY_GROUPS.map((category) => ( - - ))} -
+ + + {dropdownOpen && ( +
+
+ + | + +
+
+ {categoryKeys.map((key) => { + const { emoji, label } = poiCategories[key]; + return ( + + ); + })} +
+
+ )}
); diff --git a/frontend/src/components/Map.tsx b/frontend/src/components/Map.tsx index 0c52f12..a3e8ec2 100644 --- a/frontend/src/components/Map.tsx +++ b/frontend/src/components/Map.tsx @@ -1,6 +1,7 @@ import { useCallback, useRef, useEffect, useState, useMemo } from 'react'; -import { Map as MapGL } from 'react-map-gl/maplibre'; -import DeckGL from '@deck.gl/react'; +import { Map as MapGL, useControl } from 'react-map-gl/maplibre'; +import type { MapRef } from 'react-map-gl/maplibre'; +import { MapboxOverlay } from '@deck.gl/mapbox'; import { H3HexagonLayer } from '@deck.gl/geo-layers'; import { IconLayer } from '@deck.gl/layers'; import type { PickingInfo } from '@deck.gl/core'; @@ -19,35 +20,119 @@ const TWEMOJI_BASE = 'https://cdn.jsdelivr.net/gh/twitter/twemoji@14.0.2/assets/ // Map category to Twemoji codepoint (emoji unicode -> hex) const POI_EMOJI_CODES: Record = { - // Schools - elementary_school: '1f3eb', // ๐Ÿซ - school: '1f3eb', - high_school: '1f393', // ๐ŸŽ“ + // Education + school: '1f3eb', // ๐Ÿซ preschool: '1f476', // ๐Ÿ‘ถ - college_university: '1f393', - private_school: '1f3eb', + college_university: '1f393', // ๐ŸŽ“ + library: '1f4da', // ๐Ÿ“š // Healthcare doctor: '1f3e5', // ๐Ÿฅ dentist: '1f9b7', // ๐Ÿฆท pharmacy: '1f48a', // ๐Ÿ’Š hospital: '1f3e5', public_health_clinic: '1f3e5', + veterinary: '1f43e', // ๐Ÿพ + nursing_home: '1f3e0', // ๐Ÿ  + social_facility: '1f91d', // ๐Ÿค // Transport train_station: '1f689', // ๐Ÿš‰ bus_station: '1f68c', // ๐ŸšŒ + bus_stop: '1f68f', // ๐Ÿš metro_station: '1f687', // ๐Ÿš‡ - light_rail_and_subway_stations: '1f687', - // Parks + light_rail_station: '1f687', + tram_stop: '1f68a', // ๐ŸšŠ + ferry_terminal: '26f4', // โ›ด + airport: '2708', // โœˆ + // Parks & Leisure park: '1f333', // ๐ŸŒณ national_park: '1f3de', // ๐Ÿž + nature_reserve: '1f33f', // ๐ŸŒฟ dog_park: '1f415', // ๐Ÿ• + playground: '1f3a0', // ๐ŸŽ  + garden: '1f33a', // ๐ŸŒบ + sports_centre: '1f3c3', // ๐Ÿƒ + swimming_pool: '1f3ca', // ๐ŸŠ + gym: '1f4aa', // ๐Ÿ’ช + golf_course: '26f3', // โ›ณ + marina: '26f5', // โ›ต // Emergency police_department: '1f694', // ๐Ÿš” fire_department: '1f692', // ๐Ÿš’ - // Supermarkets + // Supermarkets & Grocery supermarket: '1f6d2', // ๐Ÿ›’ grocery_store: '1f6d2', convenience_store: '1f3ea', // ๐Ÿช + bakery: '1f35e', // ๐Ÿž + butcher: '1f969', // ๐Ÿฅฉ + greengrocer: '1f966', // ๐Ÿฅฆ + deli: '1f9c0', // ๐Ÿง€ + // Shopping + department_store: '1f3ec', // ๐Ÿฌ + clothing_store: '1f455', // ๐Ÿ‘• + shoe_store: '1f45f', // ๐Ÿ‘Ÿ + electronics_store: '1f4f1', // ๐Ÿ“ฑ + hardware_store: '1f527', // ๐Ÿ”ง + furniture_store: '1fa91', // ๐Ÿช‘ + bookshop: '1f4d6', // ๐Ÿ“– + newsagent: '1f4f0', // ๐Ÿ“ฐ + charity_shop: '1f49c', // ๐Ÿ’œ + shopping_centre: '1f6cd', // ๐Ÿ› + optician: '1f453', // ๐Ÿ‘“ + off_licence: '1f37a', // ๐Ÿบ + // Food & Drink + restaurant: '1f37d', // ๐Ÿฝ + cafe: '2615', // โ˜• + pub: '1f37b', // ๐Ÿป + bar: '1f378', // ๐Ÿธ + fast_food: '1f354', // ๐Ÿ” + food_court: '1f372', // ๐Ÿฒ + ice_cream: '1f366', // ๐Ÿฆ + beer_garden: '1f37a', // ๐Ÿบ + // Personal Care + hairdresser: '1f487', // ๐Ÿ’‡ + beauty_salon: '1f484', // ๐Ÿ’„ + laundry: '1f9fa', // ๐Ÿงบ + dry_cleaning: '1f455', // ๐Ÿ‘• + // Finance + bank: '1f3e6', // ๐Ÿฆ + atm: '1f4b3', // ๐Ÿ’ณ + bureau_de_change: '1f4b1', // ๐Ÿ’ฑ + // Entertainment & Culture + cinema: '1f3ac', // ๐ŸŽฌ + theatre: '1f3ad', // ๐ŸŽญ + nightclub: '1f483', // ๐Ÿ’ƒ + community_centre: '1f3db', // ๐Ÿ› + arts_centre: '1f3a8', // ๐ŸŽจ + museum: '1f3db', // ๐Ÿ› + gallery: '1f5bc', // ๐Ÿ–ผ + attraction: '2b50', // โญ + zoo: '1f418', // ๐Ÿ˜ + theme_park: '1f3a2', // ๐ŸŽข + viewpoint: '1f301', // ๐ŸŒ + // Accommodation + hotel: '1f3e8', // ๐Ÿจ + hostel: '1f6cf', // ๐Ÿ› + guest_house: '1f3e1', // ๐Ÿก + campsite: '26fa', // โ›บ + caravan_site: '1f699', // ๐Ÿš™ + // Religion + place_of_worship: '1f6d0', // ๐Ÿ› + // Government & Public + town_hall: '1f3db', // ๐Ÿ› + courthouse: '2696', // โš– + post_office: '1f4ee', // ๐Ÿ“ฎ + prison: '1f513', // ๐Ÿ”“ + public_toilets: '1f6bb', // ๐Ÿšป + // Automotive + petrol_station: '26fd', // โ›ฝ + ev_charging: '1f50c', // ๐Ÿ”Œ + car_dealer: '1f697', // ๐Ÿš— + car_repair: '1f527', // ๐Ÿ”ง + parking: '1f17f', // ๐Ÿ…ฟ + bicycle_parking: '1f6b2', // ๐Ÿšฒ + // Recycling & Waste + recycling: '267b', // โ™ป + waste_disposal: '1f5d1', // ๐Ÿ—‘ }; function getPOIIconUrl(category: string): string { @@ -57,29 +142,34 @@ function getPOIIconUrl(category: string): string { // Tooltip emojis (these render fine in HTML) const TOOLTIP_EMOJIS: Record = { - elementary_school: '๐Ÿซ', - school: '๐Ÿซ', - high_school: '๐ŸŽ“', - preschool: '๐Ÿ‘ถ', - college_university: '๐ŸŽ“', - private_school: '๐Ÿซ', - doctor: '๐Ÿ‘จโ€โš•๏ธ', - dentist: '๐Ÿฆท', - pharmacy: '๐Ÿ’Š', - hospital: '๐Ÿฅ', - public_health_clinic: '๐Ÿฅ', - train_station: '๐Ÿš‰', - bus_station: '๐ŸšŒ', - metro_station: '๐Ÿš‡', - light_rail_and_subway_stations: '๐Ÿš‡', - park: '๐ŸŒณ', - national_park: '๐Ÿž๏ธ', - dog_park: '๐Ÿ•', - police_department: '๐Ÿš”', - fire_department: '๐Ÿš’', - supermarket: '๐Ÿ›’', - grocery_store: '๐Ÿ›’', - convenience_store: '๐Ÿช', + school: '๐Ÿซ', preschool: '๐Ÿ‘ถ', college_university: '๐ŸŽ“', library: '๐Ÿ“š', + doctor: '๐Ÿฅ', dentist: '๐Ÿฆท', pharmacy: '๐Ÿ’Š', hospital: '๐Ÿฅ', + public_health_clinic: '๐Ÿฅ', veterinary: '๐Ÿพ', nursing_home: '๐Ÿ ', social_facility: '๐Ÿค', + train_station: '๐Ÿš‰', bus_station: '๐ŸšŒ', bus_stop: '๐Ÿš', metro_station: '๐Ÿš‡', + light_rail_station: '๐Ÿš‡', tram_stop: '๐ŸšŠ', ferry_terminal: 'โ›ด๏ธ', airport: 'โœˆ๏ธ', + park: '๐ŸŒณ', national_park: '๐Ÿž๏ธ', nature_reserve: '๐ŸŒฟ', dog_park: '๐Ÿ•', + playground: '๐ŸŽ ', garden: '๐ŸŒบ', sports_centre: '๐Ÿƒ', swimming_pool: '๐ŸŠ', + gym: '๐Ÿ’ช', golf_course: 'โ›ณ', marina: 'โ›ต', + police_department: '๐Ÿš”', fire_department: '๐Ÿš’', + supermarket: '๐Ÿ›’', grocery_store: '๐Ÿ›’', convenience_store: '๐Ÿช', + bakery: '๐Ÿž', butcher: '๐Ÿฅฉ', greengrocer: '๐Ÿฅฆ', deli: '๐Ÿง€', + department_store: '๐Ÿฌ', clothing_store: '๐Ÿ‘•', shoe_store: '๐Ÿ‘Ÿ', + electronics_store: '๐Ÿ“ฑ', hardware_store: '๐Ÿ”ง', furniture_store: '๐Ÿช‘', + bookshop: '๐Ÿ“–', newsagent: '๐Ÿ“ฐ', charity_shop: '๐Ÿ’œ', shopping_centre: '๐Ÿ›๏ธ', + optician: '๐Ÿ‘“', off_licence: '๐Ÿบ', + restaurant: '๐Ÿฝ๏ธ', cafe: 'โ˜•', pub: '๐Ÿป', bar: '๐Ÿธ', + fast_food: '๐Ÿ”', food_court: '๐Ÿฒ', ice_cream: '๐Ÿฆ', beer_garden: '๐Ÿบ', + hairdresser: '๐Ÿ’‡', beauty_salon: '๐Ÿ’„', laundry: '๐Ÿงบ', dry_cleaning: '๐Ÿ‘•', + bank: '๐Ÿฆ', atm: '๐Ÿ’ณ', bureau_de_change: '๐Ÿ’ฑ', + cinema: '๐ŸŽฌ', theatre: '๐ŸŽญ', nightclub: '๐Ÿ’ƒ', community_centre: '๐Ÿ›๏ธ', + arts_centre: '๐ŸŽจ', museum: '๐Ÿ›๏ธ', gallery: '๐Ÿ–ผ๏ธ', attraction: 'โญ', + zoo: '๐Ÿ˜', theme_park: '๐ŸŽข', viewpoint: '๐ŸŒ', + hotel: '๐Ÿจ', hostel: '๐Ÿ›๏ธ', guest_house: '๐Ÿก', campsite: 'โ›บ', caravan_site: '๐Ÿš™', + place_of_worship: '๐Ÿ›', + town_hall: '๐Ÿ›๏ธ', courthouse: 'โš–๏ธ', post_office: '๐Ÿ“ฎ', prison: '๐Ÿ”“', public_toilets: '๐Ÿšป', + petrol_station: 'โ›ฝ', ev_charging: '๐Ÿ”Œ', car_dealer: '๐Ÿš—', car_repair: '๐Ÿ”ง', + parking: '๐Ÿ…ฟ๏ธ', bicycle_parking: '๐Ÿšฒ', + recycling: 'โ™ป๏ธ', waste_disposal: '๐Ÿ—‘๏ธ', }; function getTooltipEmoji(category: string): string { @@ -158,7 +248,7 @@ function journeyTimeToColor(minutes: number | null | undefined): [number, number } function zoomToResolution(zoom: number): number { - if (zoom < 8.5) return 7; + if (zoom < 7) return 7; if (zoom < 9.5) return 8; if (zoom < 11) return 9; if (zoom < 13) return 10; @@ -209,6 +299,22 @@ interface Dimensions { height: number; } +// First label layer in the Carto Positron style โ€” hexagons render below this +const LABEL_LAYER_ID = 'waterway_label'; + +function DeckOverlay({ + layers, + getTooltip, +}: { + layers: (H3HexagonLayer | IconLayer)[]; + // eslint-disable-next-line @typescript-eslint/no-explicit-any + getTooltip: any; +}) { + const overlay = useControl(() => new MapboxOverlay({ interleaved: true })); + overlay.setProps({ layers, getTooltip }); + return null; +} + export default function Map({ data, pois, onViewChange, colorMode }: MapProps) { const containerRef = useRef(null); const [viewState, setViewState] = useState(INITIAL_VIEW); @@ -240,12 +346,23 @@ export default function Map({ data, pois, onViewChange, colorMode }: MapProps) { onViewChange({ resolution, bounds, zoom: viewState.zoom }); }, [viewState, dimensions, onViewChange]); - const handleViewStateChange = useCallback((params: { viewState: unknown }) => { - const newViewState = params.viewState as ViewState; - setViewState(newViewState); + const handleMove = useCallback((evt: { viewState: ViewState }) => { + setViewState(evt.viewState); }, []); - // Popup state for POI hover (using screen coordinates) + // Make place labels more legible over the colored hexagons + const handleMapLoad = useCallback((evt: { target: MapRef['getMap'] extends () => infer M ? M : never }) => { + const map = evt.target; + for (const layer of map.getStyle().layers || []) { + if (layer.type !== 'symbol') continue; + // Stronger white halo so text pops over hex fills + map.setPaintProperty(layer.id, 'text-halo-color', 'rgba(255,255,255,1)'); + map.setPaintProperty(layer.id, 'text-halo-width', 2); + map.setPaintProperty(layer.id, 'text-color', '#222'); + } + }, []); + + // Popup state for POI hover const [popupInfo, setPopupInfo] = useState<{ x: number; y: number; @@ -283,6 +400,9 @@ export default function Map({ data, pois, onViewChange, colorMode }: MapProps) { pickable: true, opacity: 0.5, highPrecision: true, + // Render below labels so road names, place names etc. stay visible + // @ts-expect-error beforeId is a MapboxOverlay interleave prop, not typed in LayerProps + beforeId: LABEL_LAYER_ID, }), new IconLayer({ id: 'poi-icons', @@ -303,7 +423,6 @@ export default function Map({ data, pois, onViewChange, colorMode }: MapProps) { [data, pois, handlePoiHover, colorMode] ); - // Tooltip for hexagons only (POIs use MapLibre popup) const getTooltip = useCallback(({ object }: { object?: HexagonData }) => { if (!object || !('h3' in object)) return null; @@ -339,15 +458,15 @@ export default function Map({ data, pois, onViewChange, colorMode }: MapProps) { return (
- - - + + {popupInfo && (
; diff --git a/pipeline/pois/__init__.py b/pipeline/pois/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pipeline/pois/__main__.py b/pipeline/pois/__main__.py new file mode 100644 index 0000000..ac6c0b1 --- /dev/null +++ b/pipeline/pois/__main__.py @@ -0,0 +1,181 @@ +"""Single-pass POI extraction from OSM PBF file using pyosmium.""" + +import json +import urllib.request + +import osmium +import polars as pl +from tqdm import tqdm + +from .config import ( + GB_PBF_FILE, + GEOFABRIK_GB_URL, + OSM_TAG_MAPPING, + OUTPUT_FILE, + TAG_KEYS_TO_CHECK, + UK_BBOX_EAST, + UK_BBOX_NORTH, + UK_BBOX_SOUTH, + UK_BBOX_WEST, +) + +# Approximate element count for the GB PBF extract (for progress estimation). +ESTIMATED_ELEMENTS = 500_000_000 + + +def download_pbf() -> None: + """Download Great Britain PBF extract from Geofabrik.""" + GB_PBF_FILE.parent.mkdir(parents=True, exist_ok=True) + tmp = GB_PBF_FILE.with_suffix(".pbf.tmp") + print(f"Downloading {GEOFABRIK_GB_URL}") + + with ( + tqdm(unit="B", unit_scale=True, desc="Downloading") as bar, + urllib.request.urlopen(GEOFABRIK_GB_URL) as resp, + open(tmp, "wb") as f, + ): + length = resp.headers.get("Content-Length") + if length: + bar.total = int(length) + while chunk := resp.read(1 << 20): + f.write(chunk) + bar.update(len(chunk)) + + tmp.rename(GB_PBF_FILE) + print(f"Saved to {GB_PBF_FILE}") + + +class POIHandler(osmium.SimpleHandler): + """Streams OSM data, filters to UK bbox, extracts matching POIs.""" + + def __init__(self, progress: tqdm) -> None: + super().__init__() + self.pois: list[dict] = [] + self._poi_count = 0 + self._progress = progress + + def _in_uk(self, lat: float, lon: float) -> bool: + return ( + UK_BBOX_SOUTH <= lat <= UK_BBOX_NORTH + and UK_BBOX_WEST <= lon <= UK_BBOX_EAST + ) + + def _match_tags(self, tags: osmium.osm.TagList) -> str | None: + for key in TAG_KEYS_TO_CHECK: + if key in tags: + value = tags[key] + if value in TAG_KEYS_TO_CHECK[key]: + return OSM_TAG_MAPPING[(key, value)] + return None + + def _get_name(self, tags: osmium.osm.TagList) -> str: + return tags.get("name:en", tags.get("name", "")) + + def _tags_to_json(self, tags: osmium.osm.TagList) -> str: + return json.dumps({tag.k: tag.v for tag in tags}) + + def _add_poi( + self, osm_id: str, tags: osmium.osm.TagList, category: str, lat: float, lng: float + ) -> None: + self.pois.append( + { + "id": osm_id, + "name": self._get_name(tags), + "category": category, + "lat": lat, + "lng": lng, + "osm_tags": self._tags_to_json(tags), + } + ) + self._poi_count += 1 + self._progress.set_postfix(pois=f"{self._poi_count:,}", refresh=False) + + def _tick(self) -> None: + self._progress.update(1) + + def node(self, n: osmium.osm.Node) -> None: + self._tick() + if not n.location.valid: + return + lat, lon = n.location.lat, n.location.lon + if not self._in_uk(lat, lon): + return + category = self._match_tags(n.tags) + if category: + self._add_poi(f"n{n.id}", n.tags, category, lat, lon) + + def way(self, w: osmium.osm.Way) -> None: + self._tick() + category = self._match_tags(w.tags) + if not category: + return + + lats = [] + lons = [] + for node in w.nodes: + try: + lats.append(node.location.lat) + lons.append(node.location.lon) + except osmium.InvalidLocationError: + continue + + if not lats: + return + + centroid_lat = sum(lats) / len(lats) + centroid_lng = sum(lons) / len(lons) + + if not self._in_uk(centroid_lat, centroid_lng): + return + + self._add_poi(f"w{w.id}", w.tags, category, centroid_lat, centroid_lng) + + +def main() -> None: + if not GB_PBF_FILE.exists(): + download_pbf() + + print(f"=== POI Extraction from {GB_PBF_FILE} ===") + print( + f"UK bbox: ({UK_BBOX_WEST}, {UK_BBOX_SOUTH}, {UK_BBOX_EAST}, {UK_BBOX_NORTH})" + ) + print(f"Categories: {len(OSM_TAG_MAPPING)}") + print() + + with tqdm( + total=ESTIMATED_ELEMENTS, + unit=" elements", + unit_scale=True, + desc="Streaming", + smoothing=0.05, + mininterval=1.0, + ) as progress: + handler = POIHandler(progress) + handler.apply_file(str(GB_PBF_FILE), locations=True) + + print(f"Extracted {len(handler.pois):,} POIs") + + if not handler.pois: + print("No POIs found.") + return + + df = pl.DataFrame(handler.pois) + + OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True) + df.write_parquet(OUTPUT_FILE) + print(f"Saved to {OUTPUT_FILE}") + + print("\n=== Summary ===") + print(f"Total POIs: {len(df):,}") + print("\nPOIs by category:") + category_counts = ( + df.group_by("category") + .agg(pl.len().alias("count")) + .sort("count", descending=True) + ) + for row in category_counts.iter_rows(named=True): + print(f" {row['category']}: {row['count']:,}") + + +if __name__ == "__main__": + main() diff --git a/pipeline/pois/config.py b/pipeline/pois/config.py new file mode 100644 index 0000000..14fb439 --- /dev/null +++ b/pipeline/pois/config.py @@ -0,0 +1,147 @@ +"""Configuration for POI extraction from OpenStreetMap.""" + +from pathlib import Path + +# File paths +DATA_DIR = Path(__file__).parent.parent.parent / "data_sources" +GB_PBF_FILE = DATA_DIR / "great-britain-latest.osm.pbf" +OUTPUT_FILE = DATA_DIR / "uk_pois.parquet" + +# Geofabrik download URL for Great Britain (~1.1GB PBF, much faster than planet XML) +GEOFABRIK_GB_URL = ( + "https://download.geofabrik.de/europe/great-britain-latest.osm.pbf" +) + +# UK bounding box (west, south, east, north) โ€” used for way centroid filtering +UK_BBOX_WEST = -7.57 +UK_BBOX_SOUTH = 49.96 +UK_BBOX_EAST = 1.68 +UK_BBOX_NORTH = 58.64 + +# OSM tag mapping to categories +# Maps (tag_key, tag_value) -> category name +OSM_TAG_MAPPING: dict[tuple[str, str], str] = { + # Education + ("amenity", "school"): "school", + ("amenity", "kindergarten"): "preschool", + ("amenity", "college"): "college_university", + ("amenity", "university"): "college_university", + ("amenity", "library"): "library", + ("amenity", "language_school"): "school", + ("amenity", "music_school"): "school", + ("amenity", "driving_school"): "school", + # Healthcare + ("amenity", "hospital"): "hospital", + ("amenity", "clinic"): "public_health_clinic", + ("amenity", "doctors"): "doctor", + ("amenity", "dentist"): "dentist", + ("amenity", "pharmacy"): "pharmacy", + ("amenity", "veterinary"): "veterinary", + ("amenity", "nursing_home"): "nursing_home", + ("amenity", "social_facility"): "social_facility", + # Transport + ("railway", "station"): "train_station", + ("railway", "halt"): "train_station", + ("railway", "tram_stop"): "tram_stop", + ("amenity", "bus_station"): "bus_station", + ("amenity", "ferry_terminal"): "ferry_terminal", + ("public_transport", "station"): "train_station", + ("public_transport", "stop_position"): "bus_stop", + ("station", "subway"): "metro_station", + ("station", "light_rail"): "light_rail_station", + ("aeroway", "aerodrome"): "airport", + ("highway", "bus_stop"): "bus_stop", + # Parks & Leisure + ("leisure", "park"): "park", + ("leisure", "nature_reserve"): "nature_reserve", + ("leisure", "dog_park"): "dog_park", + ("leisure", "playground"): "playground", + ("leisure", "sports_centre"): "sports_centre", + ("leisure", "swimming_pool"): "swimming_pool", + ("leisure", "fitness_centre"): "gym", + ("leisure", "golf_course"): "golf_course", + ("leisure", "garden"): "garden", + ("leisure", "marina"): "marina", + ("boundary", "national_park"): "national_park", + # Emergency + ("amenity", "police"): "police_department", + ("amenity", "fire_station"): "fire_department", + # Shopping + ("shop", "supermarket"): "supermarket", + ("shop", "convenience"): "convenience_store", + ("shop", "grocery"): "grocery_store", + ("shop", "bakery"): "bakery", + ("shop", "butcher"): "butcher", + ("shop", "greengrocer"): "greengrocer", + ("shop", "deli"): "deli", + ("shop", "department_store"): "department_store", + ("shop", "clothes"): "clothing_store", + ("shop", "shoes"): "shoe_store", + ("shop", "electronics"): "electronics_store", + ("shop", "hardware"): "hardware_store", + ("shop", "furniture"): "furniture_store", + ("shop", "car"): "car_dealer", + ("shop", "car_repair"): "car_repair", + ("shop", "hairdresser"): "hairdresser", + ("shop", "beauty"): "beauty_salon", + ("shop", "optician"): "optician", + ("shop", "newsagent"): "newsagent", + ("shop", "books"): "bookshop", + ("shop", "charity"): "charity_shop", + ("shop", "alcohol"): "off_licence", + ("shop", "laundry"): "laundry", + ("shop", "dry_cleaning"): "dry_cleaning", + ("shop", "mall"): "shopping_centre", + # Food & Drink + ("amenity", "restaurant"): "restaurant", + ("amenity", "cafe"): "cafe", + ("amenity", "pub"): "pub", + ("amenity", "bar"): "bar", + ("amenity", "fast_food"): "fast_food", + ("amenity", "food_court"): "food_court", + ("amenity", "ice_cream"): "ice_cream", + ("amenity", "biergarten"): "beer_garden", + # Finance + ("amenity", "bank"): "bank", + ("amenity", "atm"): "atm", + ("amenity", "bureau_de_change"): "bureau_de_change", + # Entertainment & Culture + ("amenity", "cinema"): "cinema", + ("amenity", "theatre"): "theatre", + ("amenity", "nightclub"): "nightclub", + ("amenity", "community_centre"): "community_centre", + ("amenity", "arts_centre"): "arts_centre", + ("tourism", "museum"): "museum", + ("tourism", "gallery"): "gallery", + ("tourism", "attraction"): "attraction", + ("tourism", "zoo"): "zoo", + ("tourism", "theme_park"): "theme_park", + ("tourism", "viewpoint"): "viewpoint", + # Accommodation + ("tourism", "hotel"): "hotel", + ("tourism", "hostel"): "hostel", + ("tourism", "guest_house"): "guest_house", + ("tourism", "camp_site"): "campsite", + ("tourism", "caravan_site"): "caravan_site", + # Religion + ("amenity", "place_of_worship"): "place_of_worship", + # Government & Public + ("amenity", "townhall"): "town_hall", + ("amenity", "courthouse"): "courthouse", + ("amenity", "post_office"): "post_office", + ("amenity", "prison"): "prison", + ("amenity", "recycling"): "recycling", + ("amenity", "waste_disposal"): "waste_disposal", + ("amenity", "toilets"): "public_toilets", + # Fuel + ("amenity", "fuel"): "petrol_station", + ("amenity", "charging_station"): "ev_charging", + # Parking + ("amenity", "parking"): "parking", + ("amenity", "bicycle_parking"): "bicycle_parking", +} + +# Build reverse lookup: tag_key -> set of tag_values we care about +TAG_KEYS_TO_CHECK: dict[str, set[str]] = {} +for (key, value), _ in OSM_TAG_MAPPING.items(): + TAG_KEYS_TO_CHECK.setdefault(key, set()).add(value) diff --git a/pipeline/processors/journey_times_aggregator.py b/pipeline/processors/journey_times_aggregator.py index ffb6b6f..5f6dfb9 100644 --- a/pipeline/processors/journey_times_aggregator.py +++ b/pipeline/processors/journey_times_aggregator.py @@ -6,31 +6,47 @@ import polars as pl from pipeline.config import AGGREGATES_DIR, H3_RESOLUTIONS, PROCESSED_DIR +JOURNEY_COLS = [ + "public_transport_easy_minutes", + "public_transport_quick_minutes", + "cycling_minutes", +] + +AGGREGATE_COLS = [ + "median_pt_easy_minutes", + "median_pt_quick_minutes", + "median_cycling_minutes", + "median_journey_minutes", +] + def aggregate_journey_times( journey_times_path: Path | None = None, postcodes_h3_path: Path | None = None, - output_dir: Path | None = None, + aggregates_dir: Path | None = None, ) -> list[Path]: """ - Aggregate journey times by H3 cells at all resolutions. + Add journey times to existing H3 aggregate parquet files. - Joins journey_times_bank.parquet with postcodes_h3.parquet on postcode, - then groups by H3 cell to compute median journey time. + Joins journey_times_bank_checkpoint.parquet with postcodes_h3.parquet on postcode, + aggregates by H3 cell, then merges into existing res{N}.parquet files. """ - journey_times_path = journey_times_path or PROCESSED_DIR / "journey_times_bank.parquet" + journey_times_path = ( + journey_times_path + or PROCESSED_DIR / "journey_times_bank_checkpoint.parquet" + ) postcodes_h3_path = postcodes_h3_path or PROCESSED_DIR / "postcodes_h3.parquet" - output_dir = output_dir or AGGREGATES_DIR - - output_dir.mkdir(parents=True, exist_ok=True) + aggregates_dir = aggregates_dir or AGGREGATES_DIR # Load journey times data journey_df = pl.read_parquet(journey_times_path).select( - ["postcode", "public_transport_minutes"] + ["postcode"] + JOURNEY_COLS ) - # Filter out null journey times - journey_df = journey_df.filter(pl.col("public_transport_minutes").is_not_null()) + # Filter out rows where all journey time columns are null + journey_df = journey_df.filter( + pl.any_horizontal(pl.col(c).is_not_null() for c in JOURNEY_COLS) + ) if journey_df.height == 0: print("No valid journey times found") @@ -48,31 +64,63 @@ def aggregate_journey_times( print(f"Joined {joined_df.height} postcodes with journey times") - saved_paths = [] + updated_paths = [] for resolution in H3_RESOLUTIONS: h3_col = f"h3_res{resolution}" + parquet_path = aggregates_dir / f"res{resolution}.parquet" + + if not parquet_path.exists(): + print(f"Skipping resolution {resolution} - {parquet_path} not found") + continue if h3_col not in joined_df.columns: print(f"Skipping resolution {resolution} - column {h3_col} not found") continue - # Aggregate by H3 cell - compute median journey time - agg_df = ( + # Aggregate journey times by H3 cell + journey_agg = ( joined_df.group_by(h3_col) .agg( - pl.col("public_transport_minutes").median().alias("median_journey_minutes"), - pl.col("public_transport_minutes").count().alias("journey_count"), + pl.col("public_transport_easy_minutes") + .median() + .alias("median_pt_easy_minutes"), + pl.col("public_transport_quick_minutes") + .median() + .alias("median_pt_quick_minutes"), + pl.col("cycling_minutes") + .median() + .alias("median_cycling_minutes"), + pl.col("public_transport_quick_minutes") + .median() + .alias("median_journey_minutes"), ) .rename({h3_col: "h3"}) ) - output_path = output_dir / f"journey_times_res{resolution}.parquet" - agg_df.write_parquet(output_path) - saved_paths.append(output_path) - print(f"Saved {agg_df.height} cells to {output_path}") + # Load existing parquet + existing_df = pl.read_parquet(parquet_path) - return saved_paths + # Drop existing journey time columns if present + existing_df = existing_df.drop( + [c for c in AGGREGATE_COLS if c in existing_df.columns] + ) + + # Left join journey times onto existing data + updated_df = existing_df.join(journey_agg, on="h3", how="left") + + # Save back to parquet + updated_df.write_parquet(parquet_path) + updated_paths.append(parquet_path) + matched = updated_df.filter( + pl.col("median_journey_minutes").is_not_null() + ).height + print( + f"Updated {parquet_path.name}: {matched} rows with journey times " + f"(out of {updated_df.height} total)" + ) + + return updated_paths if __name__ == "__main__": diff --git a/pipeline/run.py b/pipeline/run.py index 3502ff2..630a892 100644 --- a/pipeline/run.py +++ b/pipeline/run.py @@ -5,6 +5,7 @@ import polars as pl from pipeline.sources.postcodes import save_postcodes from pipeline.sources.property_prices import PropertyPricesSource from pipeline.processors.h3_aggregator import save_aggregates +from pipeline.processors.journey_times_aggregator import aggregate_journey_times def run_pipeline(): @@ -14,22 +15,31 @@ def run_pipeline(): print("=" * 60) # Step 1: Process postcodes with H3 indices - print("\n[1/3] Processing postcodes with H3 indices...") + print("\n[1/4] Processing postcodes with H3 indices...") postcodes_path = save_postcodes() print(f" Saved: {postcodes_path}") - print("\n[2/3] Processing property prices...") + print("\n[2/4] Processing property prices...") postcodes = pl.scan_parquet(postcodes_path) property_source = PropertyPricesSource() properties = property_source.process(postcodes) print(" Joined property prices with postcodes") - print("\n[3/3] Aggregating at H3 resolutions...") + print("\n[3/4] Aggregating at H3 resolutions...") saved_paths = save_aggregates(properties) for path in saved_paths: size_mb = path.stat().st_size / (1024 * 1024) print(f" Saved: {path.name} ({size_mb:.1f} MB)") + print("\n[4/4] Adding journey times to aggregates...") + updated_paths = aggregate_journey_times() + if updated_paths: + for path in updated_paths: + size_mb = path.stat().st_size / (1024 * 1024) + print(f" Updated: {path.name} ({size_mb:.1f} MB)") + else: + print(" Skipped (no journey time data found)") + if __name__ == "__main__": run_pipeline() diff --git a/server/routes/hexagons.py b/server/routes/hexagons.py index 987c749..afe3777 100644 --- a/server/routes/hexagons.py +++ b/server/routes/hexagons.py @@ -77,14 +77,28 @@ def query_hexagons_cached( # Filter by year range df = df.filter((pl.col("year") >= min_year) & (pl.col("year") <= max_year)) + # Check which journey time columns exist + journey_cols = [ + "median_journey_minutes", + "median_pt_easy_minutes", + "median_pt_quick_minutes", + "median_cycling_minutes", + ] + available_journey_cols = [c for c in journey_cols if c in df.columns] + # Aggregate across years (weighted by count) - df = df.group_by("h3").agg( + agg_exprs = [ pl.col("count").sum().alias("count"), (pl.col("avg_price") * pl.col("count")).sum().alias("weighted_price_sum"), pl.col("median_price").median().alias("median_price"), pl.col("min_price").min().alias("min_price"), pl.col("max_price").max().alias("max_price"), - ) + ] + for jc in available_journey_cols: + # Journey time is same across years, just take first non-null + agg_exprs.append(pl.col(jc).first()) + + df = df.group_by("h3").agg(agg_exprs) # Calculate weighted average price df = df.with_columns( @@ -97,16 +111,18 @@ def query_hexagons_cached( ) # Build response efficiently using Polars - df = df.select( - [ - pl.col("h3"), - pl.col("count"), - pl.col("avg_price").round(2), - pl.col("median_price").round(2), - pl.col("min_price"), - pl.col("max_price"), - ] - ) + select_cols = [ + pl.col("h3"), + pl.col("count"), + pl.col("avg_price").round(2), + pl.col("median_price").round(2), + pl.col("min_price"), + pl.col("max_price"), + ] + for jc in available_journey_cols: + select_cols.append(pl.col(jc).round(0)) + + df = df.select(select_cols) return df.to_dicts() diff --git a/server/routes/pois.py b/server/routes/pois.py index d7e0e58..fcc225b 100644 --- a/server/routes/pois.py +++ b/server/routes/pois.py @@ -1,9 +1,5 @@ """POI (Points of Interest) API endpoint.""" -import os - -os.environ["POLARS_UNKNOWN_EXTENSION_TYPE_BEHAVIOR"] = "load_as_storage" - from pathlib import Path from fastapi import APIRouter, Query @@ -13,36 +9,190 @@ router = APIRouter() DATA_FILE = Path("data_sources/uk_pois.parquet") -# Categories useful for property buyers -POI_CATEGORIES = { - "schools": [ - "elementary_school", - "school", - "high_school", - "preschool", - "college_university", - "private_school", - ], - "healthcare": [ - "doctor", - "dentist", - "pharmacy", - "hospital", - "public_health_clinic", - ], - "transport": [ - "train_station", - "bus_station", - "metro_station", - "light_rail_and_subway_stations", - ], - "parks": ["park", "national_park", "dog_park"], - "emergency": ["police_department", "fire_department"], - "supermarkets": ["supermarket", "grocery_store", "convenience_store"], +# Category groups with emoji and member categories +POI_CATEGORY_GROUPS: dict[str, dict] = { + "schools": { + "emoji": "๐Ÿซ", + "label": "Schools", + "categories": ["school", "preschool", "college_university", "library"], + }, + "healthcare": { + "emoji": "๐Ÿฅ", + "label": "Healthcare", + "categories": [ + "doctor", + "dentist", + "pharmacy", + "hospital", + "public_health_clinic", + "veterinary", + "nursing_home", + "social_facility", + ], + }, + "transport": { + "emoji": "๐Ÿš‰", + "label": "Transport", + "categories": [ + "train_station", + "bus_station", + "bus_stop", + "metro_station", + "light_rail_station", + "tram_stop", + "ferry_terminal", + "airport", + ], + }, + "parks": { + "emoji": "๐ŸŒณ", + "label": "Parks & Leisure", + "categories": [ + "park", + "national_park", + "nature_reserve", + "dog_park", + "playground", + "garden", + "sports_centre", + "swimming_pool", + "gym", + "golf_course", + "marina", + ], + }, + "emergency": { + "emoji": "๐Ÿšจ", + "label": "Emergency", + "categories": ["police_department", "fire_department"], + }, + "supermarkets": { + "emoji": "๐Ÿ›’", + "label": "Supermarkets & Grocery", + "categories": [ + "supermarket", + "grocery_store", + "convenience_store", + "bakery", + "butcher", + "greengrocer", + "deli", + ], + }, + "shopping": { + "emoji": "๐Ÿ›๏ธ", + "label": "Shopping", + "categories": [ + "department_store", + "clothing_store", + "shoe_store", + "electronics_store", + "hardware_store", + "furniture_store", + "bookshop", + "newsagent", + "charity_shop", + "shopping_centre", + "optician", + "off_licence", + ], + }, + "food_drink": { + "emoji": "๐Ÿฝ๏ธ", + "label": "Food & Drink", + "categories": [ + "restaurant", + "cafe", + "pub", + "bar", + "fast_food", + "food_court", + "ice_cream", + "beer_garden", + ], + }, + "personal_care": { + "emoji": "๐Ÿ’‡", + "label": "Personal Care", + "categories": [ + "hairdresser", + "beauty_salon", + "laundry", + "dry_cleaning", + ], + }, + "finance": { + "emoji": "๐Ÿฆ", + "label": "Finance", + "categories": ["bank", "atm", "bureau_de_change"], + }, + "entertainment": { + "emoji": "๐ŸŽญ", + "label": "Entertainment & Culture", + "categories": [ + "cinema", + "theatre", + "nightclub", + "community_centre", + "arts_centre", + "museum", + "gallery", + "attraction", + "zoo", + "theme_park", + "viewpoint", + ], + }, + "accommodation": { + "emoji": "๐Ÿจ", + "label": "Accommodation", + "categories": [ + "hotel", + "hostel", + "guest_house", + "campsite", + "caravan_site", + ], + }, + "religion": { + "emoji": "๐Ÿ›", + "label": "Places of Worship", + "categories": ["place_of_worship"], + }, + "government": { + "emoji": "๐Ÿ›๏ธ", + "label": "Government & Public", + "categories": [ + "town_hall", + "courthouse", + "post_office", + "prison", + "public_toilets", + ], + }, + "automotive": { + "emoji": "โ›ฝ", + "label": "Automotive", + "categories": [ + "petrol_station", + "ev_charging", + "car_dealer", + "car_repair", + "parking", + "bicycle_parking", + ], + }, + "recycling": { + "emoji": "โ™ป๏ธ", + "label": "Recycling & Waste", + "categories": ["recycling", "waste_disposal"], + }, } # Flatten for quick lookup -ALL_CATEGORIES = {cat for cats in POI_CATEGORIES.values() for cat in cats} +ALL_CATEGORIES = { + cat for group in POI_CATEGORY_GROUPS.values() for cat in group["categories"] +} # Cache the dataframe _df_cache: pl.DataFrame | None = None @@ -55,14 +205,9 @@ def get_df() -> pl.DataFrame | None: if not DATA_FILE.exists(): return None df = pl.read_parquet(DATA_FILE) - # Extract fields we need and filter to relevant categories - _df_cache = df.select( - pl.col("id"), - pl.col("names").struct.field("primary").alias("name"), - pl.col("categories").struct.field("primary").alias("category"), - pl.col("bbox").struct.field("xmin").alias("lng"), - pl.col("bbox").struct.field("ymin").alias("lat"), - ).filter(pl.col("category").is_in(ALL_CATEGORIES)) + _df_cache = df.select("id", "name", "category", "lat", "lng").filter( + pl.col("category").is_in(ALL_CATEGORIES) + ) return _df_cache @@ -83,23 +228,20 @@ async def get_pois( if df is None: return {"features": []} - # Parse bounds try: south, west, north, east = map(float, bounds.split(",")) except ValueError: return {"features": []} - # Get categories to include requested_groups = [g.strip() for g in categories.split(",")] cats_to_include = set() for group in requested_groups: - if group in POI_CATEGORIES: - cats_to_include.update(POI_CATEGORIES[group]) + if group in POI_CATEGORY_GROUPS: + cats_to_include.update(POI_CATEGORY_GROUPS[group]["categories"]) if not cats_to_include: return {"features": []} - # Filter by bounds and categories filtered = df.filter( (pl.col("lat") >= south) & (pl.col("lat") <= north) @@ -108,7 +250,6 @@ async def get_pois( & (pl.col("category").is_in(cats_to_include)) ) - # Limit results to avoid overwhelming the frontend MAX_POIS = 5000 if len(filtered) > MAX_POIS: filtered = filtered.sample(n=MAX_POIS, seed=42) @@ -118,5 +259,10 @@ async def get_pois( @router.get("/poi-categories") async def get_poi_categories() -> dict: - """Get available POI category groups.""" - return {"categories": list(POI_CATEGORIES.keys())} + """Get available POI category groups with emoji and labels.""" + return { + "categories": { + key: {"emoji": group["emoji"], "label": group["label"]} + for key, group in POI_CATEGORY_GROUPS.items() + } + }