From c995f12f8b0735bd9ddb673aea1a9b147c638853 Mon Sep 17 00:00:00 2001 From: Andras Schmelczer Date: Thu, 28 May 2026 21:48:35 +0100 Subject: [PATCH] vibes --- .gitignore | 2 + Makefile.data | 56 +- README.md | 5 +- finder/onthemarket.py | 16 +- finder/storage.py | 22 + finder/test_transform.py | 48 + finder/transform.py | 52 +- finder/zoopla.py | 35 +- frontend/src/App.tsx | 2 + frontend/src/components/learn/LearnPage.tsx | 2 +- frontend/src/components/map/AreaPane.tsx | 146 +- .../map/JourneyInstructions.test.tsx | 51 +- .../components/map/JourneyInstructions.tsx | 39 +- frontend/src/components/map/Map.tsx | 219 ++- frontend/src/components/map/MapPage.tsx | 55 +- frontend/src/components/map/OverlayPane.tsx | 89 +- .../src/components/map/TravelTimeCard.tsx | 18 +- .../filters/ElectionVoteShareFilterCard.tsx | 1 + .../map/filters/EnumFeatureFilterCard.tsx | 4 +- .../map/filters/EthnicityFilterCard.tsx | 1 + .../map/filters/NumericFeatureFilterCard.tsx | 8 +- .../map/filters/PoiDistanceFilterCard.tsx | 8 +- .../map/filters/SchoolFilterCard.tsx | 8 +- .../map/filters/SpecificCrimeFilterCard.tsx | 1 + .../map/map-page/DesktopMapPage.tsx | 10 +- .../components/map/map-page/MobileMapPage.tsx | 6 +- .../map/map-page/ScreenshotMapPage.tsx | 4 + frontend/src/components/map/map-page/types.ts | 2 + frontend/src/components/ui/FeatureLabel.tsx | 17 +- frontend/src/hooks/useDeckLayers.ts | 16 +- .../src/hooks/useHexagonSelection.test.ts | 229 ++- frontend/src/hooks/useHexagonSelection.ts | 108 +- frontend/src/hooks/useListingLayers.ts | 416 +++++- frontend/src/hooks/useMapData.test.ts | 1 + frontend/src/hooks/useMapData.ts | 4 + frontend/src/hooks/useUrlSync.ts | 8 +- frontend/src/i18n/details.ts | 10 +- frontend/src/i18n/locales/de.ts | 4 +- frontend/src/i18n/locales/en.ts | 8 +- frontend/src/i18n/locales/fr.ts | 4 +- frontend/src/i18n/locales/hi.ts | 4 +- frontend/src/i18n/locales/hu.ts | 4 +- frontend/src/i18n/locales/zh.ts | 4 +- frontend/src/lib/basemaps.ts | 19 + frontend/src/lib/consts.ts | 2 +- frontend/src/lib/map-utils.test.ts | 30 +- frontend/src/lib/map-utils.ts | 156 ++- frontend/src/lib/overlays.ts | 7 + frontend/src/lib/poi-distance-filter.test.ts | 19 +- frontend/src/lib/poi-distance-filter.ts | 14 + frontend/src/lib/url-state.test.ts | 24 + frontend/src/lib/url-state.ts | 15 +- frontend/src/types.ts | 1 + pipeline/download/conservation_areas.py | 54 +- pipeline/download/inspire.py | 179 ++- pipeline/download/listed_buildings.py | 26 +- pipeline/download/satellite_tiles.py | 432 ++++++ pipeline/download/test_inspire.py | 61 + pipeline/download/test_satellite_tiles.py | 97 ++ pipeline/transform/enrich_actual_listings.py | 960 ------------- pipeline/transform/merge.py | 1225 ++++++++++++++++- .../transform/postcode_boundaries/README.md | 6 +- .../transform/postcode_boundaries/__init__.py | 7 +- .../postcode_boundaries/process_oa.py | 246 +++- .../test_postcode_boundaries.py | 195 +++ .../transform/postcode_boundaries/voronoi.py | 15 +- .../transform/test_enrich_actual_listings.py | 143 -- pipeline/transform/test_merge.py | 479 ++++++- server-rs/src/data/poi.rs | 3 +- server-rs/src/features.rs | 2 +- server-rs/src/main.rs | 20 + server-rs/src/parsing.rs | 1 + server-rs/src/routes/actual_listings.rs | 113 +- server-rs/src/routes/export.rs | 22 +- server-rs/src/routes/overlays.rs | 2 + server-rs/src/routes/postcode_properties.rs | 22 +- server-rs/src/routes/properties.rs | 96 +- server-rs/src/routes/shorten.rs | 9 + 78 files changed, 4830 insertions(+), 1619 deletions(-) create mode 100644 finder/test_transform.py create mode 100644 frontend/src/lib/basemaps.ts create mode 100644 pipeline/download/satellite_tiles.py create mode 100644 pipeline/download/test_inspire.py create mode 100644 pipeline/download/test_satellite_tiles.py delete mode 100644 pipeline/transform/enrich_actual_listings.py delete mode 100644 pipeline/transform/test_enrich_actual_listings.py diff --git a/.gitignore b/.gitignore index d3f856f..4b1aeaa 100644 --- a/.gitignore +++ b/.gitignore @@ -24,3 +24,5 @@ video/auth.* r5-java/tmp property-data +property-data2 +property-data3 diff --git a/Makefile.data b/Makefile.data index 0c2e772..9d01f88 100644 --- a/Makefile.data +++ b/Makefile.data @@ -13,6 +13,7 @@ FINDER_DATA := ./finder/data # ── Output files ────────────────────────────────────────────────────────────── TILES := $(DATA_DIR)/uk.pmtiles +SATELLITE_TILES := $(DATA_DIR)/satellite.pmtiles ARCGIS := $(DATA_DIR)/arcgis_data.parquet PRICE_PAID := $(DATA_DIR)/price-paid-complete.parquet IOD := $(DATA_DIR)/IoD2025_Scores.parquet @@ -44,7 +45,7 @@ OFSTED := $(DATA_DIR)/ofsted.parquet GIAS := $(DATA_DIR)/gias.parquet NAPTAN := $(DATA_DIR)/naptan.parquet BROADBAND := $(DATA_DIR)/broadband.parquet -CONSERVATION_AREAS := $(DATA_DIR)/conservation_areas.gpkg +CONSERVATION_AREAS := $(DATA_DIR)/conservation_areas.geojson LISTED_BUILDINGS := $(DATA_DIR)/listed_buildings.gpkg SCHOOL_PROX := $(DATA_DIR)/school_proximity.parquet RENTAL := $(DATA_DIR)/rental_prices.parquet @@ -77,6 +78,8 @@ INSPIRE_STAMP := $(INSPIRE_DIR)/.done MAP_ASSETS_STAMP := $(MAP_ASSETS_DIR)/.done PMTILES_VERSION := 1.22.3 +PMTILES_BIN := $(DATA_DIR)/pmtiles +SATELLITE_TILE_ARGS ?= VALIDATE_OUTPUTS := uv run python -m pipeline.validate_outputs @@ -101,7 +104,7 @@ MAP_ASSETS_DEPS := pipeline/download/map_assets.py pipeline/transform/transform_ # ── Phony aliases ───────────────────────────────────────────────────────────── -.PHONY: prepare merge tiles overlay-tiles noise-overlay-tiles crime-hotspot-tiles tree-overlay-tiles \ +.PHONY: prepare merge tiles satellite-tiles overlay-tiles noise-overlay-tiles crime-hotspot-tiles tree-overlay-tiles \ download-arcgis download-price-paid download-deprivation download-ethnicity \ download-naptan download-pois download-grocery-retail-points download-ofsted download-gias download-broadband download-conservation-areas download-listed-buildings download-rental-prices \ download-postcodes download-noise download-inspire download-crime \ @@ -111,12 +114,13 @@ MAP_ASSETS_DEPS := pipeline/download/map_assets.py pipeline/transform/transform_ transform-school-proximity transform-tree-density \ generate-postcode-boundaries generate-travel-times enrich-actual-listings -prepare: $(PRICES_STAMP) download-places tiles overlay-tiles generate-postcode-boundaries download-map-assets generate-travel-times | $(POSTCODES_PQ) $(PROPERTIES_PQ) $(PRICE_INDEX) +prepare: $(PRICES_STAMP) download-places tiles satellite-tiles overlay-tiles generate-postcode-boundaries download-map-assets generate-travel-times | $(POSTCODES_PQ) $(PROPERTIES_PQ) $(PRICE_INDEX) $(VALIDATE_OUTPUTS) --parquet $(POSTCODES_PQ) --parquet $(PROPERTIES_PQ) --parquet $(PRICE_INDEX) merge: $(MERGE_STAMP) | $(POSTCODES_PQ) $(PROPERTIES_PQ) $(VALIDATE_OUTPUTS) --parquet $(POSTCODES_PQ) --parquet $(PROPERTIES_PQ) enrich-actual-listings: $(ACTUAL_LISTINGS_ENRICHED) -tiles: $(TILES) +tiles: $(TILES) $(SATELLITE_TILES) +satellite-tiles: $(SATELLITE_TILES) overlay-tiles: noise-overlay-tiles crime-hotspot-tiles tree-overlay-tiles noise-overlay-tiles: $(NOISE_OVERLAY_TILES) crime-hotspot-tiles: $(CRIME_HOTSPOT_TILES) @@ -183,9 +187,15 @@ generate-travel-times: $(ARCGIS) $(PLACES) $(PBF) download-transit-network # ── Downloads ───────────────────────────────────────────────────────────────── -$(TILES): +$(PMTILES_BIN): pipeline/download/tiles.py + uv run python -c 'from pathlib import Path; from pipeline.download.tiles import ensure_pmtiles_cli; ensure_pmtiles_cli(Path("$(PMTILES_BIN)"), "$(PMTILES_VERSION)")' + +$(TILES): $(PMTILES_BIN) uv run -m pipeline.download.tiles --output $@ --pmtiles-version $(PMTILES_VERSION) +$(SATELLITE_TILES): $(PMTILES_BIN) pipeline/download/satellite_tiles.py pipeline/download/tiles.py + uv run python -m pipeline.download.satellite_tiles --output $@ --pmtiles-bin $(PMTILES_BIN) --pmtiles-version $(PMTILES_VERSION) $(SATELLITE_TILE_ARGS) + # EPC requires manual registration — fail with instructions $(EPC): @echo "" @@ -260,8 +270,8 @@ $(POSTCODES_RAW): $(NOISE): $(ARCGIS) pipeline/download/noise.py uv run python -m pipeline.download.noise --arcgis $(ARCGIS) --output $@ -$(NOISE_OVERLAY_TILES): pipeline/transform/noise_overlay_tiles.py pipeline/download/noise.py pipeline/download/tiles.py - uv run python -m pipeline.transform.noise_overlay_tiles --output $@ --raster-dir $(DATA_DIR)/noise_overlay_rasters --pmtiles-bin $(DATA_DIR)/pmtiles --pmtiles-version $(PMTILES_VERSION) +$(NOISE_OVERLAY_TILES): $(PMTILES_BIN) pipeline/transform/noise_overlay_tiles.py pipeline/download/noise.py pipeline/download/tiles.py + uv run python -m pipeline.transform.noise_overlay_tiles --output $@ --raster-dir $(DATA_DIR)/noise_overlay_rasters --pmtiles-bin $(PMTILES_BIN) --pmtiles-version $(PMTILES_VERSION) $(CRIME_HOTSPOT_TILES): $(CRIME_STAMP) pipeline/transform/crime_hotspot_tiles.py pipeline/transform/crime.py uv run python -m pipeline.transform.crime_hotspot_tiles --input $(CRIME_DIR) --output $@ @@ -409,12 +419,30 @@ $(PRICES_STAMP): $(MERGE_STAMP) $(PRICE_INDEX) $(PRICE_ESTIMATE_DEPS) | $(PROPER $(VALIDATE_OUTPUTS) --parquet $(PROPERTIES_PQ) --parquet $(POSTCODES_PQ) --parquet $(PRICE_INDEX) @touch $@ -$(ACTUAL_LISTINGS_ENRICHED): $(ACTUAL_LISTINGS_RAW) $(PRICES_STAMP) $(POSTCODES_PQ) $(ARCGIS) $(EPC) \ - pipeline/transform/enrich_actual_listings.py pipeline/transform/join_epc_pp.py pipeline/utils/fuzzy_join.py - uv run python -m pipeline.transform.enrich_actual_listings \ - --listings $(ACTUAL_LISTINGS_RAW) \ - --properties $(PROPERTIES_PQ) \ - --postcode-features $(POSTCODES_PQ) \ +$(ACTUAL_LISTINGS_ENRICHED): $(ACTUAL_LISTINGS_RAW) $(EPC) \ + $(EPC_PP) $(ARCGIS) $(IOD) $(POI_PROXIMITY) \ + $(ETHNICITY) $(CRIME) $(NOISE) $(SCHOOL_PROX) $(BROADBAND) \ + $(CONSERVATION_AREAS) $(LISTED_BUILDINGS) $(RENTAL) \ + $(LSOA_POP) $(MEDIAN_AGE) $(ELECTION) $(TREE_DENSITY_PC) \ + $(MERGE_DEPS) pipeline/utils/fuzzy_join.py + uv run python -m pipeline.transform.merge \ + --epc-pp $(EPC_PP) \ --arcgis $(ARCGIS) \ + --iod $(IOD) \ + --poi-proximity $(POI_PROXIMITY) \ + --ethnicity $(ETHNICITY) \ + --crime $(CRIME) \ + --noise $(NOISE) \ + --school-proximity $(SCHOOL_PROX) \ + --broadband $(BROADBAND) \ + --conservation-areas $(CONSERVATION_AREAS) \ + --listed-buildings $(LISTED_BUILDINGS) \ + --rental-prices $(RENTAL) \ + --lsoa-population $(LSOA_POP) \ + --median-age $(MEDIAN_AGE) \ + --election-results $(ELECTION) \ + --tree-density-postcodes $(TREE_DENSITY_PC) \ + --actual-listings $(ACTUAL_LISTINGS_RAW) \ --epc $(EPC) \ - --output $@ + --output-listings $@ + $(VALIDATE_OUTPUTS) --parquet $@ diff --git a/README.md b/README.md index d031c3e..ba2d10f 100644 --- a/README.md +++ b/README.md @@ -86,9 +86,8 @@ make -f Makefile.data download-places make -f Makefile.data generate-postcode-boundaries ``` -`generate-postcode-boundaries` writes to `manual-data/postcode_boundaries/`. -The running server expects the same structure under -`property-data/postcode_boundaries/`; copy or symlink it if needed. +`generate-postcode-boundaries` writes to `property-data/postcode_boundaries/`, +which is the same directory the local server expects by default. Travel times are built separately because they are expensive: diff --git a/finder/onthemarket.py b/finder/onthemarket.py index 7f08df3..7a96e4e 100644 --- a/finder/onthemarket.py +++ b/finder/onthemarket.py @@ -29,6 +29,8 @@ from constants import ( ) from spatial import PostcodeSpatialIndex from transform import ( + clean_listing_address, + extract_full_postcode, fix_coords, map_property_type, normalize_sub_type, @@ -177,9 +179,13 @@ def transform_property( if not (49 <= lat <= 56 and -7 <= lng <= 2): return None - postcode = pc_index.nearest(lat, lng) - if not postcode: + inferred_postcode = pc_index.nearest(lat, lng) + if not inferred_postcode: return None + raw_address = raw.get("address", "") or "" + extracted_postcode = extract_full_postcode(raw_address) + postcode = extracted_postcode or inferred_postcode + postcode_source = "address" if extracted_postcode else "coordinates" raw_beds = raw.get("bedrooms") or 0 raw_baths = raw.get("bathrooms") or 0 @@ -212,7 +218,11 @@ def transform_property( "lon": lng, "lat": lat, "Postcode": postcode, - "Address per Property Register": raw.get("address", ""), + "Postcode source": postcode_source, + "Extracted postcode": extracted_postcode, + "Inferred postcode": inferred_postcode, + "Listing raw address": raw_address, + "Address per Property Register": clean_listing_address(raw_address), "Leasehold/Freehold": _extract_tenure(features), "Property type": map_property_type(sub_type), "Property sub-type": normalize_sub_type(sub_type), diff --git a/finder/storage.py b/finder/storage.py index 3d21083..6c6822e 100644 --- a/finder/storage.py +++ b/finder/storage.py @@ -105,6 +105,24 @@ def write_parquet(properties: list[dict], path: Path) -> None: "lon": [p["lon"] for p in properties], "lat": [p["lat"] for p in properties], "Postcode": [normalize_postcode(p["Postcode"]) for p in properties], + "Postcode source": [p.get("Postcode source", "") for p in properties], + "Extracted postcode": [ + normalize_postcode(p["Extracted postcode"]) + if p.get("Extracted postcode") + else None + for p in properties + ], + "Inferred postcode": [ + normalize_postcode(p["Inferred postcode"]) + if p.get("Inferred postcode") + else None + for p in properties + ], + "Listing raw address": [ + p.get("Listing raw address") + or p.get("Address per Property Register", "") + for p in properties + ], "Address per Property Register": [ p["Address per Property Register"] for p in properties ], @@ -126,6 +144,10 @@ def write_parquet(properties: list[dict], path: Path) -> None: "lon": pl.Float64, "lat": pl.Float64, "Postcode": pl.Utf8, + "Postcode source": pl.Utf8, + "Extracted postcode": pl.Utf8, + "Inferred postcode": pl.Utf8, + "Listing raw address": pl.Utf8, "Address per Property Register": pl.Utf8, "Leasehold/Freehold": pl.Utf8, "Property type": pl.Utf8, diff --git a/finder/test_transform.py b/finder/test_transform.py new file mode 100644 index 0000000..c90296b --- /dev/null +++ b/finder/test_transform.py @@ -0,0 +1,48 @@ +from transform import ( + clean_listing_address, + extract_full_postcode, + transform_property, +) + + +class StubPostcodeIndex: + def nearest(self, lat: float, lng: float) -> str: + return "SW1A 9ZZ" + + +def test_extract_full_postcode_normalizes_spacing() -> None: + assert extract_full_postcode("10 Downing Street SW1A2AA") == "SW1A 2AA" + assert extract_full_postcode("10 Downing Street, SW1A 2AA") == "SW1A 2AA" + assert extract_full_postcode("Downing Street, Westminster") is None + + +def test_clean_listing_address_removes_postcode_and_outcode_suffixes() -> None: + assert clean_listing_address("10 Downing Street, SW1A 2AA") == "10 Downing Street" + assert clean_listing_address("Hawthorne Road, Bromley, Kent, BR1") == ( + "Hawthorne Road, Bromley, Kent" + ) + assert clean_listing_address("Kings Avenue, Bromley") == "Kings Avenue, Bromley" + + +def test_rightmove_transform_prefers_postcode_from_display_address() -> None: + prop = { + "id": "123", + "location": {"latitude": 51.5, "longitude": -0.1}, + "price": {"amount": 750000, "displayPrices": []}, + "propertySubType": "Terraced", + "bedrooms": 3, + "bathrooms": 1, + "keyFeatures": [], + "propertyUrl": "/properties/123", + "displayAddress": "Flat 2, 10 Downing Street, SW1A 2AA", + } + + result = transform_property(prop, "SW1A", StubPostcodeIndex()) + + assert result is not None + assert result["Postcode"] == "SW1A 2AA" + assert result["Postcode source"] == "address" + assert result["Extracted postcode"] == "SW1A 2AA" + assert result["Inferred postcode"] == "SW1A 9ZZ" + assert result["Listing raw address"] == "Flat 2, 10 Downing Street, SW1A 2AA" + assert result["Address per Property Register"] == "Flat 2, 10 Downing Street" diff --git a/finder/transform.py b/finder/transform.py index e210bee..8c1f357 100644 --- a/finder/transform.py +++ b/finder/transform.py @@ -14,6 +14,18 @@ log = logging.getLogger("rightmove") # UK mansions. MIN_FLOOR_AREA_SQM = 5.0 MAX_FLOOR_AREA_SQM = 2000.0 +FULL_POSTCODE_RE = re.compile( + r"\b([A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2})\b", + re.IGNORECASE, +) +TRAILING_FULL_POSTCODE_RE = re.compile( + r"(?:,?\s*)\b[A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2}\b\s*$", + re.IGNORECASE, +) +TRAILING_OUTCODE_RE = re.compile( + r"(?:,?\s*)\b[A-Z]{1,2}\d[A-Z\d]?\b\s*$", + re.IGNORECASE, +) def validate_floor_area(sqm: float | None) -> float | None: @@ -184,6 +196,32 @@ def normalize_postcode(postcode: str) -> str: return compact[:-3] + " " + compact[-3:] +def extract_full_postcode(text: str | None) -> str | None: + if not text: + return None + match = FULL_POSTCODE_RE.search(text) + if not match: + return None + return normalize_postcode(match.group(1)) + + +def clean_listing_address(address: str | None) -> str: + """Remove postcode/outcode suffixes from listing display addresses. + + Listing sites often include "..., BR1" or "..., SW1A 1AA" in their public + address. Those tokens add fake address numbers to the fuzzy matcher, so keep + the raw address separately and use this cleaned value for matching. + """ + if not address: + return "" + cleaned = str(address).strip() + cleaned = TRAILING_FULL_POSTCODE_RE.sub("", cleaned) + cleaned = TRAILING_OUTCODE_RE.sub("", cleaned) + cleaned = re.sub(r"\s+", " ", cleaned) + cleaned = re.sub(r"\s*,\s*", ", ", cleaned) + return cleaned.strip(" ,") + + def transform_property( prop: dict, outcode: str, pc_index: PostcodeSpatialIndex ) -> dict | None: @@ -224,10 +262,14 @@ def transform_property( if kf.get("description") ] - postcode = pc_index.nearest(lat, lng) - if not postcode: + inferred_postcode = pc_index.nearest(lat, lng) + if not inferred_postcode: log.debug("No England postcode for property at %.4f, %.4f — skipping", lat, lng) return None + raw_address = prop.get("displayAddress", "") or "" + extracted_postcode = extract_full_postcode(raw_address) + postcode = extracted_postcode or inferred_postcode + postcode_source = "address" if extracted_postcode else "coordinates" property_url = prop.get("propertyUrl") or "" if not isinstance(property_url, str): @@ -244,7 +286,11 @@ def transform_property( "lon": lng, "lat": lat, "Postcode": postcode, - "Address per Property Register": prop.get("displayAddress", ""), + "Postcode source": postcode_source, + "Extracted postcode": extracted_postcode, + "Inferred postcode": inferred_postcode, + "Listing raw address": raw_address, + "Address per Property Register": clean_listing_address(raw_address), "Leasehold/Freehold": extract_tenure(prop.get("tenure")), "Property type": map_property_type(sub_type), "Property sub-type": normalize_sub_type(sub_type), diff --git a/finder/zoopla.py b/finder/zoopla.py index 9e8bc75..d36bc21 100644 --- a/finder/zoopla.py +++ b/finder/zoopla.py @@ -37,7 +37,13 @@ from constants import ( ZOOPLA_BASE, ) from spatial import PostcodeSpatialIndex -from transform import normalize_sub_type, parse_int_value, validate_floor_area +from transform import ( + clean_listing_address, + extract_full_postcode, + normalize_sub_type, + parse_int_value, + validate_floor_area, +) log = logging.getLogger("zoopla") @@ -1031,19 +1037,6 @@ def _resolve_outcode_coords( return None -def _extract_postcode(text: str) -> str | None: - """Extract a full UK postcode from text like 'Dollar Bay Place, Canary Wharf E14 9SS'. - Normalizes to include a space before the 3-char incode.""" - match = re.search(r"([A-Z]{1,2}\d[A-Z0-9]?\s*\d[A-Z]{2})", text, re.IGNORECASE) - if match: - raw = match.group(1).upper().strip() - # Ensure space before incode (last 3 chars): "SW1A1AA" → "SW1A 1AA" - if " " not in raw and len(raw) >= 5: - return raw[:-3] + " " + raw[-3:] - return raw - return None - - def _extract_outcode(text: str) -> str | None: """Extract a UK outcode from address text like 'Whitechapel Road, London E1'.""" # Look for outcode at end of string or after last comma @@ -1123,10 +1116,12 @@ def transform_property( from postcodes extracted from the address text.""" price = parse_int_value(raw.get("price")) or 0 - address = raw.get("address", "") + address = raw.get("address", "") or "" # Resolve postcode and coordinates from address - postcode = _extract_postcode(address) + extracted_postcode = extract_full_postcode(address) + postcode = extracted_postcode + postcode_source = "address" if extracted_postcode else None lat = lng = None if postcode: @@ -1141,12 +1136,14 @@ def transform_property( result = _resolve_outcode_coords(addr_outcode, pc_coords) if result: postcode, lat, lng = result + postcode_source = "address_outcode" # Final fallback: use the outcode we know we're searching if lat is None and search_outcode: result = _resolve_outcode_coords(search_outcode, pc_coords) if result: postcode, lat, lng = result + postcode_source = "search_outcode" if lat is None or lng is None or not postcode: return None @@ -1189,7 +1186,11 @@ def transform_property( "lon": lng, "lat": lat, "Postcode": postcode, - "Address per Property Register": address, + "Postcode source": postcode_source or "unknown", + "Extracted postcode": extracted_postcode, + "Inferred postcode": postcode if postcode_source != "address" else None, + "Listing raw address": address, + "Address per Property Register": clean_listing_address(address), "Leasehold/Freehold": raw.get("tenure") or None, "Property type": _map_property_type(raw.get("property_type")), "Property sub-type": normalize_sub_type(raw.get("property_type")), diff --git a/frontend/src/App.tsx b/frontend/src/App.tsx index c0d9815..0b75dfe 100644 --- a/frontend/src/App.tsx +++ b/frontend/src/App.tsx @@ -556,6 +556,7 @@ export default function App() { initialViewState={initialViewState} initialPOICategories={urlState.poiCategories} initialOverlays={urlState.overlays} + initialBasemap={urlState.basemap} initialTab={urlState.tab} initialLoading={initialLoading} theme={theme} @@ -661,6 +662,7 @@ export default function App() { initialViewState={initialViewState} initialPOICategories={mapUrlState.poiCategories} initialOverlays={mapUrlState.overlays} + initialBasemap={mapUrlState.basemap} initialTab={mapUrlState.tab} initialLoading={initialLoading} theme={theme} diff --git a/frontend/src/components/learn/LearnPage.tsx b/frontend/src/components/learn/LearnPage.tsx index 5262129..bfe4720 100644 --- a/frontend/src/components/learn/LearnPage.tsx +++ b/frontend/src/components/learn/LearnPage.tsx @@ -65,7 +65,7 @@ const DATA_SOURCE_DEFS: DataSourceDef[] = [ }, { id: 'conservation-areas', - url: 'https://opendata-historicengland.hub.arcgis.com/datasets/historicengland::conservation-areas/explore', + url: 'https://www.planning.data.gov.uk/dataset/conservation-area', license: 'Open Government Licence v3.0', }, { diff --git a/frontend/src/components/map/AreaPane.tsx b/frontend/src/components/map/AreaPane.tsx index aea2a48..35b2fd1 100644 --- a/frontend/src/components/map/AreaPane.tsx +++ b/frontend/src/components/map/AreaPane.tsx @@ -19,6 +19,10 @@ import { } from '../../lib/format'; import { groupFeaturesByCategory } from '../../lib/features'; import { getPoiCategoryLogoUrl } from '../../lib/map-utils'; +import { + getActiveAmenityFilterFeatureNames, + isPoiFilterFeatureName, +} from '../../lib/poi-distance-filter'; import { PARTY_FEATURE_COLORS, STACKED_GROUPS, @@ -88,7 +92,7 @@ const STATION_GROUP_NAMES = new Set([STATION_GROUP_NAME, 'Public Transport']); function MetricTextLabel({ children }: { children: ReactNode }) { return ( - + {children} ); @@ -106,7 +110,7 @@ function MetricFeatureLabel({ aboutLabel: string; }) { return ( -
+
{label ?? ts(feature.name)} {feature.detail && (
-
+
{t('areaPane.statsBasis')} -
+
- - {listingPopup.listing.asking_price != null && ( -
- {formatListingPrice(listingPopup.listing.asking_price)} - {listingPopup.listing.price_qualifier ? ( - - {listingPopup.listing.price_qualifier} - - ) : null} -
- )} - {formatListingHeadline(listingPopup.listing, t) && ( -
- {formatListingHeadline(listingPopup.listing, t)} -
- )} - {listingPopup.listing.address && ( -
- {listingPopup.listing.address} -
- )} - {listingPopup.listing.postcode && ( -
- {listingPopup.listing.postcode} -
- )} - {listingPopup.listing.floor_area_sqm != null && ( -
- {Math.round(listingPopup.listing.floor_area_sqm)} sqm - {listingPopup.listing.asking_price_per_sqm != null - ? ` · £${Math.round(listingPopup.listing.asking_price_per_sqm).toLocaleString()}/sqm` - : ''} -
- )} - {listingPopup.listing.features.length > 0 && ( -
    - {listingPopup.listing.features.slice(0, 3).map((feature, idx) => ( -
  • - {feature} -
  • - ))} -
- )} -
- Open listing ↗ -
-
+ {listingPopup.mode === 'single' ? ( + + ) : ( + + )}
)} {hoverPosition && hoveredHexagonId && hoveredHexagonId !== selectedHexagonId && ( diff --git a/frontend/src/components/map/MapPage.tsx b/frontend/src/components/map/MapPage.tsx index 5e66f2a..3f84dec 100644 --- a/frontend/src/components/map/MapPage.tsx +++ b/frontend/src/components/map/MapPage.tsx @@ -27,8 +27,14 @@ import { useFilterCounts } from '../../hooks/useFilterCounts'; import { trackEvent } from '../../lib/analytics'; import { INITIAL_VIEW_STATE, POSTCODE_ZOOM_THRESHOLD } from '../../lib/consts'; import type { OverlayId } from '../../lib/overlays'; +import type { BasemapId } from '../../lib/basemaps'; import { useLicense } from '../../hooks/useLicense'; import { stateToParams } from '../../lib/url-state'; +import { groupFeaturesByCategory } from '../../lib/features'; +import { + getActiveAmenityFilterFeatureNames, + isPoiFilterFeatureName, +} from '../../lib/poi-distance-filter'; import { AreaPane, Filters, @@ -74,6 +80,7 @@ export default function MapPage({ initialViewState, initialPOICategories, initialOverlays, + initialBasemap = 'standard', initialTab, initialLoading, theme, @@ -107,6 +114,7 @@ export default function MapPage({ const [activeOverlays, setActiveOverlays] = useState>( () => new Set(initialOverlays ?? []) ); + const [basemap, setBasemap] = useState(initialBasemap); const [leftPaneWidth, leftPaneHandlers] = usePaneResize(384, 200, 0.45, 'left'); const [rightPaneWidth, rightPaneHandlers] = usePaneResize(384, 200, 0.45, 'right'); const [mobileDrawerOpen, setMobileDrawerOpen] = useState(false); @@ -229,10 +237,10 @@ export default function MapPage({ noBuses: parsed.noBuses, slug: tt.slug, label: tt.label, - timeRange: [ - tt.min ?? 0, - Math.min(tt.max ?? MAX_TRAVEL_MINUTES, MAX_TRAVEL_MINUTES), - ] as [number, number], + timeRange: [tt.min ?? 0, Math.min(tt.max ?? MAX_TRAVEL_MINUTES, MAX_TRAVEL_MINUTES)] as [ + number, + number, + ], useBest: false, })) ); @@ -300,6 +308,29 @@ export default function MapPage({ const filterCounts = useFilterCounts(filters, features, mapData.bounds, entries, shareCode); const license = useLicense(); + const [isAreaGroupExpanded, toggleAreaGroup] = useCollapsibleGroups(true); + const activeFilterNames = useMemo(() => new Set(Object.keys(filters)), [filters]); + const activeAmenityFeatureNames = useMemo( + () => getActiveAmenityFilterFeatureNames(filters), + [filters] + ); + const areaStatsFields = useMemo( + () => + groupFeaturesByCategory(features) + .filter((group) => isAreaGroupExpanded(group.name)) + .flatMap((group) => + group.features + .filter((feature) => { + if (group.name !== 'Amenities') return true; + if (isPoiFilterFeatureName(feature.name)) { + return activeAmenityFeatureNames.has(feature.name); + } + return activeFilterNames.has(feature.name); + }) + .map((feature) => feature.name) + ), + [activeAmenityFeatureNames, activeFilterNames, features, isAreaGroupExpanded] + ); const handleTravelTimeSetDestination = useCallback( (index: number, slug: string, label: string, _lat: number, _lon: number) => { @@ -338,6 +369,7 @@ export default function MapPage({ resolution: mapData.resolution, usePostcodeView: mapData.usePostcodeView, travelTimeEntries: entries, + areaStatsFields, shareCode, journeyDest, }); @@ -452,7 +484,7 @@ export default function MapPage({ const actualListingsTravelParam = useMemo(() => buildTravelParam(entries), [entries]); const actualListingsEnabled = !__DEV__ || devActualListingsEnabled; const { listings: actualListings } = useActualListings( - actualListingsEnabled ? mapData.bounds : null, + actualListingsEnabled ? mapData.visibleBounds : null, { filterParam: actualListingsFilterParam, travelParam: actualListingsTravelParam, @@ -464,7 +496,6 @@ export default function MapPage({ if (!__DEV__) return; setDevActualListingsEnabled((enabled) => !enabled); }, []); - const [isAreaGroupExpanded, toggleAreaGroup] = useCollapsibleGroups(true); useUrlSync( mapData.currentView, @@ -474,7 +505,8 @@ export default function MapPage({ rightPaneTab, entries, shareCode, - activeOverlays + activeOverlays, + basemap ); useInitialMapPageView(mapData, initialViewState, initialTab, setRightPaneTab); @@ -548,10 +580,12 @@ export default function MapPage({ rightPaneTab, entries, shareCode, - activeOverlays + activeOverlays, + basemap ).toString(), [ activeOverlays, + basemap, entries, features, filters, @@ -596,6 +630,7 @@ export default function MapPage({ ogMode={ogMode} travelTimeEntries={entries} activeOverlays={activeOverlays} + basemap={basemap} /> ); } @@ -656,6 +691,8 @@ export default function MapPage({ setOverlayPaneOpen(false)} /> @@ -790,6 +827,7 @@ export default function MapPage({ mapData={mapData} pois={pois} activeOverlays={activeOverlays} + basemap={basemap} mapViewFeature={mapViewFeature} filterRange={filterRange} viewSource={viewSource} @@ -860,6 +898,7 @@ export default function MapPage({ mapData={mapData} pois={pois} activeOverlays={activeOverlays} + basemap={basemap} mapViewFeature={mapViewFeature} filterRange={filterRange} viewSource={viewSource} diff --git a/frontend/src/components/map/OverlayPane.tsx b/frontend/src/components/map/OverlayPane.tsx index 26eb041..39eddd2 100644 --- a/frontend/src/components/map/OverlayPane.tsx +++ b/frontend/src/components/map/OverlayPane.tsx @@ -1,11 +1,16 @@ -import { OVERLAYS, type OverlayId } from '../../lib/overlays'; -import { PillGroup } from '../ui/PillGroup'; +import { useState } from 'react'; +import { BASEMAPS, type BasemapId } from '../../lib/basemaps'; +import { OVERLAYS, type OverlayDefinition, type OverlayId } from '../../lib/overlays'; import { PillToggle } from '../ui/PillToggle'; -import { CloseIcon } from '../ui/icons'; +import { IconButton } from '../ui/IconButton'; +import InfoPopup from '../ui/InfoPopup'; +import { CloseIcon, InfoIcon } from '../ui/icons'; interface OverlayPaneProps { selectedOverlays: Set; onOverlaysChange: (overlays: Set) => void; + basemap: BasemapId; + onBasemapChange: (basemap: BasemapId) => void; zoomedIn: boolean; onClose?: () => void; } @@ -13,9 +18,13 @@ interface OverlayPaneProps { export default function OverlayPane({ selectedOverlays, onOverlaysChange, + basemap, + onBasemapChange, zoomedIn, onClose, }: OverlayPaneProps) { + const [infoOverlay, setInfoOverlay] = useState(null); + const toggleOverlay = (overlay: OverlayId) => { const next = new Set(selectedOverlays); if (next.has(overlay)) { @@ -28,6 +37,8 @@ export default function OverlayPane({ const selectNone = () => onOverlaysChange(new Set()); + const showZoomWarning = !zoomedIn && selectedOverlays.size > 0; + return (
@@ -56,26 +67,68 @@ export default function OverlayPane({ )}
- {!zoomedIn && ( -
- Zoom in to view overlays. + {showZoomWarning && ( +
+ Zoom in further to see the selected{' '} + {selectedOverlays.size === 1 ? 'overlay' : 'overlays'}.
)}
-
- - {OVERLAYS.map((overlay) => ( - toggleOverlay(overlay.id)} - size="sm" - /> - ))} - +
+
+
+ Base map +
+
+ {BASEMAPS.map((option) => ( + onBasemapChange(option.id)} + size="sm" + /> + ))} +
+
+ +
+
+ Data overlays +
+
+ {OVERLAYS.map((overlay) => ( +
+ toggleOverlay(overlay.id)} + size="sm" + /> + setInfoOverlay(overlay)} + title={`About ${overlay.label}`} + ariaLabel={`About ${overlay.label}`} + > + + +
+ ))} +
+
+ + {infoOverlay && ( + setInfoOverlay(null)}> +

+ {infoOverlay.detail} +

+
+ )}
); } diff --git a/frontend/src/components/map/TravelTimeCard.tsx b/frontend/src/components/map/TravelTimeCard.tsx index 447c045..646a2ce 100644 --- a/frontend/src/components/map/TravelTimeCard.tsx +++ b/frontend/src/components/map/TravelTimeCard.tsx @@ -93,14 +93,14 @@ export function TravelTimeCard({ className={`space-y-2 px-2 py-2 rounded ${isActive ? 'ring-2 ring-teal-400 bg-teal-50 dark:bg-teal-900/30' : isPinned ? 'ring-2 ring-teal-400 bg-teal-50/50 dark:bg-teal-900/20' : ''}`} > {/* Header */} -
-
- - +
+
+ + {t('travel.travelTime', { mode: modes.label(mode) })}
-
+
setShowInfo(true)} title={t('filters.aboutData')} size="md"> @@ -133,8 +133,8 @@ export function TravelTimeCard({ {/* Transit-only toggles — shown when destination is set */} {slug && mode === 'transit' && ( -
-
+
+
-
+
-
+
-
- +
+
- +
- +
- + ; + basemap: BasemapId; mapViewFeature: string | null; filterRange: [number, number] | null; viewSource: 'drag' | 'eye' | null; @@ -91,6 +93,7 @@ export function DesktopMapPage({ mapData, pois, activeOverlays, + basemap, mapViewFeature, filterRange, viewSource, @@ -184,6 +187,7 @@ export function DesktopMapPage({ usePostcodeView={mapData.usePostcodeView} pois={pois} activeOverlays={activeOverlays} + basemap={basemap} onViewChange={mapData.handleViewChange} viewFeature={mapViewFeature} colorRange={mapData.colorRange} @@ -224,7 +228,9 @@ export function DesktopMapPage({ className={`flex items-center gap-2 rounded-lg bg-white px-3 py-2 shadow-lg dark:bg-warm-800 ${actualListingsEnabled ? 'text-red-600 hover:text-red-700 dark:text-red-400 dark:hover:text-red-300' : 'text-warm-500 hover:text-red-600 dark:text-warm-400 dark:hover:text-red-400'}`} > - Listings + + Listings{actualListingsEnabled ? ` (${actualListings.length})` : ''} + )}
{overlayPaneOpen && ( -
+
{overlayPane}
)} diff --git a/frontend/src/components/map/map-page/MobileMapPage.tsx b/frontend/src/components/map/map-page/MobileMapPage.tsx index e59c455..448612c 100644 --- a/frontend/src/components/map/map-page/MobileMapPage.tsx +++ b/frontend/src/components/map/map-page/MobileMapPage.tsx @@ -11,6 +11,7 @@ import type { import type { useMapData } from '../../../hooks/useMapData'; import type { TravelTimeEntry } from '../../../hooks/useTravelTime'; import type { OverlayId } from '../../../lib/overlays'; +import type { BasemapId } from '../../../lib/basemaps'; import type { SearchedLocation } from '../LocationSearch'; import MobileBottomSheet from '../MobileBottomSheet'; import { MapPinIcon } from '../../ui/icons/MapPinIcon'; @@ -30,6 +31,7 @@ interface MobileMapPageProps { mapData: MapData; pois: POI[]; activeOverlays: Set; + basemap: BasemapId; mapViewFeature: string | null; filterRange: [number, number] | null; viewSource: 'drag' | 'eye' | null; @@ -79,6 +81,7 @@ export function MobileMapPage({ mapData, pois, activeOverlays, + basemap, mapViewFeature, filterRange, viewSource, @@ -135,6 +138,7 @@ export function MobileMapPage({ usePostcodeView={mapData.usePostcodeView} pois={pois} activeOverlays={activeOverlays} + basemap={basemap} onViewChange={mapData.handleViewChange} viewFeature={mapViewFeature} colorRange={mapData.colorRange} @@ -196,7 +200,7 @@ export function MobileMapPage({
{overlayPaneOpen && ( -
+
{overlayPane}
)} diff --git a/frontend/src/components/map/map-page/ScreenshotMapPage.tsx b/frontend/src/components/map/map-page/ScreenshotMapPage.tsx index 5aa74a4..b18d2da 100644 --- a/frontend/src/components/map/map-page/ScreenshotMapPage.tsx +++ b/frontend/src/components/map/map-page/ScreenshotMapPage.tsx @@ -4,6 +4,7 @@ import type { FeatureMeta, ViewState } from '../../../types'; import type { useMapData } from '../../../hooks/useMapData'; import type { TravelTimeEntry } from '../../../hooks/useTravelTime'; import type { OverlayId } from '../../../lib/overlays'; +import type { BasemapId } from '../../../lib/basemaps'; import { MapFallback } from './Fallbacks'; import { Map } from './lazyComponents'; @@ -20,6 +21,7 @@ interface ScreenshotMapPageProps { ogMode?: boolean; travelTimeEntries: TravelTimeEntry[]; activeOverlays: Set; + basemap: BasemapId; } export function ScreenshotMapPage({ @@ -33,6 +35,7 @@ export function ScreenshotMapPage({ ogMode, travelTimeEntries, activeOverlays, + basemap, }: ScreenshotMapPageProps) { return (
@@ -43,6 +46,7 @@ export function ScreenshotMapPage({ usePostcodeView={mapData.usePostcodeView} pois={[]} activeOverlays={activeOverlays} + basemap={basemap} onViewChange={mapData.handleViewChange} viewFeature={mapViewFeature} colorRange={mapData.colorRange} diff --git a/frontend/src/components/map/map-page/types.ts b/frontend/src/components/map/map-page/types.ts index a6096b1..51d81f1 100644 --- a/frontend/src/components/map/map-page/types.ts +++ b/frontend/src/components/map/map-page/types.ts @@ -7,6 +7,7 @@ import type { } from '../../../types'; import type { TravelTimeInitial } from '../../../hooks/useTravelTime'; import type { OverlayId } from '../../../lib/overlays'; +import type { BasemapId } from '../../../lib/basemaps'; import type { Page } from '../../ui/Header'; import type { PointerEvent } from 'react'; @@ -27,6 +28,7 @@ export interface MapPageProps { initialViewState: ViewState; initialPOICategories: Set; initialOverlays?: Set; + initialBasemap?: BasemapId; initialTab: 'properties' | 'area'; initialLoading: boolean; theme: 'light' | 'dark'; diff --git a/frontend/src/components/ui/FeatureLabel.tsx b/frontend/src/components/ui/FeatureLabel.tsx index d56b371..cbdf916 100644 --- a/frontend/src/components/ui/FeatureLabel.tsx +++ b/frontend/src/components/ui/FeatureLabel.tsx @@ -13,6 +13,7 @@ interface FeatureLabelProps { description?: string; label?: string; hideIconOnMobile?: boolean; + wrap?: boolean; } export function FeatureLabel({ @@ -23,10 +24,12 @@ export function FeatureLabel({ description, label, hideIconOnMobile, + wrap = false, }: FeatureLabelProps) { const { t } = useTranslation(); const textClass = size === 'sm' ? 'text-sm' : 'text-xs'; const gapClass = size === 'sm' ? 'gap-2' : 'gap-1'; + const alignmentClass = wrap ? 'items-start' : size === 'xs' ? 'items-center' : 'items-start'; const mobileHide = hideIconOnMobile ? 'hidden md:block ' : ''; const iconClass = `${mobileHide}w-3.5 h-3.5 text-teal-600 dark:text-teal-400 shrink-0`; const featureIcon = getFeatureIcon(feature.name, iconClass); @@ -38,7 +41,11 @@ export function FeatureLabel({ const nameContent = ( <> {translatedName} @@ -56,14 +63,14 @@ export function FeatureLabel({ ); return ( -
+
{featureIcon} {GroupIcon && } {translatedDesc ? (
-
{nameContent}
+
+ {nameContent} +
{translatedDesc}
) : ( diff --git a/frontend/src/hooks/useDeckLayers.ts b/frontend/src/hooks/useDeckLayers.ts index 33c1e24..5df8f68 100644 --- a/frontend/src/hooks/useDeckLayers.ts +++ b/frontend/src/hooks/useDeckLayers.ts @@ -46,6 +46,7 @@ interface UseDeckLayersProps { currentLocation?: { lat: number; lng: number } | null; bounds?: Bounds | null; travelTimeEntries?: TravelTimeEntry[]; + mapDataBeforeId: string; } /** Normalize a distribution count array to [0..1] ratios, padded to 10 values. */ @@ -88,6 +89,7 @@ export function useDeckLayers({ currentLocation, bounds: viewportBounds, travelTimeEntries = [], + mapDataBeforeId, }: UseDeckLayersProps) { const [hoverPosition, setHoverPosition] = useState<{ x: number; y: number } | null>(null); const [hoveredPostcode, setHoveredPostcode] = useState(null); @@ -419,10 +421,10 @@ export function useDeckLayers({ highPrecision: true, onClick: handleHexagonClick, onHover: handleHexagonHover, - beforeId: 'landuse_park', + beforeId: mapDataBeforeId, ...pieProps, }); - }, [data, colorTrigger, handleHexagonClick, handleHexagonHover]); + }, [data, colorTrigger, handleHexagonClick, handleHexagonHover, mapDataBeforeId]); const postcodeLayer = useMemo(() => { const isEnum = enumCountRef.current > 0; @@ -578,9 +580,15 @@ export function useDeckLayers({ onClick: handlePostcodeClick, onHover: handlePostcodeHoverCallback, // @ts-expect-error beforeId is a MapboxOverlay interleave prop, not typed in LayerProps - beforeId: 'landuse_park', + beforeId: mapDataBeforeId, }); - }, [postcodeData, postcodeColorTrigger, handlePostcodeClick, handlePostcodeHoverCallback]); + }, [ + postcodeData, + postcodeColorTrigger, + handlePostcodeClick, + handlePostcodeHoverCallback, + mapDataBeforeId, + ]); const labeledPostcodeData = useMemo( () => postcodeData.filter((feature) => feature.properties.count > 0), diff --git a/frontend/src/hooks/useHexagonSelection.test.ts b/frontend/src/hooks/useHexagonSelection.test.ts index a2534a4..528e392 100644 --- a/frontend/src/hooks/useHexagonSelection.test.ts +++ b/frontend/src/hooks/useHexagonSelection.test.ts @@ -3,6 +3,7 @@ import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'; import { useHexagonSelection } from './useHexagonSelection'; import type { FeatureMeta, HexagonStatsResponse, PostcodeGeometry } from '../types'; +import type { TravelTimeEntry } from './useTravelTime'; vi.mock('../lib/pocketbase', () => ({ default: { authStore: { isValid: false, token: '' } }, @@ -41,9 +42,24 @@ function jsonResponse(body: unknown): Response { }); } +async function flushPromises() { + await Promise.resolve(); + await Promise.resolve(); +} + describe('useHexagonSelection', () => { const requests: string[] = []; - const features: FeatureMeta[] = [{ name: 'Price', type: 'numeric', min: 0, max: 100 }]; + const features: FeatureMeta[] = [ + { name: 'Price', type: 'numeric', min: 0, max: 100 }, + { name: 'Last known price', type: 'numeric', min: 0, max: 1_000_000 }, + { name: 'Estimated current price', type: 'numeric', min: 0, max: 1_000_000 }, + { name: 'Price per sqm', type: 'numeric', min: 0, max: 20_000 }, + { name: 'Est. price per sqm', type: 'numeric', min: 0, max: 20_000 }, + { name: 'Total floor area (sqm)', type: 'numeric', min: 0, max: 500 }, + { name: 'Number of bedrooms & living rooms', type: 'numeric', min: 0, max: 12 }, + { name: 'Construction year', type: 'numeric', min: 0, max: 2026 }, + { name: 'Date of last transaction', type: 'numeric', min: 0, max: 2026 }, + ]; beforeEach(() => { requests.length = 0; @@ -64,6 +80,18 @@ describe('useHexagonSelection', () => { return Promise.resolve(jsonResponse(stats(12))); } + if (url.pathname === '/api/postcode-properties') { + return Promise.resolve( + jsonResponse({ properties: [], total: 0, offset: 0, truncated: false }) + ); + } + + if (url.pathname === '/api/hexagon-properties') { + return Promise.resolve( + jsonResponse({ properties: [], total: 0, offset: 0, truncated: false }) + ); + } + return Promise.resolve(new Response(null, { status: 404 })); }) ); @@ -201,4 +229,203 @@ describe('useHexagonSelection', () => { expect(requests.some((url) => url.startsWith('/api/postcode/'))).toBe(false); expect(requests.some((url) => url.startsWith('/api/hexagon-stats'))).toBe(false); }); + + it('passes area stat field projections to stats requests', async () => { + const { result } = renderHook(() => + useHexagonSelection({ + filters: {}, + features, + hexagonData: [], + resolution: 9, + usePostcodeView: false, + travelTimeEntries: [], + areaStatsFields: ['Price'], + }) + ); + + act(() => { + result.current.handleHexagonClick('89195da49abffff'); + }); + + await waitFor(() => { + expect(result.current.areaStats?.count).toBe(12); + }); + + const statsRequest = requests.find((url) => url.startsWith('/api/hexagon-stats')); + expect(statsRequest).toBeDefined(); + expect(new URL(statsRequest!, 'http://localhost').searchParams.get('fields')).toBe('Price'); + }); + + it('keeps existing area stats visible while area field projections refetch', async () => { + const pendingStatsRequests: Array<{ resolve: (response: Response) => void }> = []; + vi.stubGlobal( + 'fetch', + vi.fn((input: string | URL | Request) => { + const url = new URL(String(input), 'http://localhost'); + requests.push(`${url.pathname}${url.search}`); + + if (url.pathname === '/api/hexagon-stats') { + return new Promise((resolve) => { + pendingStatsRequests.push({ resolve }); + }); + } + + return Promise.resolve(new Response(null, { status: 404 })); + }) + ); + + const { result, rerender } = renderHook( + ({ areaStatsFields }: { areaStatsFields: string[] }) => + useHexagonSelection({ + filters: {}, + features, + hexagonData: [], + resolution: 9, + usePostcodeView: false, + travelTimeEntries: [], + areaStatsFields, + }), + { initialProps: { areaStatsFields: [] as string[] } } + ); + + act(() => { + result.current.handleHexagonClick('89195da49abffff'); + }); + + await waitFor(() => { + expect(pendingStatsRequests).toHaveLength(1); + }); + + await act(async () => { + pendingStatsRequests[0].resolve(jsonResponse(stats(12))); + await flushPromises(); + }); + + await waitFor(() => { + expect(result.current.areaStats?.count).toBe(12); + expect(result.current.loadingAreaStats).toBe(false); + }); + + act(() => { + rerender({ areaStatsFields: ['Price'] }); + }); + + await waitFor(() => { + expect(pendingStatsRequests).toHaveLength(2); + }); + + expect(result.current.loadingAreaStats).toBe(true); + expect(result.current.areaStats?.count).toBe(12); + + const refetchRequest = requests.filter((url) => url.startsWith('/api/hexagon-stats'))[1]; + expect(new URL(refetchRequest, 'http://localhost').searchParams.get('fields')).toBe('Price'); + + await act(async () => { + pendingStatsRequests[1].resolve(jsonResponse(stats(12))); + await flushPromises(); + }); + + await waitFor(() => { + expect(result.current.loadingAreaStats).toBe(false); + expect(result.current.areaStats?.count).toBe(12); + }); + }); + + it('passes property card field projections to property requests', async () => { + const { result } = renderHook(() => + useHexagonSelection({ + filters: {}, + features, + hexagonData: [], + resolution: 9, + usePostcodeView: true, + travelTimeEntries: [], + }) + ); + + act(() => { + result.current.handleLocationSearch('SW1A 1AA', postcodeGeometry, 51.505, -0.115); + }); + + await waitFor(() => { + expect(result.current.areaStats?.count).toBe(4); + }); + + act(() => { + result.current.handlePropertiesTabClick(); + }); + + await waitFor(() => { + expect(requests.some((url) => url.startsWith('/api/postcode-properties'))).toBe(true); + }); + + const propertiesRequest = requests.find((url) => url.startsWith('/api/postcode-properties')); + const fieldsParam = new URL(propertiesRequest!, 'http://localhost').searchParams.get('fields'); + expect(fieldsParam).toContain('Last known price'); + expect(fieldsParam).toContain('Date of last transaction'); + expect(fieldsParam).not.toContain('Distance to nearest amenity'); + }); + + it('refetches property requests when stats basis switches to all properties', async () => { + const propertyFilters = { Price: [0, 50] as [number, number] }; + const travelTimeEntries: TravelTimeEntry[] = [ + { + mode: 'transit', + slug: 'kings-cross', + label: 'Kings Cross', + timeRange: [0, 30], + useBest: false, + }, + ]; + const { result } = renderHook(() => + useHexagonSelection({ + filters: propertyFilters, + features, + hexagonData: [], + resolution: 9, + usePostcodeView: true, + travelTimeEntries, + }) + ); + + act(() => { + result.current.handleLocationSearch('SW1A 1AA', postcodeGeometry, 51.505, -0.115); + }); + + await waitFor(() => { + expect(result.current.areaStats?.count).toBe(0); + }); + + act(() => { + result.current.handlePropertiesTabClick(); + }); + + await waitFor(() => { + expect(requests.filter((url) => url.startsWith('/api/postcode-properties')).length).toBe(1); + }); + + const filteredPropertiesRequest = requests.find((url) => + url.startsWith('/api/postcode-properties') + ); + const filteredParams = new URL(filteredPropertiesRequest!, 'http://localhost').searchParams; + expect(filteredParams.has('filters')).toBe(true); + expect(filteredParams.has('travel')).toBe(true); + + act(() => { + result.current.setAreaStatsUseFilters(false); + }); + + await waitFor(() => { + expect(result.current.areaStats?.count).toBe(4); + }); + await waitFor(() => { + expect(requests.filter((url) => url.startsWith('/api/postcode-properties')).length).toBe(2); + }); + + const propertyRequests = requests.filter((url) => url.startsWith('/api/postcode-properties')); + const allPropertiesRequest = propertyRequests[propertyRequests.length - 1]; + const allPropertiesParams = new URL(allPropertiesRequest, 'http://localhost').searchParams; + expect(allPropertiesParams.has('filters')).toBe(false); + expect(allPropertiesParams.has('travel')).toBe(false); + }); }); diff --git a/frontend/src/hooks/useHexagonSelection.ts b/frontend/src/hooks/useHexagonSelection.ts index 7dc4d94..425158a 100644 --- a/frontend/src/hooks/useHexagonSelection.ts +++ b/frontend/src/hooks/useHexagonSelection.ts @@ -42,11 +42,23 @@ interface UseHexagonSelectionOptions { resolution: number; usePostcodeView: boolean; travelTimeEntries: TravelTimeEntry[]; + areaStatsFields?: string[]; shareCode?: string; /** First transit destination — used to pick the best central_postcode for journey display. */ journeyDest?: JourneyDest | null; } +const PROPERTY_PANE_FIELDS = [ + 'Last known price', + 'Estimated current price', + 'Price per sqm', + 'Est. price per sqm', + 'Total floor area (sqm)', + 'Number of bedrooms & living rooms', + 'Construction year', + 'Date of last transaction', +]; + export function useHexagonSelection({ filters, features, @@ -54,6 +66,7 @@ export function useHexagonSelection({ resolution, usePostcodeView, travelTimeEntries, + areaStatsFields, shareCode, journeyDest, }: UseHexagonSelectionOptions) { @@ -93,6 +106,11 @@ export function useHexagonSelection({ }, []); const travelParam = useMemo(() => buildTravelParam(travelTimeEntries), [travelTimeEntries]); + const areaStatsFieldsKey = useMemo(() => areaStatsFields?.join(';;') ?? '', [areaStatsFields]); + const propertyPaneFieldsParam = useMemo(() => { + const availableFields = new Set(features.map((feature) => feature.name)); + return PROPERTY_PANE_FIELDS.filter((field) => availableFields.has(field)).join(';;'); + }, [features]); const fetchHexagonStats = useCallback( async ( @@ -110,8 +128,9 @@ export function useHexagonSelection({ if (filterStr) params.append('filters', filterStr); if (includeFilters && travelParam) params.set('travel', travelParam); if (shareCode) params.set('share', shareCode); - if (fields) { - params.set('fields', fields.join(';;')); + const requestedFields = fields ?? areaStatsFields; + if (requestedFields) { + params.set('fields', requestedFields.join(';;')); } if (journeyDest) { params.set('journey_mode', journeyDest.mode); @@ -121,27 +140,34 @@ export function useHexagonSelection({ assertOk(response, 'hexagon-stats'); return (await response.json()) as HexagonStatsResponse; }, - [filters, features, journeyDest, shareCode, travelParam] + [areaStatsFields, filters, features, journeyDest, shareCode, travelParam] ); const fetchPostcodeStats = useCallback( - async (postcode: string, signal?: AbortSignal, includeFilters = true) => { + async ( + postcode: string, + signal?: AbortSignal, + includeFilters = true, + fields?: string[] + ) => { const params = new URLSearchParams({ postcode }); const filterStr = includeFilters ? buildFilterString(filters, features) : ''; if (filterStr) params.append('filters', filterStr); if (includeFilters && travelParam) params.set('travel', travelParam); if (shareCode) params.set('share', shareCode); + const requestedFields = fields ?? areaStatsFields; + if (requestedFields) params.set('fields', requestedFields.join(';;')); const response = await fetch(apiUrl('postcode-stats', params), authHeaders({ signal })); assertOk(response, 'postcode-stats'); return (await response.json()) as HexagonStatsResponse; }, - [filters, features, shareCode, travelParam] + [areaStatsFields, filters, features, shareCode, travelParam] ); const filterStr = useMemo(() => buildFilterString(filters, features), [filters, features]); const hasStatsFilters = filterStr.length > 0 || travelParam.length > 0; const journeyKey = journeyDest ? `${journeyDest.mode}:${journeyDest.slug}` : ''; - const areaStatsQueryKey = useMemo( + const areaStatsDataKey = useMemo( () => [ areaStatsUseFilters ? 'filtered' : 'all', @@ -152,6 +178,10 @@ export function useHexagonSelection({ ].join('|'), [areaStatsUseFilters, filterStr, journeyKey, shareCode, travelParam] ); + const areaStatsQueryKey = useMemo( + () => [areaStatsDataKey, areaStatsFieldsKey].join('|'), + [areaStatsDataKey, areaStatsFieldsKey] + ); const fetchUnfilteredAreaCount = useCallback( async (selection: SelectedHexagon, requestId: number, signal?: AbortSignal) => { @@ -162,8 +192,8 @@ export function useHexagonSelection({ const stats = selection.type === 'postcode' - ? await fetchPostcodeStats(selection.id, signal, false) - : await fetchHexagonStats(selection.id, selection.resolution, signal, undefined, false); + ? await fetchPostcodeStats(selection.id, signal, false, []) + : await fetchHexagonStats(selection.id, selection.resolution, signal, [], false); if (isCurrentAreaRequest(requestId)) setUnfilteredAreaCount(stats.count); }, [fetchHexagonStats, fetchPostcodeStats, hasStatsFilters, isCurrentAreaRequest] @@ -209,9 +239,10 @@ export function useHexagonSelection({ offset: offset.toString(), }); - const filterStr = buildFilterString(filters, features); + const filterStr = areaStatsUseFilters ? buildFilterString(filters, features) : ''; if (filterStr) params.append('filters', filterStr); - if (travelParam) params.set('travel', travelParam); + if (areaStatsUseFilters && travelParam) params.set('travel', travelParam); + params.set('fields', propertyPaneFieldsParam); if (shareCode) params.set('share', shareCode); const response = await fetch(apiUrl('hexagon-properties', params), authHeaders()); @@ -235,8 +266,10 @@ export function useHexagonSelection({ [ filters, features, + areaStatsUseFilters, invalidatePropertyRequests, isCurrentPropertyRequest, + propertyPaneFieldsParam, shareCode, travelParam, ] @@ -255,9 +288,10 @@ export function useHexagonSelection({ params.set('focus_address', focusAddress); } - const filterStr = buildFilterString(filters, features); + const filterStr = areaStatsUseFilters ? buildFilterString(filters, features) : ''; if (filterStr) params.append('filters', filterStr); - if (travelParam) params.set('travel', travelParam); + if (areaStatsUseFilters && travelParam) params.set('travel', travelParam); + params.set('fields', propertyPaneFieldsParam); if (shareCode) params.set('share', shareCode); const response = await fetch(apiUrl('postcode-properties', params), authHeaders()); @@ -281,8 +315,10 @@ export function useHexagonSelection({ [ filters, features, + areaStatsUseFilters, invalidatePropertyRequests, isCurrentPropertyRequest, + propertyPaneFieldsParam, shareCode, travelParam, ] @@ -546,25 +582,34 @@ export function useHexagonSelection({ rightPaneTab, ]); - // Re-fetch stats when filters or travel constraints change while an area is selected - const prevAreaStatsQueryKey = useRef(areaStatsQueryKey); + // Re-fetch stats when the selected stats basis or requested field projection changes. + const prevAreaStatsQueryRef = useRef({ + dataKey: areaStatsDataKey, + queryKey: areaStatsQueryKey, + }); useEffect(() => { - if (prevAreaStatsQueryKey.current === areaStatsQueryKey) return; - prevAreaStatsQueryKey.current = areaStatsQueryKey; + const previousQuery = prevAreaStatsQueryRef.current; + if (previousQuery.queryKey === areaStatsQueryKey) return; + prevAreaStatsQueryRef.current = { + dataKey: areaStatsDataKey, + queryKey: areaStatsQueryKey, + }; if (!selectedHexagon) return; + const fieldProjectionOnlyChanged = previousQuery.dataKey === areaStatsDataKey; - // Clear stale properties - setProperties([]); - setPropertiesTotal(0); - setPropertiesOffset(0); - invalidatePropertyRequests(); - setAreaStats(null); - setUnfilteredAreaCount(null); + if (!fieldProjectionOnlyChanged) { + // Clear stale properties + setProperties([]); + setPropertiesTotal(0); + setPropertiesOffset(0); + invalidatePropertyRequests(); + setAreaStats(null); + setUnfilteredAreaCount(null); + } setLoadingAreaStats(true); - let cancelled = false; const requestId = invalidateAreaRequests(); const fetchStats = @@ -580,11 +625,11 @@ export function useHexagonSelection({ fetchStats .then((stats) => { - if (cancelled || !isCurrentAreaRequest(requestId)) return; + if (!isCurrentAreaRequest(requestId)) return; setAreaStats(stats); refreshUnfilteredAreaCount(selectedHexagon, stats.count, areaStatsUseFilters, requestId); - // Re-fetch properties if the properties tab is active and the filtered area still has matches. - if (areaStatsUseFilters && rightPaneTab === 'properties' && stats.count > 0) { + // Re-fetch properties if the properties tab is active and the selected basis has matches. + if (!fieldProjectionOnlyChanged && rightPaneTab === 'properties' && stats.count > 0) { if (selectedHexagon.type === 'postcode') { fetchPostcodeProperties(selectedHexagon.id, 0); } else { @@ -593,17 +638,14 @@ export function useHexagonSelection({ } }) .catch((error) => { - if (cancelled || !isCurrentAreaRequest(requestId)) return; + if (!isCurrentAreaRequest(requestId)) return; logNonAbortError('Failed to refresh stats', error); }) .finally(() => { - if (!cancelled && isCurrentAreaRequest(requestId)) setLoadingAreaStats(false); + if (isCurrentAreaRequest(requestId)) setLoadingAreaStats(false); }); - - return () => { - cancelled = true; - }; }, [ + areaStatsDataKey, areaStatsQueryKey, selectedHexagon, fetchHexagonStats, diff --git a/frontend/src/hooks/useListingLayers.ts b/frontend/src/hooks/useListingLayers.ts index 7d7c97b..136fbcb 100644 --- a/frontend/src/hooks/useListingLayers.ts +++ b/frontend/src/hooks/useListingLayers.ts @@ -1,42 +1,211 @@ -import { useCallback, useMemo, useRef, useState } from 'react'; +import { useCallback, useEffect, useMemo, useRef, useState } from 'react'; import type { Layer, PickingInfo } from '@deck.gl/core'; -import { ScatterplotLayer, TextLayer } from '@deck.gl/layers'; +import { PathLayer, ScatterplotLayer, TextLayer } from '@deck.gl/layers'; +import Supercluster from 'supercluster'; import type { ActualListing } from '../types'; import { trackEvent } from '../lib/analytics'; const PRICE_LABEL_MIN_ZOOM = 14; const ADDRESS_LABEL_MIN_ZOOM = 16; +const LISTING_CLUSTER_RADIUS = 18; +const LISTING_CLUSTER_MAX_ZOOM = 24; +const LISTING_CLUSTER_POPUP_LIMIT = 30; +const LISTING_SPIDERFY_LIMIT = 12; +const TILE_SIZE = 512; -export interface ListingPopupInfo { +interface SingleListingPopupInfo { + mode: 'single'; x: number; y: number; listing: ActualListing; + locked?: boolean; } +interface ListingClusterPopupInfo { + mode: 'cluster'; + x: number; + y: number; + count: number; + listings: ActualListing[]; + locked?: boolean; +} + +export type ListingPopupInfo = SingleListingPopupInfo | ListingClusterPopupInfo; + interface UseListingLayersProps { listings: ActualListing[]; zoom: number; isDark: boolean; } +interface ListingClusterPoint { + lng: number; + lat: number; + count: number; + clusterId: number; +} + +interface ExpandedListingMarker { + listing: ActualListing; + lng: number; + lat: number; + anchorLng: number; + anchorLat: number; +} + function formatShortPrice(price: number): string { if (price >= 1_000_000) return `£${(price / 1_000_000).toFixed(price >= 10_000_000 ? 0 : 1)}M`; if (price >= 1_000) return `£${Math.round(price / 1_000)}k`; return `£${price}`; } +function formatClusterCount(count: number): string { + if (count >= 1_000) return `${(count / 1_000).toFixed(count >= 10_000 ? 0 : 1)}k`; + return String(count); +} + +function compareListingsForDisplay(left: ActualListing, right: ActualListing): number { + const dateCompare = (right.listing_date_iso ?? '').localeCompare(left.listing_date_iso ?? ''); + if (dateCompare !== 0) return dateCompare; + return (right.asking_price ?? 0) - (left.asking_price ?? 0); +} + +function getClusterListings( + index: Supercluster, + clusterId: number, + limit: number +): ActualListing[] { + return index + .getLeaves(clusterId, limit, 0) + .map((feature) => feature.properties) + .sort(compareListingsForDisplay); +} + +function offsetLngLat( + lng: number, + lat: number, + dxPixels: number, + dyPixels: number, + zoom: number +): [number, number] { + const worldSize = TILE_SIZE * Math.pow(2, zoom); + const lngPerPixel = 360 / worldSize; + const cosLat = Math.max(0.25, Math.cos((lat * Math.PI) / 180)); + const latPerPixel = lngPerPixel / cosLat; + return [lng + dxPixels * lngPerPixel, lat - dyPixels * latPerPixel]; +} + +function spiderfyPosition( + lng: number, + lat: number, + index: number, + total: number, + zoom: number +): [number, number] { + if (total <= 1) return [lng, lat]; + const radius = total <= 6 ? 24 : 32; + const angle = -Math.PI / 2 + (index / total) * Math.PI * 2; + return offsetLngLat(lng, lat, Math.cos(angle) * radius, Math.sin(angle) * radius, zoom); +} + export function useListingLayers({ listings, zoom, isDark }: UseListingLayersProps) { const [popupInfo, setPopupInfo] = useState(null); + const [selectedCluster, setSelectedCluster] = useState(null); - const handleHover = useCallback((info: PickingInfo) => { - if (info.object && info.x !== undefined && info.y !== undefined) { - setPopupInfo({ x: info.x, y: info.y, listing: info.object }); - } else { - setPopupInfo(null); + useEffect(() => { + setSelectedCluster(null); + setPopupInfo(null); + }, [listings]); + + const clusterIndex = useMemo(() => { + if (listings.length === 0) return null; + const index = new Supercluster({ + radius: LISTING_CLUSTER_RADIUS, + maxZoom: LISTING_CLUSTER_MAX_ZOOM, + }); + const features: Supercluster.PointFeature[] = listings + .filter((listing) => Number.isFinite(listing.lat) && Number.isFinite(listing.lon)) + .map((listing) => ({ + type: 'Feature', + geometry: { type: 'Point', coordinates: [listing.lon, listing.lat] }, + properties: listing, + })); + index.load(features); + return index; + }, [listings]); + + const clusterIndexRef = useRef(clusterIndex); + clusterIndexRef.current = clusterIndex; + + const clusterZoom = Math.min(Math.floor(zoom), LISTING_CLUSTER_MAX_ZOOM); + const { visibleListings, clusters } = useMemo(() => { + if (!clusterIndex) { + return { + visibleListings: [] as ActualListing[], + clusters: [] as ListingClusterPoint[], + }; } + + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const features = clusterIndex.getClusters([-180, -85, 180, 85], clusterZoom) as any[]; + const individual: ActualListing[] = []; + const clusterPoints: ListingClusterPoint[] = []; + for (const feature of features) { + if (feature.properties.cluster) { + clusterPoints.push({ + lng: feature.geometry.coordinates[0], + lat: feature.geometry.coordinates[1], + count: feature.properties.point_count, + clusterId: feature.properties.cluster_id, + }); + } else { + individual.push(feature.properties as ActualListing); + } + } + return { visibleListings: individual, clusters: clusterPoints }; + }, [clusterIndex, clusterZoom]); + + const expandedListings = useMemo(() => { + if (!selectedCluster || !clusterIndex) return []; + const leaves = getClusterListings( + clusterIndex, + selectedCluster.clusterId, + LISTING_SPIDERFY_LIMIT + ); + return leaves.map((listing, index) => { + const [lng, lat] = spiderfyPosition( + selectedCluster.lng, + selectedCluster.lat, + index, + leaves.length, + zoom + ); + return { + listing, + lng, + lat, + anchorLng: selectedCluster.lng, + anchorLat: selectedCluster.lat, + }; + }); + }, [clusterIndex, selectedCluster, zoom]); + + const clearUnlockedPopup = useCallback(() => { + setPopupInfo((current) => (current?.locked ? current : null)); }, []); + const handleHover = useCallback( + (info: PickingInfo) => { + if (info.object && info.x !== undefined && info.y !== undefined) { + setPopupInfo({ mode: 'single', x: info.x, y: info.y, listing: info.object }); + } else { + clearUnlockedPopup(); + } + }, + [clearUnlockedPopup] + ); + const handleClick = useCallback((info: PickingInfo) => { const url = info.object?.listing_url; if (!url) return; @@ -58,25 +227,115 @@ export function useListingLayers({ listings, zoom, isDark }: UseListingLayersPro [] ); + const handleExpandedHover = useCallback( + (info: PickingInfo) => { + if (info.object && info.x !== undefined && info.y !== undefined) { + setPopupInfo({ mode: 'single', x: info.x, y: info.y, listing: info.object.listing }); + } else { + clearUnlockedPopup(); + } + }, + [clearUnlockedPopup] + ); + + const handleExpandedClick = useCallback((info: PickingInfo) => { + const url = info.object?.listing.listing_url; + if (!url) return; + trackEvent('Actual Listing Click', { url, source: 'cluster_expanded' }); + window.open(url, '_blank', 'noopener,noreferrer'); + }, []); + + const handleExpandedHoverRef = useRef(handleExpandedHover); + handleExpandedHoverRef.current = handleExpandedHover; + const stableExpandedHover = useCallback( + (info: PickingInfo) => handleExpandedHoverRef.current(info), + [] + ); + + const handleExpandedClickRef = useRef(handleExpandedClick); + handleExpandedClickRef.current = handleExpandedClick; + const stableExpandedClick = useCallback( + (info: PickingInfo) => handleExpandedClickRef.current(info), + [] + ); + + const handleClusterHover = useCallback( + (info: PickingInfo) => { + if (info.object && info.x !== undefined && info.y !== undefined) { + const cluster = info.object; + setPopupInfo((current) => + current?.locked + ? current + : { + mode: 'cluster', + x: info.x, + y: info.y, + count: cluster.count, + listings: [], + } + ); + } else { + clearUnlockedPopup(); + } + }, + [clearUnlockedPopup] + ); + + const handleClusterClick = useCallback((info: PickingInfo) => { + if (!info.object || info.x === undefined || info.y === undefined) return; + const index = clusterIndexRef.current; + if (!index) return; + const cluster = info.object; + const clusterListings = getClusterListings( + index, + cluster.clusterId, + LISTING_CLUSTER_POPUP_LIMIT + ); + setSelectedCluster(cluster); + setPopupInfo({ + mode: 'cluster', + x: info.x, + y: info.y, + count: cluster.count, + listings: clusterListings, + locked: true, + }); + trackEvent('Actual Listing Cluster Click', { count: cluster.count }); + }, []); + + const handleClusterHoverRef = useRef(handleClusterHover); + handleClusterHoverRef.current = handleClusterHover; + const stableClusterHover = useCallback( + (info: PickingInfo) => handleClusterHoverRef.current(info), + [] + ); + + const handleClusterClickRef = useRef(handleClusterClick); + handleClusterClickRef.current = handleClusterClick; + const stableClusterClick = useCallback( + (info: PickingInfo) => handleClusterClickRef.current(info), + [] + ); + const pinShadowLayer = useMemo( () => new ScatterplotLayer({ id: 'actual-listing-shadow', - data: listings, + data: visibleListings, getPosition: (d) => [d.lon, d.lat], getRadius: 8, radiusUnits: 'pixels', getFillColor: isDark ? [0, 0, 0, 80] : [0, 0, 0, 40], pickable: false, }), - [listings, isDark] + [visibleListings, isDark] ); const pinLayer = useMemo( () => new ScatterplotLayer({ id: 'actual-listing-pin', - data: listings, + data: visibleListings, getPosition: (d) => [d.lon, d.lat], getRadius: 7, radiusUnits: 'pixels', @@ -91,12 +350,108 @@ export function useListingLayers({ listings, zoom, isDark }: UseListingLayersPro onHover: stableHover, onClick: stableClick, }), - [listings, stableHover, stableClick] + [visibleListings, stableHover, stableClick] + ); + + const clusterShadowLayer = useMemo( + () => + new ScatterplotLayer({ + id: 'actual-listing-cluster-shadow', + data: clusters, + getPosition: (d) => [d.lng, d.lat], + getRadius: (d) => Math.min(32, 13 + Math.sqrt(d.count) * 1.8), + radiusUnits: 'pixels', + getFillColor: isDark ? [0, 0, 0, 90] : [0, 0, 0, 45], + pickable: false, + }), + [clusters, isDark] + ); + + const clusterLayer = useMemo( + () => + new ScatterplotLayer({ + id: 'actual-listing-cluster', + data: clusters, + getPosition: (d) => [d.lng, d.lat], + getRadius: (d) => Math.min(30, 12 + Math.sqrt(d.count) * 1.8), + radiusUnits: 'pixels', + getFillColor: isDark ? [185, 28, 28, 230] : [220, 38, 38, 230], + getLineColor: [255, 255, 255, isDark ? 90 : 180], + getLineWidth: 2, + lineWidthUnits: 'pixels', + stroked: true, + pickable: true, + autoHighlight: true, + highlightColor: [29, 228, 195, 220], + onHover: stableClusterHover, + onClick: stableClusterClick, + }), + [clusters, isDark, stableClusterHover, stableClusterClick] + ); + + const clusterTextLayer = useMemo( + () => + new TextLayer({ + id: 'actual-listing-cluster-text', + data: clusters, + getPosition: (d) => [d.lng, d.lat], + getText: (d) => formatClusterCount(d.count), + getSize: 12, + getColor: [255, 255, 255, 255], + fontFamily: 'Inter, system-ui, sans-serif', + fontWeight: 800, + getTextAnchor: 'middle', + getAlignmentBaseline: 'center', + sizeUnits: 'pixels', + sizeMinPixels: 10, + sizeMaxPixels: 13, + pickable: false, + }), + [clusters] + ); + + const expandedConnectorLayer = useMemo( + () => + new PathLayer({ + id: 'actual-listing-expanded-lines', + data: expandedListings, + getPath: (d) => [ + [d.anchorLng, d.anchorLat], + [d.lng, d.lat], + ], + getColor: isDark ? [255, 255, 255, 80] : [80, 60, 50, 110], + getWidth: 1, + widthUnits: 'pixels', + pickable: false, + }), + [expandedListings, isDark] + ); + + const expandedPinLayer = useMemo( + () => + new ScatterplotLayer({ + id: 'actual-listing-expanded-pin', + data: expandedListings, + getPosition: (d) => [d.lng, d.lat], + getRadius: 6, + radiusUnits: 'pixels', + getFillColor: [231, 76, 60, 245], + getLineColor: [255, 255, 255, 255], + getLineWidth: 1.5, + lineWidthUnits: 'pixels', + stroked: true, + pickable: true, + autoHighlight: true, + highlightColor: [29, 228, 195, 220], + onHover: stableExpandedHover, + onClick: stableExpandedClick, + }), + [expandedListings, stableExpandedHover, stableExpandedClick] ); const priceLabelLayer = useMemo(() => { if (zoom < PRICE_LABEL_MIN_ZOOM) return null; - const labeled = listings.filter((l) => l.asking_price && l.asking_price > 0); + const labeled = visibleListings.filter((l) => l.asking_price && l.asking_price > 0); return new TextLayer({ id: 'actual-listing-price', data: labeled, @@ -117,11 +472,11 @@ export function useListingLayers({ listings, zoom, isDark }: UseListingLayersPro sizeMaxPixels: 14, pickable: false, }); - }, [listings, zoom, isDark]); + }, [visibleListings, zoom, isDark]); const detailLabelLayer = useMemo(() => { if (zoom < ADDRESS_LABEL_MIN_ZOOM) return null; - const labeled = listings.filter((l) => l.address || l.bedrooms != null); + const labeled = visibleListings.filter((l) => l.address || l.bedrooms != null); return new TextLayer({ id: 'actual-listing-detail', data: labeled, @@ -148,16 +503,39 @@ export function useListingLayers({ listings, zoom, isDark }: UseListingLayersPro sizeMaxPixels: 12, pickable: false, }); - }, [listings, zoom, isDark]); + }, [visibleListings, zoom, isDark]); const listingLayers = useMemo(() => { - const layers: Layer[] = [pinShadowLayer, pinLayer]; + const layers: Layer[] = [ + clusterShadowLayer, + clusterLayer, + clusterTextLayer, + pinShadowLayer, + pinLayer, + ]; + if (expandedListings.length > 0) { + layers.push(expandedConnectorLayer, expandedPinLayer); + } if (priceLabelLayer) layers.push(priceLabelLayer); if (detailLabelLayer) layers.push(detailLabelLayer); return layers; - }, [pinShadowLayer, pinLayer, priceLabelLayer, detailLabelLayer]); + }, [ + clusterShadowLayer, + clusterLayer, + clusterTextLayer, + pinShadowLayer, + pinLayer, + expandedListings.length, + expandedConnectorLayer, + expandedPinLayer, + priceLabelLayer, + detailLabelLayer, + ]); - const clearListingPopup = useCallback(() => setPopupInfo(null), []); + const clearListingPopup = useCallback(() => { + setPopupInfo(null); + setSelectedCluster(null); + }, []); return { listingLayers, listingPopup: popupInfo, clearListingPopup }; } diff --git a/frontend/src/hooks/useMapData.test.ts b/frontend/src/hooks/useMapData.test.ts index 8a721da..8c17189 100644 --- a/frontend/src/hooks/useMapData.test.ts +++ b/frontend/src/hooks/useMapData.test.ts @@ -20,6 +20,7 @@ function viewChange(bounds: Bounds): ViewChangeParams { return { resolution: 8, bounds, + visibleBounds: bounds, zoom: 10, latitude: (bounds.south + bounds.north) / 2, longitude: (bounds.west + bounds.east) / 2, diff --git a/frontend/src/hooks/useMapData.ts b/frontend/src/hooks/useMapData.ts index 7188030..17709f7 100644 --- a/frontend/src/hooks/useMapData.ts +++ b/frontend/src/hooks/useMapData.ts @@ -84,6 +84,7 @@ export function useMapData({ const [postcodeData, setPostcodeData] = useState([]); const [resolution, setResolution] = useState(8); const [bounds, setBounds] = useState(null); + const [visibleBounds, setVisibleBounds] = useState(null); const [loading, setLoading] = useState(false); const [zoom, setZoom] = useState(10); const [currentView, setCurrentView] = useState<{ @@ -685,6 +686,7 @@ export function useMapData({ ({ resolution: newRes, bounds: newBounds, + visibleBounds: newVisibleBounds, zoom: newZoom, latitude, longitude, @@ -697,6 +699,7 @@ export function useMapData({ setResolution(newRes); setBounds(newBounds); } + setVisibleBounds(newVisibleBounds); setZoom(newZoom); setCurrentView({ latitude, longitude, zoom: newZoom }); setCurrentVisibleView({ @@ -729,6 +732,7 @@ export function useMapData({ postcodeData: effectivePostcodeData, resolution, bounds, + visibleBounds, loading: isLoading, zoom, currentView, diff --git a/frontend/src/hooks/useUrlSync.ts b/frontend/src/hooks/useUrlSync.ts index 2c5a819..4997adb 100644 --- a/frontend/src/hooks/useUrlSync.ts +++ b/frontend/src/hooks/useUrlSync.ts @@ -2,6 +2,7 @@ import { useEffect, useRef } from 'react'; import type { FeatureMeta, FeatureFilters } from '../types'; import { stateToParams } from '../lib/url-state'; import type { OverlayId } from '../lib/overlays'; +import type { BasemapId } from '../lib/basemaps'; import type { TravelTimeEntry } from './useTravelTime'; const URL_DEBOUNCE_MS = 300; @@ -14,7 +15,8 @@ export function useUrlSync( rightPaneTab: 'properties' | 'area', travelTimeEntries?: TravelTimeEntry[], share?: string, - selectedOverlays?: Set + selectedOverlays?: Set, + basemap?: BasemapId ) { const urlDebounceRef = useRef | null>(null); @@ -31,7 +33,8 @@ export function useUrlSync( rightPaneTab, travelTimeEntries, share, - selectedOverlays + selectedOverlays, + basemap ); const search = params.toString(); const newUrl = search ? `${window.location.pathname}?${search}` : window.location.pathname; @@ -50,5 +53,6 @@ export function useUrlSync( travelTimeEntries, share, selectedOverlays, + basemap, ]); } diff --git a/frontend/src/i18n/details.ts b/frontend/src/i18n/details.ts index 2e30372..9d1544d 100644 --- a/frontend/src/i18n/details.ts +++ b/frontend/src/i18n/details.ts @@ -38,7 +38,7 @@ export const details: Record> = { 'Street tree density percentile': "Couverture arborée approximative autour du centroïde du code postal, dérivée de la carte Trees Outside Woodland 2025 de Forest Research. Les polygones de couvert arboré des arbres isolés et groupes d'arbres sont comptés dans un rayon de 50 m de chaque centroïde de code postal, puis convertis en percentile parmi les codes postaux anglais. Il s'agit d'une approximation fondée sur le centroïde du code postal, pas d'une mesure exacte du bien ou du segment de rue.", 'Within conservation area': - "Limites de zones de conservation de Historic England, rattachées au point représentatif du code postal. Le jeu de données national est indicatif plutôt que définitif ; les décisions sensibles aux limites doivent être vérifiées auprès de l'autorité locale de planification.", + "Limites de zones de conservation de Planning Data, rattachées au point représentatif du code postal. Le jeu de données national est en cours d'amélioration et peut contenir des doublons ou une couverture locale incomplète ; les décisions sensibles aux limites doivent être vérifiées auprès de l'autorité locale de planification.", 'Listed building': "Points de bâtiments classés de la National Heritage List for England de Historic England, associés prudemment aux adresses des biens à partir du nom de l'entrée classée et de codes postaux proches candidats. À traiter comme un signal de présélection, pas comme une décision juridique : vérifiez tout bien précis dans la NHLE et auprès de l'autorité locale de planification.", 'Good+ primary schools within 2km': @@ -188,7 +188,7 @@ export const details: Record> = { 'Street tree density percentile': 'Ungefähre Baumkronenbedeckung rund um den Postleitzahlen-Zentroiden aus der Forest-Research-Karte Trees Outside Woodland 2025. Baumkronen-Polygone für Einzelbäume und Baumgruppen werden im Umkreis von 50 m um jeden Postleitzahlen-Zentroiden gezählt und dann in ein Perzentil über englische Postleitzahlen umgerechnet. Dies ist ein Näherungswert auf Basis des Postleitzahlen-Zentroids, keine exakte Messung für Immobilie oder Straßenabschnitt.', 'Within conservation area': - 'Historic-England-Grenzen für Erhaltungsgebiete, dem repräsentativen Punkt der Postleitzahl zugeordnet. Der nationale Datensatz ist indikativ und nicht rechtsverbindlich; grenznahe Entscheidungen sollten bei der lokalen Planungsbehörde geprüft werden.', + 'Planning-Data-Grenzen für Erhaltungsgebiete, dem repräsentativen Punkt der Postleitzahl zugeordnet. Der nationale Datensatz wird laufend verbessert und kann Duplikate oder unvollständige lokale Abdeckung enthalten; grenznahe Entscheidungen sollten bei der lokalen Planungsbehörde geprüft werden.', 'Listed building': 'Punktdaten zu denkmalgeschützten Gebäuden aus der National Heritage List for England von Historic England, vorsichtig mit Immobilienadressen abgeglichen anhand des Namens des Denkmaleintrags und nahegelegener Postleitzahlkandidaten. Behandle dies als Vorauswahl-Hinweis, nicht als rechtliche Feststellung: Prüfe jede konkrete Immobilie in der NHLE und bei der lokalen Planungsbehörde.', 'Good+ primary schools within 2km': @@ -338,7 +338,7 @@ export const details: Record> = { 'Street tree density percentile': '基于 Forest Research 2025 年 Trees Outside Woodland 地图估算的邮编质心周边树冠覆盖率。系统会统计每个邮编质心 50 米范围内的孤立树木和树群树冠多边形,然后转换为英格兰邮编范围内的百分位。这是邮编质心近似指标,不是精确的房产或道路路段测量。', 'Within conservation area': - 'Historic England 保护区边界,与邮编代表点匹配。全国数据集是指示性而非最终权威;涉及边界的决策应向地方规划部门核实。', + 'Planning Data 保护区边界,与邮编代表点匹配。全国数据集仍在完善中,可能包含重复记录或地方覆盖不完整;涉及边界的决策应向地方规划部门核实。', 'Listed building': 'Historic England 英格兰国家遗产名录(NHLE)中的受保护建筑点位记录,会根据名录条目名称和附近候选邮编,谨慎匹配到房产地址。请把它当作初筛信号,而不是法律认定:具体房产应在 NHLE 和地方规划部门核实。', 'Good+ primary schools within 2km': @@ -480,7 +480,7 @@ export const details: Record> = { 'Street tree density percentile': 'Forest Research के 2025 Trees Outside Woodland नक्शे से निकाला गया पोस्टकोड केंद्र के आसपास का अनुमानित वृक्ष आच्छादन. अकेले पेड़ों और पेड़ों के समूहों के वृक्ष-शिखर बहुभुजों को हर पोस्टकोड केंद्र से 50m के भीतर गिना जाता है, फिर इंग्लैंड के पोस्टकोडों के मुकाबले प्रतिशतक में बदला जाता है. यह पोस्टकोड-केंद्र पर आधारित अनुमानक है, किसी संपत्ति या सड़क-खंड की सटीक माप नहीं.', 'Within conservation area': - 'Historic England संरक्षण क्षेत्र सीमाएं पोस्टकोड प्रतिनिधि बिंदु से मिलाई जाती हैं. राष्ट्रीय डेटासेट संकेतक है, अंतिम आधिकारिक नहीं; सीमा-संवेदनशील निर्णय स्थानीय योजना प्राधिकरण से जांचे जाने चाहिए.', + 'Planning Data संरक्षण क्षेत्र सीमाएं पोस्टकोड प्रतिनिधि बिंदु से मिलाई जाती हैं. राष्ट्रीय डेटासेट अभी बेहतर किया जा रहा है और इसमें डुप्लीकेट या अधूरी स्थानीय कवरेज हो सकती है; सीमा-संवेदनशील निर्णय स्थानीय योजना प्राधिकरण से जांचे जाने चाहिए.', 'Listed building': 'Historic England की इंग्लैंड की राष्ट्रीय धरोहर सूची (NHLE) में सूचीबद्ध भवनों के बिंदु रिकॉर्ड, जिन्हें सूचीबद्ध प्रविष्टि के नाम और पास के संभावित पोस्टकोड के आधार पर संपत्ति पते से सावधानी से मिलाया गया है. इसे केवल प्रारंभिक जांच संकेत मानें, कानूनी निर्णय नहीं: किसी भी विशिष्ट संपत्ति को NHLE और स्थानीय योजना प्राधिकरण से सत्यापित करें.', 'Good+ primary schools within 2km': @@ -630,7 +630,7 @@ export const details: Record> = { 'Street tree density percentile': 'A Forest Research 2025-os Trees Outside Woodland térképéből származó hozzávetőleges lombkorona-fedettség az irányítószám-középpont körül. A magányos fák és facsoportok lombkorona-poligonjait minden irányítószám-középpont 50 méteres körzetében számoljuk, majd az angliai irányítószámok közötti percentilissé alakítjuk. Ez az irányítószám-középponton alapuló közelítő mutató, nem pontos ingatlan- vagy utcaszakasz-mérés.', 'Within conservation area': - 'A Historic England műemléki területeinek határai az irányítószám reprezentatív pontjához rendelve. Az országos adatállomány tájékoztató jellegű, nem végleges; határérzékeny döntéseknél a helyi tervezési hatóság adatait kell ellenőrizni.', + 'A Planning Data műemléki területeinek határai az irányítószám reprezentatív pontjához rendelve. Az országos adatállomány fejlesztés alatt áll, és tartalmazhat duplikátumokat vagy hiányos helyi lefedettséget; határérzékeny döntéseknél a helyi tervezési hatóság adatait kell ellenőrizni.', 'Listed building': 'A Historic England National Heritage List for England műemlékiépület-pontrekordjai, amelyeket óvatosan egyeztetünk ingatlancímekhez a műemléki bejegyzés neve és a közeli irányítószám-jelöltek alapján. Előszűrési jelzésként kezelendő, nem jogi megállapításként: minden konkrét ingatlant ellenőrizni kell az NHLE-ben és a helyi tervezési hatóságnál.', 'Good+ primary schools within 2km': diff --git a/frontend/src/i18n/locales/de.ts b/frontend/src/i18n/locales/de.ts index c728295..d2e77c8 100644 --- a/frontend/src/i18n/locales/de.ts +++ b/frontend/src/i18n/locales/de.ts @@ -1139,8 +1139,8 @@ const de: Translations = { dsTowOrigin: 'Forest Research / Defra NCEA', dsTowUse: 'Baumkronen-Polygone für Einzelbäume, Baumgruppen und kleine Gehölze in England. Hier verwendet, um Baumdeckungs-Perzentile rund um Postleitzahlen-Zentroide zu schätzen.', - dsConservationAreasName: 'Historic England Conservation Areas (Denkmalschutzgebiete)', - dsConservationAreasOrigin: 'Historic England und lokale Planungsbehörden', + dsConservationAreasName: 'Planning Data Conservation Areas (Denkmalschutzgebiete)', + dsConservationAreasOrigin: 'Planning Data / lokale Planungsbehörden', dsConservationAreasUse: 'Grenzen ausgewiesener Conservation Areas in England. Wird genutzt, um zu kennzeichnen, ob der repräsentative Punkt einer Postleitzahl innerhalb eines solchen Denkmalschutzgebiets liegt.', dsListedBuildingsName: 'Historic England denkmalgeschützte Gebäude', diff --git a/frontend/src/i18n/locales/en.ts b/frontend/src/i18n/locales/en.ts index a58ddaf..d7bd749 100644 --- a/frontend/src/i18n/locales/en.ts +++ b/frontend/src/i18n/locales/en.ts @@ -811,8 +811,8 @@ const en = { rooms: 'Rooms:', built: 'Built:', formerCouncil: 'Ex-council:', - exCouncilBadge: 'Maybe ex-council house', - listedBuildingBadge: 'Maybe listed', + exCouncilBadge: 'Likely ex-council house', + listedBuildingBadge: 'Likely listed', epcRating: 'EPC rating:', epcPotential: 'EPC potential:', renovations: 'Renovations', @@ -1113,8 +1113,8 @@ const en = { dsTowOrigin: 'Forest Research / Defra NCEA', dsTowUse: 'Tree canopy polygons for lone trees, groups of trees, and small woodlands in England. Used here to estimate tree coverage percentiles around postcode centroids.', - dsConservationAreasName: 'Historic England Conservation Areas', - dsConservationAreasOrigin: 'Historic England and local planning authorities', + dsConservationAreasName: 'Planning Data Conservation Areas', + dsConservationAreasOrigin: 'Planning Data / local planning authorities', dsConservationAreasUse: 'Designated conservation area boundaries for England. Used to flag whether a postcode representative point falls within a conservation area.', dsListedBuildingsName: 'Historic England Listed Buildings', diff --git a/frontend/src/i18n/locales/fr.ts b/frontend/src/i18n/locales/fr.ts index 576d097..c8e2861 100644 --- a/frontend/src/i18n/locales/fr.ts +++ b/frontend/src/i18n/locales/fr.ts @@ -1148,8 +1148,8 @@ const fr: Translations = { dsTowOrigin: 'Forest Research / Defra NCEA', dsTowUse: 'Polygones de couvert arboré pour les arbres isolés, groupes d’arbres et petits bois en Angleterre. Utilisés ici pour estimer les percentiles de couvert arboré autour des centroïdes de codes postaux.', - dsConservationAreasName: 'Zones de conservation de Historic England', - dsConservationAreasOrigin: 'Historic England et autorités locales de planification', + dsConservationAreasName: 'Zones de conservation de Planning Data', + dsConservationAreasOrigin: 'Planning Data / autorités locales de planification', dsConservationAreasUse: 'Limites des zones de conservation désignées en Angleterre. Utilisées pour indiquer si le point représentatif d’un code postal se trouve dans une zone de conservation.', dsListedBuildingsName: 'Bâtiments classés Historic England', diff --git a/frontend/src/i18n/locales/hi.ts b/frontend/src/i18n/locales/hi.ts index 42707f5..8904241 100644 --- a/frontend/src/i18n/locales/hi.ts +++ b/frontend/src/i18n/locales/hi.ts @@ -1091,8 +1091,8 @@ const hi: Translations = { dsTowOrigin: 'Forest Research / Defra NCEA', dsTowUse: 'इंग्लैंड में अकेले पेड़ों, पेड़ों के समूहों और छोटे वन क्षेत्रों के वृक्ष आच्छादन बहुभुज. यहां पोस्टकोड केंद्रों के आसपास वृक्ष आच्छादन प्रतिशतक का अनुमान लगाने के लिए उपयोग किया गया है.', - dsConservationAreasName: 'Historic England संरक्षण क्षेत्र', - dsConservationAreasOrigin: 'Historic England और स्थानीय योजना प्राधिकरण', + dsConservationAreasName: 'Planning Data संरक्षण क्षेत्र', + dsConservationAreasOrigin: 'Planning Data / स्थानीय योजना प्राधिकरण', dsConservationAreasUse: 'इंग्लैंड में नामित संरक्षण क्षेत्रों की सीमाएं. इसका उपयोग यह दिखाने के लिए किया जाता है कि पोस्टकोड का प्रतिनिधि बिंदु संरक्षण क्षेत्र में आता है या नहीं.', dsListedBuildingsName: 'Historic England सूचीबद्ध भवन', diff --git a/frontend/src/i18n/locales/hu.ts b/frontend/src/i18n/locales/hu.ts index d5e51d7..30c7e43 100644 --- a/frontend/src/i18n/locales/hu.ts +++ b/frontend/src/i18n/locales/hu.ts @@ -1134,8 +1134,8 @@ const hu: Translations = { dsTowOrigin: 'Forest Research / Defra NCEA', dsTowUse: 'Fák lombkorona-poligonjai magányos fákhoz, facsoportokhoz és kisebb erdőfoltokhoz Angliában. Itt az irányítószám-középpontok körüli lombkorona-fedettségi percentilisek becslésére használjuk.', - dsConservationAreasName: 'Historic England műemlékvédelmi területek', - dsConservationAreasOrigin: 'Historic England és helyi tervezési hatóságok', + dsConservationAreasName: 'Planning Data műemlékvédelmi területek', + dsConservationAreasOrigin: 'Planning Data / helyi tervezési hatóságok', dsConservationAreasUse: 'Anglia kijelölt műemlékvédelmi területeinek határai. Annak jelzésére használjuk, hogy egy irányítószám reprezentatív pontja ilyen területre esik-e.', dsListedBuildingsName: 'Historic England műemlék épületek', diff --git a/frontend/src/i18n/locales/zh.ts b/frontend/src/i18n/locales/zh.ts index aea7fcf..43fce84 100644 --- a/frontend/src/i18n/locales/zh.ts +++ b/frontend/src/i18n/locales/zh.ts @@ -1064,8 +1064,8 @@ const zh: Translations = { dsTowOrigin: 'Forest Research / Defra NCEA', dsTowUse: '英格兰孤立树木、树群和小片林地的树冠多边形。此处用于估算邮编质心周围的树冠覆盖率百分位。', - dsConservationAreasName: 'Historic England 保护区', - dsConservationAreasOrigin: 'Historic England 和地方规划部门', + dsConservationAreasName: 'Planning Data 保护区', + dsConservationAreasOrigin: 'Planning Data / 地方规划部门', dsConservationAreasUse: '英格兰指定保护区边界。用于标记邮编代表点是否位于保护区内。', dsListedBuildingsName: 'Historic England 登录建筑', dsListedBuildingsOrigin: 'Historic England 英格兰国家遗产名录', diff --git a/frontend/src/lib/basemaps.ts b/frontend/src/lib/basemaps.ts new file mode 100644 index 0000000..b241af5 --- /dev/null +++ b/frontend/src/lib/basemaps.ts @@ -0,0 +1,19 @@ +export const BASEMAP_IDS = ['standard', 'satellite'] as const; + +export type BasemapId = (typeof BASEMAP_IDS)[number]; + +export interface BasemapDefinition { + id: BasemapId; + label: string; +} + +export const BASEMAPS: BasemapDefinition[] = [ + { id: 'standard', label: 'Map' }, + { id: 'satellite', label: 'Satellite' }, +]; + +const BASEMAP_ID_SET = new Set(BASEMAP_IDS); + +export function isBasemapId(value: string): value is BasemapId { + return BASEMAP_ID_SET.has(value); +} diff --git a/frontend/src/lib/consts.ts b/frontend/src/lib/consts.ts index 538a5bc..200a632 100644 --- a/frontend/src/lib/consts.ts +++ b/frontend/src/lib/consts.ts @@ -11,7 +11,7 @@ export const COLOR_RANGE_HIGH_PERCENTILE = 95; export const MAP_BOUNDS: [number, number, number, number] = [-9.5, 49, 5, 57]; export const MAP_MIN_ZOOM = 5.5; -export const BUFFER_MULTIPLIER = 1.5; +export const BUFFER_MULTIPLIER = 1; /** Demo free zone bounds (south, west, north, east) — must match server FREE_ZONE_BOUNDS */ export const FREE_ZONE_BOUNDS = { south: 51.44, west: -0.31, north: 51.59, east: 0.05 }; diff --git a/frontend/src/lib/map-utils.test.ts b/frontend/src/lib/map-utils.test.ts index b82e1b6..bb3b584 100644 --- a/frontend/src/lib/map-utils.test.ts +++ b/frontend/src/lib/map-utils.test.ts @@ -6,6 +6,7 @@ import { DENSITY_GRADIENT, ENUM_PALETTE, FEATURE_GRADIENT, + BUFFER_MULTIPLIER, MAP_BOUNDS, POI_CATEGORY_LOGOS, SMALLEST_VISIBLE_HEXAGON_RESOLUTION, @@ -15,6 +16,7 @@ import { enumIndexToColor, getBoundsFromViewState, getBoundsWithBottomScreenInset, + getVisibleBoundsFromViewState, getLatitudeAtVerticalPixelOffset, getFeatureFillColor, getMapCenterForTargetScreenPoint, @@ -31,17 +33,33 @@ describe('map utilities', () => { expect(SMALLEST_VISIBLE_HEXAGON_RESOLUTION).toBe(9); }); - it('computes buffered bounds around a view state', () => { - const bounds = getBoundsFromViewState( - { latitude: 51.5, longitude: -0.1, zoom: 12, pitch: 0 }, - 1200, - 800 - ); + it('computes exact viewport bounds by default', () => { + const viewState = { latitude: 51.5, longitude: -0.1, zoom: 12, pitch: 0 }; + const bounds = getBoundsFromViewState(viewState, 1200, 800); + const exactBounds = getBoundsFromViewState(viewState, 1200, 800, 1); + const bufferedBounds = getBoundsFromViewState(viewState, 1200, 800, 1.5); + expect(BUFFER_MULTIPLIER).toBe(1); + expect(bounds).toEqual(exactBounds); expect(bounds.south).toBeLessThan(51.5); expect(bounds.north).toBeGreaterThan(51.5); expect(bounds.west).toBeLessThan(-0.1); expect(bounds.east).toBeGreaterThan(-0.1); + expect(bufferedBounds.south).toBeLessThan(bounds.south); + expect(bufferedBounds.north).toBeGreaterThan(bounds.north); + expect(bufferedBounds.west).toBeLessThan(bounds.west); + expect(bufferedBounds.east).toBeGreaterThan(bounds.east); + }); + + it('excludes mobile bottom-sheet covered map area from visible bounds', () => { + const viewState = { latitude: 51.5, longitude: -0.1, zoom: 12, pitch: 0 }; + const fullBounds = getVisibleBoundsFromViewState(viewState, 390, 800, 0); + const visibleBounds = getVisibleBoundsFromViewState(viewState, 390, 800, 352); + + expect(visibleBounds.west).toBeCloseTo(fullBounds.west, 6); + expect(visibleBounds.east).toBeCloseTo(fullBounds.east, 6); + expect(visibleBounds.north).toBeCloseTo(fullBounds.north, 6); + expect(visibleBounds.south).toBeGreaterThan(fullBounds.south); }); it('moves the map center so a target lands in the requested screen position', () => { diff --git a/frontend/src/lib/map-utils.ts b/frontend/src/lib/map-utils.ts index a893fc3..2730820 100644 --- a/frontend/src/lib/map-utils.ts +++ b/frontend/src/lib/map-utils.ts @@ -1,6 +1,7 @@ import type { ViewState, Bounds } from '../types'; import type { StyleSpecification } from 'maplibre-gl'; import { layers, namedFlavor } from '@protomaps/basemaps'; +import type { BasemapId } from './basemaps'; import { GLYPHS_URL, FEATURE_GRADIENT, @@ -9,11 +10,19 @@ import { TWEMOJI_BASE, BUFFER_MULTIPLIER, POI_CATEGORY_LOGOS, + MAP_MIN_ZOOM, type GradientStop, } from './consts'; const ROAD_OPACITY = 0.4; const TILE_SIZE = 512; const MAX_MERCATOR_LATITUDE = 85; +const SATELLITE_MAX_ZOOM = 13; +const SATELLITE_ATTRIBUTION = + 'Sentinel-2 cloudless - https://s2maps.eu by EOX IT Services GmbH (Contains modified Copernicus Sentinel data 2024)'; + +export function getMapDataBeforeId(basemap: BasemapId): string { + return basemap === 'satellite' ? 'roads_runway' : 'landuse_park'; +} function clampLatitude(latitude: number): number { return Math.max(-MAX_MERCATOR_LATITUDE, Math.min(MAX_MERCATOR_LATITUDE, latitude)); @@ -66,10 +75,52 @@ export function getMapCenterForTargetScreenPoint( }; } -export function getMapStyle(theme: 'light' | 'dark'): StyleSpecification { +function isSatelliteReferenceLayer(layer: ReturnType[number]): boolean { + if (layer.type === 'symbol') return true; + if (layer.type !== 'line') return false; + return ( + layer.id.startsWith('roads_') || + layer.id.startsWith('boundaries') || + layer.id.startsWith('water_') + ); +} + +function satelliteReferenceLayer(layer: ReturnType[number]) { + if (layer.type === 'symbol') { + return { + ...layer, + paint: { + ...layer.paint, + 'text-color': '#f8fafc', + 'text-halo-color': '#111827', + 'text-halo-width': 1.6, + 'text-halo-blur': 0.3, + 'icon-opacity': 0.9, + }, + }; + } + + if (layer.type === 'line') { + const isCasing = layer.id.includes('casing'); + const isBoundary = layer.id.startsWith('boundaries'); + return { + ...layer, + paint: { + ...layer.paint, + 'line-color': isBoundary ? '#f8fafc' : isCasing ? '#111827' : '#f9fafb', + 'line-opacity': isBoundary ? 0.45 : isCasing ? 0.62 : 0.78, + }, + }; + } + + return layer; +} + +export function getMapStyle(theme: 'light' | 'dark', basemap: BasemapId): StyleSpecification { const flavor = namedFlavor(theme); // Use absolute URL for tiles - required by MapLibre const tileUrl = `${window.location.origin}/api/tiles/{z}/{x}/{y}`; + const satelliteTileUrl = `${window.location.origin}/api/tiles/satellite/{z}/{x}/{y}`; const baseLayers = layers('protomaps', flavor, { lang: 'en' }); const isDark = theme === 'dark'; @@ -105,6 +156,50 @@ export function getMapStyle(theme: 'light' | 'dark'): StyleSpecification { return layer; }); + if (basemap === 'satellite') { + return { + version: 8, + sprite: `${window.location.origin}/assets/sprites/${theme}`, + glyphs: GLYPHS_URL, + sources: { + satellite: { + type: 'raster', + tiles: [satelliteTileUrl], + tileSize: 256, + minzoom: MAP_MIN_ZOOM, + maxzoom: SATELLITE_MAX_ZOOM, + attribution: SATELLITE_ATTRIBUTION, + }, + protomaps: { + type: 'vector', + tiles: [tileUrl], + maxzoom: 15, + }, + }, + layers: [ + { + id: 'satellite-background', + type: 'background', + paint: { + 'background-color': isDark ? '#111827' : '#d4cec3', + }, + }, + { + id: 'satellite-raster', + type: 'raster', + source: 'satellite', + paint: { + 'raster-fade-duration': 120, + 'raster-brightness-min': isDark ? 0.08 : 0, + 'raster-brightness-max': isDark ? 0.86 : 1, + 'raster-contrast': isDark ? 0.08 : 0.03, + }, + }, + ...modifiedLayers.filter(isSatelliteReferenceLayer).map(satelliteReferenceLayer), + ], + } as StyleSpecification; + } + return { version: 8, sprite: `${window.location.origin}/assets/sprites/${theme}`, @@ -209,15 +304,16 @@ export function zoomToResolution(zoom: number): number { export function getBoundsFromViewState( viewState: ViewState, width: number, - height: number + height: number, + bufferMultiplier: number = BUFFER_MULTIPLIER ): Bounds { const { longitude, latitude, zoom } = viewState; const clampedLat = clampLatitude(latitude); const scale = Math.pow(2, zoom); const worldSize = TILE_SIZE * scale; - const bufferedWidth = width * BUFFER_MULTIPLIER; - const bufferedHeight = height * BUFFER_MULTIPLIER; + const bufferedWidth = width * bufferMultiplier; + const bufferedHeight = height * bufferMultiplier; const degreesPerPixelLng = 360 / worldSize; const halfWidthDeg = (bufferedWidth / 2) * degreesPerPixelLng; @@ -235,6 +331,58 @@ export function getBoundsFromViewState( return { south, west, north, east }; } +export function getBoundsFromScreenRect( + viewState: ViewState, + width: number, + height: number, + rect: { left?: number; top?: number; right?: number; bottom?: number } = {} +): Bounds { + const { longitude, latitude, zoom } = viewState; + const worldSize = TILE_SIZE * Math.pow(2, zoom); + const centerPixelX = longitudeToWorldX(longitude, worldSize); + const centerPixelY = latitudeToWorldY(clampLatitude(latitude), worldSize); + + const left = Math.min(rect.left ?? 0, rect.right ?? width); + const right = Math.max(rect.left ?? 0, rect.right ?? width); + const top = Math.min(rect.top ?? 0, rect.bottom ?? height); + const bottom = Math.max(rect.top ?? 0, rect.bottom ?? height); + + const longitudeAtX = (screenX: number) => { + const worldX = centerPixelX + screenX - width / 2; + const rawLongitude = (worldX / worldSize) * 360 - 180; + return Math.max(-180, Math.min(180, rawLongitude)); + }; + const latitudeAtY = (screenY: number) => { + const worldY = centerPixelY + screenY - height / 2; + return Math.max( + -MAX_MERCATOR_LATITUDE, + Math.min(MAX_MERCATOR_LATITUDE, worldYToLatitude(worldY, worldSize)) + ); + }; + + const west = longitudeAtX(left); + const east = longitudeAtX(right); + const topLatitude = latitudeAtY(top); + const bottomLatitude = latitudeAtY(bottom); + + return { + south: Math.min(topLatitude, bottomLatitude), + west: Math.min(west, east), + north: Math.max(topLatitude, bottomLatitude), + east: Math.max(west, east), + }; +} + +export function getVisibleBoundsFromViewState( + viewState: ViewState, + width: number, + height: number, + bottomScreenInset: number = 0 +): Bounds { + const visibleBottom = height - Math.max(0, Math.min(height, bottomScreenInset)); + return getBoundsFromScreenRect(viewState, width, height, { bottom: visibleBottom }); +} + export function getLatitudeAtVerticalPixelOffset( latitude: number, zoom: number, diff --git a/frontend/src/lib/overlays.ts b/frontend/src/lib/overlays.ts index c9e4dec..5f9add4 100644 --- a/frontend/src/lib/overlays.ts +++ b/frontend/src/lib/overlays.ts @@ -6,6 +6,7 @@ export interface OverlayDefinition { id: OverlayId; label: string; description: string; + detail: string; } export const OVERLAYS: OverlayDefinition[] = [ @@ -13,16 +14,22 @@ export const OVERLAYS: OverlayDefinition[] = [ id: 'noise', label: 'Noise', description: 'High-resolution Defra Lden noise raster', + detail: + 'Defra Strategic Noise Mapping Round 4 (2022), combining road, rail, and airport sources. Values are the EU-standard Lden metric (day-evening-night 24-hour weighted average), modelled on a 10m grid at 4m above ground. Brighter areas indicate higher modelled noise. Licensed under the Open Government Licence v3.0.', }, { id: 'crime-hotspots', label: 'Crime hotspots', description: 'Approximate police.uk street-crime heatmap', + detail: + 'Client-side heatmap of street-level crimes published by police.uk over the most recent months. Police.uk coordinates are anonymised snap-to-grid points, not exact offence locations, so the heatmap should be read as an approximation of relative density rather than a precise map of incidents.', }, { id: 'trees-outside-woodlands', label: 'Trees', description: 'Trees Outside Woodland canopy polygons', + detail: + 'Forest Research Trees Outside Woodland (TOW) v1 canopy polygons covering lone trees and groups of trees outside mapped woodland blocks. Useful for spotting tree-lined streets and green pockets that broader land-use layers miss. Polygon opacity scales with canopy area.', }, ]; diff --git a/frontend/src/lib/poi-distance-filter.test.ts b/frontend/src/lib/poi-distance-filter.test.ts index 3320142..fdd581d 100644 --- a/frontend/src/lib/poi-distance-filter.test.ts +++ b/frontend/src/lib/poi-distance-filter.test.ts @@ -1,11 +1,14 @@ import { describe, expect, it } from 'vitest'; -import type { FeatureMeta } from '../types'; +import type { FeatureFilters, FeatureMeta } from '../types'; import { POI_COUNT_2KM_FILTER_NAME, POI_DISTANCE_FILTER_NAME, TRANSPORT_DISTANCE_FILTER_NAME, clampPoiFilterRange, + createPoiFilterKey, + createPoiDistanceFilterKey, + getActiveAmenityFilterFeatureNames, getPoiFilterFeatureOptions, getPoiFilterName, } from './poi-distance-filter'; @@ -60,6 +63,20 @@ describe('poi-distance-filter', () => { expect(getPoiFilterName('Number of amenities (Bus stop) within 2km')).toBeNull(); }); + it('extracts only active non-transport amenity backend feature names', () => { + const cafeDistance = 'Distance to nearest amenity (Cafe) (km)'; + const parkCount = 'Number of amenities (Park) within 2km'; + const busStopDistance = 'Distance to nearest amenity (Bus stop) (km)'; + const filters: FeatureFilters = { + [createPoiDistanceFilterKey(cafeDistance, 0)]: [0, 1], + [createPoiFilterKey(POI_COUNT_2KM_FILTER_NAME, parkCount, 1)]: [2, 10], + [createPoiFilterKey(TRANSPORT_DISTANCE_FILTER_NAME, busStopDistance, 2)]: [0, 0.5], + Price: [0, 500000], + }; + + expect([...getActiveAmenityFilterFeatureNames(filters)]).toEqual([cafeDistance, parkCount]); + }); + it('clamps fixed amenity distance scales to the 0-5km slider bounds', () => { const feature = numeric('Distance to nearest amenity (Cafe) (km)', { absolute: true, diff --git a/frontend/src/lib/poi-distance-filter.ts b/frontend/src/lib/poi-distance-filter.ts index 6875d2d..edadee1 100644 --- a/frontend/src/lib/poi-distance-filter.ts +++ b/frontend/src/lib/poi-distance-filter.ts @@ -203,6 +203,20 @@ export function getPoiDistanceFeatureName(name: string): string | null { return parsePoiFilterKey(name); } +export function getActiveAmenityFilterFeatureNames(filters: FeatureFilters): Set { + const names = new Set(); + + for (const name of Object.keys(filters)) { + const filterName = getPoiFilterName(name); + if (!filterName || filterName === TRANSPORT_DISTANCE_FILTER_NAME) continue; + + const featureName = getPoiDistanceFeatureName(name); + if (featureName) names.add(featureName); + } + + return names; +} + export function replacePoiFilterKeySelection(key: string, featureName: string): string { const filterName = getPoiFilterName(key) ?? diff --git a/frontend/src/lib/url-state.test.ts b/frontend/src/lib/url-state.test.ts index 80ee2cf..a8219c9 100644 --- a/frontend/src/lib/url-state.test.ts +++ b/frontend/src/lib/url-state.test.ts @@ -173,6 +173,30 @@ describe('url-state', () => { expect(state.overlays).toEqual(new Set(['noise', 'crime-hotspots'])); }); + it('round-trips satellite basemap selection', () => { + const params = stateToParams( + null, + {}, + [], + new Set(), + 'area', + undefined, + undefined, + undefined, + 'satellite' + ); + + expect(params.get('basemap')).toBe('satellite'); + + window.history.replaceState({}, '', `/?${params.toString()}`); + const state = parseUrlState(); + + expect(state.basemap).toBe('satellite'); + + window.history.replaceState({}, '', '/?basemap=unknown'); + expect(parseUrlState().basemap).toBe('standard'); + }); + it('round-trips repeated school filters with dedicated URL params', () => { const schoolOne = createSchoolFilterKey('primary', 'good', 2, 1); const schoolTwo = createSchoolFilterKey('secondary', 'outstanding', 5, 2); diff --git a/frontend/src/lib/url-state.ts b/frontend/src/lib/url-state.ts index a420054..11af321 100644 --- a/frontend/src/lib/url-state.ts +++ b/frontend/src/lib/url-state.ts @@ -50,6 +50,7 @@ import { } from './poi-distance-filter'; import { dedupeTravelTimeEntries } from './travel-params'; import { isOverlayId, type OverlayId } from './overlays'; +import { isBasemapId, type BasemapId } from './basemaps'; const POI_NONE_PARAM = '__none'; @@ -58,6 +59,7 @@ export interface UrlState { filters: FeatureFilters; poiCategories: Set; overlays: Set; + basemap: BasemapId; tab: 'properties' | 'area'; travelTime?: TravelTimeInitial; postcode?: string; @@ -213,6 +215,7 @@ export function parseUrlState(): UrlState { filters: parseFilters(params), poiCategories: new Set(), overlays: new Set(), + basemap: 'standard', tab: 'area', }; @@ -253,6 +256,11 @@ export function parseUrlState(): UrlState { result.overlays = new Set(overlayParams.filter(isOverlayId)); } + const basemap = params.get('basemap'); + if (basemap && isBasemapId(basemap)) { + result.basemap = basemap; + } + // Tab: full name const tab = params.get('tab'); if (tab === 'properties' || tab === 'area') { @@ -320,7 +328,8 @@ export function stateToParams( rightPaneTab: 'properties' | 'area', travelTimeEntries?: TravelTimeEntry[], share?: string, - selectedOverlays?: Set + selectedOverlays?: Set, + basemap?: BasemapId ): URLSearchParams { const params = new URLSearchParams(); @@ -409,6 +418,10 @@ export function stateToParams( } } + if (basemap && basemap !== 'standard') { + params.set('basemap', basemap); + } + // Travel time: repeated `tt` params if (travelTimeEntries) { for (const entry of dedupeTravelTimeEntries(travelTimeEntries)) { diff --git a/frontend/src/types.ts b/frontend/src/types.ts index f606ab8..08aa2d7 100644 --- a/frontend/src/types.ts +++ b/frontend/src/types.ts @@ -85,6 +85,7 @@ export interface MapFlyToOptions { export interface ViewChangeParams { resolution: number; bounds: Bounds; + visibleBounds: Bounds; zoom: number; latitude: number; longitude: number; diff --git a/pipeline/download/conservation_areas.py b/pipeline/download/conservation_areas.py index 79c0f42..2e7873d 100644 --- a/pipeline/download/conservation_areas.py +++ b/pipeline/download/conservation_areas.py @@ -1,6 +1,6 @@ -"""Download Historic England conservation area polygons. +"""Download Planning Data conservation area polygons. -Source: Historic England Conservation Areas +Source: https://www.planning.data.gov.uk/dataset/conservation-area License: Open Government Licence v3.0 """ @@ -9,38 +9,60 @@ from pathlib import Path import httpx import pyogrio +from shapely import from_wkb -URL = ( - "https://opendata-historicengland.hub.arcgis.com/api/download/v1/items/" - "446bc9bf8b5b440386d0c504caa3dac5/geoPackage?layers=0" -) +URL = "https://files.planning.data.gov.uk/dataset/conservation-area.geojson" + + +def _geometry_column(metadata: dict, column_names: list[str]) -> str: + geometry_name = metadata.get("geometry_name") + if geometry_name: + return str(geometry_name) + for name in ("wkb_geometry", "geometry", "geom"): + if name in column_names: + return name + return column_names[-1] + + +def _validate_conservation_areas(path: Path) -> int: + info = pyogrio.read_info(path) + features = info.get("features", 0) + if features <= 0: + raise ValueError("Downloaded conservation areas file contains no features") + + metadata, table = pyogrio.read_arrow(path, columns=[], read_geometry=True) + geometry_name = _geometry_column(metadata, table.column_names) + geometries = from_wkb(table[geometry_name].combine_chunks().to_pylist()) + polygon_count = sum( + geom is not None + and not geom.is_empty + and geom.geom_type in {"Polygon", "MultiPolygon"} + for geom in geometries + ) + if polygon_count <= 0: + raise ValueError("Downloaded conservation areas file contains no polygons") + return int(features) def main() -> None: parser = argparse.ArgumentParser( - description="Download Historic England conservation area polygons" + description="Download Planning Data conservation area polygons" ) parser.add_argument( - "--output", type=Path, required=True, help="Output GeoPackage file path" + "--output", type=Path, required=True, help="Output GeoJSON file path" ) args = parser.parse_args() args.output.parent.mkdir(parents=True, exist_ok=True) tmp_path = args.output.with_name(f"{args.output.stem}.tmp{args.output.suffix}") - print("Downloading Historic England conservation areas...") + print("Downloading Planning Data conservation areas...") with httpx.stream("GET", URL, follow_redirects=True, timeout=300) as response: response.raise_for_status() with tmp_path.open("wb") as fh: for chunk in response.iter_bytes(): fh.write(chunk) - info = pyogrio.read_info(tmp_path) - features = info.get("features", 0) - geometry_type = info.get("geometry_type") - if features <= 0: - raise ValueError("Downloaded conservation areas file contains no features") - if "Polygon" not in str(geometry_type): - raise ValueError(f"Expected polygon geometry, got {geometry_type!r}") + features = _validate_conservation_areas(tmp_path) tmp_path.replace(args.output) size_mb = args.output.stat().st_size / (1024 * 1024) diff --git a/pipeline/download/inspire.py b/pipeline/download/inspire.py index 72b2e8b..c32e531 100644 --- a/pipeline/download/inspire.py +++ b/pipeline/download/inspire.py @@ -8,15 +8,61 @@ License: INSPIRE End User Licence """ import argparse -import re from concurrent.futures import ThreadPoolExecutor, as_completed +from html.parser import HTMLParser from pathlib import Path +import time +import zipfile +from urllib.parse import urljoin, urlparse import httpx from tqdm import tqdm -BASE = "https://use-land-property-data.service.gov.uk" -INDEX_URL = f"{BASE}/datasets/inspire/download" +BASE_URL = "https://use-land-property-data.service.gov.uk" +INDEX_URL = f"{BASE_URL}/datasets/inspire/download" +HEADERS = { + "User-Agent": "Mozilla/5.0 (compatible; perfect-postcode-data-pipeline/1.0)" +} +CHUNK_SIZE = 1024 * 1024 +MAX_ATTEMPTS = 5 +BACKOFF_BASE = 2.0 + + +class ZipLinkParser(HTMLParser): + """Collect links to Land Registry INSPIRE ZIP downloads.""" + + def __init__(self, base_url: str) -> None: + super().__init__() + self.base_url = base_url + self.base_netloc = urlparse(base_url).netloc + self.urls: set[str] = set() + + def handle_starttag( + self, tag: str, attrs: list[tuple[str, str | None]] + ) -> None: + if tag != "a": + return + + href = dict(attrs).get("href") + if not href: + return + + url = urljoin(self.base_url, href) + parsed = urlparse(url) + if ( + parsed.scheme in {"http", "https"} + and parsed.netloc == self.base_netloc + and parsed.path.startswith("/datasets/inspire/download/") + and parsed.path.endswith(".zip") + ): + self.urls.add(parsed._replace(query="", fragment="").geturl()) + + +def parse_zip_urls(html: str, base_url: str = BASE_URL) -> list[str]: + """Parse the INSPIRE download page for all council ZIP URLs.""" + parser = ZipLinkParser(base_url) + parser.feed(html) + return sorted(parser.urls) def get_zip_urls() -> list[str]: @@ -25,28 +71,76 @@ def get_zip_urls() -> list[str]: with httpx.Client( follow_redirects=True, timeout=httpx.Timeout(30.0, read=60), - headers={"User-Agent": "Mozilla/5.0", "Accept": "text/html"}, + headers={**HEADERS, "Accept": "text/html"}, ) as client: resp = client.get(INDEX_URL) resp.raise_for_status() html = resp.text - pattern = r'href="(/datasets/inspire/download/[^"]+\.zip)"' - paths = sorted(set(re.findall(pattern, html))) - return [f"{BASE}{p}" for p in paths] + urls = parse_zip_urls(html) + if not urls: + raise RuntimeError(f"No INSPIRE ZIP links found at {INDEX_URL}") + return urls -def download_one(url: str, output_dir: Path, client: httpx.Client) -> str: +def _is_valid_zip(path: Path) -> bool: + return path.exists() and zipfile.is_zipfile(path) + + +def _stream_download(url: str, output_path: Path, *, timeout: float) -> None: + with httpx.stream( + "GET", + url, + follow_redirects=True, + timeout=httpx.Timeout(30.0, read=timeout), + headers=HEADERS, + ) as response: + response.raise_for_status() + with output_path.open("wb") as out: + for chunk in response.iter_bytes(chunk_size=CHUNK_SIZE): + out.write(chunk) + + +def download_one( + url: str, + output_dir: Path, + *, + force: bool = False, + timeout: float = 600, +) -> str: """Download a single ZIP file. Returns the filename.""" - name = url.rsplit("/", 1)[-1] - dest = output_dir / name - if dest.exists(): - return f"{name} (skipped, exists)" + name = Path(urlparse(url).path).name + if not name.endswith(".zip"): + raise ValueError(f"Expected a ZIP download URL, got {url}") - resp = client.get(url) - resp.raise_for_status() - dest.write_bytes(resp.content) - return name + output_dir.mkdir(parents=True, exist_ok=True) + dest = output_dir / name + if not force and _is_valid_zip(dest): + return f"{name} (skipped, valid ZIP exists)" + + tmp = dest.with_suffix(dest.suffix + ".tmp") + last_exc: Exception | None = None + try: + for attempt in range(1, MAX_ATTEMPTS + 1): + tmp.unlink(missing_ok=True) + try: + _stream_download(url, tmp, timeout=timeout) + if not _is_valid_zip(tmp): + raise RuntimeError( + f"{name} did not download as a valid ZIP" + ) + tmp.replace(dest) + return name + except (httpx.HTTPError, OSError) as exc: + last_exc = exc + if attempt < MAX_ATTEMPTS: + time.sleep(BACKOFF_BASE ** (attempt - 1)) + finally: + tmp.unlink(missing_ok=True) + + raise RuntimeError( + f"{name} failed after {MAX_ATTEMPTS} attempts" + ) from last_exc def main() -> None: @@ -65,32 +159,61 @@ def main() -> None: default=8, help="Number of parallel downloads (default: 8)", ) + parser.add_argument( + "--force", + action="store_true", + help="Re-download files even when a valid ZIP already exists", + ) + parser.add_argument( + "--timeout", + type=float, + default=600, + help="Per-file read timeout in seconds (default: 600)", + ) args = parser.parse_args() + if args.workers < 1: + raise SystemExit("--workers must be at least 1") + args.output.mkdir(parents=True, exist_ok=True) print("Fetching download index...") urls = get_zip_urls() print(f"Found {len(urls)} files to download") - with ( - httpx.Client( - follow_redirects=True, - timeout=httpx.Timeout(30.0, read=120), - headers={"User-Agent": "Mozilla/5.0"}, - ) as client, - tqdm(total=len(urls), unit="file") as pbar, - ): + failures: list[tuple[str, Exception]] = [] + with tqdm(total=len(urls), unit="file") as pbar: with ThreadPoolExecutor(max_workers=args.workers) as pool: futures = { - pool.submit(download_one, url, args.output, client): url for url in urls + pool.submit( + download_one, + url, + args.output, + force=args.force, + timeout=args.timeout, + ): url + for url in urls } for future in as_completed(futures): - result = future.result() - pbar.set_postfix_str(result[:40]) + try: + result = future.result() + pbar.set_postfix_str(result[:40]) + except Exception as exc: # noqa: BLE001 + failures.append((futures[future], exc)) + pbar.set_postfix_str("FAILED") pbar.update(1) - print(f"Done. {len(urls)} files in {args.output}") + succeeded = len(urls) - len(failures) + print(f"Done. {succeeded}/{len(urls)} files in {args.output}") + if failures: + print(f"{len(failures)} file(s) failed:") + for url, exc in failures: + name = Path(urlparse(url).path).name + print(f" - {name}: {exc}") + raise SystemExit( + f"{len(failures)} INSPIRE download(s) failed; " + "re-run to retry only the missing files" + ) if __name__ == "__main__": diff --git a/pipeline/download/listed_buildings.py b/pipeline/download/listed_buildings.py index ea7589b..be39c85 100644 --- a/pipeline/download/listed_buildings.py +++ b/pipeline/download/listed_buildings.py @@ -5,6 +5,7 @@ License: Open Government Licence v3.0 """ import argparse +import time from pathlib import Path import httpx @@ -14,6 +15,8 @@ URL = ( "https://opendata-historicengland.hub.arcgis.com/api/download/v1/items/" "767f279327a24845bf47dfe5eae9862b/geoPackage?layers=0" ) +POLL_INTERVAL_S = 5 +POLL_TIMEOUT_S = 600 def main() -> None: @@ -28,11 +31,24 @@ def main() -> None: tmp_path = args.output.with_name(f"{args.output.stem}.tmp{args.output.suffix}") print("Downloading Historic England listed-building points...") - with httpx.stream("GET", URL, follow_redirects=True, timeout=300) as response: - response.raise_for_status() - with tmp_path.open("wb") as fh: - for chunk in response.iter_bytes(): - fh.write(chunk) + deadline = time.monotonic() + POLL_TIMEOUT_S + with httpx.Client(follow_redirects=True, timeout=300) as client: + while True: + with client.stream("GET", URL) as response: + if response.status_code == 202: + response.read() + if time.monotonic() > deadline: + raise TimeoutError( + f"Export did not finish within {POLL_TIMEOUT_S}s: " + f"{response.text}" + ) + time.sleep(POLL_INTERVAL_S) + continue + response.raise_for_status() + with tmp_path.open("wb") as fh: + for chunk in response.iter_bytes(): + fh.write(chunk) + break info = pyogrio.read_info(tmp_path) features = info.get("features", 0) diff --git a/pipeline/download/satellite_tiles.py b/pipeline/download/satellite_tiles.py new file mode 100644 index 0000000..f1d2856 --- /dev/null +++ b/pipeline/download/satellite_tiles.py @@ -0,0 +1,432 @@ +"""Download Sentinel-2 cloudless satellite tiles into a local PMTiles archive.""" + +from __future__ import annotations + +import argparse +import email.utils +import http.client +import math +import sqlite3 +import subprocess +import tempfile +import threading +import time +import urllib.error +import urllib.parse +import urllib.request +from concurrent.futures import FIRST_COMPLETED, Future, ThreadPoolExecutor, wait +from dataclasses import dataclass +from datetime import datetime, timezone +from pathlib import Path + +from pipeline.download.tiles import ensure_pmtiles_cli +from pipeline.local_temp import local_tmp_dir + +DEFAULT_TILE_URL = ( + "https://tiles.maps.eox.at/wmts/1.0.0/s2cloudless_3857/default/" + "GoogleMapsCompatible/{z}/{y}/{x}.jpg" +) +DEFAULT_BBOX = (-10.5, 49.0, 5.0, 61.0) +DEFAULT_MIN_ZOOM = 5 +DEFAULT_MAX_ZOOM = 13 +DEFAULT_RETRY_COOLDOWN = 15.0 +USER_AGENT = "perfect-postcode-satellite-tiles/1.0" +RETRYABLE_HTTP_STATUS = {408, 429, 500, 502, 503, 504} +ATTRIBUTION = ( + "Sentinel-2 cloudless - https://s2maps.eu by EOX IT Services GmbH " + "(Contains modified Copernicus Sentinel data 2024)" +) + + +@dataclass(frozen=True) +class Tile: + zoom: int + x: int + y: int + + +class _DownloadThrottle: + def __init__(self, min_request_interval: float) -> None: + self._min_request_interval = max(0.0, min_request_interval) + self._next_request_at = 0.0 + self._lock = threading.Lock() + + def wait(self) -> None: + while True: + with self._lock: + now = time.monotonic() + wait_for = self._next_request_at - now + if wait_for <= 0: + if self._min_request_interval: + self._next_request_at = now + self._min_request_interval + return + + time.sleep(min(wait_for, 1.0)) + + def defer(self, delay: float) -> bool: + if delay <= 0: + return False + + target = time.monotonic() + delay + with self._lock: + should_announce = target > self._next_request_at + 1.0 + self._next_request_at = max(self._next_request_at, target) + return should_announce + + +def _lonlat_to_tile(lon: float, lat: float, zoom: int) -> tuple[int, int]: + lat = max(min(lat, 85.05112878), -85.05112878) + n = 1 << zoom + x = int(math.floor((lon + 180.0) / 360.0 * n)) + y = int( + math.floor((1.0 - math.asinh(math.tan(math.radians(lat))) / math.pi) / 2.0 * n) + ) + return min(max(x, 0), n - 1), min(max(y, 0), n - 1) + + +def _tile_ranges( + bbox: tuple[float, float, float, float], zoom: int +) -> tuple[range, range]: + west, south, east, north = bbox + min_x, min_y = _lonlat_to_tile(west, north, zoom) + max_x, max_y = _lonlat_to_tile(east, south, zoom) + return range(min_x, max_x + 1), range(min_y, max_y + 1) + + +def _iter_tiles( + bbox: tuple[float, float, float, float], min_zoom: int, max_zoom: int +): + for zoom in range(min_zoom, max_zoom + 1): + x_range, y_range = _tile_ranges(bbox, zoom) + for x in x_range: + for y in y_range: + yield Tile(zoom=zoom, x=x, y=y) + + +def _tile_count( + bbox: tuple[float, float, float, float], min_zoom: int, max_zoom: int +) -> int: + count = 0 + for zoom in range(min_zoom, max_zoom + 1): + x_range, y_range = _tile_ranges(bbox, zoom) + count += len(x_range) * len(y_range) + return count + + +def _is_eox_tile_url(url: str) -> bool: + host = urllib.parse.urlparse(url).hostname or "" + return host == "tiles.maps.eox.at" or host.endswith(".tiles.maps.eox.at") + + +def _retry_after_seconds(headers) -> float | None: + raw = None + if headers is not None: + raw = headers.get("retry-after") or headers.get("Retry-After") + if not raw: + return None + + try: + return max(0.0, float(raw)) + except ValueError: + pass + + try: + retry_at = email.utils.parsedate_to_datetime(raw) + except (TypeError, ValueError): + return None + + if retry_at.tzinfo is None: + retry_at = retry_at.replace(tzinfo=timezone.utc) + return max(0.0, (retry_at - datetime.now(timezone.utc)).total_seconds()) + + +def _http_retry_delay( + err: urllib.error.HTTPError, + url: str, + attempt: int, + retry_cooldown: float, +) -> float | None: + if err.code in {204, 404}: + return None + + retry_after = _retry_after_seconds(err.headers) + if retry_after is not None: + return retry_after + + if err.code == 403 and _is_eox_tile_url(url): + return retry_cooldown + + if err.code in RETRYABLE_HTTP_STATUS: + return min(2.0, 0.25 * (2**attempt)) + + return None + + +def _fetch_tile( + tile: Tile, + source_url: str, + timeout: float, + retries: int, + throttle: _DownloadThrottle, + retry_cooldown: float, +) -> tuple[Tile, bytes | None]: + url = source_url.format(z=tile.zoom, x=tile.x, y=tile.y) + last_error: Exception | None = None + + for attempt in range(retries + 1): + try: + throttle.wait() + req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT}) + with urllib.request.urlopen(req, timeout=timeout) as response: + content_type = response.headers.get("content-type", "") + data = response.read() + if not data: + return tile, None + if not content_type.lower().startswith("image/"): + raise RuntimeError( + f"Unexpected content type for {url}: {content_type or 'unknown'}" + ) + return tile, data + except urllib.error.HTTPError as err: + if err.code in {204, 404}: + return tile, None + retry_delay = _http_retry_delay(err, url, attempt, retry_cooldown) + if retry_delay is None: + raise RuntimeError( + f"Failed to download satellite tile {url}: {err}" + ) from err + last_error = err + except ( + TimeoutError, + urllib.error.URLError, + ConnectionError, + http.client.HTTPException, + RuntimeError, + ) as err: + last_error = err + retry_delay = min(2.0, 0.25 * (2**attempt)) + + if attempt < retries: + if throttle.defer(retry_delay) and retry_delay >= 5.0: + print( + f"Satellite tile source returned {last_error}; " + f"pausing downloads for {retry_delay:.0f}s before retrying", + flush=True, + ) + + assert last_error is not None + raise RuntimeError(f"Failed to download satellite tile {url}: {last_error}") from last_error + + +def _create_mbtiles( + mbtiles_path: Path, + bbox: tuple[float, float, float, float], + min_zoom: int, + max_zoom: int, + source_url: str, + max_workers: int, + timeout: float, + retries: int, + retry_cooldown: float, + min_request_interval: float, +) -> int: + if mbtiles_path.exists(): + mbtiles_path.unlink() + + conn = sqlite3.connect(mbtiles_path) + conn.execute("PRAGMA journal_mode = WAL") + conn.execute("PRAGMA synchronous = NORMAL") + conn.execute("CREATE TABLE metadata (name TEXT, value TEXT)") + conn.execute( + "CREATE TABLE tiles (zoom_level INTEGER, tile_column INTEGER, " + "tile_row INTEGER, tile_data BLOB)" + ) + conn.execute( + "CREATE UNIQUE INDEX tile_index ON tiles (zoom_level, tile_column, tile_row)" + ) + conn.executemany( + "INSERT INTO metadata (name, value) VALUES (?, ?)", + [ + ("name", "Sentinel-2 cloudless satellite basemap"), + ("type", "baselayer"), + ("version", "1"), + ("description", "Sentinel-2 cloudless Web Mercator satellite imagery"), + ("format", "jpg"), + ("attribution", ATTRIBUTION), + ("bounds", ",".join(f"{value:.6f}" for value in bbox)), + ("minzoom", str(min_zoom)), + ("maxzoom", str(max_zoom)), + ], + ) + + total = _tile_count(bbox, min_zoom, max_zoom) + inserted = 0 + completed = 0 + submitted = 0 + tiles = iter(_iter_tiles(bbox, min_zoom, max_zoom)) + pending: set[Future[tuple[Tile, bytes | None]]] = set() + queue_size = max_workers * 4 + throttle = _DownloadThrottle(min_request_interval=min_request_interval) + + def submit_next(executor: ThreadPoolExecutor) -> bool: + nonlocal submitted + try: + tile = next(tiles) + except StopIteration: + return False + pending.add( + executor.submit( + _fetch_tile, + tile, + source_url, + timeout, + retries, + throttle, + retry_cooldown, + ) + ) + submitted += 1 + return True + + try: + with ThreadPoolExecutor(max_workers=max_workers) as executor: + for _ in range(queue_size): + if not submit_next(executor): + break + + while pending: + done, pending = wait(pending, return_when=FIRST_COMPLETED) + for future in done: + tile, tile_data = future.result() + completed += 1 + if tile_data is not None: + tms_y = (1 << tile.zoom) - 1 - tile.y + conn.execute( + "INSERT OR REPLACE INTO tiles VALUES (?, ?, ?, ?)", + (tile.zoom, tile.x, tms_y, tile_data), + ) + inserted += 1 + + submit_next(executor) + + if completed % 1000 == 0 or completed == total: + conn.commit() + print( + f"Downloaded {completed:,}/{total:,} satellite tiles " + f"({inserted:,} stored)", + flush=True, + ) + finally: + conn.commit() + conn.close() + + return inserted + + +def build_satellite_tiles( + output_path: Path, + pmtiles_bin: Path, + pmtiles_version: str, + bbox: tuple[float, float, float, float], + min_zoom: int, + max_zoom: int, + source_url: str, + max_workers: int, + timeout: float, + retries: int, + retry_cooldown: float, + min_request_interval: float, +) -> None: + if min_zoom > max_zoom: + raise ValueError("--min-zoom must be <= --max-zoom") + if len(bbox) != 4 or bbox[0] >= bbox[2] or bbox[1] >= bbox[3]: + raise ValueError("--bbox must be west,south,east,north") + + output_path.parent.mkdir(parents=True, exist_ok=True) + ensure_pmtiles_cli(pmtiles_bin, pmtiles_version) + + with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as tmp: + mbtiles_path = Path(tmp) / "satellite.mbtiles" + tile_count = _create_mbtiles( + mbtiles_path=mbtiles_path, + bbox=bbox, + min_zoom=min_zoom, + max_zoom=max_zoom, + source_url=source_url, + max_workers=max_workers, + timeout=timeout, + retries=retries, + retry_cooldown=retry_cooldown, + min_request_interval=min_request_interval, + ) + if tile_count == 0: + raise RuntimeError("Satellite tile download produced no tiles") + + subprocess.run( + [ + str(pmtiles_bin), + "convert", + str(mbtiles_path), + str(output_path), + "--force", + ], + check=True, + ) + + size_mb = output_path.stat().st_size / (1024 * 1024) + print(f"Wrote {output_path} ({size_mb:.1f} MB)", flush=True) + + +def _parse_bbox(raw: str) -> tuple[float, float, float, float]: + parts = [float(part.strip()) for part in raw.split(",")] + if len(parts) != 4: + raise argparse.ArgumentTypeError("bbox must contain four comma-separated numbers") + return parts[0], parts[1], parts[2], parts[3] + + +def main() -> None: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--output", type=Path, required=True) + parser.add_argument( + "--pmtiles-bin", type=Path, default=Path("property-data/pmtiles") + ) + parser.add_argument("--pmtiles-version", default="1.22.3") + parser.add_argument("--bbox", type=_parse_bbox, default=DEFAULT_BBOX) + parser.add_argument("--min-zoom", type=int, default=DEFAULT_MIN_ZOOM) + parser.add_argument("--max-zoom", type=int, default=DEFAULT_MAX_ZOOM) + parser.add_argument("--source-url", default=DEFAULT_TILE_URL) + parser.add_argument("--max-workers", type=int, default=8) + parser.add_argument("--timeout", type=float, default=20.0) + parser.add_argument("--retries", type=int, default=3) + parser.add_argument( + "--retry-cooldown", + type=float, + default=DEFAULT_RETRY_COOLDOWN, + help="Seconds to pause all workers after an EOX rate-limit response", + ) + parser.add_argument( + "--min-request-interval", + type=float, + default=0.0, + help="Minimum seconds between tile requests across all workers", + ) + args = parser.parse_args() + + build_satellite_tiles( + output_path=args.output, + pmtiles_bin=args.pmtiles_bin, + pmtiles_version=args.pmtiles_version, + bbox=args.bbox, + min_zoom=args.min_zoom, + max_zoom=args.max_zoom, + source_url=args.source_url, + max_workers=max(1, args.max_workers), + timeout=args.timeout, + retries=max(0, args.retries), + retry_cooldown=max(0.0, args.retry_cooldown), + min_request_interval=max(0.0, args.min_request_interval), + ) + + +if __name__ == "__main__": + main() diff --git a/pipeline/download/test_inspire.py b/pipeline/download/test_inspire.py new file mode 100644 index 0000000..97db357 --- /dev/null +++ b/pipeline/download/test_inspire.py @@ -0,0 +1,61 @@ +from zipfile import ZipFile + +from pipeline.download import inspire + + +def _write_zip(path): + with ZipFile(path, "w") as archive: + archive.writestr("example.gml", "") + + +def test_parse_zip_urls_finds_relative_and_absolute_links(): + html = """ + Download + Duplicate + Query suffix + Wrong dataset + Wrong host + """ + + urls = inspire.parse_zip_urls(html) + + assert urls == [ + "https://use-land-property-data.service.gov.uk/datasets/inspire/download/Adur_District_Council.zip", + "https://use-land-property-data.service.gov.uk/datasets/inspire/download/Barnsley_Metropolitan_Borough_Council.zip", + ] + + +def test_download_one_skips_existing_valid_zip(monkeypatch, tmp_path): + dest = tmp_path / "Adur_District_Council.zip" + _write_zip(dest) + + def fail_download(*args, **kwargs): + raise AssertionError("download should not run") + + monkeypatch.setattr(inspire, "_stream_download", fail_download) + + result = inspire.download_one( + "https://use-land-property-data.service.gov.uk/datasets/inspire/download/Adur_District_Council.zip", + tmp_path, + ) + + assert result == "Adur_District_Council.zip (skipped, valid ZIP exists)" + + +def test_download_one_replaces_invalid_existing_file(monkeypatch, tmp_path): + dest = tmp_path / "Adur_District_Council.zip" + dest.write_text("not a zip") + + def fake_download(url, output_path, *, timeout): + _write_zip(output_path) + + monkeypatch.setattr(inspire, "_stream_download", fake_download) + + result = inspire.download_one( + "https://use-land-property-data.service.gov.uk/datasets/inspire/download/Adur_District_Council.zip", + tmp_path, + ) + + assert result == "Adur_District_Council.zip" + assert inspire._is_valid_zip(dest) + assert not (tmp_path / "Adur_District_Council.zip.tmp").exists() diff --git a/pipeline/download/test_satellite_tiles.py b/pipeline/download/test_satellite_tiles.py new file mode 100644 index 0000000..f64f89e --- /dev/null +++ b/pipeline/download/test_satellite_tiles.py @@ -0,0 +1,97 @@ +import urllib.error + +import pytest + +from pipeline.download import satellite_tiles + + +class _Response: + headers = {"content-type": "image/jpeg"} + + def __init__(self, data: bytes = b"jpeg") -> None: + self._data = data + + def read(self) -> bytes: + return self._data + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc, traceback): + return False + + +class _Throttle: + def __init__(self) -> None: + self.deferred: list[float] = [] + self.waits = 0 + + def wait(self) -> None: + self.waits += 1 + + def defer(self, delay: float) -> bool: + self.deferred.append(delay) + return False + + +def _http_error(url: str, code: int) -> urllib.error.HTTPError: + return urllib.error.HTTPError(url, code, "Forbidden", {}, None) + + +def test_fetch_tile_retries_eox_403_with_shared_cooldown(monkeypatch): + tile = satellite_tiles.Tile(zoom=9, x=248, y=172) + calls: list[str] = [] + + def fake_urlopen(req, timeout): + calls.append(req.full_url) + if len(calls) == 1: + raise _http_error(req.full_url, 403) + return _Response() + + monkeypatch.setattr(satellite_tiles.urllib.request, "urlopen", fake_urlopen) + + throttle = _Throttle() + fetched_tile, data = satellite_tiles._fetch_tile( + tile, + satellite_tiles.DEFAULT_TILE_URL, + timeout=1.0, + retries=1, + throttle=throttle, + retry_cooldown=15.0, + ) + + assert fetched_tile == tile + assert data == b"jpeg" + assert calls == [ + "https://tiles.maps.eox.at/wmts/1.0.0/s2cloudless_3857/default/" + "GoogleMapsCompatible/9/172/248.jpg", + "https://tiles.maps.eox.at/wmts/1.0.0/s2cloudless_3857/default/" + "GoogleMapsCompatible/9/172/248.jpg", + ] + assert throttle.deferred == [15.0] + assert throttle.waits == 2 + + +def test_fetch_tile_does_not_retry_non_eox_403(monkeypatch): + tile = satellite_tiles.Tile(zoom=9, x=248, y=172) + calls: list[str] = [] + + def fake_urlopen(req, timeout): + calls.append(req.full_url) + raise _http_error(req.full_url, 403) + + monkeypatch.setattr(satellite_tiles.urllib.request, "urlopen", fake_urlopen) + + throttle = _Throttle() + with pytest.raises(RuntimeError, match="HTTP Error 403"): + satellite_tiles._fetch_tile( + tile, + "https://example.com/{z}/{x}/{y}.jpg", + timeout=1.0, + retries=1, + throttle=throttle, + retry_cooldown=15.0, + ) + + assert calls == ["https://example.com/9/248/172.jpg"] + assert throttle.deferred == [] diff --git a/pipeline/transform/enrich_actual_listings.py b/pipeline/transform/enrich_actual_listings.py deleted file mode 100644 index cb6bdae..0000000 --- a/pipeline/transform/enrich_actual_listings.py +++ /dev/null @@ -1,960 +0,0 @@ -import argparse -import re -import tempfile -from pathlib import Path - -import polars as pl -from thefuzz import fuzz -from tqdm import tqdm - -from pipeline.local_temp import local_tmp_dir -from pipeline.transform.join_epc_pp import _scan_epc_certificates -from pipeline.utils.fuzzy_join import normalize_address_key, normalize_postcode_key -from pipeline.utils.postcode_mapping import build_postcode_mapping - -MIN_FLOOR_AREA_M2 = 10.0 -PROPERTY_MATCH_MIN_SCORE_WITH_NUMBERS = 82.0 -PROPERTY_MATCH_MIN_SCORE_WITHOUT_NUMBERS = 96.0 -PROPERTY_MATCH_MIN_MARGIN = 4.0 -EPC_MATCH_MIN_SCORE_WITH_NUMBERS = 82.0 -EPC_MATCH_MIN_SCORE_WITHOUT_NUMBERS = 96.0 -EPC_MATCH_MIN_MARGIN = 4.0 -ENRICHMENT_VERSION = 1 - -_NUMBER_RE = re.compile(r"\d+") - -LISTING_REQUIRED_COLUMNS = [ - "Bedrooms", - "Bathrooms", - "Number of bedrooms & living rooms", - "lon", - "lat", - "Postcode", - "Address per Property Register", - "Leasehold/Freehold", - "Property type", - "Property sub-type", - "Price qualifier", - "Total floor area (sqm)", - "Listing URL", - "Listing features", - "Listing date", - "Listing status", - "Asking price", - "Asking price per sqm", -] - -PROPERTY_CANDIDATE_COLUMNS = [ - "Address per Property Register", - "Postcode", - "Leasehold/Freehold", - "Last known price", - "Date of last transaction", - "Address per EPC", - "Current energy rating", - "Potential energy rating", - "Total floor area (sqm)", - "Number of bedrooms & living rooms", - "Interior height (m)", - "Construction year", - "Former council house", - "Is construction date approximate", - "Listed building", - "Estimated monthly rent", - "Street tree density percentile", - "Property type", - "Price per sqm", - "Estimated current price", - "Est. price per sqm", -] - -PROPERTY_ENRICHMENT_COLUMNS = [ - "Address per EPC", - "Current energy rating", - "Potential energy rating", - "Interior height (m)", - "Construction year", - "Former council house", - "Is construction date approximate", - "Listed building", - "Estimated monthly rent", - "Street tree density percentile", - "Date of last transaction", -] - -EPC_ENRICHMENT_COLUMNS = [ - "Address per EPC", - "Current energy rating", - "Potential energy rating", - "Total floor area (sqm)", - "Number of bedrooms & living rooms", - "Interior height (m)", - "Construction year", - "Former council house", -] - -EPC_RATING_VALUES = ["A", "B", "C", "D", "E", "F", "G"] -TENURE_VALUES = ["Freehold", "Leasehold"] -PROPERTY_TYPE_VALUES = [ - "Detached", - "Semi-Detached", - "Terraced", - "Flats/Maisonettes", - "Other", -] - -COLUMN_DTYPES = { - "Address per EPC": pl.Utf8, - "Current energy rating": pl.Utf8, - "Potential energy rating": pl.Utf8, - "Total floor area (sqm)": pl.Float64, - "Number of bedrooms & living rooms": pl.Int32, - "Interior height (m)": pl.Float64, - "Construction year": pl.UInt16, - "Former council house": pl.Utf8, - "Is construction date approximate": pl.UInt8, - "Listed building": pl.Utf8, - "Estimated monthly rent": pl.Float32, - "Street tree density percentile": pl.Float32, - "Date of last transaction": pl.Datetime("us"), - "Property type": pl.Utf8, - "Leasehold/Freehold": pl.Utf8, -} - - -def _canonical_postcode_expr(column: str) -> pl.Expr: - compact = ( - pl.col(column) - .cast(pl.Utf8) - .str.to_uppercase() - .str.replace_all(r"[^A-Z0-9]+", "") - .str.strip_chars() - ) - return ( - pl.when(compact.str.contains(r"^[A-Z]{1,2}\d[A-Z\d]?\d[A-Z]{2}$")) - .then(compact.str.replace(r"^(.+)([0-9][A-Z]{2})$", "${1} ${2}")) - .otherwise(None) - ) - - -def _clean_string_expr(column: str) -> pl.Expr: - stripped = pl.col(column).cast(pl.Utf8).str.strip_chars() - return pl.when(stripped == "").then(None).otherwise(stripped) - - -def _coalesce_non_empty(*columns: str) -> pl.Expr: - return pl.coalesce( - [ - pl.when(pl.col(column).cast(pl.Utf8).str.strip_chars() == "") - .then(None) - .otherwise(pl.col(column).cast(pl.Utf8)) - for column in columns - ] - ) - - -def _valid_number_expr(column: str) -> pl.Expr: - return pl.when(pl.col(column).is_finite()).then(pl.col(column)).otherwise(None) - - -def _read_listings(listings_path: Path, arcgis_path: Path) -> pl.DataFrame: - schema = pl.scan_parquet(listings_path).collect_schema() - missing = sorted(set(LISTING_REQUIRED_COLUMNS) - set(schema.names())) - if missing: - raise ValueError(f"{listings_path} is missing listing columns: {missing}") - - listings = ( - pl.scan_parquet(listings_path) - .with_row_index("_listing_idx") - .with_columns( - _canonical_postcode_expr("Postcode").alias("_original_postcode"), - normalize_address_key(pl.col("Address per Property Register")).alias( - "_listing_match_address" - ), - normalize_postcode_key(pl.col("Postcode")).alias("_listing_match_postcode"), - ) - .collect(engine="streaming") - ) - - postcode_mapping = build_postcode_mapping(arcgis_path) - listings = ( - listings.join( - postcode_mapping, - left_on="_original_postcode", - right_on="old_postcode", - how="left", - ) - .with_columns( - pl.coalesce("new_postcode", "_original_postcode", "Postcode").alias( - "Postcode" - ), - ) - .drop("new_postcode", strict=False) - .with_columns( - normalize_postcode_key(pl.col("Postcode")).alias("_listing_match_postcode"), - ) - ) - return listings - - -def _load_property_candidates( - properties_path: Path, listing_postcodes: list[str] -) -> pl.DataFrame: - schema = pl.scan_parquet(properties_path).collect_schema() - columns = [ - column for column in PROPERTY_CANDIDATE_COLUMNS if column in schema.names() - ] - missing = sorted( - set( - [ - "Address per Property Register", - "Postcode", - "Property type", - "Total floor area (sqm)", - ] - ) - - set(columns) - ) - if missing: - raise ValueError(f"{properties_path} is missing property columns: {missing}") - - return ( - pl.scan_parquet(properties_path) - .select(columns) - .with_columns( - normalize_postcode_key(pl.col("Postcode")).alias("_match_postcode") - ) - .filter(pl.col("_match_postcode").is_in(listing_postcodes)) - .with_columns( - normalize_address_key(pl.col("Address per Property Register")).alias( - "_match_register_address" - ), - normalize_address_key(pl.col("Address per EPC")).alias("_match_epc_address") - if "Address per EPC" in columns - else pl.lit(None, dtype=pl.Utf8).alias("_match_epc_address"), - ) - .filter( - pl.col("_match_register_address").is_not_null() - | pl.col("_match_epc_address").is_not_null() - ) - .with_row_index("_property_row") - .collect(engine="streaming") - ) - - -def _property_candidates_by_postcode( - candidates: pl.DataFrame, -) -> dict[str, list[dict]]: - buckets: dict[str, list[dict]] = {} - for row in candidates.iter_rows(named=True): - postcode = row.get("_match_postcode") - if postcode: - buckets.setdefault(postcode, []).append(row) - return buckets - - -def _numbers_compatible(left: str | None, right: str | None) -> bool: - if not left or not right: - return False - left_nums = set(_NUMBER_RE.findall(left)) - right_nums = set(_NUMBER_RE.findall(right)) - smaller, larger = ( - (left_nums, right_nums) - if len(left_nums) <= len(right_nums) - else (right_nums, left_nums) - ) - if not smaller and larger: - return False - return smaller.issubset(larger) - - -def _has_number(address: str | None) -> bool: - return bool(address and _NUMBER_RE.search(address)) - - -def _ratio_bonus( - left: float | int | None, right: float | int | None, pct: float, cap: float -) -> float: - if left is None or right is None: - return 0.0 - try: - left_f = float(left) - right_f = float(right) - except (TypeError, ValueError): - return 0.0 - if left_f <= 0 or right_f <= 0: - return 0.0 - rel = abs(left_f - right_f) / max(left_f, right_f) - if rel > pct: - return 0.0 - return cap * (1.0 - rel / pct) - - -def _rooms_bonus(left: int | None, right: int | None) -> float: - if left is None or right is None: - return 0.0 - try: - diff = abs(int(left) - int(right)) - except (TypeError, ValueError): - return 0.0 - if diff == 0: - return 4.0 - if diff == 1: - return 2.0 - return 0.0 - - -def _enum_bonus( - left: str | None, right: str | None, *, exact: float, mismatch: float -) -> float: - if not left or not right: - return 0.0 - return exact if left == right else mismatch - - -def _address_score(query: str, candidate: str | None) -> int: - if not candidate: - return 0 - return max( - fuzz.token_set_ratio(query, candidate), - fuzz.token_sort_ratio(query, candidate), - ) - - -def _best_property_candidate(listing: dict, candidates: list[dict]) -> dict | None: - query = listing.get("_listing_match_address") - if not query: - return None - - listing_has_numbers = _has_number(query) - scored: list[tuple[float, int, dict, str]] = [] - for candidate in candidates: - register_address = candidate.get("_match_register_address") - epc_address = candidate.get("_match_epc_address") - if listing_has_numbers and not ( - _numbers_compatible(query, register_address) - or _numbers_compatible(query, epc_address) - ): - continue - - register_score = _address_score(query, register_address) - epc_score = _address_score(query, epc_address) - base_score = max(register_score, epc_score) - if base_score == 0: - continue - - score = float(base_score) - score += _enum_bonus( - listing.get("Property type"), - candidate.get("Property type"), - exact=7.0, - mismatch=-8.0, - ) - score += _enum_bonus( - listing.get("Leasehold/Freehold"), - candidate.get("Leasehold/Freehold"), - exact=3.0, - mismatch=-3.0, - ) - score += _ratio_bonus( - listing.get("Total floor area (sqm)"), - candidate.get("Total floor area (sqm)"), - pct=0.15, - cap=8.0, - ) - score += _rooms_bonus( - listing.get("Number of bedrooms & living rooms"), - candidate.get("Number of bedrooms & living rooms"), - ) - score += _ratio_bonus( - listing.get("Asking price"), - candidate.get("Estimated current price") - or candidate.get("Last known price"), - pct=0.25, - cap=3.0, - ) - matched_address = ( - "Address per Property Register" - if register_score >= epc_score - else "Address per EPC" - ) - scored.append((score, base_score, candidate, matched_address)) - - if not scored: - return None - scored.sort(key=lambda item: item[0], reverse=True) - top = scored[0] - runner_up = scored[1][0] if len(scored) > 1 else None - margin = top[0] - runner_up if runner_up is not None else top[0] - threshold = ( - PROPERTY_MATCH_MIN_SCORE_WITH_NUMBERS - if listing_has_numbers - else PROPERTY_MATCH_MIN_SCORE_WITHOUT_NUMBERS - ) - if top[0] < threshold or margin < PROPERTY_MATCH_MIN_MARGIN: - return None - - return { - "_listing_idx": listing["_listing_idx"], - "_property_row": top[2]["_property_row"], - "Historical property match score": round(top[0], 1), - "Historical property address score": top[1], - "Historical property match margin": round(margin, 1), - "Historical property match field": top[3], - "Historical property match status": "matched", - } - - -def _match_properties(listings: pl.DataFrame, candidates: pl.DataFrame) -> pl.DataFrame: - schema = { - "_listing_idx": pl.UInt32, - "_property_row": pl.UInt32, - "Historical property match score": pl.Float32, - "Historical property address score": pl.Int32, - "Historical property match margin": pl.Float32, - "Historical property match field": pl.Utf8, - "Historical property match status": pl.Utf8, - } - if candidates.is_empty(): - return pl.DataFrame(schema=schema) - - buckets = _property_candidates_by_postcode(candidates) - matches = [] - for listing in tqdm( - listings.iter_rows(named=True), - total=listings.height, - desc="Matching historical properties", - ): - postcode = listing.get("_listing_match_postcode") - if not postcode: - continue - match = _best_property_candidate(listing, buckets.get(postcode, [])) - if match is not None: - matches.append(match) - - if not matches: - return pl.DataFrame(schema=schema) - return pl.DataFrame(matches, schema=schema) - - -def _prefix_columns(df: pl.DataFrame, columns: list[str], prefix: str) -> pl.DataFrame: - rename = {column: f"{prefix}{column}" for column in columns if column in df.columns} - return df.rename(rename) - - -def _ensure_prefixed_columns( - df: pl.DataFrame, columns: list[str], prefix: str -) -> pl.DataFrame: - missing_exprs = [ - pl.lit(None, dtype=COLUMN_DTYPES.get(column, pl.Utf8)).alias( - f"{prefix}{column}" - ) - for column in columns - if f"{prefix}{column}" not in df.columns - ] - if not missing_exprs: - return df - return df.with_columns(missing_exprs) - - -def _property_match_frame( - matches: pl.DataFrame, candidates: pl.DataFrame -) -> pl.DataFrame: - if matches.is_empty(): - return matches - selected_columns = [ - "_property_row", - *[ - column - for column in PROPERTY_CANDIDATE_COLUMNS - if column in candidates.columns - ], - ] - matched = matches.join( - candidates.select(selected_columns), on="_property_row", how="left" - ) - return _prefix_columns( - matched, - [column for column in PROPERTY_CANDIDATE_COLUMNS if column in matched.columns], - "_property_", - ) - - -def _canonical_epc_property_type_expr() -> pl.Expr: - bad_built_form = pl.col("built_form").is_null() | pl.col("built_form").is_in( - ["NO DATA!", "Not Recorded"] - ) - has_epc = pl.col("epc_property_type").is_not_null() - is_house = pl.col("epc_property_type") == "House" - return ( - pl.when(has_epc & is_house & ~bad_built_form) - .then(pl.col("built_form")) - .when(has_epc) - .then(pl.col("epc_property_type")) - .otherwise(None) - .replace( - { - "Flat": "Flats/Maisonettes", - "Maisonette": "Flats/Maisonettes", - "End-Terrace": "Terraced", - "Mid-Terrace": "Terraced", - "Enclosed End-Terrace": "Terraced", - "Enclosed Mid-Terrace": "Terraced", - "Bungalow": "Other", - "Park home": "Other", - "House": "Other", - } - ) - ) - - -def _construction_year_expr(column: str = "construction_age_band") -> pl.Expr: - return ( - pl.col(column) - .cast(pl.Utf8) - .str.replace("England and Wales: ", "") - .str.replace(" onwards", "") - .str.extract(r"(\d{4})", 1) - .cast(pl.UInt16, strict=False) - ) - - -def _fractional_year_expr(column: str) -> pl.Expr: - return ( - pl.col(column).dt.year().cast(pl.Float32) - + (pl.col(column).dt.month().cast(pl.Float32) - 1.0) / 12.0 - ) - - -def _load_epc_candidates( - epc_path: Path, listing_postcodes: list[str], temp_dir: Path -) -> pl.DataFrame: - epc_base = _scan_epc_certificates(epc_path, temp_dir).with_columns( - normalize_address_key(pl.col("epc_address")).alias("_epc_match_address"), - normalize_postcode_key(pl.col("epc_postcode")).alias("_epc_match_postcode"), - ) - - epc = ( - epc_base.filter(pl.col("_epc_match_postcode").is_in(listing_postcodes)) - .sort("inspection_date", descending=True) - .group_by("_epc_match_address", "_epc_match_postcode") - .first() - .with_columns( - _canonical_epc_property_type_expr().alias("_epc_canonical_property_type"), - _construction_year_expr().alias("Construction year"), - pl.when(pl.col("current_energy_rating").is_in(EPC_RATING_VALUES)) - .then(pl.col("current_energy_rating")) - .otherwise(None) - .alias("Current energy rating"), - pl.when(pl.col("potential_energy_rating").is_in(EPC_RATING_VALUES)) - .then(pl.col("potential_energy_rating")) - .otherwise(None) - .alias("Potential energy rating"), - pl.col("total_floor_area").alias("Total floor area (sqm)"), - pl.col("number_habitable_rooms").alias("Number of bedrooms & living rooms"), - pl.col("floor_height").alias("Interior height (m)"), - pl.col("epc_address").alias("Address per EPC"), - ) - .drop("tenure", strict=False) - ) - - social_tenure = ( - epc_base.filter(pl.col("_epc_match_postcode").is_in(listing_postcodes)) - .filter(pl.col("tenure").str.to_lowercase().str.contains("social")) - .select("_epc_match_address", "_epc_match_postcode") - .unique() - .with_columns(pl.lit("Yes").alias("Former council house")) - ) - - return ( - epc.join( - social_tenure, - on=["_epc_match_address", "_epc_match_postcode"], - how="left", - ) - .with_columns(pl.col("Former council house").fill_null("No")) - .filter(pl.col("_epc_match_address").is_not_null()) - .with_row_index("_epc_row") - .select( - "_epc_row", - "_epc_match_address", - "_epc_match_postcode", - "_epc_canonical_property_type", - *EPC_ENRICHMENT_COLUMNS, - ) - .collect(engine="streaming") - ) - - -def _epc_candidates_by_postcode(candidates: pl.DataFrame) -> dict[str, list[dict]]: - buckets: dict[str, list[dict]] = {} - for row in candidates.iter_rows(named=True): - postcode = row.get("_epc_match_postcode") - if postcode: - buckets.setdefault(postcode, []).append(row) - return buckets - - -def _best_epc_candidate(listing: dict, candidates: list[dict]) -> dict | None: - query = listing.get("_listing_match_address") - if not query: - return None - - listing_has_numbers = _has_number(query) - scored: list[tuple[float, int, dict]] = [] - for candidate in candidates: - address = candidate.get("_epc_match_address") - if listing_has_numbers and not _numbers_compatible(query, address): - continue - base_score = _address_score(query, address) - if base_score == 0: - continue - score = float(base_score) - score += _enum_bonus( - listing.get("Property type"), - candidate.get("_epc_canonical_property_type"), - exact=6.0, - mismatch=-6.0, - ) - score += _ratio_bonus( - listing.get("Total floor area (sqm)"), - candidate.get("Total floor area (sqm)"), - pct=0.12, - cap=8.0, - ) - score += _rooms_bonus( - listing.get("Number of bedrooms & living rooms"), - candidate.get("Number of bedrooms & living rooms"), - ) - scored.append((score, base_score, candidate)) - - if not scored: - return None - scored.sort(key=lambda item: item[0], reverse=True) - top = scored[0] - runner_up = scored[1][0] if len(scored) > 1 else None - margin = top[0] - runner_up if runner_up is not None else top[0] - threshold = ( - EPC_MATCH_MIN_SCORE_WITH_NUMBERS - if listing_has_numbers - else EPC_MATCH_MIN_SCORE_WITHOUT_NUMBERS - ) - if top[0] < threshold or margin < EPC_MATCH_MIN_MARGIN: - return None - return { - "_listing_idx": listing["_listing_idx"], - "_epc_row": top[2]["_epc_row"], - "EPC match score": round(top[0], 1), - "EPC address score": top[1], - "EPC match margin": round(margin, 1), - "EPC match status": "matched", - } - - -def _match_epc(listings: pl.DataFrame, candidates: pl.DataFrame) -> pl.DataFrame: - schema = { - "_listing_idx": pl.UInt32, - "_epc_row": pl.UInt32, - "EPC match score": pl.Float32, - "EPC address score": pl.Int32, - "EPC match margin": pl.Float32, - "EPC match status": pl.Utf8, - } - if candidates.is_empty(): - return pl.DataFrame(schema=schema) - - buckets = _epc_candidates_by_postcode(candidates) - matches = [] - for listing in tqdm( - listings.iter_rows(named=True), - total=listings.height, - desc="Matching EPC certificates", - ): - postcode = listing.get("_listing_match_postcode") - if not postcode: - continue - match = _best_epc_candidate(listing, buckets.get(postcode, [])) - if match is not None: - matches.append(match) - - if not matches: - return pl.DataFrame(schema=schema) - return pl.DataFrame(matches, schema=schema) - - -def _epc_match_frame(matches: pl.DataFrame, candidates: pl.DataFrame) -> pl.DataFrame: - if matches.is_empty(): - return matches - matched = matches.join( - candidates.select("_epc_row", *EPC_ENRICHMENT_COLUMNS), - on="_epc_row", - how="left", - ) - return _prefix_columns( - matched, - [column for column in EPC_ENRICHMENT_COLUMNS if column in matched.columns], - "_epc_", - ) - - -def _join_postcode_features( - listings: pl.DataFrame, postcode_features_path: Path -) -> pl.DataFrame: - postcode_features = pl.scan_parquet(postcode_features_path).collect( - engine="streaming" - ) - return listings.join( - postcode_features, on="Postcode", how="left", suffix="_postcode" - ) - - -def _coalesce_feature_columns(df: pl.DataFrame) -> pl.DataFrame: - with_columns: list[pl.Expr] = [ - pl.lit(ENRICHMENT_VERSION, dtype=pl.UInt16).alias( - "Actual listing enrichment version" - ), - _coalesce_non_empty( - "_epc_Address per EPC", - "_property_Address per EPC", - ).alias("Address per EPC"), - pl.when(pl.col("Property type").is_in(PROPERTY_TYPE_VALUES)) - .then(pl.col("Property type")) - .otherwise(pl.col("_property_Property type")) - .alias("Property type"), - pl.when(pl.col("Leasehold/Freehold").is_in(TENURE_VALUES)) - .then(pl.col("Leasehold/Freehold")) - .otherwise(pl.col("_property_Leasehold/Freehold")) - .alias("Leasehold/Freehold"), - pl.coalesce( - _valid_number_expr("Total floor area (sqm)"), - _valid_number_expr("_epc_Total floor area (sqm)"), - _valid_number_expr("_property_Total floor area (sqm)"), - ).alias("Total floor area (sqm)"), - pl.when(pl.col("Number of bedrooms & living rooms") > 0) - .then(pl.col("Number of bedrooms & living rooms")) - .otherwise( - pl.coalesce( - pl.col("_epc_Number of bedrooms & living rooms"), - pl.col("_property_Number of bedrooms & living rooms"), - ) - ) - .cast(pl.Int32, strict=False) - .alias("Number of bedrooms & living rooms"), - pl.col("Asking price").alias("Estimated current price"), - pl.col("Asking price").alias("Last known price"), - _coalesce_non_empty( - "_epc_Current energy rating", - "_property_Current energy rating", - ).alias("Current energy rating"), - _coalesce_non_empty( - "_epc_Potential energy rating", - "_property_Potential energy rating", - ).alias("Potential energy rating"), - pl.coalesce( - _valid_number_expr("_epc_Interior height (m)"), - _valid_number_expr("_property_Interior height (m)"), - ).alias("Interior height (m)"), - pl.coalesce( - pl.col("_epc_Construction year"), - pl.col("_property_Construction year"), - ) - .cast(pl.UInt16, strict=False) - .alias("Construction year"), - _coalesce_non_empty( - "_epc_Former council house", - "_property_Former council house", - ) - .fill_null("No") - .alias("Former council house"), - pl.col("_property_Is construction date approximate").alias( - "Is construction date approximate" - ), - pl.col("_property_Listed building").fill_null("No").alias("Listed building"), - pl.col("_property_Estimated monthly rent").alias("Estimated monthly rent"), - pl.col("_property_Street tree density percentile").alias( - "Street tree density percentile" - ), - _fractional_year_expr("_property_Date of last transaction").alias( - "Date of last transaction" - ), - ] - - df = df.with_columns(with_columns) - df = df.with_columns( - pl.when( - pl.col("Asking price").is_not_null() - & pl.col("Total floor area (sqm)").is_not_null() - & (pl.col("Total floor area (sqm)") > 0) - ) - .then((pl.col("Asking price") / pl.col("Total floor area (sqm)")).round(0)) - .otherwise(None) - .cast(pl.Int32, strict=False) - .alias("Asking price per sqm"), - ).with_columns( - pl.col("Asking price per sqm").alias("Est. price per sqm"), - pl.col("Asking price per sqm").alias("Price per sqm"), - ) - - return df - - -def _drop_internal_columns(df: pl.DataFrame) -> pl.DataFrame: - internal_prefixes = ("_property_", "_epc_") - internal_exact = { - "_listing_idx", - "_listing_match_address", - "_listing_match_postcode", - "_original_postcode", - "_property_row", - "_epc_row", - "lat_postcode", - "lon_postcode", - } - drop_cols = [ - column - for column in df.columns - if column in internal_exact or column.startswith(internal_prefixes) - ] - return df.drop(drop_cols, strict=False) - - -def build_enriched_actual_listings( - listings_path: Path, - properties_path: Path, - postcode_features_path: Path, - arcgis_path: Path, - output_path: Path, - *, - epc_path: Path | None = None, -) -> pl.DataFrame: - print(f"Loading listings from {listings_path}...") - listings = _read_listings(listings_path, arcgis_path) - listing_postcodes = ( - listings.select("_listing_match_postcode") - .drop_nulls() - .unique() - .to_series() - .to_list() - ) - print(f"Listings: {listings.height}; unique postcodes: {len(listing_postcodes)}") - - print(f"Loading property candidates from {properties_path}...") - property_candidates = _load_property_candidates(properties_path, listing_postcodes) - print(f"Property candidates: {property_candidates.height}") - property_matches = _match_properties(listings, property_candidates) - print(f"Historical property matches: {property_matches.height}") - property_match_frame = _property_match_frame(property_matches, property_candidates) - - enriched = _join_postcode_features(listings, postcode_features_path) - if not property_match_frame.is_empty(): - enriched = enriched.join(property_match_frame, on="_listing_idx", how="left") - else: - enriched = enriched.with_columns( - pl.lit(None, dtype=pl.Utf8).alias("Historical property match status") - ) - - if epc_path is not None: - with tempfile.TemporaryDirectory( - prefix="actual_listing_epc_", dir=local_tmp_dir() - ) as tmpdir: - print(f"Loading EPC candidates from {epc_path}...") - epc_candidates = _load_epc_candidates( - epc_path, listing_postcodes, Path(tmpdir) - ) - print(f"EPC candidates: {epc_candidates.height}") - epc_matches = _match_epc(listings, epc_candidates) - print(f"EPC matches: {epc_matches.height}") - epc_match_frame = _epc_match_frame(epc_matches, epc_candidates) - if not epc_match_frame.is_empty(): - enriched = enriched.join(epc_match_frame, on="_listing_idx", how="left") - else: - enriched = enriched.with_columns( - pl.lit(None, dtype=pl.Utf8).alias("EPC match status") - ) - else: - enriched = enriched.with_columns( - pl.lit(None, dtype=pl.Utf8).alias("EPC match status") - ) - - enriched = _ensure_prefixed_columns( - enriched, PROPERTY_CANDIDATE_COLUMNS, "_property_" - ) - enriched = _ensure_prefixed_columns(enriched, EPC_ENRICHMENT_COLUMNS, "_epc_") - enriched = _coalesce_feature_columns(enriched) - enriched = _drop_internal_columns(enriched) - - output_path.parent.mkdir(parents=True, exist_ok=True) - enriched.write_parquet(output_path) - size_mb = output_path.stat().st_size / (1024 * 1024) - print( - f"Wrote {enriched.height} enriched listings to {output_path} ({size_mb:.1f} MB)" - ) - return enriched - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Build a pre-enriched actual-listings parquet for the server" - ) - parser.add_argument( - "--listings", - type=Path, - default=Path("finder/data/online_listings_buy.parquet"), - help="Input scraped listings parquet", - ) - parser.add_argument( - "--properties", - type=Path, - default=Path("property-data/properties.parquet"), - help="Historical properties parquet", - ) - parser.add_argument( - "--postcode-features", - type=Path, - default=Path("property-data/postcode.parquet"), - help="Postcode feature parquet", - ) - parser.add_argument( - "--arcgis", - type=Path, - default=Path("property-data/arcgis_data.parquet"), - help="ArcGIS/NSPL postcode parquet used for terminated-postcode remapping", - ) - parser.add_argument( - "--epc", - type=Path, - default=Path("manual-data/domestic-csv.zip"), - help="Optional EPC certificates CSV/zip for direct listing-to-EPC fuzzy matching", - ) - parser.add_argument( - "--no-epc", - action="store_true", - help="Skip direct EPC matching even when --epc exists", - ) - parser.add_argument( - "--output", - type=Path, - default=Path("finder/data/online_listings_buy_enriched.parquet"), - help="Output enriched listings parquet", - ) - args = parser.parse_args() - - epc_path = None if args.no_epc else args.epc - if epc_path is not None and not epc_path.exists(): - print( - f"EPC source not found at {epc_path}; continuing without direct EPC matching" - ) - epc_path = None - - build_enriched_actual_listings( - listings_path=args.listings, - properties_path=args.properties, - postcode_features_path=args.postcode_features, - arcgis_path=args.arcgis, - epc_path=epc_path, - output_path=args.output, - ) - - -if __name__ == "__main__": - main() diff --git a/pipeline/transform/merge.py b/pipeline/transform/merge.py index b2baf85..5dc21b0 100644 --- a/pipeline/transform/merge.py +++ b/pipeline/transform/merge.py @@ -1,5 +1,8 @@ import argparse import re +import tempfile +from dataclasses import dataclass +from typing import Literal import numpy as np import polars as pl @@ -13,7 +16,12 @@ from shapely.geometry.base import BaseGeometry from shapely.strtree import STRtree from thefuzz import fuzz -from pipeline.utils.fuzzy_join import normalize_address_key +from pipeline.local_temp import local_tmp_dir +from pipeline.transform.join_epc_pp import _scan_epc_certificates +from pipeline.utils.fuzzy_join import ( + normalize_address_key, + normalize_postcode_key, +) from pipeline.utils.postcode_mapping import build_postcode_mapping MIN_FLOOR_AREA_M2 = 10 @@ -22,7 +30,7 @@ LISTED_BUILDING_FEATURE = "Listed building" LISTED_BUILDING_MATCH_RADIUS_M = 250.0 LISTED_BUILDING_NEAREST_POSTCODES = 3 LISTED_BUILDING_MIN_MATCH_SCORE = 95 -_UNPUBLISHED_CONSERVATION_AREA_PREFIX = "no data available for publication" +PLANNING_DATA_CONSERVATION_AREA_DATASET = "conservation-area" _IOD_PERCENTILE_COLUMNS = [ "Education, Skills and Training Score", @@ -430,37 +438,79 @@ def _normalise_crs(crs: object | None) -> str: return str(crs) if crs else "EPSG:4326" -def _is_unpublished_conservation_area_record(name: object) -> bool: +def _geometry_column(metadata: dict, column_names: list[str]) -> str: + geometry_name = metadata.get("geometry_name") + if geometry_name: + return str(geometry_name) + for name in ("wkb_geometry", "geometry", "geom"): + if name in column_names: + return name + return column_names[-1] + + +def _column_values(table, column: str, default: object = None) -> list[object]: + if column not in table.column_names: + return [default] * table.num_rows + return table[column].combine_chunks().to_pylist() + + +def _is_planning_conservation_area_record(dataset: object) -> bool: return ( - isinstance(name, str) - and name.strip().casefold().startswith(_UNPUBLISHED_CONSERVATION_AREA_PREFIX) + dataset is None + or str(dataset).strip().casefold() == PLANNING_DATA_CONSERVATION_AREA_DATASET ) +def _is_current_planning_record(end_date: object) -> bool: + if end_date is None: + return True + if isinstance(end_date, str): + return end_date.strip() == "" + return False + + def _load_conservation_area_geometries( conservation_areas_path: Path, ) -> tuple[list[BaseGeometry], str]: - metadata, table = pyogrio.read_arrow(conservation_areas_path, columns=["NAME"]) - geometry_name = metadata.get("geometry_name") or table.column_names[-1] - names = table["NAME"].combine_chunks().to_pylist() + metadata, table = pyogrio.read_arrow(conservation_areas_path) + geometry_name = _geometry_column(metadata, table.column_names) + datasets = _column_values(table, "dataset") + end_dates = _column_values(table, "end-date") geometries = [] - skipped_unpublished = 0 - for name, geom in zip( - names, from_wkb(table[geometry_name].combine_chunks().to_pylist()), strict=True + skipped_other_dataset = 0 + skipped_ended = 0 + skipped_non_polygon = 0 + skipped_empty = 0 + for dataset, end_date, geom in zip( + datasets, + end_dates, + from_wkb(table[geometry_name].combine_chunks().to_pylist()), + strict=True, ): - if _is_unpublished_conservation_area_record(name): - skipped_unpublished += 1 - elif geom is not None and not geom.is_empty: - geometries.append(geom) + if not _is_planning_conservation_area_record(dataset): + skipped_other_dataset += 1 + continue + if not _is_current_planning_record(end_date): + skipped_ended += 1 + continue + if geom is None or geom.is_empty: + skipped_empty += 1 + continue + if geom.geom_type not in {"Polygon", "MultiPolygon"}: + skipped_non_polygon += 1 + continue + geometries.append(geom) if not geometries: raise ValueError( f"{conservation_areas_path} does not contain any usable polygon geometries" ) - if skipped_unpublished: + if skipped_other_dataset or skipped_ended or skipped_empty or skipped_non_polygon: print( - "Skipped " - f"{skipped_unpublished} Historic England unpublished conservation-area " - "placeholder polygons" + "Skipped conservation-area records during load: " + f"other_dataset={skipped_other_dataset}, " + f"ended={skipped_ended}, " + f"empty_geometry={skipped_empty}, " + f"non_polygon={skipped_non_polygon}" ) return geometries, _normalise_crs(metadata.get("crs")) @@ -659,6 +709,1035 @@ def _validate_property_postcodes(df: pl.DataFrame) -> None: ) +# Map listings-parquet source columns to the `_actual_*` overlay columns +# carried alongside the wide frame through the postcode-keyed joins. After the +# rest of the pipeline finalises, listing rows pick their canonical dashboard +# values from these overlays in `_finalize_listings`. +_LISTING_OVERLAY_SOURCES: tuple[tuple[str, str, pl.DataType], ...] = ( + ("Listing URL", "_actual_listing_url", pl.Utf8), + ("Asking price", "_actual_asking_price", pl.Int64), + ("Asking price per sqm", "_actual_asking_price_per_sqm", pl.Int32), + ("Listing date", "_actual_listing_date", pl.Datetime("us")), + ("Listing status", "_actual_listing_status", pl.Utf8), + ("Listing features", "_actual_listing_features", pl.List(pl.Utf8)), + ("Bedrooms", "_actual_bedrooms", pl.Int32), + ("Bathrooms", "_actual_bathrooms", pl.Int32), + ("Price qualifier", "_actual_price_qualifier", pl.Utf8), + ("Property sub-type", "_actual_property_sub_type", pl.Utf8), + ("lat", "_actual_lat", pl.Float64), + ("lon", "_actual_lon", pl.Float64), + # Seeds for the wide row that an unmatched listing produces. + ("Total floor area (sqm)", "_actual_total_floor_area", pl.Float64), + ("Number of bedrooms & living rooms", "_actual_number_habitable_rooms", pl.Int16), + ("Property type", "_actual_property_type", pl.Utf8), + ("Leasehold/Freehold", "_actual_leasehold_freehold", pl.Utf8), +) +_LISTING_FLAG_COLUMN = "_actual_listing_url" +_TENURE_VALUES = ["Freehold", "Leasehold"] +_PROPERTY_TYPE_VALUES = [ + "Detached", + "Semi-Detached", + "Terraced", + "Flats/Maisonettes", + "Other", +] +_EPC_RATING_VALUES = ["A", "B", "C", "D", "E", "F", "G"] +_PROPERTY_MATCH_MIN_SCORE_WITH_NUMBERS = 82.0 +_PROPERTY_MATCH_MIN_SCORE_WITHOUT_NUMBERS = 96.0 +_PROPERTY_MATCH_MIN_ADDRESS_SCORE_WITH_NUMBERS = 82 +_PROPERTY_MATCH_MIN_ADDRESS_SCORE_WITHOUT_NUMBERS = 96 +_PROPERTY_MATCH_MIN_MARGIN = 4.0 +_DIRECT_EPC_MATCH_MIN_SCORE_WITH_NUMBERS = 82.0 +_DIRECT_EPC_MATCH_MIN_SCORE_WITHOUT_NUMBERS = 96.0 +_DIRECT_EPC_MATCH_MIN_MARGIN = 4.0 +_DIRECT_EPC_NEARBY_RADIUS_M = 500.0 +_DIRECT_EPC_NEAREST_POSTCODES = 40 +_DIRECT_EPC_COLUMNS: tuple[tuple[str, pl.DataType], ...] = ( + ("_direct_epc_address", pl.Utf8), + ("_direct_current_energy_rating", pl.Utf8), + ("_direct_potential_energy_rating", pl.Utf8), + ("_direct_total_floor_area", pl.Float64), + ("_direct_number_habitable_rooms", pl.Int16), + ("_direct_floor_height", pl.Float64), + ("_direct_construction_age_band", pl.UInt16), + ("_direct_is_construction_date_approximate", pl.UInt8), + ("_direct_was_council_house", pl.Utf8), + ("_direct_epc_match_status", pl.Utf8), + ("_direct_epc_match_score", pl.Float32), + ("_direct_epc_match_margin", pl.Float32), +) +_DIRECT_EPC_RAW_COLUMN_MAP = { + "epc_address": "_direct_epc_address", + "current_energy_rating": "_direct_current_energy_rating", + "potential_energy_rating": "_direct_potential_energy_rating", + "total_floor_area": "_direct_total_floor_area", + "number_habitable_rooms": "_direct_number_habitable_rooms", + "floor_height": "_direct_floor_height", + "construction_age_band": "_direct_construction_age_band", + "is_construction_date_approximate": "_direct_is_construction_date_approximate", + "was_council_house": "_direct_was_council_house", +} + + +def _canonical_postcode_expr(column: str) -> pl.Expr: + """Re-format a postcode into NSPL `pcds` style (e.g. `AB1 2CD`) or null.""" + compact = ( + pl.col(column) + .cast(pl.Utf8) + .str.to_uppercase() + .str.replace_all(r"[^A-Z0-9]+", "") + .str.strip_chars() + ) + return ( + pl.when(compact.str.contains(r"^[A-Z]{1,2}\d[A-Z\d]?\d[A-Z]{2}$")) + .then(compact.str.replace(r"^(.+)([0-9][A-Z]{2})$", "${1} ${2}")) + .otherwise(None) + ) + + +def _postcode_outcode_expr(column: str) -> pl.Expr: + return normalize_postcode_key(pl.col(column)).str.extract( + r"^([A-Z]{1,2}\d[A-Z\d]?)\d[A-Z]{2}$", 1 + ) + + +def _canonical_epc_property_type_expr() -> pl.Expr: + bad_built_form = pl.col("built_form").is_null() | pl.col("built_form").is_in( + ["NO DATA!", "Not Recorded"] + ) + has_epc = pl.col("epc_property_type").is_not_null() + is_house = pl.col("epc_property_type") == "House" + return ( + pl.when(has_epc & is_house & ~bad_built_form) + .then(pl.col("built_form")) + .when(has_epc) + .then(pl.col("epc_property_type")) + .otherwise(None) + .replace( + { + "Flat": "Flats/Maisonettes", + "Maisonette": "Flats/Maisonettes", + "End-Terrace": "Terraced", + "Mid-Terrace": "Terraced", + "Enclosed End-Terrace": "Terraced", + "Enclosed Mid-Terrace": "Terraced", + "Bungalow": "Other", + "Park home": "Other", + "House": "Other", + } + ) + ) + + +def _construction_year_expr(column: str = "construction_age_band") -> pl.Expr: + return ( + pl.col(column) + .cast(pl.Utf8) + .str.replace("England and Wales: ", "") + .str.replace(" onwards", "") + .str.extract(r"(\d{4})", 1) + .cast(pl.UInt16, strict=False) + ) + + +def _ratio_bonus( + left: float | int | None, right: float | int | None, pct: float, cap: float +) -> float: + if left is None or right is None: + return 0.0 + try: + left_f = float(left) + right_f = float(right) + except (TypeError, ValueError): + return 0.0 + if left_f <= 0 or right_f <= 0: + return 0.0 + rel = abs(left_f - right_f) / max(left_f, right_f) + if rel > pct: + return 0.0 + return cap * (1.0 - rel / pct) + + +def _rooms_bonus(left: int | None, right: int | None) -> float: + if left is None or right is None: + return 0.0 + try: + diff = abs(int(left) - int(right)) + except (TypeError, ValueError): + return 0.0 + if diff == 0: + return 4.0 + if diff == 1: + return 2.0 + return 0.0 + + +def _enum_bonus( + left: str | None, right: str | None, *, exact: float, mismatch: float +) -> float: + if not left or not right: + return 0.0 + return exact if left == right else mismatch + + +def _address_score(query: str, candidate: str | None) -> int: + if not candidate: + return 0 + return max( + fuzz.token_set_ratio(query, candidate), + fuzz.token_sort_ratio(query, candidate), + ) + + +def _has_number(address: str | None) -> bool: + return bool(address and _NUMBER_RE.search(address)) + + +def _load_listings_for_merge( + listings_path: Path, arcgis_path: Path +) -> pl.DataFrame: + """Read the listings parquet and prepare it for the wide-frame merge. + + Output is keyed by `_listing_idx` and carries: + * `postcode` — canonical (NSPL `pcds`) form, with terminated postcodes + remapped to their nearest active successor; + * `pp_address` — the listing's raw register address (used as the + address half of the fuzzy match); + * one `_actual_*` overlay column per `_LISTING_OVERLAY_SOURCES` entry. + """ + raw = pl.scan_parquet(listings_path).with_row_index("_listing_idx") + postcode_mapping = build_postcode_mapping(arcgis_path).lazy() + + # Listings parquets occasionally carry Float NaNs (e.g. floor area). Polars + # treats NaN as distinct from null and the downstream `latest_price / + # total_floor_area` cast to Int32 explodes on a NaN, so we normalise floats + # to null at load time. + def _overlay_expr(src: str, dst: str, dtype: pl.DataType) -> pl.Expr: + expr = pl.col(src).cast(dtype, strict=False) + if dtype in (pl.Float32, pl.Float64): + expr = expr.fill_nan(None) + return expr.alias(dst) + + overlay = [ + _overlay_expr(src, dst, dtype) for src, dst, dtype in _LISTING_OVERLAY_SOURCES + ] + return ( + raw.with_columns( + _canonical_postcode_expr("Postcode").alias("_canonical_postcode"), + ) + .join( + postcode_mapping, + left_on="_canonical_postcode", + right_on="old_postcode", + how="left", + ) + .with_columns( + pl.coalesce("new_postcode", "_canonical_postcode", "Postcode").alias( + "postcode" + ), + pl.col("Address per Property Register").alias("pp_address"), + *overlay, + ) + .select( + "_listing_idx", + "postcode", + "pp_address", + *[dst for _src, dst, _dt in _LISTING_OVERLAY_SOURCES], + ) + .collect(engine="streaming") + ) + + +def _ensure_direct_epc_columns(df: pl.DataFrame) -> pl.DataFrame: + missing_exprs = [ + pl.lit(None, dtype=dtype).alias(column) + for column, dtype in _DIRECT_EPC_COLUMNS + if column not in df.columns + ] + if not missing_exprs: + return df + return df.with_columns(missing_exprs) + + +def _direct_epc_match_schema() -> dict[str, pl.DataType]: + return { + "_listing_idx": pl.UInt32, + **{column: dtype for column, dtype in _DIRECT_EPC_COLUMNS}, + } + + +def _empty_direct_epc_matches() -> pl.DataFrame: + return pl.DataFrame(schema=_direct_epc_match_schema()) + + +def _load_direct_epc_candidates( + epc_path: Path, + arcgis_path: Path, + listing_outcodes: list[str], + temp_dir: Path, +) -> pl.DataFrame: + schema = { + "_direct_epc_row": pl.UInt32, + "_direct_epc_match_address": pl.Utf8, + "_direct_epc_match_postcode": pl.Utf8, + "_direct_epc_outcode": pl.Utf8, + "_direct_epc_canonical_property_type": pl.Utf8, + "_direct_epc_east": pl.Float64, + "_direct_epc_north": pl.Float64, + **{column: dtype for column, dtype in _DIRECT_EPC_COLUMNS if column.startswith("_direct_")}, + } + if not listing_outcodes: + return pl.DataFrame(schema=schema) + + epc_base = ( + _scan_epc_certificates(epc_path, temp_dir) + .with_columns( + normalize_address_key(pl.col("epc_address")).alias( + "_direct_epc_match_address" + ), + normalize_postcode_key(pl.col("epc_postcode")).alias( + "_direct_epc_match_postcode" + ), + ) + .with_columns( + pl.col("_direct_epc_match_postcode") + .str.extract(r"^([A-Z]{1,2}\d[A-Z\d]?)\d[A-Z]{2}$", 1) + .alias("_direct_epc_outcode") + ) + .filter(pl.col("_direct_epc_outcode").is_in(listing_outcodes)) + .filter(pl.col("_direct_epc_match_address").is_not_null()) + .filter(pl.col("_direct_epc_match_postcode").is_not_null()) + ) + + social_tenure = ( + epc_base.filter(pl.col("tenure").str.to_lowercase().str.contains("social")) + .select("_direct_epc_match_address", "_direct_epc_match_postcode") + .unique() + .with_columns(pl.lit("Yes").alias("_direct_was_council_house")) + ) + + arcgis = pl.scan_parquet(arcgis_path).select( + normalize_postcode_key(pl.col("pcds")).alias("_direct_epc_match_postcode"), + pl.col("east1m").alias("_direct_epc_east"), + pl.col("north1m").alias("_direct_epc_north"), + ) + + return ( + epc_base.sort("inspection_date", descending=True) + .group_by("_direct_epc_match_address", "_direct_epc_match_postcode") + .first() + .join( + social_tenure, + on=["_direct_epc_match_address", "_direct_epc_match_postcode"], + how="left", + ) + .join(arcgis, on="_direct_epc_match_postcode", how="left") + .with_columns( + _canonical_epc_property_type_expr().alias( + "_direct_epc_canonical_property_type" + ), + _construction_year_expr().alias("_direct_construction_age_band"), + pl.when(pl.col("current_energy_rating").is_in(_EPC_RATING_VALUES)) + .then(pl.col("current_energy_rating")) + .otherwise(None) + .alias("_direct_current_energy_rating"), + pl.when(pl.col("potential_energy_rating").is_in(_EPC_RATING_VALUES)) + .then(pl.col("potential_energy_rating")) + .otherwise(None) + .alias("_direct_potential_energy_rating"), + pl.col("epc_address").alias("_direct_epc_address"), + pl.col("total_floor_area").alias("_direct_total_floor_area"), + pl.col("number_habitable_rooms").alias( + "_direct_number_habitable_rooms" + ), + pl.col("floor_height").alias("_direct_floor_height"), + pl.col("_direct_was_council_house").fill_null("No"), + ) + .with_columns( + pl.when(pl.col("_direct_construction_age_band").is_not_null()) + .then(pl.lit(1, dtype=pl.UInt8)) + .otherwise(pl.lit(None, dtype=pl.UInt8)) + .alias("_direct_is_construction_date_approximate") + ) + .with_row_index("_direct_epc_row") + .select( + "_direct_epc_row", + "_direct_epc_match_address", + "_direct_epc_match_postcode", + "_direct_epc_outcode", + "_direct_epc_canonical_property_type", + "_direct_epc_east", + "_direct_epc_north", + "_direct_epc_address", + "_direct_current_energy_rating", + "_direct_potential_energy_rating", + "_direct_total_floor_area", + "_direct_number_habitable_rooms", + "_direct_floor_height", + "_direct_construction_age_band", + "_direct_is_construction_date_approximate", + "_direct_was_council_house", + ) + .collect(engine="streaming") + ) + + +def _listing_match_frame(listings: pl.DataFrame) -> pl.DataFrame: + match = listings.with_columns( + normalize_address_key(pl.col("pp_address")).alias("_listing_match_address"), + normalize_postcode_key(pl.col("postcode")).alias("_listing_match_postcode"), + ).with_columns( + pl.col("_listing_match_postcode") + .str.extract(r"^([A-Z]{1,2}\d[A-Z\d]?)\d[A-Z]{2}$", 1) + .alias("_listing_outcode") + ) + + if match.is_empty(): + return match.with_columns( + pl.Series("_listing_east", [], dtype=pl.Float64), + pl.Series("_listing_north", [], dtype=pl.Float64), + ) + + transformer = Transformer.from_crs("EPSG:4326", "EPSG:27700", always_xy=True) + east, north = transformer.transform( + match["_actual_lon"].to_numpy(), match["_actual_lat"].to_numpy() + ) + return match.with_columns( + pl.Series("_listing_east", east, dtype=pl.Float64), + pl.Series("_listing_north", north, dtype=pl.Float64), + ) + + +def _optional_lazy_col( + schema: pl.Schema, column: str, dtype: pl.DataType +) -> pl.Expr: + if column in schema: + return pl.col(column).cast(dtype, strict=False).alias(column) + return pl.lit(None, dtype=dtype).alias(column) + + +def _listing_property_match_schema() -> dict[str, pl.DataType]: + return { + "_listing_idx": pl.UInt32, + "_matched_postcode": pl.Utf8, + "_matched_pp_address": pl.Utf8, + "_property_match_score": pl.Float32, + "_property_match_address_score": pl.Int32, + "_property_match_margin": pl.Float32, + "_property_match_field": pl.Utf8, + } + + +def _empty_listing_property_matches() -> pl.DataFrame: + return pl.DataFrame(schema=_listing_property_match_schema()) + + +def _property_match_candidate_frame(wide: pl.LazyFrame) -> pl.DataFrame: + schema = wide.collect_schema() + return ( + wide.select( + pl.col("postcode").cast(pl.Utf8).alias("postcode"), + pl.col("pp_address").cast(pl.Utf8).alias("pp_address"), + _optional_lazy_col(schema, "epc_address", pl.Utf8), + _optional_lazy_col(schema, "pp_property_type", pl.Utf8), + _optional_lazy_col(schema, "duration", pl.Utf8), + _optional_lazy_col(schema, "total_floor_area", pl.Float64), + _optional_lazy_col(schema, "number_habitable_rooms", pl.Int16), + _optional_lazy_col(schema, "latest_price", pl.Int64), + ) + .with_row_index("_property_row") + .with_columns( + normalize_postcode_key(pl.col("postcode")).alias( + "_property_match_postcode" + ), + normalize_address_key(pl.col("pp_address")).alias( + "_property_match_address" + ), + normalize_address_key(pl.col("epc_address")).alias( + "_property_epc_match_address" + ), + ) + .filter(pl.col("pp_address").is_not_null()) + .filter(pl.col("_property_match_postcode").is_not_null()) + .filter( + pl.col("_property_match_address").is_not_null() + | pl.col("_property_epc_match_address").is_not_null() + ) + .collect(engine="streaming") + ) + + +def _property_candidates_by_postcode( + candidates: pl.DataFrame, +) -> dict[str, list[dict]]: + buckets: dict[str, list[dict]] = {} + for row in candidates.iter_rows(named=True): + postcode = row.get("_property_match_postcode") + if postcode: + buckets.setdefault(postcode, []).append(row) + return buckets + + +def _best_listing_property_candidate( + listing: dict, candidates: list[dict] +) -> dict | None: + query = listing.get("_listing_match_address") + if not query: + return None + + listing_has_numbers = _has_number(query) + scored: list[tuple[float, int, dict, str]] = [] + for candidate in candidates: + register_address = candidate.get("_property_match_address") + epc_address = candidate.get("_property_epc_match_address") + register_numbers_compatible = bool( + register_address and _numbers_compatible(query, register_address) + ) + epc_numbers_compatible = bool( + epc_address and _numbers_compatible(query, epc_address) + ) + if not (register_numbers_compatible or epc_numbers_compatible): + continue + + register_score = _address_score(query, register_address) + epc_score = _address_score(query, epc_address) + base_score = max(register_score, epc_score) + if base_score == 0: + continue + + score = float(base_score) + score += _enum_bonus( + listing.get("_actual_property_type"), + candidate.get("pp_property_type"), + exact=7.0, + mismatch=-8.0, + ) + score += _enum_bonus( + listing.get("_actual_leasehold_freehold"), + candidate.get("duration"), + exact=3.0, + mismatch=-3.0, + ) + score += _ratio_bonus( + listing.get("_actual_total_floor_area"), + candidate.get("total_floor_area"), + pct=0.15, + cap=8.0, + ) + score += _rooms_bonus( + listing.get("_actual_number_habitable_rooms"), + candidate.get("number_habitable_rooms"), + ) + score += _ratio_bonus( + listing.get("_actual_asking_price"), + candidate.get("latest_price"), + pct=0.25, + cap=3.0, + ) + matched_field = ( + "pp_address" if register_score >= epc_score else "epc_address" + ) + scored.append((score, base_score, candidate, matched_field)) + + if not scored: + return None + scored.sort(key=lambda item: item[0], reverse=True) + top = scored[0] + runner_up = scored[1][0] if len(scored) > 1 else None + margin = top[0] - runner_up if runner_up is not None else top[0] + score_threshold = ( + _PROPERTY_MATCH_MIN_SCORE_WITH_NUMBERS + if listing_has_numbers + else _PROPERTY_MATCH_MIN_SCORE_WITHOUT_NUMBERS + ) + address_threshold = ( + _PROPERTY_MATCH_MIN_ADDRESS_SCORE_WITH_NUMBERS + if listing_has_numbers + else _PROPERTY_MATCH_MIN_ADDRESS_SCORE_WITHOUT_NUMBERS + ) + if ( + top[0] < score_threshold + or top[1] < address_threshold + or margin < _PROPERTY_MATCH_MIN_MARGIN + ): + return None + + candidate = top[2] + return { + "_listing_idx": listing["_listing_idx"], + "_matched_postcode": candidate.get("postcode"), + "_matched_pp_address": candidate.get("pp_address"), + "_property_match_score": round(top[0], 1), + "_property_match_address_score": top[1], + "_property_match_margin": round(margin, 1), + "_property_match_field": top[3], + } + + +def _match_listing_properties( + listing_matches: pl.DataFrame, property_candidates: pl.DataFrame +) -> pl.DataFrame: + if listing_matches.is_empty() or property_candidates.is_empty(): + return _empty_listing_property_matches() + + buckets = _property_candidates_by_postcode(property_candidates) + best_matches = [] + for listing in listing_matches.iter_rows(named=True): + postcode = listing.get("_listing_match_postcode") + if not postcode: + continue + match = _best_listing_property_candidate(listing, buckets.get(postcode, [])) + if match is not None: + best_matches.append(match) + + if not best_matches: + return _empty_listing_property_matches() + + matches = pl.DataFrame(best_matches, schema=_listing_property_match_schema()) + return ( + matches.sort( + ["_property_match_score", "_listing_idx"], descending=[True, False] + ) + .unique( + ["_matched_postcode", "_matched_pp_address"], + keep="first", + maintain_order=True, + ) + .sort("_listing_idx") + ) + + +def _epc_candidates_by_postcode(candidates: pl.DataFrame) -> dict[str, list[dict]]: + buckets: dict[str, list[dict]] = {} + for row in candidates.iter_rows(named=True): + postcode = row.get("_direct_epc_match_postcode") + if postcode: + buckets.setdefault(postcode, []).append(row) + return buckets + + +def _epc_postcode_tree( + candidates: pl.DataFrame, +) -> tuple[cKDTree | None, list[str]]: + postcode_points = ( + candidates.select( + "_direct_epc_match_postcode", + "_direct_epc_east", + "_direct_epc_north", + ) + .drop_nulls() + .filter( + pl.col("_direct_epc_east").is_finite() + & pl.col("_direct_epc_north").is_finite() + ) + .unique("_direct_epc_match_postcode") + ) + if postcode_points.is_empty(): + return None, [] + coords = np.column_stack( + [ + postcode_points["_direct_epc_east"].to_numpy(), + postcode_points["_direct_epc_north"].to_numpy(), + ] + ) + return cKDTree(coords), postcode_points["_direct_epc_match_postcode"].to_list() + + +def _candidate_postcodes_for_listing( + listing: dict, + postcode_tree: cKDTree | None, + postcode_values: list[str], +) -> list[str]: + postcodes: list[str] = [] + exact = listing.get("_listing_match_postcode") + if exact: + postcodes.append(exact) + + if postcode_tree is None: + return postcodes + + east = listing.get("_listing_east") + north = listing.get("_listing_north") + try: + east_f = float(east) + north_f = float(north) + except (TypeError, ValueError): + return postcodes + if not np.isfinite(east_f) or not np.isfinite(north_f): + return postcodes + + k = min(_DIRECT_EPC_NEAREST_POSTCODES, len(postcode_values)) + distances, indices = postcode_tree.query( + [east_f, north_f], + k=k, + distance_upper_bound=_DIRECT_EPC_NEARBY_RADIUS_M, + ) + distances = np.atleast_1d(distances) + indices = np.atleast_1d(indices) + seen = set(postcodes) + for distance, idx in zip(distances, indices, strict=False): + if not np.isfinite(distance) or idx >= len(postcode_values): + continue + postcode = postcode_values[int(idx)] + if postcode not in seen: + postcodes.append(postcode) + seen.add(postcode) + return postcodes + + +def _best_direct_epc_candidate(listing: dict, candidates: list[dict]) -> dict | None: + query = listing.get("_listing_match_address") + if not query: + return None + + listing_has_numbers = _has_number(query) + scored: list[tuple[float, int, dict]] = [] + for candidate in candidates: + address = candidate.get("_direct_epc_match_address") + if listing_has_numbers and not _numbers_compatible(query, address or ""): + continue + base_score = _address_score(query, address) + if base_score == 0: + continue + + score = float(base_score) + score += _enum_bonus( + listing.get("_actual_property_type"), + candidate.get("_direct_epc_canonical_property_type"), + exact=6.0, + mismatch=-6.0, + ) + score += _ratio_bonus( + listing.get("_actual_total_floor_area"), + candidate.get("_direct_total_floor_area"), + pct=0.12, + cap=8.0, + ) + score += _rooms_bonus( + listing.get("_actual_number_habitable_rooms"), + candidate.get("_direct_number_habitable_rooms"), + ) + scored.append((score, base_score, candidate)) + + if not scored: + return None + scored.sort(key=lambda item: item[0], reverse=True) + top = scored[0] + runner_up = scored[1][0] if len(scored) > 1 else None + margin = top[0] - runner_up if runner_up is not None else top[0] + threshold = ( + _DIRECT_EPC_MATCH_MIN_SCORE_WITH_NUMBERS + if listing_has_numbers + else _DIRECT_EPC_MATCH_MIN_SCORE_WITHOUT_NUMBERS + ) + if top[0] < threshold or margin < _DIRECT_EPC_MATCH_MIN_MARGIN: + return None + + candidate = top[2] + return { + "_listing_idx": listing["_listing_idx"], + "_direct_epc_address": candidate.get("_direct_epc_address"), + "_direct_current_energy_rating": candidate.get("_direct_current_energy_rating"), + "_direct_potential_energy_rating": candidate.get( + "_direct_potential_energy_rating" + ), + "_direct_total_floor_area": candidate.get("_direct_total_floor_area"), + "_direct_number_habitable_rooms": candidate.get( + "_direct_number_habitable_rooms" + ), + "_direct_floor_height": candidate.get("_direct_floor_height"), + "_direct_construction_age_band": candidate.get("_direct_construction_age_band"), + "_direct_is_construction_date_approximate": candidate.get( + "_direct_is_construction_date_approximate" + ), + "_direct_was_council_house": candidate.get("_direct_was_council_house"), + "_direct_epc_match_status": "matched", + "_direct_epc_match_score": round(top[0], 1), + "_direct_epc_match_margin": round(margin, 1), + } + + +def _match_direct_epc( + listing_matches: pl.DataFrame, epc_candidates: pl.DataFrame +) -> pl.DataFrame: + if listing_matches.is_empty() or epc_candidates.is_empty(): + return _empty_direct_epc_matches() + + buckets = _epc_candidates_by_postcode(epc_candidates) + postcode_tree, postcode_values = _epc_postcode_tree(epc_candidates) + + matches = [] + for listing in listing_matches.iter_rows(named=True): + candidate_postcodes = _candidate_postcodes_for_listing( + listing, postcode_tree, postcode_values + ) + candidate_rows: list[dict] = [] + seen_rows: set[int] = set() + for postcode in candidate_postcodes: + for candidate in buckets.get(postcode, []): + row = candidate.get("_direct_epc_row") + if row in seen_rows: + continue + candidate_rows.append(candidate) + if row is not None: + seen_rows.add(row) + match = _best_direct_epc_candidate(listing, candidate_rows) + if match is not None: + matches.append(match) + + if not matches: + return _empty_direct_epc_matches() + return pl.DataFrame(matches, schema=_direct_epc_match_schema()) + + +def _enrich_listings_with_direct_epc( + listings: pl.DataFrame, + epc_path: Path | None, + arcgis_path: Path, +) -> pl.DataFrame: + if epc_path is None: + return _ensure_direct_epc_columns(listings) + + listing_matches = _listing_match_frame(listings) + listing_outcodes = ( + listing_matches.select("_listing_outcode") + .drop_nulls() + .unique() + .to_series() + .to_list() + ) + if not listing_outcodes: + return _ensure_direct_epc_columns(listings) + + with tempfile.TemporaryDirectory( + prefix="direct_listing_epc_", dir=local_tmp_dir() + ) as tmpdir: + epc_candidates = _load_direct_epc_candidates( + epc_path, arcgis_path, listing_outcodes, Path(tmpdir) + ) + print(f"Direct listing EPC candidates: {epc_candidates.height}") + direct_matches = _match_direct_epc(listing_matches, epc_candidates) + + print(f"Direct listing EPC matches: {direct_matches.height}") + if direct_matches.is_empty(): + return _ensure_direct_epc_columns(listings) + + return _ensure_direct_epc_columns( + listings.join(direct_matches, on="_listing_idx", how="left") + ) + + +def _coalesce_direct_epc_columns(wide: pl.LazyFrame) -> pl.LazyFrame: + return wide.with_columns( + [ + pl.coalesce(pl.col(raw_column), pl.col(direct_column)).alias(raw_column) + for raw_column, direct_column in _DIRECT_EPC_RAW_COLUMN_MAP.items() + ] + ) + + +def _build_unmatched_listing_seed_rows( + unmatched_listing_idxs: pl.DataFrame, + listings: pl.DataFrame, + template_schema: pl.Schema, +) -> pl.DataFrame: + """Materialise wide-shape rows for listings that didn't match any property. + + Each seed row carries enough columns (postcode, pp_address, property type, + tenure, floor area, room count, asking price → latest_price) for the + postcode-keyed joins later in `_build` to fill in the rest. All other + wide columns are null on the seed row. + """ + if unmatched_listing_idxs.is_empty(): + return pl.DataFrame(schema=template_schema) + + listings = _ensure_direct_epc_columns(listings) + base = unmatched_listing_idxs.join(listings, on="_listing_idx", how="inner") + + populated: dict[str, pl.Expr] = { + "postcode": pl.col("postcode"), + "pp_address": pl.col("pp_address"), + "pp_property_type": pl.col("_actual_property_type"), + "duration": pl.col("_actual_leasehold_freehold"), + "total_floor_area": pl.coalesce( + pl.col("_actual_total_floor_area"), pl.col("_direct_total_floor_area") + ), + "number_habitable_rooms": pl.coalesce( + pl.col("_actual_number_habitable_rooms"), + pl.col("_direct_number_habitable_rooms"), + ), + "latest_price": pl.col("_actual_asking_price"), + } + for raw_column, direct_column in _DIRECT_EPC_RAW_COLUMN_MAP.items(): + if raw_column in populated: + continue + populated[raw_column] = pl.col(direct_column) + for _src, dst, _dt in _LISTING_OVERLAY_SOURCES: + populated[dst] = pl.col(dst) + + seed_exprs: list[pl.Expr] = [] + for col_name, dtype in template_schema.items(): + if col_name in populated: + seed_exprs.append( + populated[col_name].cast(dtype, strict=False).alias(col_name) + ) + else: + seed_exprs.append(pl.lit(None, dtype=dtype).alias(col_name)) + + return base.select(seed_exprs) + + +def _integrate_listings( + wide: pl.LazyFrame, + listings_path: Path, + arcgis_path: Path, + epc_path: Path | None = None, +) -> pl.LazyFrame: + """Splice actual listings into the wide property frame. + + Listings are fuzzy-matched to wide rows on (postcode, pp_address). Matched + listings stamp `_actual_*` overlay columns onto the existing wide row, so + historical context (EPC, last sale, etc.) is preserved. Unmatched listings + are appended as new wide rows with enough property-shape fields filled in + that the downstream postcode-keyed joins (deprivation, crime, tree + density, …) populate them just like any other row. + """ + listings = _load_listings_for_merge(listings_path, arcgis_path) + print(f"Listings loaded: {listings.height}") + listings = _enrich_listings_with_direct_epc(listings, epc_path, arcgis_path) + + overlay_columns = [dst for _src, dst, _dt in _LISTING_OVERLAY_SOURCES] + listing_attachment_columns = [ + *overlay_columns, + *[column for column, _dtype in _DIRECT_EPC_COLUMNS], + ] + + property_candidates = _property_match_candidate_frame(wide) + joined = _match_listing_properties( + _listing_match_frame(listings), property_candidates + ) + + total = listings.height + matched_count = joined.height + if total > 0: + print( + "Listings matched to existing wide rows: " + f"{matched_count}/{total} " + f"({100 * matched_count / total:.1f}%)" + ) + + overlay_for_matched = ( + joined.join(listings, on="_listing_idx", how="inner") + .select( + pl.col("_matched_postcode").alias("postcode"), + pl.col("_matched_pp_address").alias("pp_address"), + *listing_attachment_columns, + ) + .unique(["postcode", "pp_address"], keep="first") + ) + + wide_attached = wide.join( + overlay_for_matched.lazy(), on=["postcode", "pp_address"], how="left" + ) + wide_attached = _coalesce_direct_epc_columns(wide_attached) + wide_output = wide_attached.drop( + [column for column, _dtype in _DIRECT_EPC_COLUMNS], strict=False + ) + + unmatched_listing_idxs = listings.select("_listing_idx").join( + joined.select("_listing_idx"), on="_listing_idx", how="anti" + ) + seed_rows = _build_unmatched_listing_seed_rows( + unmatched_listing_idxs, + listings, + template_schema=wide_output.collect_schema(), + ) + + return pl.concat([wide_output, seed_rows.lazy()], how="vertical_relaxed") + + +def _finalize_listings(df: pl.DataFrame) -> pl.DataFrame: + """Project the post-rename wide frame down to enriched-listing rows.""" + df = df.filter(pl.col(_LISTING_FLAG_COLUMN).is_not_null()) + + df = df.with_columns( + pl.col("_actual_listing_url").alias("Listing URL"), + pl.col("_actual_listing_date").alias("Listing date"), + pl.col("_actual_listing_status").alias("Listing status"), + pl.col("_actual_listing_features").alias("Listing features"), + pl.col("_actual_asking_price").alias("Asking price"), + pl.col("_actual_asking_price_per_sqm").alias("Asking price per sqm"), + pl.col("_actual_bedrooms").alias("Bedrooms"), + pl.col("_actual_bathrooms").alias("Bathrooms"), + pl.col("_actual_price_qualifier").alias("Price qualifier"), + pl.col("_actual_property_sub_type").alias("Property sub-type"), + # Listing coordinates win over the postcode centroid. + pl.coalesce(pl.col("_actual_lat").cast(pl.Float64), pl.col("lat")).alias("lat"), + pl.coalesce(pl.col("_actual_lon").cast(pl.Float64), pl.col("lon")).alias("lon"), + # Listing's floor area / rooms override any EPC/PP value when present. + pl.coalesce( + pl.col("_actual_total_floor_area").cast(pl.Float64), + pl.col("Total floor area (sqm)"), + ).alias("Total floor area (sqm)"), + pl.coalesce( + pl.col("_actual_number_habitable_rooms").cast(pl.Int16), + pl.col("Number of bedrooms & living rooms"), + ).alias("Number of bedrooms & living rooms"), + pl.when(pl.col("_actual_property_type").is_in(_PROPERTY_TYPE_VALUES)) + .then(pl.col("_actual_property_type")) + .otherwise(pl.col("Property type")) + .alias("Property type"), + pl.when(pl.col("_actual_leasehold_freehold").is_in(_TENURE_VALUES)) + .then(pl.col("_actual_leasehold_freehold")) + .otherwise(pl.col("Leasehold/Freehold")) + .alias("Leasehold/Freehold"), + ) + + df = df.with_columns( + pl.coalesce( + pl.col("Asking price per sqm"), + pl.when( + pl.col("Asking price").is_not_null() + & pl.col("Total floor area (sqm)").is_not_null() + & (pl.col("Total floor area (sqm)") > MIN_FLOOR_AREA_M2) + ) + .then( + ( + pl.col("Asking price").cast(pl.Float64) + / pl.col("Total floor area (sqm)") + ) + .round(0) + .cast(pl.Int32, strict=False) + ) + .otherwise(None), + ).alias("Asking price per sqm") + ) + + df = df.with_columns( + pl.col("Asking price").alias("Estimated current price"), + pl.col("Asking price per sqm").alias("Est. price per sqm"), + pl.coalesce(pl.col("Last known price"), pl.col("Asking price")).alias( + "Last known price" + ), + pl.when(pl.col("Date of last transaction").is_not_null()) + .then(pl.lit("matched")) + .otherwise(pl.lit("unmatched")) + .alias("Historical property match status"), + ) + + drop_cols = [dst for _src, dst, _dt in _LISTING_OVERLAY_SOURCES] + return df.drop(drop_cols, strict=False) + + +@dataclass +class _BuildResult: + """Outputs of `_build` — exactly one of the two slot pairs is populated.""" + + postcode: pl.DataFrame | None = None + properties: pl.DataFrame | None = None + listings: pl.DataFrame | None = None + + def _build( epc_pp_path: Path, arcgis_path: Path, @@ -676,11 +1755,23 @@ def _build( election_results_path: Path, tree_density_postcodes_path: Path | None = None, listed_buildings_path: Path | None = None, -) -> tuple[pl.DataFrame, pl.DataFrame]: - """Build postcode and properties dataframes from epc_pp + auxiliary data. + actual_listings_path: Path | None = None, + actual_listings_epc_path: Path | None = None, + mode: Literal["normal", "listings"] = "normal", +) -> _BuildResult: + """Build postcode/properties dataframes (or enriched listings) from epc_pp + auxiliary data. - Returns (postcode_df, properties_df). + Modes: + * `normal` — produces (postcode_df, properties_df) as before. Ignores + `actual_listings_path` if supplied. + * `listings` — requires `actual_listings_path`; produces a single + enriched-listings DataFrame and skips the postcode/properties outputs. + Listings flow through the same enrichment joins as historical rows, + so postcode-scoped features (tree density, crime, deprivation, …) end + up populated on every listing with a valid postcode. """ + if mode == "listings" and actual_listings_path is None: + raise ValueError("listings mode requires actual_listings_path") _validate_lad_source_coverage(iod_path, ethnicity_path, rental_prices_path) wide = pl.scan_parquet(epc_pp_path).filter( @@ -731,6 +1822,15 @@ def _build( wide = wide.with_columns( pl.lit(None, dtype=pl.Utf8).alias(LISTED_BUILDING_FEATURE) ) + + if actual_listings_path is not None: + wide = _integrate_listings( + wide, + actual_listings_path, + arcgis_path, + epc_path=actual_listings_epc_path, + ) + wide = wide.with_columns(pl.col(LISTED_BUILDING_FEATURE).fill_null("No")) arcgis = ( @@ -995,6 +2095,13 @@ def _build( print("Collecting with streaming engine...") df = wide.collect(engine="streaming") + + if mode == "listings": + enriched_listings = _finalize_listings(df) + _validate_property_postcodes(enriched_listings) + print(f"Enriched listings rows: {enriched_listings.height}") + return _BuildResult(listings=enriched_listings) + _validate_property_postcodes(df) # Split into postcode-level and property-level dataframes @@ -1013,7 +2120,7 @@ def _build( properties_df = df.select(property_cols) print(f"Property rows: {properties_df.height}") - return postcode_df, properties_df + return _BuildResult(postcode=postcode_df, properties=properties_df) def main(): @@ -1068,7 +2175,7 @@ def main(): "--conservation-areas", type=Path, required=True, - help="Historic England conservation areas GeoPackage", + help="Planning Data conservation areas GeoJSON", ) parser.add_argument( "--listed-buildings", @@ -1109,18 +2216,58 @@ def main(): parser.add_argument( "--output-postcodes", type=Path, - required=True, - help="Output postcode parquet file path", + required=False, + help="Output postcode parquet (normal mode only)", ) parser.add_argument( "--output-properties", type=Path, - required=True, - help="Output properties parquet file path", + required=False, + help="Output properties parquet (normal mode only)", + ) + parser.add_argument( + "--actual-listings", + type=Path, + required=False, + help=( + "Optional scraped-listings parquet. When provided, listings flow " + "through the same merge pipeline as historical properties — set " + "--output-listings to write the enriched-listings file instead " + "of the postcode/properties files." + ), + ) + parser.add_argument( + "--epc", + type=Path, + required=False, + help=( + "Raw EPC certificates CSV or zip. Used only with --actual-listings " + "to match live listings directly to EPC records." + ), + ) + parser.add_argument( + "--output-listings", + type=Path, + required=False, + help=( + "Output enriched-listings parquet path. Required (and only valid) " + "when --actual-listings is set; --output-postcodes and " + "--output-properties are ignored in this mode." + ), ) args = parser.parse_args() - postcode_df, properties_df = _build( + listings_mode = args.actual_listings is not None + if listings_mode and args.output_listings is None: + parser.error("--output-listings is required when --actual-listings is set") + if not listings_mode and ( + args.output_postcodes is None or args.output_properties is None + ): + parser.error( + "--output-postcodes and --output-properties are required in normal mode" + ) + + result = _build( epc_pp_path=args.epc_pp, arcgis_path=args.arcgis, iod_path=args.iod, @@ -1137,8 +2284,28 @@ def main(): election_results_path=args.election_results, tree_density_postcodes_path=args.tree_density_postcodes, listed_buildings_path=args.listed_buildings, + actual_listings_path=args.actual_listings, + actual_listings_epc_path=args.epc if listings_mode else None, + mode="listings" if listings_mode else "normal", ) + if listings_mode: + listings_df = result.listings + assert listings_df is not None # guaranteed by mode contract + args.output_listings.parent.mkdir(parents=True, exist_ok=True) + listings_df.write_parquet(args.output_listings) + size_mb = args.output_listings.stat().st_size / (1024 * 1024) + print( + f"\nEnriched listings: {listings_df.height} rows, " + f"{len(listings_df.columns)} columns" + ) + print(f"Wrote {args.output_listings} ({size_mb:.1f} MB)") + return + + postcode_df = result.postcode + properties_df = result.properties + assert postcode_df is not None and properties_df is not None + print(f"\nPostcode columns: {postcode_df.columns}") print(f"Postcode rows: {postcode_df.height}") postcode_df.write_parquet(args.output_postcodes) diff --git a/pipeline/transform/postcode_boundaries/README.md b/pipeline/transform/postcode_boundaries/README.md index d10a9cd..15d96e3 100644 --- a/pipeline/transform/postcode_boundaries/README.md +++ b/pipeline/transform/postcode_boundaries/README.md @@ -53,7 +53,7 @@ Build an STRtree spatial index over the INSPIRE candidate polygons. Convert all For each INSPIRE parcel that contains at least one UPRN, run a majority vote: whichever postcode has the most UPRNs inside that parcel wins the parcel. Accumulate winning parcels per postcode, union them, and clip to the OA boundary. The result is `claimed[postcode] = polygon_within_oa`. -Then resolve overlaps: INSPIRE parcels can overlap geographically (digitization overlaps), so two postcodes might claim the same square meters. Walk through the claimed dict in insertion order (the postcode with the most parcel wins gets priority by virtue of appearing first), subtracting the running union from each subsequent postcode's geometry. +For INSPIRE parcels with no contained UPRN, assign the clipped parcel to the nearest UPRN's postcode using the parcel's representative point. These nearest-postcode claims run after contained-UPRN claims, so explicit address-in-parcel evidence keeps priority. Then resolve overlaps: INSPIRE parcels can overlap geographically (digitization overlaps), so two postcodes might claim the same square meters. Walk through claims in priority order, subtracting the running union from each subsequent postcode's geometry. #### Stage B: Voronoi distribution of remaining area @@ -67,7 +67,7 @@ The Voronoi computation (`voronoi.py`): 5. For each real point's Voronoi cell, constructs the polygon from the Voronoi vertices, clips to the boundary, groups by postcode 6. Unions per-postcode fragments -The effect: every unclaimed patch of OA gets assigned to the nearest postcode by straight-line distance (Voronoi tessellation is exactly the set of all points nearest to each generator). +The effect: every non-parcel patch of OA gets assigned to the nearest postcode by straight-line distance (Voronoi tessellation is exactly the set of all points nearest to each generator). #### Stage C: Combine @@ -77,7 +77,7 @@ The output of `process_oa` is `list[(postcode, polygon)]` — the per-OA fragmen ### Phase 4: Merging and writing -**Fragment merging** (`output.py:merge_fragments`): Groups all fragments by postcode, unions them. If the result is a MultiPolygon (meaning the postcode has disconnected pieces — either from spanning OAs with a gap, or algorithm artifacts), applies a 1m buffer-then-unbuffer to close tiny gaps from floating-point mismatches at OA boundary edges. If still a MultiPolygon after that, keeps only the largest polygon — postcodes are contiguous delivery routes, so detached fragments are artifacts. +**Fragment merging** (`output.py:merge_fragments`): Groups all fragments by postcode, unions them. If the result is a MultiPolygon (meaning the postcode has disconnected pieces — either from spanning OAs with a gap, or algorithm artifacts), applies a 5m buffer-then-unbuffer to close tiny gaps from floating-point mismatches at OA boundary edges. If still a MultiPolygon after that, keeps only the largest polygon — postcodes are contiguous delivery routes, so detached fragments are artifacts. **GeoJSON output** (`output.py:write_district_geojson`): Groups postcodes by district (the outward code, e.g. `SW1A` from `SW1A 1AA`). For each district, converts every postcode polygon from BNG to WGS84 using pyproj, simplifies with 1m tolerance (Douglas-Peucker), rounds coordinates to 6 decimal places (~0.1m precision), and writes a single `{district}.geojson` FeatureCollection. Each Feature has `postcodes` (formatted like `"SW1A 1AA"`) and `mapit_code` (no space: `"SW1A1AA"`) in its properties. diff --git a/pipeline/transform/postcode_boundaries/__init__.py b/pipeline/transform/postcode_boundaries/__init__.py index 439af3d..c4aaf5a 100644 --- a/pipeline/transform/postcode_boundaries/__init__.py +++ b/pipeline/transform/postcode_boundaries/__init__.py @@ -8,9 +8,10 @@ Algorithm per OA: 1. Single-postcode OA → entire OA polygon assigned to that postcode 2. Multi-postcode OA: a. Assign INSPIRE parcels to postcodes via UPRN point-in-polygon majority vote - b. Union INSPIRE parcels per postcode, clip to OA → "claimed" area - c. Distribute remaining (unclaimed) OA area via Voronoi of UPRN points - d. Final polygon = claimed + Voronoi share + b. Assign INSPIRE parcels with no contained UPRN to the nearest UPRN postcode + c. Union parcel claims per postcode, clip to OA → "claimed" area + d. Distribute remaining non-parcel OA area via Voronoi of UPRN points + e. Final polygon = parcel claims + Voronoi share Memory-efficient design (<12GB total): - INSPIRE polygons stored as raw coordinate bytes in parquet; Shapely objects built diff --git a/pipeline/transform/postcode_boundaries/process_oa.py b/pipeline/transform/postcode_boundaries/process_oa.py index 7efd445..14aae67 100644 --- a/pipeline/transform/postcode_boundaries/process_oa.py +++ b/pipeline/transform/postcode_boundaries/process_oa.py @@ -1,12 +1,15 @@ from collections import Counter, defaultdict import numpy as np +from scipy.spatial import cKDTree from shapely import STRtree, make_valid from shapely.geometry import MultiPolygon, Polygon from shapely.ops import unary_union from .voronoi import compute_voronoi_regions +MIN_GEOM_AREA = 0.01 + def process_oa( oa_geom: Polygon | MultiPolygon, @@ -19,76 +22,31 @@ def process_oa( if len(unique_pcs) == 1: return [(next(iter(unique_pcs)), oa_geom)] - # Try INSPIRE-based assignment - claimed: dict[str, Polygon | MultiPolygon] = {} + if len(points) == 0: + return [] + + valid_oa = _clean_polygonal(oa_geom) + if valid_oa is None: + return [] if inspire_candidates: - cand_tree = STRtree(inspire_candidates) - - from shapely import points as shp_points - - uprn_pts = shp_points(points) - pt_idx, cand_idx = cand_tree.query(uprn_pts, predicate="intersects") - - # Majority vote per candidate polygon - cand_postcodes: dict[int, list[str]] = defaultdict(list) - for pi, ci in zip(pt_idx, cand_idx): - cand_postcodes[ci].append(postcodes[pi]) - - pc_inspire_polys: dict[str, list[Polygon]] = defaultdict(list) - for ci, pc_list in cand_postcodes.items(): - winner = Counter(pc_list).most_common(1)[0][0] - pc_inspire_polys[winner].append(inspire_candidates[ci]) - - for pc, polys in pc_inspire_polys.items(): - merged = unary_union(polys) - if not merged.is_valid: - merged = make_valid(merged) - valid_oa = oa_geom if oa_geom.is_valid else make_valid(oa_geom) - clipped = merged.intersection(valid_oa) - if not clipped.is_empty: - if not clipped.is_valid: - clipped = make_valid(clipped) - clipped = _extract_polygonal(clipped) - if clipped is not None: - claimed[pc] = clipped - - # Resolve overlaps: INSPIRE parcels can overlap geographically, so two - # postcodes may claim the same area. Give contested area to whichever - # postcode claimed it first (most UPRNs → first in insertion order). - if len(claimed) > 1: - resolved: dict[str, Polygon | MultiPolygon] = {} - used = None - for pc, geom in claimed.items(): - if used is not None: - if not geom.is_valid: - geom = make_valid(geom) - if not used.is_valid: - used = make_valid(used) - geom = geom.difference(used) - if geom.is_empty: - continue - geom = _extract_polygonal(geom) - if geom is None: - continue - resolved[pc] = geom - used = geom if used is None else unary_union([used, geom]) - claimed = resolved + claimed = _claim_inspire_parcels(valid_oa, points, postcodes, inspire_candidates) + else: + claimed = {} # Compute remaining area if claimed: all_claimed = unary_union(list(claimed.values())) - if not all_claimed.is_valid: - all_claimed = make_valid(all_claimed) - valid_oa = oa_geom if oa_geom.is_valid else make_valid(oa_geom) - remaining = valid_oa.difference(all_claimed) - if not remaining.is_valid: - remaining = make_valid(remaining) + all_claimed = _clean_polygonal(all_claimed) + remaining = ( + valid_oa.difference(all_claimed) if all_claimed is not None else valid_oa + ) + remaining = _clean_polygonal(remaining) else: - remaining = oa_geom if oa_geom.is_valid else make_valid(oa_geom) + remaining = valid_oa - # Distribute remaining area via Voronoi - if not remaining.is_empty and remaining.area > 0.01: + # Distribute non-parcel land via Voronoi + if remaining is not None and not remaining.is_empty and remaining.area > MIN_GEOM_AREA: voronoi_result = compute_voronoi_regions(points, postcodes, remaining) else: voronoi_result = {} @@ -102,17 +60,167 @@ def process_oa( fragments = [] for pc, parts in result.items(): - merged = unary_union(parts) - if not merged.is_empty: - if not merged.is_valid: - merged = make_valid(merged) - merged = _extract_polygonal(merged) - if merged is not None: - fragments.append((pc, merged)) + merged = _clean_polygonal(unary_union(parts)) + if merged is not None: + fragments.append((pc, merged)) return fragments +def _claim_inspire_parcels( + valid_oa: Polygon | MultiPolygon, + points: np.ndarray, + postcodes: list[str], + inspire_candidates: list[Polygon], +) -> dict[str, Polygon | MultiPolygon]: + """Assign INSPIRE parcels to postcodes before Voronoi fills non-parcel land.""" + parcels = _prepare_inspire_parcels(valid_oa, inspire_candidates) + if not parcels: + return {} + + cand_tree = STRtree(parcels) + + from shapely import points as shp_points + + uprn_pts = shp_points(points) + pt_idx, cand_idx = cand_tree.query(uprn_pts, predicate="within") + + # First priority: parcels that physically contain UPRNs. Majority vote + # resolves blocks of flats or overlapping parcel data. + cand_postcodes: dict[int, list[str]] = defaultdict(list) + for pi, ci in zip(pt_idx, cand_idx): + cand_postcodes[ci].append(postcodes[pi]) + + contained_parts: dict[str, list] = defaultdict(list) + contained_scores: Counter[str] = Counter() + for ci, pc_list in cand_postcodes.items(): + pc_counts = Counter(pc_list) + winner, votes = pc_counts.most_common(1)[0] + contained_parts[winner].append(parcels[ci]) + contained_scores[winner] += votes + + contained_claimed = _merge_parts_by_postcode(contained_parts) + contained_claims = sorted( + contained_claimed.items(), + key=lambda item: (-contained_scores[item[0]], -item[1].area, item[0]), + ) + + # Second priority: remaining INSPIRE parcels with no contained UPRN. Assign + # each to the nearest UPRN/postcode so parcel boundaries carry more of the + # visible postcode shape; Voronoi is then limited to roads, parks, water, and + # any other non-parcel gaps. + points_f64 = points.astype(np.float64, copy=False) + contained_union = _union_claims(contained_claims) + nearest_tree = cKDTree(points_f64) + nearest_parts: dict[str, list] = defaultdict(list) + for i, parcel in enumerate(parcels): + if i in cand_postcodes: + continue + + assignable = parcel + if contained_union is not None: + assignable = assignable.difference(contained_union) + for part in _polygon_parts(assignable): + part = _clean_polygonal(part) + if part is None: + continue + pc = _nearest_postcode(part, nearest_tree, postcodes) + nearest_parts[pc].append(part) + + nearest_claimed = _merge_parts_by_postcode(nearest_parts) + nearest_claims = sorted( + nearest_claimed.items(), + key=lambda item: (-item[1].area, item[0]), + ) + + return _resolve_ordered_claims(contained_claims + nearest_claims) + + +def _prepare_inspire_parcels( + valid_oa: Polygon | MultiPolygon, + inspire_candidates: list[Polygon], +) -> list[Polygon | MultiPolygon]: + parcels: list[Polygon | MultiPolygon] = [] + for candidate in inspire_candidates: + geom = _clean_polygonal(candidate) + if geom is None: + continue + if not geom.intersects(valid_oa): + continue + clipped = _clean_polygonal(geom.intersection(valid_oa)) + if clipped is not None: + parcels.append(clipped) + return parcels + + +def _nearest_postcode( + geom: Polygon | MultiPolygon, + tree: cKDTree, + postcodes: list[str], +) -> str: + point = geom.representative_point() + _, idx = tree.query([point.x, point.y]) + return postcodes[idx] + + +def _polygon_parts(geom) -> list[Polygon]: + geom = _clean_polygonal(geom) + if geom is None: + return [] + if geom.geom_type == "Polygon": + return [geom] + return list(geom.geoms) + + +def _merge_parts_by_postcode( + parts_by_postcode: dict[str, list], +) -> dict[str, Polygon | MultiPolygon]: + merged: dict[str, Polygon | MultiPolygon] = {} + for pc, parts in parts_by_postcode.items(): + geom = _clean_polygonal(unary_union(parts)) + if geom is not None: + merged[pc] = geom + return merged + + +def _union_claims( + claims: list[tuple[str, Polygon | MultiPolygon]], +) -> Polygon | MultiPolygon | None: + if not claims: + return None + return _clean_polygonal(unary_union([geom for _, geom in claims])) + + +def _resolve_ordered_claims( + claims: list[tuple[str, Polygon | MultiPolygon]], +) -> dict[str, Polygon | MultiPolygon]: + """Resolve overlapping parcel claims in priority order.""" + resolved_parts: dict[str, list] = defaultdict(list) + used = None + for pc, geom in claims: + geom = _clean_polygonal(geom) + if geom is None: + continue + if used is not None: + geom = _clean_polygonal(geom.difference(used)) + if geom is None: + continue + resolved_parts[pc].append(geom) + used = _clean_polygonal(geom if used is None else unary_union([used, geom])) + return _merge_parts_by_postcode(resolved_parts) + + +def _clean_polygonal(geom) -> Polygon | MultiPolygon | None: + if geom is None or geom.is_empty: + return None + if not geom.is_valid: + geom = make_valid(geom) + geom = _extract_polygonal(geom) + if geom is None or geom.is_empty or geom.area <= MIN_GEOM_AREA: + return None + return geom + + def _extract_polygonal(geom) -> Polygon | MultiPolygon | None: """Extract only Polygon/MultiPolygon parts from a geometry. diff --git a/pipeline/transform/postcode_boundaries/test_postcode_boundaries.py b/pipeline/transform/postcode_boundaries/test_postcode_boundaries.py index af5a9d9..99df1ce 100644 --- a/pipeline/transform/postcode_boundaries/test_postcode_boundaries.py +++ b/pipeline/transform/postcode_boundaries/test_postcode_boundaries.py @@ -7,6 +7,7 @@ import numpy as np import polars as pl import pytest from shapely.geometry import MultiPolygon, Polygon, box +from shapely.ops import unary_union from .oa_boundaries import parse_gpkg_geometry from .greenspace import subtract_greenspace @@ -215,6 +216,20 @@ class TestVoronoiCollinear: assert ratio > 0.3, f"Area split too unfair: {area_a:.0f} vs {area_b:.0f}" +class TestVoronoiCoverage: + """Voronoi fallback should cover large OAs even when UPRNs are clustered.""" + + def test_clustered_points_cover_large_boundary(self): + boundary = box(0, 0, 5000, 100) + points = np.array([[10, 50], [20, 50]]) + result = compute_voronoi_regions(points, ["A", "B"], boundary) + + covered = unary_union(list(result.values())) + + assert covered.area == pytest.approx(boundary.area) + assert boundary.difference(covered).area < 0.01 + + class TestEqualSplitFallback: """_equal_split_fallback must give every postcode some area.""" @@ -306,6 +321,186 @@ class TestProcessOAGeometryTypes: ) +class TestProcessOAInspireParcelAssignment: + """INSPIRE parcels without UPRNs should still shape postcode boundaries.""" + + def test_unoccupied_inspire_parcel_goes_to_nearest_postcode(self): + """A parcel with no contained UPRN should not be split by Voronoi.""" + oa_geom = box(0, 0, 100, 100) + parcel = box(20, 40, 65, 60) # crosses the x=50 Voronoi split + points = np.array( + [ + [10, 50], # postcode A + [90, 50], # postcode B + ] + ) + postcodes = ["A", "B"] + + fragments = process_oa(oa_geom, points, postcodes, inspire_candidates=[parcel]) + frag_dict = dict(fragments) + + assert "A" in frag_dict and "B" in frag_dict + assert parcel.difference(frag_dict["A"]).area < 0.01 + assert frag_dict["B"].intersection(parcel).area < 0.01 + + def test_contained_uprn_claim_wins_over_overlapping_nearest_parcel(self): + """Contained-UPRN parcel claims should keep priority over nearest claims.""" + oa_geom = box(0, 0, 100, 100) + contained_a = box(0, 0, 60, 100) + unoccupied_nearer_b = box(50, 0, 80, 100) + points = np.array( + [ + [20, 50], # postcode A, inside contained_a + [90, 50], # postcode B, outside unoccupied_nearer_b + ] + ) + postcodes = ["A", "B"] + + fragments = process_oa( + oa_geom, + points, + postcodes, + inspire_candidates=[contained_a, unoccupied_nearer_b], + ) + frag_dict = dict(fragments) + + assert "A" in frag_dict and "B" in frag_dict + assert contained_a.difference(frag_dict["A"]).area < 0.01 + assert frag_dict["A"].intersection(frag_dict["B"]).area < 0.01 + assert frag_dict["B"].intersection(box(60, 0, 80, 100)).area > 0 + + def test_nearest_uses_assignable_fragment_after_contained_subtraction(self): + """Nearest assignment should use the part left after priority subtraction.""" + oa_geom = box(0, 0, 100, 100) + contained_a = box(0, 0, 60, 100) + unoccupied = box(25, 0, 80, 100) + points = np.array( + [ + [20, 50], # postcode A, inside contained_a + [90, 50], # postcode B, nearest to unoccupied remainder + ] + ) + postcodes = ["A", "B"] + + fragments = process_oa( + oa_geom, + points, + postcodes, + inspire_candidates=[contained_a, unoccupied], + ) + frag_dict = dict(fragments) + + assert contained_a.difference(frag_dict["A"]).area < 0.01 + assert box(60, 0, 80, 100).difference(frag_dict["B"]).area < 0.01 + + def test_boundary_uprn_does_not_claim_adjacent_parcel(self): + """A UPRN on a parcel edge should not count inside both parcels.""" + oa_geom = box(0, 0, 100, 100) + left = box(0, 0, 50, 100) + right = box(50, 0, 100, 100) + points = np.array( + [ + [50, 50], # postcode A, exactly on shared parcel boundary + [75, 50], # postcode B, strictly inside right parcel + ] + ) + postcodes = ["A", "B"] + + fragments = process_oa(oa_geom, points, postcodes, inspire_candidates=[left, right]) + frag_dict = dict(fragments) + + assert "A" in frag_dict and "B" in frag_dict + assert right.difference(frag_dict["B"]).area < 0.01 + + def test_disconnected_nearest_fragments_can_go_to_different_postcodes(self): + """A split unoccupied parcel should be assigned component by component.""" + oa_geom = box(0, 0, 100, 100) + contained_b = box(40, 0, 60, 100) + unoccupied = box(0, 40, 100, 60) + points = np.array( + [ + [10, 20], # postcode A, nearest to left split fragment + [50, 20], # postcode B, inside contained_b but outside unoccupied + [90, 20], # postcode C, nearest to right split fragment + ] + ) + postcodes = ["A", "B", "C"] + + fragments = process_oa( + oa_geom, + points, + postcodes, + inspire_candidates=[contained_b, unoccupied], + ) + frag_dict = dict(fragments) + + assert box(0, 40, 40, 60).difference(frag_dict["A"]).area < 0.01 + assert box(60, 40, 100, 60).difference(frag_dict["C"]).area < 0.01 + + def test_overlapping_nearest_parcels_do_not_overlap_in_output(self): + """Two unoccupied nearest-assigned parcels should be resolved cleanly.""" + oa_geom = box(0, 0, 100, 100) + left = box(0, 0, 70, 100) + right = box(30, 0, 100, 100) + points = np.array( + [ + [10, 50], # postcode A, nearest to left parcel + [90, 50], # postcode B, nearest to right parcel + ] + ) + postcodes = ["A", "B"] + + fragments = process_oa(oa_geom, points, postcodes, inspire_candidates=[left, right]) + frag_dict = dict(fragments) + + assert "A" in frag_dict and "B" in frag_dict + assert frag_dict["A"].intersection(frag_dict["B"]).area < 0.01 + + def test_mixed_inspire_and_voronoi_covers_oa_without_overlap(self): + """Parcel claims plus Voronoi fallback should cover the whole OA.""" + oa_geom = box(0, 0, 100, 100) + contained_a = box(0, 0, 30, 100) + unoccupied = box(70, 0, 90, 100) + points = np.array( + [ + [10, 50], + [90, 50], + ] + ) + postcodes = ["A", "B"] + + fragments = process_oa( + oa_geom, + points, + postcodes, + inspire_candidates=[contained_a, unoccupied], + ) + geoms = [geom for _, geom in fragments] + covered = unary_union(geoms) + overlap = sum(geom.area for geom in geoms) - covered.area + + assert covered.area == pytest.approx(oa_geom.area) + assert oa_geom.difference(covered).area < 0.01 + assert overlap < 0.01 + + def test_inspire_parcel_straddling_oa_is_clipped(self): + """INSPIRE parcels crossing the OA boundary should not leak outside it.""" + oa_geom = box(0, 0, 100, 100) + straddling = box(80, 0, 140, 100) + points = np.array( + [ + [10, 50], + [90, 50], + ] + ) + postcodes = ["A", "B"] + + fragments = process_oa(oa_geom, points, postcodes, inspire_candidates=[straddling]) + + for _, geom in fragments: + assert geom.difference(oa_geom).area < 0.01 + + # --------------------------------------------------------------------------- # _extract_polygonal helper # --------------------------------------------------------------------------- diff --git a/pipeline/transform/postcode_boundaries/voronoi.py b/pipeline/transform/postcode_boundaries/voronoi.py index cdff159..21ce654 100644 --- a/pipeline/transform/postcode_boundaries/voronoi.py +++ b/pipeline/transform/postcode_boundaries/voronoi.py @@ -52,9 +52,17 @@ def compute_voronoi_regions( if len(unique_pts) == 1: return {unique_pcs[0]: boundary} + if not boundary.is_valid: + boundary = make_valid(boundary) + pts = np.array(unique_pts) - min_e, min_n = pts.min(axis=0) - max_e, max_n = pts.max(axis=0) + pts_min_e, pts_min_n = pts.min(axis=0) + pts_max_e, pts_max_n = pts.max(axis=0) + boundary_min_e, boundary_min_n, boundary_max_e, boundary_max_n = boundary.bounds + min_e = min(pts_min_e, boundary_min_e) + min_n = min(pts_min_n, boundary_min_n) + max_e = max(pts_max_e, boundary_max_e) + max_n = max(pts_max_n, boundary_max_n) span = max(max_e - min_e, max_n - min_n, 100) dummy = np.array( @@ -79,9 +87,6 @@ def compute_voronoi_regions( n_real = len(pts) pc_polys: dict[str, list[Polygon]] = defaultdict(list) - if not boundary.is_valid: - boundary = make_valid(boundary) - for i in range(n_real): region_idx = vor.point_region[i] region = vor.regions[region_idx] diff --git a/pipeline/transform/test_enrich_actual_listings.py b/pipeline/transform/test_enrich_actual_listings.py deleted file mode 100644 index 86eeca4..0000000 --- a/pipeline/transform/test_enrich_actual_listings.py +++ /dev/null @@ -1,143 +0,0 @@ -from pathlib import Path - -import polars as pl - -from pipeline.transform.enrich_actual_listings import build_enriched_actual_listings - - -def test_build_enriched_actual_listings_joins_postcode_and_property_features( - tmp_path: Path, -) -> None: - listings_path = tmp_path / "listings.parquet" - properties_path = tmp_path / "properties.parquet" - postcode_path = tmp_path / "postcode.parquet" - arcgis_path = tmp_path / "arcgis.parquet" - output_path = tmp_path / "online_listings_buy_enriched.parquet" - - pl.DataFrame( - { - "Bedrooms": [2], - "Bathrooms": [1], - "Number of bedrooms & living rooms": [3], - "lon": [-0.1], - "lat": [51.5], - "Postcode": ["AA1 1AB"], - "Address per Property Register": ["1 High Street"], - "Leasehold/Freehold": [None], - "Property type": ["Terraced"], - "Property sub-type": ["Terraced"], - "Price qualifier": [""], - "Total floor area (sqm)": [None], - "Listing URL": ["https://example.test/listing"], - "Listing features": [["Garden"]], - "Listing date": [None], - "Listing status": ["For sale"], - "Asking price": [300_000], - "Asking price per sqm": [None], - }, - schema={ - "Bedrooms": pl.Int32, - "Bathrooms": pl.Int32, - "Number of bedrooms & living rooms": pl.Int32, - "lon": pl.Float64, - "lat": pl.Float64, - "Postcode": pl.Utf8, - "Address per Property Register": pl.Utf8, - "Leasehold/Freehold": pl.Utf8, - "Property type": pl.Utf8, - "Property sub-type": pl.Utf8, - "Price qualifier": pl.Utf8, - "Total floor area (sqm)": pl.Float64, - "Listing URL": pl.Utf8, - "Listing features": pl.List(pl.Utf8), - "Listing date": pl.Datetime("us"), - "Listing status": pl.Utf8, - "Asking price": pl.Int64, - "Asking price per sqm": pl.Int32, - }, - ).write_parquet(listings_path) - - pl.DataFrame( - { - "Address per Property Register": ["1 HIGH STREET"], - "Postcode": ["AA1 1AA"], - "Leasehold/Freehold": ["Freehold"], - "Address per EPC": ["1 High Street"], - "Current energy rating": ["C"], - "Potential energy rating": ["B"], - "Total floor area (sqm)": [80.0], - "Number of bedrooms & living rooms": [4], - "Interior height (m)": [2.4], - "Construction year": [1935], - "Former council house": ["No"], - "Listed building": ["No"], - "Estimated monthly rent": [1200.0], - "Street tree density percentile": [75.0], - "Property type": ["Terraced"], - "Estimated current price": [310_000.0], - }, - schema={ - "Address per Property Register": pl.Utf8, - "Postcode": pl.Utf8, - "Leasehold/Freehold": pl.Utf8, - "Address per EPC": pl.Utf8, - "Current energy rating": pl.Utf8, - "Potential energy rating": pl.Utf8, - "Total floor area (sqm)": pl.Float64, - "Number of bedrooms & living rooms": pl.Int32, - "Interior height (m)": pl.Float64, - "Construction year": pl.UInt16, - "Former council house": pl.Utf8, - "Listed building": pl.Utf8, - "Estimated monthly rent": pl.Float32, - "Street tree density percentile": pl.Float32, - "Property type": pl.Utf8, - "Estimated current price": pl.Float64, - }, - ).write_parquet(properties_path) - - pl.DataFrame( - { - "Postcode": ["AA1 1AA"], - "Income Score": [82.5], - "Within conservation area": ["Yes"], - } - ).write_parquet(postcode_path) - - pl.DataFrame( - { - "pcds": ["AA1 1AA", "AA1 1AB"], - "ctry25cd": ["E92000001", "E92000001"], - "doterm": [None, "202401"], - "east1m": [100.0, 105.0], - "north1m": [100.0, 105.0], - }, - schema={ - "pcds": pl.Utf8, - "ctry25cd": pl.Utf8, - "doterm": pl.Utf8, - "east1m": pl.Float64, - "north1m": pl.Float64, - }, - ).write_parquet(arcgis_path) - - result = build_enriched_actual_listings( - listings_path=listings_path, - properties_path=properties_path, - postcode_features_path=postcode_path, - arcgis_path=arcgis_path, - output_path=output_path, - epc_path=None, - ) - - row = result.row(0, named=True) - assert output_path.exists() - assert row["Postcode"] == "AA1 1AA" - assert row["Historical property match status"] == "matched" - assert row["Income Score"] == 82.5 - assert row["Within conservation area"] == "Yes" - assert row["Leasehold/Freehold"] == "Freehold" - assert row["Total floor area (sqm)"] == 80.0 - assert row["Asking price per sqm"] == 3750 - assert row["Estimated current price"] == 300_000 - assert row["Current energy rating"] == "C" diff --git a/pipeline/transform/test_merge.py b/pipeline/transform/test_merge.py index e2cfa89..2597730 100644 --- a/pipeline/transform/test_merge.py +++ b/pipeline/transform/test_merge.py @@ -2,16 +2,23 @@ import polars as pl import pyarrow as pa import pytest from shapely import box, to_wkb +from shapely.geometry import Point from pipeline.transform.merge import ( _AREA_COLUMNS, CONSERVATION_AREA_FEATURE, LISTED_BUILDING_FEATURE, TREE_DENSITY_FEATURE, - _is_unpublished_conservation_area_record, + _LISTING_OVERLAY_SOURCES, + _build_unmatched_listing_seed_rows, + _canonical_postcode_expr, + _finalize_listings, + _integrate_listings, + _match_direct_epc, _is_dynamic_poi_metric_column, _less_deprived_percentile_expr, _load_conservation_area_geometries, + _load_listings_for_merge, _matched_listed_building_flags, _postcode_conservation_area_flags, _postcode_listed_building_candidates, @@ -85,31 +92,28 @@ def test_postcode_conservation_area_flags_marks_point_membership() -> None: ] -def test_unpublished_conservation_area_records_are_identified() -> None: - assert _is_unpublished_conservation_area_record( - "No data available for publication by HE" - ) - assert not _is_unpublished_conservation_area_record("Bloomsbury") - assert not _is_unpublished_conservation_area_record(None) - - -def test_load_conservation_area_geometries_skips_unpublished_placeholders( +def test_load_conservation_area_geometries_uses_current_planning_data_records( monkeypatch: pytest.MonkeyPatch, tmp_path, ) -> None: real_area = box(0, 0, 1, 1) - placeholder_area = box(-100, -100, 100, 100) + ended_area = box(2, 2, 3, 3) + other_dataset_area = box(4, 4, 5, 5) + point = Point(0.5, 0.5) - def fake_read_arrow(path, columns): - assert path == tmp_path / "conservation_areas.gpkg" - assert columns == ["NAME"] + def fake_read_arrow(path): + assert path == tmp_path / "conservation_areas.geojson" table = pa.table( { - "NAME": [ - "Central Village", - "No data available for publication by HE", + "dataset": [ + "conservation-area", + "conservation-area", + "listed-building", + "conservation-area", ], - "SHAPE": to_wkb([real_area, placeholder_area]), + "end-date": ["", "2025-01-01", "", ""], + "name": ["Central Village", "Old Boundary", "Other", "Point Record"], + "SHAPE": to_wkb([real_area, ended_area, other_dataset_area, point]), } ) return {"geometry_name": "SHAPE", "crs": "EPSG:4326"}, table @@ -117,7 +121,7 @@ def test_load_conservation_area_geometries_skips_unpublished_placeholders( monkeypatch.setattr("pipeline.transform.merge.pyogrio.read_arrow", fake_read_arrow) geometries, crs = _load_conservation_area_geometries( - tmp_path / "conservation_areas.gpkg" + tmp_path / "conservation_areas.geojson" ) assert crs == "EPSG:4326" @@ -290,3 +294,440 @@ def test_tree_density_by_postcode_requires_postcode_and_density_columns( with pytest.raises(ValueError, match="missing required column: postcode"): _tree_density_by_postcode(missing_postcode_path) + + +def _sample_listings_frame() -> pl.DataFrame: + return pl.DataFrame( + { + "Bedrooms": [3], + "Bathrooms": [2], + "Number of bedrooms & living rooms": [4], + "lon": [-0.1], + "lat": [51.5], + "Postcode": ["sw1a1aa"], + "Address per Property Register": ["1 Example Road"], + "Leasehold/Freehold": ["Freehold"], + "Property type": ["Terraced"], + "Property sub-type": ["Mid-Terrace"], + "Price qualifier": [""], + "Total floor area (sqm)": [120.0], + "Listing URL": ["https://example.test/abc"], + "Listing features": [["Garden", "Off-street parking"]], + "Listing date": [None], + "Listing status": ["For sale"], + "Asking price": [750_000], + "Asking price per sqm": [6_250], + }, + schema={ + "Bedrooms": pl.Int32, + "Bathrooms": pl.Int32, + "Number of bedrooms & living rooms": pl.Int32, + "lon": pl.Float64, + "lat": pl.Float64, + "Postcode": pl.Utf8, + "Address per Property Register": pl.Utf8, + "Leasehold/Freehold": pl.Utf8, + "Property type": pl.Utf8, + "Property sub-type": pl.Utf8, + "Price qualifier": pl.Utf8, + "Total floor area (sqm)": pl.Float64, + "Listing URL": pl.Utf8, + "Listing features": pl.List(pl.Utf8), + "Listing date": pl.Datetime("us"), + "Listing status": pl.Utf8, + "Asking price": pl.Int64, + "Asking price per sqm": pl.Int32, + }, + ) + + +def _stub_arcgis(path) -> None: + pl.DataFrame( + { + "pcds": ["SW1A 1AA"], + "ctry25cd": ["E92000001"], + "doterm": [None], + "east1m": [530000.0], + "north1m": [180000.0], + }, + schema={ + "pcds": pl.Utf8, + "ctry25cd": pl.Utf8, + "doterm": pl.Utf8, + "east1m": pl.Float64, + "north1m": pl.Float64, + }, + ).write_parquet(path) + + +def test_canonical_postcode_expr_formats_compact_postcodes() -> None: + df = pl.DataFrame({"Postcode": ["sw1a1aa", "SW1A 1AA", "bad", None]}) + result = df.with_columns(_canonical_postcode_expr("Postcode").alias("canonical")) + assert result["canonical"].to_list() == ["SW1A 1AA", "SW1A 1AA", None, None] + + +def test_load_listings_for_merge_canonicalises_and_exposes_overlay_columns( + tmp_path, +) -> None: + listings_path = tmp_path / "listings.parquet" + arcgis_path = tmp_path / "arcgis.parquet" + _sample_listings_frame().write_parquet(listings_path) + _stub_arcgis(arcgis_path) + + loaded = _load_listings_for_merge(listings_path, arcgis_path) + + assert loaded["postcode"].to_list() == ["SW1A 1AA"] + assert loaded["pp_address"].to_list() == ["1 Example Road"] + assert loaded["_actual_listing_url"].to_list() == ["https://example.test/abc"] + assert loaded["_actual_asking_price"].to_list() == [750_000] + assert loaded["_actual_lat"].to_list() == [51.5] + + +def test_build_unmatched_listing_seed_rows_fills_property_shape_fields( + tmp_path, +) -> None: + listings_path = tmp_path / "listings.parquet" + arcgis_path = tmp_path / "arcgis.parquet" + _sample_listings_frame().write_parquet(listings_path) + _stub_arcgis(arcgis_path) + + listings = _load_listings_for_merge(listings_path, arcgis_path) + template_schema = pl.Schema( + { + "postcode": pl.Utf8, + "pp_address": pl.Utf8, + "pp_property_type": pl.Utf8, + "duration": pl.Utf8, + "total_floor_area": pl.Float64, + "number_habitable_rooms": pl.Int16, + "latest_price": pl.Int64, + "epc_address": pl.Utf8, + **{dst: dtype for _src, dst, dtype in _LISTING_OVERLAY_SOURCES}, + } + ) + unmatched_idxs = listings.select("_listing_idx") + + seed = _build_unmatched_listing_seed_rows( + unmatched_idxs, listings, template_schema + ) + + assert seed.height == 1 + assert seed["postcode"].to_list() == ["SW1A 1AA"] + assert seed["pp_address"].to_list() == ["1 Example Road"] + assert seed["pp_property_type"].to_list() == ["Terraced"] + assert seed["duration"].to_list() == ["Freehold"] + assert seed["total_floor_area"].to_list() == [120.0] + assert seed["number_habitable_rooms"].to_list() == [4] + assert seed["latest_price"].to_list() == [750_000] + # Columns not populated from the listing default to null. + assert seed["epc_address"].to_list() == [None] + # Overlay columns flow through 1:1. + assert seed["_actual_listing_url"].to_list() == ["https://example.test/abc"] + + +def test_build_unmatched_listing_seed_rows_uses_direct_epc_fallbacks( + tmp_path, +) -> None: + listings_path = tmp_path / "listings.parquet" + arcgis_path = tmp_path / "arcgis.parquet" + _sample_listings_frame().with_columns( + pl.lit(None, dtype=pl.Float64).alias("Total floor area (sqm)"), + pl.lit(None, dtype=pl.Int32).alias("Number of bedrooms & living rooms"), + ).write_parquet(listings_path) + _stub_arcgis(arcgis_path) + + listings = _load_listings_for_merge(listings_path, arcgis_path).with_columns( + pl.lit("1 Example Road").alias("_direct_epc_address"), + pl.lit("C").alias("_direct_current_energy_rating"), + pl.lit("B").alias("_direct_potential_energy_rating"), + pl.lit(98.0).alias("_direct_total_floor_area"), + pl.lit(4, dtype=pl.Int16).alias("_direct_number_habitable_rooms"), + pl.lit(2.4).alias("_direct_floor_height"), + pl.lit(1930, dtype=pl.UInt16).alias("_direct_construction_age_band"), + pl.lit(1, dtype=pl.UInt8).alias("_direct_is_construction_date_approximate"), + pl.lit("No").alias("_direct_was_council_house"), + ) + template_schema = pl.Schema( + { + "postcode": pl.Utf8, + "pp_address": pl.Utf8, + "total_floor_area": pl.Float64, + "number_habitable_rooms": pl.Int16, + "epc_address": pl.Utf8, + "current_energy_rating": pl.Utf8, + "was_council_house": pl.Utf8, + **{dst: dtype for _src, dst, dtype in _LISTING_OVERLAY_SOURCES}, + } + ) + + seed = _build_unmatched_listing_seed_rows( + listings.select("_listing_idx"), listings, template_schema + ) + + assert seed["total_floor_area"].to_list() == [98.0] + assert seed["number_habitable_rooms"].to_list() == [4] + assert seed["epc_address"].to_list() == ["1 Example Road"] + assert seed["current_energy_rating"].to_list() == ["C"] + assert seed["was_council_house"].to_list() == ["No"] + + +def test_match_direct_epc_considers_nearby_postcodes() -> None: + listing_matches = pl.DataFrame( + { + "_listing_idx": [0], + "_listing_match_address": ["1 EXAMPLE ROAD"], + "_listing_match_postcode": ["AA11AA"], + "_listing_east": [1000.0], + "_listing_north": [1000.0], + "_actual_property_type": ["Terraced"], + "_actual_total_floor_area": [100.0], + "_actual_number_habitable_rooms": [4], + }, + schema={ + "_listing_idx": pl.UInt32, + "_listing_match_address": pl.Utf8, + "_listing_match_postcode": pl.Utf8, + "_listing_east": pl.Float64, + "_listing_north": pl.Float64, + "_actual_property_type": pl.Utf8, + "_actual_total_floor_area": pl.Float64, + "_actual_number_habitable_rooms": pl.Int16, + }, + ) + epc_candidates = pl.DataFrame( + { + "_direct_epc_row": [0], + "_direct_epc_match_address": ["1 EXAMPLE ROAD"], + "_direct_epc_match_postcode": ["BB11BB"], + "_direct_epc_east": [1020.0], + "_direct_epc_north": [1010.0], + "_direct_epc_canonical_property_type": ["Terraced"], + "_direct_epc_address": ["1, Example Road"], + "_direct_current_energy_rating": ["C"], + "_direct_potential_energy_rating": ["B"], + "_direct_total_floor_area": [101.0], + "_direct_number_habitable_rooms": [4], + "_direct_floor_height": [2.5], + "_direct_construction_age_band": [1930], + "_direct_is_construction_date_approximate": [1], + "_direct_was_council_house": ["No"], + }, + schema={ + "_direct_epc_row": pl.UInt32, + "_direct_epc_match_address": pl.Utf8, + "_direct_epc_match_postcode": pl.Utf8, + "_direct_epc_east": pl.Float64, + "_direct_epc_north": pl.Float64, + "_direct_epc_canonical_property_type": pl.Utf8, + "_direct_epc_address": pl.Utf8, + "_direct_current_energy_rating": pl.Utf8, + "_direct_potential_energy_rating": pl.Utf8, + "_direct_total_floor_area": pl.Float64, + "_direct_number_habitable_rooms": pl.Int16, + "_direct_floor_height": pl.Float64, + "_direct_construction_age_band": pl.UInt16, + "_direct_is_construction_date_approximate": pl.UInt8, + "_direct_was_council_house": pl.Utf8, + }, + ) + + matches = _match_direct_epc(listing_matches, epc_candidates) + + assert matches.height == 1 + assert matches["_listing_idx"].to_list() == [0] + assert matches["_direct_epc_address"].to_list() == ["1, Example Road"] + + +def test_integrate_listings_attaches_overlay_by_matched_property_key(tmp_path) -> None: + listings_path = tmp_path / "listings.parquet" + arcgis_path = tmp_path / "arcgis.parquet" + _sample_listings_frame().write_parquet(listings_path) + _stub_arcgis(arcgis_path) + wide = pl.DataFrame( + { + "postcode": ["SW1A 1AA", "SW1A 1AA"], + "pp_address": ["9 Other Road", "1 Example Road"], + "pp_property_type": ["Detached", "Terraced"], + "duration": ["Freehold", "Freehold"], + "total_floor_area": [80.0, 90.0], + "number_habitable_rooms": [3, 4], + "latest_price": [500_000, 600_000], + "epc_address": [None, "1 Example Road"], + "current_energy_rating": [None, "C"], + "potential_energy_rating": [None, "B"], + "floor_height": [None, 2.4], + "construction_age_band": [None, 1930], + "is_construction_date_approximate": [None, 1], + "was_council_house": [None, "No"], + }, + schema={ + "postcode": pl.Utf8, + "pp_address": pl.Utf8, + "pp_property_type": pl.Utf8, + "duration": pl.Utf8, + "total_floor_area": pl.Float64, + "number_habitable_rooms": pl.Int16, + "latest_price": pl.Int64, + "epc_address": pl.Utf8, + "current_energy_rating": pl.Utf8, + "potential_energy_rating": pl.Utf8, + "floor_height": pl.Float64, + "construction_age_band": pl.UInt16, + "is_construction_date_approximate": pl.UInt8, + "was_council_house": pl.Utf8, + }, + ) + + integrated = _integrate_listings( + wide.lazy(), listings_path, arcgis_path, epc_path=None + ).collect() + + matched = integrated.filter(pl.col("pp_address") == "1 Example Road") + other = integrated.filter(pl.col("pp_address") == "9 Other Road") + assert matched["_actual_listing_url"].to_list() == ["https://example.test/abc"] + assert other["_actual_listing_url"].to_list() == [None] + + +def test_integrate_listings_rejects_low_confidence_no_number_match(tmp_path) -> None: + listings_path = tmp_path / "listings.parquet" + arcgis_path = tmp_path / "arcgis.parquet" + _sample_listings_frame().with_columns( + pl.lit("Rose Cottage High Street").alias("Address per Property Register"), + ).write_parquet(listings_path) + _stub_arcgis(arcgis_path) + wide = pl.DataFrame( + { + "postcode": ["SW1A 1AA"], + "pp_address": ["Old Cottage High Street"], + "pp_property_type": ["Terraced"], + "duration": ["Freehold"], + "total_floor_area": [120.0], + "number_habitable_rooms": [4], + "latest_price": [750_000], + "epc_address": ["Old Cottage High Street"], + "current_energy_rating": ["C"], + "potential_energy_rating": ["B"], + "floor_height": [2.4], + "construction_age_band": [1930], + "is_construction_date_approximate": [1], + "was_council_house": ["No"], + }, + schema={ + "postcode": pl.Utf8, + "pp_address": pl.Utf8, + "pp_property_type": pl.Utf8, + "duration": pl.Utf8, + "total_floor_area": pl.Float64, + "number_habitable_rooms": pl.Int16, + "latest_price": pl.Int64, + "epc_address": pl.Utf8, + "current_energy_rating": pl.Utf8, + "potential_energy_rating": pl.Utf8, + "floor_height": pl.Float64, + "construction_age_band": pl.UInt16, + "is_construction_date_approximate": pl.UInt8, + "was_council_house": pl.Utf8, + }, + ) + + integrated = _integrate_listings( + wide.lazy(), listings_path, arcgis_path, epc_path=None + ).collect() + + existing = integrated.filter(pl.col("pp_address") == "Old Cottage High Street") + seed = integrated.filter(pl.col("pp_address") == "Rose Cottage High Street") + assert existing["_actual_listing_url"].to_list() == [None] + assert seed["_actual_listing_url"].to_list() == ["https://example.test/abc"] + + +def test_finalize_listings_promotes_overlay_columns_and_filters_to_listing_rows() -> ( + None +): + df = pl.DataFrame( + { + "Postcode": ["SW1A 1AA", "SW1A 1AA"], + "Address per Property Register": ["1 Example Road", "2 Example Road"], + "Address per EPC": ["1 Example Road", None], + "Date of last transaction": [1990.0, None], + "lat": [51.5, 51.5], + "lon": [-0.1, -0.1], + "Total floor area (sqm)": [100.0, 95.0], + "Number of bedrooms & living rooms": [3, None], + "Property type": ["Terraced", None], + "Leasehold/Freehold": ["Leasehold", None], + "Last known price": [500_000, None], + "Street tree density percentile": [42.0, 42.0], + # Overlay columns: row 0 is a matched listing, row 1 is unmatched, row none. + "_actual_listing_url": ["url0", "url1"], + "_actual_asking_price": [600_000, 700_000], + "_actual_asking_price_per_sqm": [5_000, None], + "_actual_listing_date": [None, None], + "_actual_listing_status": ["For sale", "For sale"], + "_actual_listing_features": [["Garden"], ["Parking"]], + "_actual_bedrooms": [3, 4], + "_actual_bathrooms": [1, 2], + "_actual_price_qualifier": ["", ""], + "_actual_property_sub_type": ["Mid-Terrace", "End-Terrace"], + "_actual_lat": [51.51, 51.52], + "_actual_lon": [-0.11, -0.12], + "_actual_total_floor_area": [110.0, None], + "_actual_number_habitable_rooms": [4, 3], + "_actual_property_type": ["Terraced", "Flats/Maisonettes"], + "_actual_leasehold_freehold": ["Freehold", "Leasehold"], + }, + schema={ + "Postcode": pl.Utf8, + "Address per Property Register": pl.Utf8, + "Address per EPC": pl.Utf8, + "Date of last transaction": pl.Float64, + "lat": pl.Float64, + "lon": pl.Float64, + "Total floor area (sqm)": pl.Float64, + "Number of bedrooms & living rooms": pl.Int16, + "Property type": pl.Utf8, + "Leasehold/Freehold": pl.Utf8, + "Last known price": pl.Int64, + "Street tree density percentile": pl.Float32, + "_actual_listing_url": pl.Utf8, + "_actual_asking_price": pl.Int64, + "_actual_asking_price_per_sqm": pl.Int32, + "_actual_listing_date": pl.Datetime("us"), + "_actual_listing_status": pl.Utf8, + "_actual_listing_features": pl.List(pl.Utf8), + "_actual_bedrooms": pl.Int32, + "_actual_bathrooms": pl.Int32, + "_actual_price_qualifier": pl.Utf8, + "_actual_property_sub_type": pl.Utf8, + "_actual_lat": pl.Float64, + "_actual_lon": pl.Float64, + "_actual_total_floor_area": pl.Float64, + "_actual_number_habitable_rooms": pl.Int16, + "_actual_property_type": pl.Utf8, + "_actual_leasehold_freehold": pl.Utf8, + }, + ) + + finalized = _finalize_listings(df).sort("Address per Property Register") + + assert finalized.height == 2 + assert finalized["Listing URL"].to_list() == ["url0", "url1"] + assert finalized["Asking price"].to_list() == [600_000, 700_000] + assert finalized["Asking price per sqm"].to_list() == [5_000, 7_368] + assert finalized["Est. price per sqm"].to_list() == [5_000, 7_368] + assert finalized["Estimated current price"].to_list() == [600_000, 700_000] + assert finalized["Last known price"].to_list() == [500_000, 700_000] + # Listing's preferred floor area / rooms / property type / tenure. + assert finalized["Total floor area (sqm)"].to_list() == [110.0, 95.0] + assert finalized["Number of bedrooms & living rooms"].to_list() == [4, 3] + assert finalized["Property type"].to_list() == ["Terraced", "Flats/Maisonettes"] + assert finalized["Leasehold/Freehold"].to_list() == ["Freehold", "Leasehold"] + # Postcode-level feature carried through to both matched and unmatched rows. + assert finalized["Street tree density percentile"].to_list() == [42.0, 42.0] + # Match status reflects historical context availability. + assert finalized["Historical property match status"].to_list() == [ + "matched", + "unmatched", + ] + # Overlay scaffolding is dropped. + for src, dst, _dt in _LISTING_OVERLAY_SOURCES: + assert dst not in finalized.columns, src diff --git a/server-rs/src/data/poi.rs b/server-rs/src/data/poi.rs index beb08d0..0fb3b50 100644 --- a/server-rs/src/data/poi.rs +++ b/server-rs/src/data/poi.rs @@ -386,8 +386,7 @@ fn build_school_meta( let website = extract_optional_str_col(df, "school_website")?.unwrap_or_default(); let telephone = extract_optional_str_col(df, "school_telephone")?.unwrap_or_default(); let head_name = extract_optional_str_col(df, "school_head_name")?.unwrap_or_default(); - let ofsted_rating = - extract_optional_str_col(df, "school_ofsted_rating")?.unwrap_or_default(); + let ofsted_rating = extract_optional_str_col(df, "school_ofsted_rating")?.unwrap_or_default(); let fetch_str = |col: &Vec>, row: usize| -> Option { col.get(row).cloned().flatten() diff --git a/server-rs/src/features.rs b/server-rs/src/features.rs index 849a1a0..64bdf5e 100644 --- a/server-rs/src/features.rs +++ b/server-rs/src/features.rs @@ -184,7 +184,7 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[ name: "Within conservation area", order: Some(&["Yes", "No"]), description: "Whether the postcode point falls inside a designated conservation area", - detail: "Historic England conservation area boundaries, matched to the postcode representative point. The national dataset is indicative rather than definitive, so boundary-sensitive decisions should be checked with the local planning authority.", + detail: "Planning Data conservation area boundaries, matched to the postcode representative point. The national dataset is a work in progress and may include duplicates or incomplete local coverage, so boundary-sensitive decisions should be checked with the local planning authority.", source: "conservation-areas", }), Feature::Enum(EnumFeatureConfig { diff --git a/server-rs/src/main.rs b/server-rs/src/main.rs index 0502185..80bf040 100644 --- a/server-rs/src/main.rs +++ b/server-rs/src/main.rs @@ -167,6 +167,10 @@ struct Cli { #[arg(long)] tiles: PathBuf, + /// Optional PMTiles raster basemap for satellite imagery. + #[arg(long, env = "SATELLITE_TILES")] + satellite_tiles: Option, + /// Optional PMTiles raster overlay for high-resolution strategic noise. #[arg(long, env = "NOISE_OVERLAY_TILES")] noise_overlay_tiles: Option, @@ -475,6 +479,8 @@ async fn main() -> anyhow::Result<()> { tiles_path, "noise_lden_10m.pmtiles", ); + let satellite_tiles = + configured_or_default_overlay_path(&cli.satellite_tiles, tiles_path, "satellite.pmtiles"); let crime_hotspot_tiles = configured_or_default_overlay_path( &cli.crime_hotspot_tiles, tiles_path, @@ -488,6 +494,7 @@ async fn main() -> anyhow::Result<()> { let noise_overlay_reader = init_optional_tile_reader("Noise", noise_overlay_tiles.as_ref()).await?; + let satellite_reader = init_optional_tile_reader("Satellite", satellite_tiles.as_ref()).await?; let crime_hotspot_reader = init_optional_tile_reader("Crime hotspots", crime_hotspot_tiles.as_ref()).await?; let tree_overlay_reader = @@ -692,6 +699,7 @@ async fn main() -> anyhow::Result<()> { let reader_tile = tile_reader.clone(); let reader_style = tile_reader.clone(); + let reader_satellite = satellite_reader.clone(); let reader_noise_overlay = noise_overlay_reader.clone(); let reader_crime_hotspot = crime_hotspot_reader.clone(); let reader_tree_overlay = tree_overlay_reader.clone(); @@ -858,6 +866,18 @@ async fn main() -> anyhow::Result<()> { }) .layer(ConcurrencyLimitLayer::new(20)), ) + .route( + "/api/tiles/satellite/{z}/{x}/{y}", + get(move |path| { + routes::get_overlay_tile( + reader_satellite.clone(), + routes::OverlayTileFormat::RasterJpeg, + "satellite", + path, + ) + }) + .layer(ConcurrencyLimitLayer::new(30)), + ) .route( "/api/overlays/noise/{z}/{x}/{y}", get(move |path| { diff --git a/server-rs/src/parsing.rs b/server-rs/src/parsing.rs index c5ee2c7..20f5639 100644 --- a/server-rs/src/parsing.rs +++ b/server-rs/src/parsing.rs @@ -6,6 +6,7 @@ mod h3; pub use bounds::{bounds_intersect, h3_cell_bounds, parse_bounds, require_bounds}; pub use fields::{ parse_enum_dist, parse_field_indices, parse_field_indices_with_poi, parse_field_set, + ParsedFieldIndices, }; pub use filters::{ count_filter_impacts, count_filter_rejections, parse_filters, parse_filters_with_poi, diff --git a/server-rs/src/routes/actual_listings.rs b/server-rs/src/routes/actual_listings.rs index e00818f..8a75e27 100644 --- a/server-rs/src/routes/actual_listings.rs +++ b/server-rs/src/routes/actual_listings.rs @@ -40,7 +40,13 @@ pub struct ActualListingsResponse { pub truncated: bool, } -const KEEP_UNKNOWN_LISTING_FILTER_FEATURES: &[&str] = &["Total floor area (sqm)"]; +const KEEP_UNKNOWN_LISTING_FILTER_FEATURES: &[&str] = &[ + "Total floor area (sqm)", + "Leasehold/Freehold", + "Number of bedrooms & living rooms", + "Property type", +]; +const LISTING_BOUNDS_EPSILON_DEGREES: f64 = 0.00001; pub async fn get_actual_listings( State(shared): State>, @@ -98,14 +104,29 @@ pub async fn get_actual_listings( }; let row_indices = actual_listings.grid.query(south, west, north, east); - let total_in_bounds = row_indices.len(); - - // Build (row, sort_key) pairs so we can sort by index without - // materializing the full ActualListing for every matching row. - let mut matching_rows: Vec = row_indices + let total_grid_candidates = row_indices.len(); + let candidate_rows: Vec = row_indices .iter() .filter_map(|&row_idx| { let row = row_idx as usize; + row_is_within_bounds( + actual_listings.lat[row], + actual_listings.lon[row], + south, + west, + north, + east, + ) + .then_some(row) + }) + .collect(); + let total_in_bounds = candidate_rows.len(); + + // Build (row, sort_key) pairs so we can sort by index without + // materializing the full ActualListing for every matching row. + let mut matching_rows: Vec = candidate_rows + .into_iter() + .filter(|&row| { if has_listing_filters && !row_passes_listing_filters( row, @@ -116,7 +137,7 @@ pub async fn get_actual_listings( &keep_unknown_listing_filter_idxs, ) { - return None; + return false; } if has_poi_filters && !row_passes_listing_poi_filters( @@ -126,7 +147,7 @@ pub async fn get_actual_listings( poi_num_features, ) { - return None; + return false; } if has_travel_filters && !row_passes_travel_filters( @@ -135,9 +156,9 @@ pub async fn get_actual_listings( &travel_data, ) { - return None; + return false; } - Some(row) + true }) .collect(); @@ -162,6 +183,7 @@ pub async fn get_actual_listings( results = listings.len(), total = total_matching, total_in_bounds, + total_grid_candidates, offset, listing_filtered = has_listing_filters, poi_filtered = has_poi_filters, @@ -214,10 +236,23 @@ fn row_passes_listing_filters( } }) && enum_filters.iter().all(|filter| { let raw = feature_data[base + filter.feat_idx]; - raw != NAN_U16 && filter.allowed.contains(&raw) + if raw == NAN_U16 { + keep_unknown_filter_idxs.contains(&filter.feat_idx) + } else { + filter.allowed.contains(&raw) + } }) } +fn row_is_within_bounds(lat: f32, lon: f32, south: f64, west: f64, north: f64, east: f64) -> bool { + let lat = lat as f64; + let lon = lon as f64; + lat >= south - LISTING_BOUNDS_EPSILON_DEGREES + && lat <= north + LISTING_BOUNDS_EPSILON_DEGREES + && lon >= west - LISTING_BOUNDS_EPSILON_DEGREES + && lon <= east + LISTING_BOUNDS_EPSILON_DEGREES +} + fn row_passes_listing_poi_filters( row: usize, filters: &[ParsedPoiFilter], @@ -245,6 +280,20 @@ fn row_passes_listing_poi_filters( mod tests { use super::*; + #[test] + fn listing_bounds_check_keeps_only_exact_viewport_rows() { + assert!(row_is_within_bounds(51.5, -0.1, 51.4, -0.2, 51.6, 0.0)); + + // Bounds are inclusive so edge points are retained. + assert!(row_is_within_bounds(51.4, -0.2, 51.4, -0.2, 51.6, 0.0)); + assert!(row_is_within_bounds(51.6, 0.0, 51.4, -0.2, 51.6, 0.0)); + + assert!(!row_is_within_bounds(51.399, -0.1, 51.4, -0.2, 51.6, 0.0)); + assert!(!row_is_within_bounds(51.601, -0.1, 51.4, -0.2, 51.6, 0.0)); + assert!(!row_is_within_bounds(51.5, -0.201, 51.4, -0.2, 51.6, 0.0)); + assert!(!row_is_within_bounds(51.5, 0.001, 51.4, -0.2, 51.6, 0.0)); + } + #[test] fn listing_floor_area_filter_keeps_unknown_values() { let floor_area_filter = ParsedFilter { @@ -290,6 +339,48 @@ mod tests { )); } + #[test] + fn listing_enum_filter_keeps_allowlisted_unknown_values() { + let enum_filter = ParsedEnumFilter { + feat_idx: 0, + allowed: [1u16].into_iter().collect(), + }; + let keep_unknown_filter_idxs: FxHashSet = [0usize].into_iter().collect(); + + assert!(row_passes_listing_filters( + 0, + &[], + &[enum_filter], + &[NAN_U16], + 1, + &keep_unknown_filter_idxs + )); + + assert!(!row_passes_listing_filters( + 0, + &[], + &[ParsedEnumFilter { + feat_idx: 0, + allowed: [1u16].into_iter().collect(), + }], + &[2], + 1, + &keep_unknown_filter_idxs + )); + + assert!(row_passes_listing_filters( + 0, + &[], + &[ParsedEnumFilter { + feat_idx: 0, + allowed: [1u16].into_iter().collect(), + }], + &[1], + 1, + &keep_unknown_filter_idxs + )); + } + #[test] fn listing_poi_filter_uses_listing_metric_matrix() { let filter = ParsedPoiFilter { diff --git a/server-rs/src/routes/export.rs b/server-rs/src/routes/export.rs index cad4eb0..505e939 100644 --- a/server-rs/src/routes/export.rs +++ b/server-rs/src/routes/export.rs @@ -354,7 +354,11 @@ pub async fn get_export( .map_err(|err| (StatusCode::BAD_REQUEST, err).into_response())? }; let has_poi_filters = !parsed_poi_filters.is_empty(); - let filters_str = if is_postcode_mode { None } else { params.filters }; + let filters_str = if is_postcode_mode { + None + } else { + params.filters + }; let travel_entries = if is_postcode_mode { Vec::new() } else { @@ -472,9 +476,10 @@ pub async fn get_export( let mut out: Vec<(usize, PostcodeExportAgg)> = Vec::with_capacity(entries.len()); for (pc_idx, _normalized) in entries { let mut agg = PostcodeExportAgg::new(total_export_features); - for &row_idx in state.data.rows_for_postcode( - &postcode_data.postcodes[*pc_idx], - ) { + for &row_idx in state + .data + .rows_for_postcode(&postcode_data.postcodes[*pc_idx]) + { agg.add_row( feature_data, row_idx as usize, @@ -518,7 +523,8 @@ pub async fn get_export( return; } if let Some(&pc_idx) = postcode_data.postcode_to_idx.get(postcode) { - by_pc.entry(pc_idx) + by_pc + .entry(pc_idx) .or_insert_with(|| PostcodeExportAgg::new(total_export_features)) .add_row( feature_data, @@ -531,10 +537,8 @@ pub async fn get_export( } }); - let mut aggs: Vec<(usize, PostcodeExportAgg)> = by_pc - .into_iter() - .filter(|(_, agg)| agg.count > 0) - .collect(); + let mut aggs: Vec<(usize, PostcodeExportAgg)> = + by_pc.into_iter().filter(|(_, agg)| agg.count > 0).collect(); // Sort by property count descending aggs.sort_unstable_by_key(|agg| std::cmp::Reverse(agg.1.count)); diff --git a/server-rs/src/routes/overlays.rs b/server-rs/src/routes/overlays.rs index 4a220af..0e7a369 100644 --- a/server-rs/src/routes/overlays.rs +++ b/server-rs/src/routes/overlays.rs @@ -12,6 +12,7 @@ use super::TileReader; pub enum OverlayTileFormat { VectorMvtGzip, RasterPng, + RasterJpeg, } impl OverlayTileFormat { @@ -19,6 +20,7 @@ impl OverlayTileFormat { match self { Self::VectorMvtGzip => "application/x-protobuf", Self::RasterPng => "image/png", + Self::RasterJpeg => "image/jpeg", } } diff --git a/server-rs/src/routes/postcode_properties.rs b/server-rs/src/routes/postcode_properties.rs index 44e40c2..f1a4d41 100644 --- a/server-rs/src/routes/postcode_properties.rs +++ b/server-rs/src/routes/postcode_properties.rs @@ -10,7 +10,10 @@ use tracing::{info, warn}; use crate::auth::OptionalUser; use crate::consts::{POSTCODE_SEARCH_OFFSET, PROPERTIES_LIMIT}; use crate::licensing::{check_license_point, resolve_share_code}; -use crate::parsing::{parse_filters_with_poi, row_passes_filters, row_passes_poi_filters}; +use crate::parsing::{ + parse_field_indices_with_poi, parse_filters_with_poi, row_passes_filters, + row_passes_poi_filters, +}; use crate::state::SharedState; use crate::utils::normalize_postcode; @@ -25,6 +28,10 @@ pub struct PostcodePropertiesParams { /// Optional min:max applies as a filter (exclude properties outside range). pub travel: Option, pub offset: Option, + /// `;;`-separated numeric feature names to include in each property payload. + /// If absent, keeps the legacy behavior and returns all numeric features. + /// If empty, returns only the fixed property card fields. + pub fields: Option, /// Exact address to rank first when opening properties from address search. pub focus_address: Option, /// Share-link code; grants bbox-scoped access for unlicensed users. @@ -76,6 +83,17 @@ pub async fn get_postcode_properties( let has_poi_filters = !parsed_poi_filters.is_empty(); let travel_entries = parse_optional_travel(params.travel.as_deref()) .map_err(|err| (StatusCode::BAD_REQUEST, err).into_response())?; + let field_indices = parse_field_indices_with_poi( + params.fields.as_deref(), + &state.feature_name_to_index, + &state.data.poi_metrics.name_to_index, + ) + .map_err(|err| (err.0, err.1).into_response())?; + let fields_count = field_indices + .normal + .as_ref() + .map(|indices| (indices.len() + field_indices.poi.len()) as i32) + .unwrap_or(-1); let postcode_str = normalized; let focus_address = params @@ -165,6 +183,7 @@ pub async fn get_postcode_properties( feature_names, feature_name_to_index, enum_values, + &field_indices, ) }) .collect(); @@ -177,6 +196,7 @@ pub async fn get_postcode_properties( offset = page_offset, filters = num_filters, filters_raw = filters_str.as_deref().unwrap_or("-"), + fields = fields_count, travel_entries = travel_entries.len(), ms = format_args!("{:.1}", elapsed.as_secs_f64() * 1000.0), "GET /api/postcode-properties" diff --git a/server-rs/src/routes/properties.rs b/server-rs/src/routes/properties.rs index 1376096..dfbb7a7 100644 --- a/server-rs/src/routes/properties.rs +++ b/server-rs/src/routes/properties.rs @@ -14,8 +14,9 @@ use crate::consts::PROPERTIES_LIMIT; use crate::data::{HistoricalPrice, RenovationEvent}; use crate::licensing::{check_license_bounds, resolve_share_code}; use crate::parsing::{ - cell_for_row_cached, h3_cell_bounds, needs_parent, parse_filters_with_poi, row_passes_filters, - row_passes_poi_filters, validate_h3_resolution, + cell_for_row_cached, h3_cell_bounds, needs_parent, parse_field_indices_with_poi, + parse_filters_with_poi, row_passes_filters, row_passes_poi_filters, validate_h3_resolution, + ParsedFieldIndices, }; use crate::state::{AppState, SharedState}; @@ -30,6 +31,10 @@ pub struct HexagonPropertiesParams { /// Optional min:max applies as a filter (exclude properties outside range). pub travel: Option, pub offset: Option, + /// `;;`-separated numeric feature names to include in each property payload. + /// If absent, keeps the legacy behavior and returns all numeric features. + /// If empty, returns only the fixed property card fields. + pub fields: Option, /// Share-link code; grants bbox-scoped access for unlicensed users. pub share: Option, } @@ -106,27 +111,81 @@ fn lookup_enum_value( } } +fn insert_feature_value( + features: &mut FxHashMap, + row: usize, + state: &AppState, + feature_names: &[String], + enum_values: &FxHashMap>, + feat_idx: usize, +) { + if feat_idx >= feature_names.len() || enum_values.contains_key(&feat_idx) { + return; + } + + let value = state.data.get_feature(row, feat_idx); + if value.is_finite() { + features.insert(feature_names[feat_idx].clone(), value); + } +} + +fn insert_poi_metric_value( + features: &mut FxHashMap, + row: usize, + state: &AppState, + metric_idx: usize, +) { + let Some(metric_name) = state.data.poi_metrics.feature_names.get(metric_idx) else { + return; + }; + + let value = state.data.poi_metrics.get_for_property_row(row, metric_idx); + if value.is_finite() { + features.insert(metric_name.clone(), value); + } +} + pub fn build_property( row: usize, state: &AppState, feature_names: &[String], feature_name_to_index: &FxHashMap, enum_values: &FxHashMap>, + field_indices: &ParsedFieldIndices, ) -> Property { let mut features = FxHashMap::default(); - for (feat_idx, feat_name) in feature_names.iter().enumerate() { - if enum_values.contains_key(&feat_idx) { - continue; + + if let Some(indices) = field_indices.normal.as_deref() { + for &feat_idx in indices { + insert_feature_value( + &mut features, + row, + state, + feature_names, + enum_values, + feat_idx, + ); } - let value = state.data.get_feature(row, feat_idx); - if value.is_finite() { - features.insert(feat_name.clone(), value); + } else { + for feat_idx in 0..feature_names.len() { + insert_feature_value( + &mut features, + row, + state, + feature_names, + enum_values, + feat_idx, + ); } } - for (metric_idx, metric_name) in state.data.poi_metrics.feature_names.iter().enumerate() { - let value = state.data.poi_metrics.get_for_property_row(row, metric_idx); - if value.is_finite() { - features.insert(metric_name.clone(), value); + + if field_indices.normal.is_some() { + for &metric_idx in &field_indices.poi { + insert_poi_metric_value(&mut features, row, state, metric_idx); + } + } else { + for metric_idx in 0..state.data.poi_metrics.feature_names.len() { + insert_poi_metric_value(&mut features, row, state, metric_idx); } } @@ -241,6 +300,17 @@ pub async fn get_hexagon_properties( let has_poi_filters = !parsed_poi_filters.is_empty(); let travel_entries = parse_optional_travel(params.travel.as_deref()) .map_err(|err| (StatusCode::BAD_REQUEST, err).into_response())?; + let field_indices = parse_field_indices_with_poi( + params.fields.as_deref(), + &state.feature_name_to_index, + &state.data.poi_metrics.name_to_index, + ) + .map_err(|err| (err.0, err.1).into_response())?; + let fields_count = field_indices + .normal + .as_ref() + .map(|indices| (indices.len() + field_indices.poi.len()) as i32) + .unwrap_or(-1); let result = tokio::task::spawn_blocking(move || { let t0 = std::time::Instant::now(); @@ -309,6 +379,7 @@ pub async fn get_hexagon_properties( feature_names, feature_name_to_index, enum_values, + &field_indices, ) }) .collect(); @@ -322,6 +393,7 @@ pub async fn get_hexagon_properties( offset, filters = num_filters, filters_raw = filters_str.as_deref().unwrap_or("-"), + fields = fields_count, travel_entries = travel_entries.len(), ms = format_args!("{:.1}", elapsed.as_secs_f64() * 1000.0), "GET /api/hexagon-properties" diff --git a/server-rs/src/routes/shorten.rs b/server-rs/src/routes/shorten.rs index 18b12af..b4841a0 100644 --- a/server-rs/src/routes/shorten.rs +++ b/server-rs/src/routes/shorten.rs @@ -135,6 +135,7 @@ fn is_allowed_param_key(key: &str) -> bool { | "amenityCount5km" | "poi" | "overlay" + | "basemap" | "tab" | "pc" | "tt" @@ -585,6 +586,14 @@ mod tests { ); } + #[test] + fn preserves_basemap_for_share_links() { + let params = + sanitized_query_params("lat=51.5&lon=-0.1&zoom=12&basemap=satellite", false).unwrap(); + + assert_eq!(params, "lat=51.5&lon=-0.1&zoom=12&basemap=satellite"); + } + #[test] fn escapes_html_attributes() { assert_eq!(escape_attr(r#""'><&"#), ""'><&");