Compare commits
9 commits
c4423b6c9a
...
bbc2fcb86c
| Author | SHA1 | Date | |
|---|---|---|---|
| bbc2fcb86c | |||
| 9cd2b8849c | |||
| 3a3e249bdd | |||
| 30055ab870 | |||
| c997ea46a5 | |||
| 160283f1a1 | |||
| da1bf49524 | |||
| 582bc856d8 | |||
| 300209b192 |
49 changed files with 1826 additions and 599 deletions
|
|
@ -37,8 +37,6 @@ NAPTAN := $(DATA_DIR)/naptan.parquet
|
|||
BROADBAND := $(DATA_DIR)/broadband.parquet
|
||||
SCHOOL_PROX := $(DATA_DIR)/school_proximity.parquet
|
||||
RENTAL := $(DATA_DIR)/rental_prices.parquet
|
||||
GEOSURE_DIR := $(DATA_DIR)/geosure
|
||||
GEOSURE := $(DATA_DIR)/geosure.parquet
|
||||
INSPIRE_DIR := $(DATA_DIR)/inspire
|
||||
OA_BOUNDARIES := $(DATA_DIR)/oa_boundaries.gpkg
|
||||
UPRN_LOOKUP := $(DATA_DIR)/uprn_lookup.parquet
|
||||
|
|
@ -46,16 +44,17 @@ PC_BOUNDARIES := $(MANUAL_DATA)/postcode_boundaries
|
|||
TRANSIT_DIR := $(DATA_DIR)/transit
|
||||
TRANSIT_STAMP := $(TRANSIT_DIR)/.done
|
||||
GREENSPACE := $(DATA_DIR)/greenspace_water.parquet
|
||||
OS_GREENSPACE := $(DATA_DIR)/os_greenspace.parquet
|
||||
PBF := $(DATA_DIR)/england-latest.osm.pbf
|
||||
PLACES := $(DATA_DIR)/places.parquet
|
||||
LISTINGS_BUY := $(DATA_DIR)/online_listings_buy.parquet
|
||||
LISTINGS_RENT := $(DATA_DIR)/online_listings_rent.parquet
|
||||
LSOA_POP := $(DATA_DIR)/lsoa_population.parquet
|
||||
MEDIAN_AGE := $(DATA_DIR)/median_age.parquet
|
||||
ENGLAND_BOUNDARY := $(DATA_DIR)/england_boundary.geojson
|
||||
RM_OUTCODES := frontend/src/lib/rightmove-outcodes.json
|
||||
|
||||
# Sentinel files for directory targets (Make doesn't track directories well)
|
||||
GEOSURE_STAMP := $(GEOSURE_DIR)/.done
|
||||
INSPIRE_STAMP := $(INSPIRE_DIR)/.done
|
||||
|
||||
PMTILES_VERSION := 1.22.3
|
||||
|
|
@ -65,10 +64,10 @@ PMTILES_VERSION := 1.22.3
|
|||
.PHONY: prepare merge tiles \
|
||||
download-arcgis download-price-paid download-deprivation download-ethnicity \
|
||||
download-naptan download-pois download-ofsted download-broadband download-rental-prices \
|
||||
download-postcodes download-geosure download-noise download-inspire \
|
||||
download-oa-boundaries download-uprn-lookup download-transit-network download-greenspace download-pbf download-places download-lsoa-population download-england-boundary download-rightmove-outcodes \
|
||||
download-postcodes download-noise download-inspire \
|
||||
download-oa-boundaries download-uprn-lookup download-transit-network download-greenspace download-os-greenspace download-pbf download-places download-lsoa-population download-median-age download-england-boundary download-rightmove-outcodes \
|
||||
transform-pois transform-epc-pp transform-crime transform-poi-proximity \
|
||||
transform-school-proximity transform-geosure transform-postcode-boundaries \
|
||||
transform-school-proximity transform-postcode-boundaries \
|
||||
generate-postcode-boundaries
|
||||
|
||||
prepare: $(PRICES_STAMP)
|
||||
|
|
@ -83,7 +82,6 @@ download-pois: $(POIS_RAW)
|
|||
download-ofsted: $(OFSTED)
|
||||
download-broadband: $(BROADBAND)
|
||||
download-postcodes: $(POSTCODES)
|
||||
download-geosure: $(GEOSURE_STAMP)
|
||||
download-rental-prices: $(RENTAL)
|
||||
download-noise: $(NOISE)
|
||||
download-inspire: $(INSPIRE_STAMP)
|
||||
|
|
@ -91,9 +89,11 @@ download-oa-boundaries: $(OA_BOUNDARIES)
|
|||
download-uprn-lookup: $(UPRN_LOOKUP)
|
||||
download-transit-network: $(TRANSIT_STAMP)
|
||||
download-greenspace: $(GREENSPACE)
|
||||
download-os-greenspace: $(OS_GREENSPACE)
|
||||
download-pbf: $(PBF)
|
||||
download-places: $(PLACES)
|
||||
download-lsoa-population: $(LSOA_POP)
|
||||
download-median-age: $(MEDIAN_AGE)
|
||||
download-england-boundary: $(ENGLAND_BOUNDARY)
|
||||
download-rightmove-outcodes: $(RM_OUTCODES)
|
||||
transform-pois: $(POIS_FILTERED)
|
||||
|
|
@ -101,7 +101,6 @@ transform-epc-pp: $(EPC_PP)
|
|||
transform-crime: $(CRIME)
|
||||
transform-poi-proximity: $(POI_PROXIMITY)
|
||||
transform-school-proximity: $(SCHOOL_PROX)
|
||||
transform-geosure: $(GEOSURE)
|
||||
transform-postcode-boundaries: $(PC_BOUNDARIES)
|
||||
generate-postcode-boundaries: $(OA_BOUNDARIES) $(INSPIRE_STAMP) $(UPRN_LOOKUP)
|
||||
uv run python -m pipeline.transform.postcode_boundaries \
|
||||
|
|
@ -158,10 +157,6 @@ $(BROADBAND):
|
|||
$(POSTCODES):
|
||||
uv run python -m pipeline.download.postcodes --output $@
|
||||
|
||||
$(GEOSURE_STAMP):
|
||||
uv run python -m pipeline.download.geosure --output $(GEOSURE_DIR)
|
||||
@touch $@
|
||||
|
||||
$(NOISE): $(ARCGIS)
|
||||
uv run python -m pipeline.download.noise --arcgis $(ARCGIS) --output $@
|
||||
|
||||
|
|
@ -185,12 +180,19 @@ $(RENTAL):
|
|||
$(GREENSPACE): $(PBF)
|
||||
uv run python -m pipeline.download.greenspace_water --output $@ --pbf $(PBF)
|
||||
|
||||
$(OS_GREENSPACE):
|
||||
uv run python -m pipeline.download.os_greenspace --output $@
|
||||
|
||||
$(PLACES): $(PBF) $(ENGLAND_BOUNDARY)
|
||||
uv run python -m pipeline.download.places --output $@ --pbf $(PBF) --boundary $(ENGLAND_BOUNDARY)
|
||||
|
||||
$(LSOA_POP):
|
||||
uv run python -m pipeline.download.lsoa_population --output $@
|
||||
|
||||
|
||||
$(MEDIAN_AGE):
|
||||
uv run python -m pipeline.download.median_age --output $@
|
||||
|
||||
$(ENGLAND_BOUNDARY):
|
||||
uv run python -m pipeline.download.england_boundary --output $@
|
||||
|
||||
|
|
@ -216,15 +218,12 @@ $(CRIME):
|
|||
fi
|
||||
uv run python -m pipeline.transform.crime --input $(CRIME_DIR) --output $@
|
||||
|
||||
$(POI_PROXIMITY): $(ARCGIS) $(POIS_FILTERED)
|
||||
uv run python -m pipeline.transform.poi_proximity --arcgis $(ARCGIS) --pois $(POIS_FILTERED) --output $@
|
||||
$(POI_PROXIMITY): $(ARCGIS) $(POIS_FILTERED) $(OS_GREENSPACE)
|
||||
uv run python -m pipeline.transform.poi_proximity --arcgis $(ARCGIS) --pois $(POIS_FILTERED) --greenspace $(OS_GREENSPACE) --output $@
|
||||
|
||||
$(SCHOOL_PROX): $(OFSTED) $(ARCGIS)
|
||||
uv run python -m pipeline.transform.school_proximity --ofsted $(OFSTED) --arcgis $(ARCGIS) --output $@
|
||||
|
||||
$(GEOSURE): $(GEOSURE_STAMP) $(ARCGIS)
|
||||
uv run python -m pipeline.transform.transform_geosure --geosure $(GEOSURE_DIR) --arcgis $(ARCGIS) --output $@
|
||||
|
||||
# Postcode boundaries require manual generation — fail with instructions
|
||||
$(PC_BOUNDARIES):
|
||||
@echo ""
|
||||
|
|
@ -243,7 +242,7 @@ $(PC_BOUNDARIES):
|
|||
# ── Final merge → postcode.parquet + properties.parquet ──────────────────────
|
||||
|
||||
$(MERGE_STAMP): $(EPC_PP) $(ARCGIS) $(IOD) $(POI_PROXIMITY) \
|
||||
$(ETHNICITY) $(CRIME) $(NOISE) $(SCHOOL_PROX) $(BROADBAND) $(GEOSURE) $(RENTAL) $(LSOA_POP)
|
||||
$(ETHNICITY) $(CRIME) $(NOISE) $(SCHOOL_PROX) $(BROADBAND) $(RENTAL) $(LSOA_POP) $(MEDIAN_AGE)
|
||||
uv run python -m pipeline.transform.merge \
|
||||
--epc-pp $(EPC_PP) \
|
||||
--arcgis $(ARCGIS) \
|
||||
|
|
@ -254,9 +253,9 @@ $(MERGE_STAMP): $(EPC_PP) $(ARCGIS) $(IOD) $(POI_PROXIMITY) \
|
|||
--noise $(NOISE) \
|
||||
--school-proximity $(SCHOOL_PROX) \
|
||||
--broadband $(BROADBAND) \
|
||||
--geosure $(GEOSURE) \
|
||||
--rental-prices $(RENTAL) \
|
||||
--lsoa-population $(LSOA_POP) \
|
||||
--median-age $(MEDIAN_AGE) \
|
||||
--output-postcodes $(POSTCODES_PQ) \
|
||||
--output-properties $(PROPERTIES_PQ)
|
||||
@touch $@
|
||||
|
|
|
|||
|
|
@ -9,6 +9,12 @@ DELAY_BETWEEN_OUTCODES = 0.5
|
|||
MAX_RETRIES = 3
|
||||
RETRY_BASE_DELAY = 2.0
|
||||
GRID_CELL_SIZE = 0.01 # degrees for postcode spatial index
|
||||
MAX_BEDROOMS = 20 # sanity cap — values above this are almost certainly parsing errors
|
||||
# Rent sanity bounds (monthly). Rents outside this range are nulled out — they are
|
||||
# almost always total-stay pricing (e.g. "Golf Open 2026" short lets), annual rents
|
||||
# mislabelled as monthly, or data errors.
|
||||
MIN_RENT_MONTHLY = 50 # below £50/month is implausible for any UK property
|
||||
MAX_RENT_MONTHLY = 25_000 # above £25k/month covers ultra-prime London; higher is suspect
|
||||
SEED = 42
|
||||
CHECKPOINT_INTERVAL = int(os.environ.get("CHECKPOINT_INTERVAL", "900")) # seconds
|
||||
|
||||
|
|
|
|||
|
|
@ -15,6 +15,7 @@ from constants import (
|
|||
HOMECOUK_API_BASE,
|
||||
HOMECOUK_BASE,
|
||||
HOMECOUK_PER_PAGE,
|
||||
MAX_BEDROOMS,
|
||||
PROPERTY_TYPE_MAP,
|
||||
RETRY_BASE_DELAY,
|
||||
)
|
||||
|
|
@ -25,6 +26,7 @@ from metrics import (
|
|||
homecouk_requests_total,
|
||||
)
|
||||
from spatial import PostcodeSpatialIndex
|
||||
from transform import validate_floor_area
|
||||
|
||||
log = logging.getLogger("homecouk")
|
||||
|
||||
|
|
@ -216,10 +218,57 @@ def parse_floor_area(description: str | None) -> float | None:
|
|||
m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*ft", description, re.IGNORECASE)
|
||||
if m:
|
||||
sqft = float(m.group(1).replace(",", ""))
|
||||
return round(sqft * 0.092903, 1)
|
||||
return validate_floor_area(round(sqft * 0.092903, 1))
|
||||
m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*m", description, re.IGNORECASE)
|
||||
if m:
|
||||
return round(float(m.group(1).replace(",", "")), 1)
|
||||
return validate_floor_area(round(float(m.group(1).replace(",", "")), 1))
|
||||
return None
|
||||
|
||||
|
||||
def parse_tenure(prop: dict) -> str | None:
|
||||
"""Extract tenure from home.co.uk property data.
|
||||
|
||||
Checks multiple sources in priority order:
|
||||
1. Dedicated 'tenure' or 'tenure_type' field in the API response
|
||||
2. Free-text search in the description for 'freehold' / 'leasehold'
|
||||
3. Free-text search in features lists
|
||||
|
||||
home.co.uk aggregates listings from estate agents, so tenure is often
|
||||
embedded in the description text rather than a structured field.
|
||||
"""
|
||||
# 1. Check dedicated tenure fields (in case the API adds them)
|
||||
for key in ("tenure", "tenure_type", "tenureType"):
|
||||
val = prop.get(key)
|
||||
if val and isinstance(val, str):
|
||||
lower = val.lower().strip()
|
||||
if "leasehold" in lower:
|
||||
return "Leasehold"
|
||||
if "freehold" in lower:
|
||||
return "Freehold"
|
||||
|
||||
# 2. Check description text — estate agents often include tenure here
|
||||
description = prop.get("description") or ""
|
||||
if description:
|
||||
lower_desc = description.lower()
|
||||
if re.search(r"\bleasehold\b", lower_desc):
|
||||
return "Leasehold"
|
||||
if re.search(r"\bfreehold\b", lower_desc):
|
||||
# Matches "Freehold" and "Share of Freehold" (both = freehold ownership)
|
||||
return "Freehold"
|
||||
|
||||
# 3. Check features / key_features lists if present
|
||||
for key in ("features", "key_features", "keyFeatures"):
|
||||
features = prop.get(key)
|
||||
if features and isinstance(features, list):
|
||||
for feat in features:
|
||||
if not isinstance(feat, str):
|
||||
continue
|
||||
lower_feat = feat.lower()
|
||||
if "leasehold" in lower_feat:
|
||||
return "Leasehold"
|
||||
if "freehold" in lower_feat:
|
||||
return "Freehold"
|
||||
|
||||
return None
|
||||
|
||||
|
||||
|
|
@ -267,7 +316,7 @@ def transform_property(
|
|||
return None
|
||||
|
||||
price = prop.get("price") or prop.get("latest_price")
|
||||
if not price:
|
||||
if not price or int(price) <= 0:
|
||||
return None
|
||||
|
||||
# Home.co.uk provides postcodes directly, but fall back to spatial index
|
||||
|
|
@ -278,8 +327,16 @@ def transform_property(
|
|||
log.debug("No postcode for property at %.4f, %.4f — skipping", lat, lng)
|
||||
return None
|
||||
|
||||
bedrooms = prop.get("bedrooms", 0) or 0
|
||||
bathrooms = prop.get("bathrooms", 0) or 0
|
||||
raw_beds = prop.get("bedrooms", 0) or 0
|
||||
raw_baths = prop.get("bathrooms", 0) or 0
|
||||
bedrooms = raw_beds if raw_beds <= MAX_BEDROOMS else 0
|
||||
bathrooms = raw_baths if raw_baths <= MAX_BEDROOMS else 0
|
||||
if raw_beds > MAX_BEDROOMS or raw_baths > MAX_BEDROOMS:
|
||||
log.warning(
|
||||
"home.co.uk %s: implausible beds=%d baths=%d (capped to 0)",
|
||||
prop.get("listing_id") or prop.get("property_id") or "?",
|
||||
raw_beds, raw_baths,
|
||||
)
|
||||
|
||||
listing_type = prop.get("listing_property_type") or prop.get("property_type") or ""
|
||||
address = prop.get("display_address") or prop.get("address") or ""
|
||||
|
|
@ -304,7 +361,7 @@ def transform_property(
|
|||
"lat": lat,
|
||||
"Postcode": postcode,
|
||||
"Address per Property Register": address,
|
||||
"Leasehold/Freehold": None, # not available from home.co.uk
|
||||
"Leasehold/Freehold": parse_tenure(prop),
|
||||
"Property type": map_property_type(listing_type),
|
||||
"Property sub-type": listing_type or "Unknown",
|
||||
"price": int(price),
|
||||
|
|
|
|||
|
|
@ -34,6 +34,7 @@ from playwright.sync_api import sync_playwright
|
|||
|
||||
from constants import (
|
||||
DELAY_BETWEEN_PAGES,
|
||||
MAX_BEDROOMS,
|
||||
OPENRENT_BASE,
|
||||
PROPERTY_TYPE_MAP,
|
||||
RETRY_BASE_DELAY,
|
||||
|
|
@ -45,6 +46,7 @@ from metrics import (
|
|||
openrent_requests_total,
|
||||
)
|
||||
from spatial import PostcodeSpatialIndex
|
||||
from transform import validate_floor_area
|
||||
|
||||
log = logging.getLogger("openrent")
|
||||
|
||||
|
|
@ -607,10 +609,10 @@ def parse_floor_area(description: str | None) -> float | None:
|
|||
m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*ft", description, re.IGNORECASE)
|
||||
if m:
|
||||
sqft = float(m.group(1).replace(",", ""))
|
||||
return round(sqft * 0.092903, 1)
|
||||
return validate_floor_area(round(sqft * 0.092903, 1))
|
||||
m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*m", description, re.IGNORECASE)
|
||||
if m:
|
||||
return round(float(m.group(1).replace(",", "")), 1)
|
||||
return validate_floor_area(round(float(m.group(1).replace(",", "")), 1))
|
||||
return None
|
||||
|
||||
|
||||
|
|
@ -651,7 +653,7 @@ def transform_property(
|
|||
lat = detail.get("lat") or search_data.get("lat")
|
||||
lng = detail.get("lng") or search_data.get("lng")
|
||||
price = detail.get("price") or search_data.get("price")
|
||||
if not price:
|
||||
if not price or int(price) <= 0:
|
||||
return None
|
||||
|
||||
frequency = search_data.get("frequency", "monthly")
|
||||
|
|
@ -701,8 +703,15 @@ def transform_property(
|
|||
log.debug("No postcode for property — skipping")
|
||||
return None
|
||||
|
||||
bedrooms = detail.get("bedrooms") or search_data.get("bedrooms", 0) or 0
|
||||
bathrooms = detail.get("bathrooms") or search_data.get("bathrooms", 0) or 0
|
||||
raw_beds = detail.get("bedrooms") or search_data.get("bedrooms", 0) or 0
|
||||
raw_baths = detail.get("bathrooms") or search_data.get("bathrooms", 0) or 0
|
||||
bedrooms = raw_beds if raw_beds <= MAX_BEDROOMS else 0
|
||||
bathrooms = raw_baths if raw_baths <= MAX_BEDROOMS else 0
|
||||
if raw_beds > MAX_BEDROOMS or raw_baths > MAX_BEDROOMS:
|
||||
log.warning(
|
||||
"OpenRent %s: implausible beds=%d baths=%d (capped to 0)",
|
||||
search_data.get("id", "?"), raw_beds, raw_baths,
|
||||
)
|
||||
|
||||
# Title: prefer detail page (has h1 with full title)
|
||||
title = detail.get("title") or search_data.get("title", "")
|
||||
|
|
@ -746,6 +755,9 @@ def transform_property(
|
|||
"lat": lat,
|
||||
"Postcode": postcode,
|
||||
"Address per Property Register": address,
|
||||
# OpenRent is a rental-only platform — tenure (Freehold/Leasehold) is a
|
||||
# property ownership concept that doesn't apply to rental listings. The
|
||||
# landlord's tenure is not shown on OpenRent listing pages.
|
||||
"Leasehold/Freehold": None,
|
||||
"Property type": map_property_type(property_type),
|
||||
"Property sub-type": property_type or "Unknown",
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@ from pathlib import Path
|
|||
|
||||
import polars as pl
|
||||
|
||||
from constants import MAX_BEDROOMS, MAX_RENT_MONTHLY, MIN_RENT_MONTHLY
|
||||
from transform import normalize_price
|
||||
|
||||
log = logging.getLogger("rightmove")
|
||||
|
|
@ -18,6 +19,30 @@ def write_parquet(properties: list[dict], path: Path, channel: str) -> None:
|
|||
log.warning("No properties to write to %s", path)
|
||||
return
|
||||
|
||||
# Sanitize bedroom/bathroom counts — values above MAX_BEDROOMS are
|
||||
# almost certainly prices or other numeric fields mis-parsed as bedrooms.
|
||||
bad_count = 0
|
||||
for p in properties:
|
||||
for key in ("Bedrooms", "Bathrooms"):
|
||||
val = p.get(key, 0) or 0
|
||||
if val > MAX_BEDROOMS:
|
||||
bad_count += 1
|
||||
p[key] = None
|
||||
# Recompute derived field after sanitization
|
||||
beds = p.get("Bedrooms")
|
||||
baths = p.get("Bathrooms")
|
||||
if beds is None or baths is None:
|
||||
p["Number of bedrooms & living rooms"] = None
|
||||
else:
|
||||
p["Number of bedrooms & living rooms"] = beds + baths
|
||||
|
||||
if bad_count:
|
||||
log.warning(
|
||||
"Sanitized %d properties with bedroom/bathroom counts > %d (set to null)",
|
||||
bad_count,
|
||||
MAX_BEDROOMS,
|
||||
)
|
||||
|
||||
# Parse first_visible_date to datetime
|
||||
listing_dates = []
|
||||
for p in properties:
|
||||
|
|
@ -36,15 +61,33 @@ def write_parquet(properties: list[dict], path: Path, channel: str) -> None:
|
|||
listing_dates.append(None)
|
||||
|
||||
# Derive asking price / asking rent based on channel
|
||||
# Zero prices indicate parsing failures or POA/auction listings — treat as null
|
||||
if channel == "buy":
|
||||
asking_prices = [p["price"] for p in properties]
|
||||
asking_prices = [p["price"] if p["price"] > 0 else None for p in properties]
|
||||
asking_rents = [None] * len(properties)
|
||||
listing_statuses = ["For sale"] * len(properties)
|
||||
else:
|
||||
asking_prices = [None] * len(properties)
|
||||
asking_rents = [
|
||||
normalize_price(p["price"], p["price_frequency"]) for p in properties
|
||||
]
|
||||
# Normalize to monthly, then apply sanity bounds. Rents outside
|
||||
# [MIN_RENT_MONTHLY, MAX_RENT_MONTHLY] are almost always total-stay
|
||||
# pricing (short lets), annual rents mislabelled as monthly, or £0
|
||||
# placeholders — null them out rather than polluting aggregates.
|
||||
rent_outliers = 0
|
||||
asking_rents = []
|
||||
for p in properties:
|
||||
monthly = normalize_price(p["price"], p["price_frequency"])
|
||||
if monthly < MIN_RENT_MONTHLY or monthly > MAX_RENT_MONTHLY:
|
||||
rent_outliers += 1
|
||||
asking_rents.append(None)
|
||||
else:
|
||||
asking_rents.append(monthly)
|
||||
if rent_outliers:
|
||||
log.warning(
|
||||
"Nulled %d rent outliers outside [£%d, £%d]/month",
|
||||
rent_outliers,
|
||||
MIN_RENT_MONTHLY,
|
||||
MAX_RENT_MONTHLY,
|
||||
)
|
||||
listing_statuses = ["For rent"] * len(properties)
|
||||
|
||||
df = pl.DataFrame(
|
||||
|
|
|
|||
|
|
@ -1,12 +1,31 @@
|
|||
import logging
|
||||
import re
|
||||
|
||||
from constants import PROPERTY_TYPE_MAP, RIGHTMOVE_BASE
|
||||
from constants import MAX_BEDROOMS, PROPERTY_TYPE_MAP, RIGHTMOVE_BASE
|
||||
from spatial import PostcodeSpatialIndex
|
||||
|
||||
log = logging.getLogger("rightmove")
|
||||
|
||||
|
||||
# Maximum plausible floor area for a residential property listing (sqm).
|
||||
# ~21,500 sq ft — covers even the largest UK mansions.
|
||||
MAX_FLOOR_AREA_SQM = 2000.0
|
||||
|
||||
|
||||
def validate_floor_area(sqm: float | None) -> float | None:
|
||||
"""Validate a floor area value. Returns None for nonsensical values.
|
||||
|
||||
Rejects zero/negative values and anything above MAX_FLOOR_AREA_SQM,
|
||||
which catches parsing errors where prices or other large numbers are
|
||||
mistakenly extracted as floor area from free-text descriptions or DOM text.
|
||||
"""
|
||||
if sqm is None:
|
||||
return None
|
||||
if sqm <= 0 or sqm > MAX_FLOOR_AREA_SQM:
|
||||
return None
|
||||
return sqm
|
||||
|
||||
|
||||
def parse_display_size(display_size: str | None) -> float | None:
|
||||
"""Parse displaySize like '499 sq. ft.' or '4,124 sq. ft.' to sqm."""
|
||||
if not display_size:
|
||||
|
|
@ -15,11 +34,11 @@ def parse_display_size(display_size: str | None) -> float | None:
|
|||
m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*ft", display_size, re.IGNORECASE)
|
||||
if m:
|
||||
sqft = float(m.group(1).replace(",", ""))
|
||||
return round(sqft * 0.092903, 1)
|
||||
return validate_floor_area(round(sqft * 0.092903, 1))
|
||||
# Try sq. m.
|
||||
m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*m", display_size, re.IGNORECASE)
|
||||
if m:
|
||||
return round(float(m.group(1).replace(",", "")), 1)
|
||||
return validate_floor_area(round(float(m.group(1).replace(",", "")), 1))
|
||||
return None
|
||||
|
||||
|
||||
|
|
@ -92,19 +111,34 @@ def transform_property(
|
|||
|
||||
price_obj = prop.get("price", {})
|
||||
amount = price_obj.get("amount")
|
||||
if amount is None:
|
||||
if not amount:
|
||||
return None
|
||||
frequency = price_obj.get("frequency", "")
|
||||
price = normalize_price(int(amount), frequency)
|
||||
# Store raw price — normalization to monthly happens once in storage.py
|
||||
price = int(amount)
|
||||
if price <= 0:
|
||||
return None
|
||||
|
||||
display_prices = price_obj.get("displayPrices", [])
|
||||
price_qualifier = (
|
||||
display_prices[0].get("displayPriceQualifier", "") if display_prices else ""
|
||||
)
|
||||
|
||||
# POA / Auction listings have unreliable prices — treat as no price
|
||||
pq_lower = price_qualifier.lower()
|
||||
if "poa" in pq_lower or "auction" in pq_lower:
|
||||
return None
|
||||
|
||||
sub_type = prop.get("propertySubType", "")
|
||||
bedrooms = prop.get("bedrooms", 0) or 0
|
||||
bathrooms = prop.get("bathrooms", 0) or 0
|
||||
raw_beds = prop.get("bedrooms", 0) or 0
|
||||
raw_baths = prop.get("bathrooms", 0) or 0
|
||||
bedrooms = raw_beds if raw_beds <= MAX_BEDROOMS else 0
|
||||
bathrooms = raw_baths if raw_baths <= MAX_BEDROOMS else 0
|
||||
if raw_beds > MAX_BEDROOMS or raw_baths > MAX_BEDROOMS:
|
||||
log.warning(
|
||||
"Rightmove %s: implausible beds=%d baths=%d (capped to 0)",
|
||||
prop.get("id", "?"), raw_beds, raw_baths,
|
||||
)
|
||||
|
||||
key_features = [
|
||||
kf.get("description", "")
|
||||
|
|
|
|||
|
|
@ -26,9 +26,10 @@ import logging
|
|||
import re
|
||||
import time
|
||||
|
||||
from constants import DELAY_BETWEEN_PAGES, PROPERTY_TYPE_MAP, ZOOPLA_BASE
|
||||
from constants import DELAY_BETWEEN_PAGES, MAX_BEDROOMS, PROPERTY_TYPE_MAP, ZOOPLA_BASE
|
||||
from metrics import zoopla_errors_total, zoopla_pages_scraped, zoopla_properties_scraped
|
||||
from spatial import PostcodeSpatialIndex
|
||||
from transform import validate_floor_area
|
||||
|
||||
log = logging.getLogger("zoopla")
|
||||
|
||||
|
|
@ -94,15 +95,16 @@ _EXTRACT_LISTINGS_JS = r"""() => {
|
|||
const areaMatch = text.match(/([\d,]+)\s*sq\.?\s*ft/i);
|
||||
|
||||
let tenure = '';
|
||||
if (/freehold/i.test(text)) tenure = 'Freehold';
|
||||
else if (/leasehold/i.test(text)) tenure = 'Leasehold';
|
||||
if (/leasehold/i.test(text)) tenure = 'Leasehold';
|
||||
else if (/freehold/i.test(text)) tenure = 'Freehold';
|
||||
|
||||
results.push({
|
||||
id, url: href.replace(window.location.origin, ''),
|
||||
price: priceMatch ? parseInt(priceMatch[1].replace(/,/g, '')) : null,
|
||||
beds: bedsMatch ? parseInt(bedsMatch[1]) : null,
|
||||
baths: bathsMatch ? parseInt(bathsMatch[1]) : null,
|
||||
receptions: recMatch ? parseInt(recMatch[1]) : null,
|
||||
price_text: priceText.trim(),
|
||||
beds: bedsMatch && parseInt(bedsMatch[1]) <= 20 ? parseInt(bedsMatch[1]) : null,
|
||||
baths: bathsMatch && parseInt(bathsMatch[1]) <= 20 ? parseInt(bathsMatch[1]) : null,
|
||||
receptions: recMatch && parseInt(recMatch[1]) <= 20 ? parseInt(recMatch[1]) : null,
|
||||
floor_area_sqft: areaMatch ? parseInt(areaMatch[1].replace(/,/g, '')) : null,
|
||||
address, tenure,
|
||||
});
|
||||
|
|
@ -137,7 +139,9 @@ _EXTRACT_LISTINGS_JS = r"""() => {
|
|||
const text = card.innerText || '';
|
||||
const lines = text.split('\n').map(l => l.trim()).filter(Boolean);
|
||||
|
||||
const priceMatch = text.match(/\u00a3([\d,]+)/);
|
||||
const priceEl2 = card.querySelector('[data-testid="listing-price"]');
|
||||
const priceText2 = priceEl2 ? priceEl2.innerText : text;
|
||||
const priceMatch = priceText2.match(/\u00a3([\d,]+)/);
|
||||
const bedsMatch = text.match(/(\d+)\s*beds?/i);
|
||||
const bathsMatch = text.match(/(\d+)\s*baths?/i);
|
||||
const recMatch = text.match(/(\d+)\s*reception/i);
|
||||
|
|
@ -153,15 +157,16 @@ _EXTRACT_LISTINGS_JS = r"""() => {
|
|||
}
|
||||
|
||||
let tenure = '';
|
||||
if (/freehold/i.test(text)) tenure = 'Freehold';
|
||||
else if (/leasehold/i.test(text)) tenure = 'Leasehold';
|
||||
if (/leasehold/i.test(text)) tenure = 'Leasehold';
|
||||
else if (/freehold/i.test(text)) tenure = 'Freehold';
|
||||
|
||||
results.push({
|
||||
id, url: href.replace(window.location.origin, ''),
|
||||
price: priceMatch ? parseInt(priceMatch[1].replace(/,/g, '')) : null,
|
||||
beds: bedsMatch ? parseInt(bedsMatch[1]) : null,
|
||||
baths: bathsMatch ? parseInt(bathsMatch[1]) : null,
|
||||
receptions: recMatch ? parseInt(recMatch[1]) : null,
|
||||
price_text: priceText2.trim(),
|
||||
beds: bedsMatch && parseInt(bedsMatch[1]) <= 20 ? parseInt(bedsMatch[1]) : null,
|
||||
baths: bathsMatch && parseInt(bathsMatch[1]) <= 20 ? parseInt(bathsMatch[1]) : null,
|
||||
receptions: recMatch && parseInt(recMatch[1]) <= 20 ? parseInt(recMatch[1]) : null,
|
||||
floor_area_sqft: areaMatch ? parseInt(areaMatch[1].replace(/,/g, '')) : null,
|
||||
address, tenure,
|
||||
});
|
||||
|
|
@ -597,6 +602,21 @@ def _map_property_type(raw_type: str | None) -> str:
|
|||
return "Other"
|
||||
|
||||
|
||||
def _detect_rent_frequency(price_text: str) -> str:
|
||||
"""Detect rent frequency from Zoopla price text.
|
||||
|
||||
Zoopla price elements contain text like '£1,500 pcm', '£350 pw',
|
||||
'£18,000 pa'. Defaults to 'monthly' if no frequency indicator found.
|
||||
"""
|
||||
lower = price_text.lower()
|
||||
if "pw" in lower or "per week" in lower or "/w" in lower:
|
||||
return "weekly"
|
||||
if "pa" in lower or "per annum" in lower or "/y" in lower or "per year" in lower:
|
||||
return "yearly"
|
||||
# pcm, per month, /m, or no indicator — default monthly
|
||||
return "monthly"
|
||||
|
||||
|
||||
def transform_property(
|
||||
raw: dict,
|
||||
channel: str,
|
||||
|
|
@ -608,7 +628,7 @@ def transform_property(
|
|||
Zoopla search cards do not include coordinates, so we resolve lat/lng
|
||||
from postcodes extracted from the address text."""
|
||||
price = raw.get("price")
|
||||
if not price:
|
||||
if not price or int(price) <= 0:
|
||||
return None
|
||||
|
||||
address = raw.get("address", "")
|
||||
|
|
@ -647,21 +667,35 @@ def transform_property(
|
|||
if not (49 <= lat <= 56 and -7 <= lng <= 2):
|
||||
return None
|
||||
|
||||
bedrooms = raw.get("beds") or 0
|
||||
bathrooms = raw.get("baths") or 0
|
||||
raw_beds = raw.get("beds") or 0
|
||||
raw_baths = raw.get("baths") or 0
|
||||
bedrooms = raw_beds if raw_beds <= MAX_BEDROOMS else 0
|
||||
bathrooms = raw_baths if raw_baths <= MAX_BEDROOMS else 0
|
||||
if raw_beds > MAX_BEDROOMS or raw_baths > MAX_BEDROOMS:
|
||||
log.warning(
|
||||
"Zoopla %s: implausible beds=%d baths=%d (capped to 0)",
|
||||
raw.get("id", "?"), raw_beds, raw_baths,
|
||||
)
|
||||
receptions = raw.get("receptions") or 0
|
||||
|
||||
# Floor area: convert sq ft to sq m
|
||||
floor_area_sqm = None
|
||||
sqft = raw.get("floor_area_sqft")
|
||||
if sqft:
|
||||
floor_area_sqm = round(sqft * 0.092903, 1)
|
||||
floor_area_sqm = validate_floor_area(round(sqft * 0.092903, 1))
|
||||
|
||||
listing_id = raw.get("id", "")
|
||||
listing_url = raw.get("url", "")
|
||||
if listing_url and not listing_url.startswith("http"):
|
||||
listing_url = ZOOPLA_BASE + listing_url
|
||||
|
||||
# Detect rent frequency from price text (e.g. "£1,500 pcm" vs "£350 pw")
|
||||
if channel == "BUY":
|
||||
frequency = ""
|
||||
else:
|
||||
price_text = raw.get("price_text", "")
|
||||
frequency = _detect_rent_frequency(price_text)
|
||||
|
||||
return {
|
||||
"id": f"zp_{listing_id}",
|
||||
"Bedrooms": bedrooms,
|
||||
|
|
@ -675,7 +709,7 @@ def transform_property(
|
|||
"Property type": "Other", # Not reliably extractable from Zoopla search cards
|
||||
"Property sub-type": "",
|
||||
"price": int(price),
|
||||
"price_frequency": "" if channel == "BUY" else "monthly",
|
||||
"price_frequency": frequency,
|
||||
"Price qualifier": "",
|
||||
"Total floor area (sqm)": floor_area_sqm,
|
||||
"Listing URL": listing_url,
|
||||
|
|
|
|||
|
|
@ -24,6 +24,7 @@ import { useSavedProperties } from './hooks/useSavedProperties';
|
|||
declare global {
|
||||
interface Window {
|
||||
__screenshot_ready?: boolean;
|
||||
__map_idle?: boolean;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -69,6 +69,14 @@ const DATA_SOURCES = [
|
|||
url: 'https://download.geofabrik.de/europe/great-britain-latest.osm.pbf',
|
||||
license: 'Open Data Commons Open Database License (ODbL)',
|
||||
},
|
||||
{
|
||||
id: 'os-open-greenspace',
|
||||
name: 'OS Open Greenspace',
|
||||
origin: 'Ordnance Survey',
|
||||
use: 'Authoritative green space boundaries for Great Britain, including public parks, gardens, playing fields, and play spaces. Polygon centroids are used for park proximity counts and distance-to-nearest-park calculations.',
|
||||
url: 'https://osdatahub.os.uk/downloads/open/OpenGreenspace',
|
||||
license: 'Open Government Licence v3.0',
|
||||
},
|
||||
{
|
||||
id: 'naptan',
|
||||
name: 'NaPTAN (Public Transport Stops)',
|
||||
|
|
@ -101,14 +109,6 @@ const DATA_SOURCES = [
|
|||
url: 'https://www.ofcom.org.uk/phones-and-broadband/coverage-and-speeds/connected-nations-20252/data-downloads-2025',
|
||||
license: 'Open Government Licence v3.0',
|
||||
},
|
||||
{
|
||||
id: 'geosure',
|
||||
name: 'GeoSure Ground Stability',
|
||||
origin: 'Ordnance Survey',
|
||||
use: 'Ground stability hazard ratings on a 5km hex grid covering Great Britain. Six risk categories (collapsible deposits, compressible ground, landslides, running sand, shrink-swell, and soluble rocks) rated Low, Moderate, or Significant. Spatial-joined to postcodes via centroid intersection.',
|
||||
url: 'https://osdatahub.os.uk/downloads/open/GeoSure',
|
||||
license: 'Open Government Licence v3.0',
|
||||
},
|
||||
{
|
||||
id: 'council-tax',
|
||||
name: 'Council Tax Levels 2025-26',
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
import { memo, useState, useCallback, useEffect, useRef } from 'react';
|
||||
import { SpinnerIcon } from '../ui/icons/SpinnerIcon';
|
||||
import { SparklesIcon } from '../ui/icons/SparklesIcon';
|
||||
import { ChevronIcon } from '../ui/icons/ChevronIcon';
|
||||
import type { AiFilterErrorType } from '../../hooks/useAiFilters';
|
||||
|
||||
const EXAMPLE_QUERIES = [
|
||||
|
|
@ -13,6 +14,7 @@ const LOADING_MESSAGES = [
|
|||
'Analysing your query...',
|
||||
'Searching for destinations...',
|
||||
'Generating filters...',
|
||||
'Refining results...',
|
||||
];
|
||||
|
||||
/** Cycle through loading messages to show progress. */
|
||||
|
|
@ -28,9 +30,11 @@ function useLoadingMessage(loading: boolean): string {
|
|||
// Advance message every 1.5s
|
||||
timerRef.current = setTimeout(() => setIndex(1), 1500);
|
||||
const t2 = setTimeout(() => setIndex(2), 3500);
|
||||
const t3 = setTimeout(() => setIndex(3), 5500);
|
||||
return () => {
|
||||
clearTimeout(timerRef.current);
|
||||
clearTimeout(t2);
|
||||
clearTimeout(t3);
|
||||
};
|
||||
}, [loading]);
|
||||
|
||||
|
|
@ -62,18 +66,45 @@ export default memo(function AiFilterInput({
|
|||
const [expanded, setExpanded] = useState(false);
|
||||
const loadingMessage = useLoadingMessage(loading);
|
||||
const containerRef = useRef<HTMLDivElement>(null);
|
||||
const textareaRef = useRef<HTMLTextAreaElement>(null);
|
||||
|
||||
const queryRef = useRef(query);
|
||||
queryRef.current = query;
|
||||
|
||||
useEffect(() => {
|
||||
if (!expanded || loading) return;
|
||||
const handler = (e: MouseEvent) => {
|
||||
if (containerRef.current && !containerRef.current.contains(e.target as Node)) {
|
||||
setExpanded(false);
|
||||
if (!queryRef.current.trim()) setExpanded(false);
|
||||
}
|
||||
};
|
||||
document.addEventListener('mousedown', handler);
|
||||
return () => document.removeEventListener('mousedown', handler);
|
||||
}, [expanded, loading]);
|
||||
|
||||
const resizeTextarea = useCallback(() => {
|
||||
const ta = textareaRef.current;
|
||||
if (!ta) return;
|
||||
ta.style.height = 'auto';
|
||||
ta.style.height = `${ta.scrollHeight}px`;
|
||||
}, []);
|
||||
|
||||
const handleKeyDown = useCallback(
|
||||
(e: React.KeyboardEvent) => {
|
||||
if (e.key === 'Enter' && !e.shiftKey) {
|
||||
e.preventDefault();
|
||||
const trimmed = query.trim();
|
||||
if (!trimmed || loading) return;
|
||||
if (!isLoggedIn) {
|
||||
onLoginRequired();
|
||||
return;
|
||||
}
|
||||
onSubmit(trimmed);
|
||||
}
|
||||
},
|
||||
[query, loading, isLoggedIn, onLoginRequired, onSubmit]
|
||||
);
|
||||
|
||||
const handleSubmit = useCallback(
|
||||
(e: React.FormEvent) => {
|
||||
e.preventDefault();
|
||||
|
|
@ -129,14 +160,27 @@ export default memo(function AiFilterInput({
|
|||
<span className="text-xs text-warm-400 dark:text-warm-500">
|
||||
describe what you're looking for
|
||||
</span>
|
||||
<button
|
||||
type="button"
|
||||
onClick={() => setExpanded(false)}
|
||||
className="ml-auto text-warm-400 dark:text-warm-500 hover:text-warm-600 dark:hover:text-warm-300 cursor-pointer"
|
||||
>
|
||||
<ChevronIcon direction="up" className="w-3.5 h-3.5" />
|
||||
</button>
|
||||
</div>
|
||||
<form onSubmit={handleSubmit} className="flex items-center gap-1.5">
|
||||
<input
|
||||
type="text"
|
||||
<form onSubmit={handleSubmit} className="flex items-end gap-1.5">
|
||||
<textarea
|
||||
ref={textareaRef}
|
||||
value={query}
|
||||
onChange={(e) => setQuery(e.target.value)}
|
||||
onChange={(e) => {
|
||||
setQuery(e.target.value);
|
||||
resizeTextarea();
|
||||
}}
|
||||
onKeyDown={handleKeyDown}
|
||||
placeholder="e.g. quiet area, under 400k, near good schools..."
|
||||
className="flex-1 px-2.5 py-1.5 text-sm rounded-lg border border-warm-200 dark:border-warm-700 bg-warm-50 dark:bg-warm-800 text-warm-700 dark:text-warm-200 placeholder-warm-400 dark:placeholder-warm-500 focus:outline-none focus:ring-2 focus:ring-teal-400 focus:bg-white dark:focus:bg-warm-800"
|
||||
className="flex-1 px-2.5 py-1.5 text-sm rounded-lg border border-warm-200 dark:border-warm-700 bg-warm-50 dark:bg-warm-800 text-warm-700 dark:text-warm-200 placeholder-warm-400 dark:placeholder-warm-500 focus:outline-none focus:ring-2 focus:ring-teal-400 focus:bg-white dark:focus:bg-warm-800 resize-none overflow-hidden"
|
||||
rows={1}
|
||||
style={{ maxHeight: '6rem' }}
|
||||
disabled={loading}
|
||||
autoFocus
|
||||
/>
|
||||
|
|
|
|||
|
|
@ -1,5 +1,4 @@
|
|||
import { useMemo, useState } from 'react';
|
||||
import { useCollapsibleGroups } from '../../hooks/useCollapsibleGroups';
|
||||
import type {
|
||||
FeatureFilters,
|
||||
FeatureMeta,
|
||||
|
|
@ -38,6 +37,8 @@ interface AreaPaneProps {
|
|||
filters: FeatureFilters;
|
||||
onNavigateToSource?: (slug: string, featureName: string) => void;
|
||||
travelTimeEntries?: TravelTimeEntry[];
|
||||
isGroupExpanded: (name: string) => boolean;
|
||||
onToggleGroup: (name: string) => void;
|
||||
}
|
||||
|
||||
export default function AreaPane({
|
||||
|
|
@ -52,11 +53,12 @@ export default function AreaPane({
|
|||
filters,
|
||||
onNavigateToSource,
|
||||
travelTimeEntries,
|
||||
isGroupExpanded,
|
||||
onToggleGroup,
|
||||
}: AreaPaneProps) {
|
||||
const propertyCount = isPostcode && postcodeData ? postcodeData.properties.count : stats?.count;
|
||||
const featureGroups = useMemo(() => groupFeaturesByCategory(globalFeatures), [globalFeatures]);
|
||||
const [infoFeature, setInfoFeature] = useState<FeatureMeta | null>(null);
|
||||
const [collapsedGroups, toggleGroup] = useCollapsibleGroups();
|
||||
|
||||
const numericByName = useMemo(() => {
|
||||
if (!stats) return new Map();
|
||||
|
|
@ -165,17 +167,17 @@ export default function AreaPane({
|
|||
) ?? []
|
||||
);
|
||||
|
||||
const isExpanded = !collapsedGroups.has(group.name);
|
||||
const expanded = isGroupExpanded(group.name);
|
||||
|
||||
return (
|
||||
<div key={group.name}>
|
||||
<CollapsibleGroupHeader
|
||||
name={group.name}
|
||||
expanded={isExpanded}
|
||||
onToggle={() => toggleGroup(group.name)}
|
||||
expanded={expanded}
|
||||
onToggle={() => onToggleGroup(group.name)}
|
||||
className="px-3 py-2.5 text-sm font-bold text-warm-500 bg-warm-50 dark:bg-warm-900 dark:text-warm-400 sticky top-0 z-10 hover:bg-warm-100 dark:hover:bg-warm-800"
|
||||
/>
|
||||
{isExpanded && (
|
||||
{expanded && (
|
||||
<div className="px-3 py-2 space-y-3">
|
||||
{stackedCharts
|
||||
? stackedCharts.map((chart) => {
|
||||
|
|
|
|||
|
|
@ -54,7 +54,7 @@ export default function FeatureBrowser({
|
|||
const [search, setSearch] = useState('');
|
||||
const [infoFeature, setInfoFeature] = useState<FeatureMeta | null>(null);
|
||||
const [travelInfoMode, setTravelInfoMode] = useState<TransportMode | null>(null);
|
||||
const [expandedGroups, toggleGroup] = useCollapsibleGroups();
|
||||
const [isGroupExpanded, toggleGroup] = useCollapsibleGroups(true);
|
||||
const availableTravelModes = useTravelModes();
|
||||
|
||||
useEffect(() => {
|
||||
|
|
@ -106,7 +106,7 @@ export default function FeatureBrowser({
|
|||
</div>
|
||||
<div className="md:min-h-0 md:flex-1 md:overflow-y-auto flex flex-col">
|
||||
{mergedGrouped.map((group) => {
|
||||
const isExpanded = isSearching || expandedGroups.has(group.name);
|
||||
const isExpanded = isSearching || isGroupExpanded(group.name);
|
||||
return (
|
||||
<div key={group.name} className="shrink-0">
|
||||
<CollapsibleGroupHeader
|
||||
|
|
|
|||
|
|
@ -182,7 +182,7 @@ interface FiltersProps {
|
|||
travelTimeEntries: TravelTimeEntry[];
|
||||
onTravelTimeAddEntry: (mode: TransportMode) => void;
|
||||
onTravelTimeRemoveEntry: (index: number) => void;
|
||||
onTravelTimeSetDestination: (index: number, slug: string, label: string) => void;
|
||||
onTravelTimeSetDestination: (index: number, slug: string, label: string, lat: number, lon: number) => void;
|
||||
onTravelTimeRangeChange: (index: number, range: [number, number]) => void;
|
||||
onTravelTimeDragEnd: (index: number) => void;
|
||||
onTravelTimeToggleBest: (index: number) => void;
|
||||
|
|
@ -475,7 +475,7 @@ export default memo(function Filters({
|
|||
isActive={activeFeature === travelFieldKey(entry)}
|
||||
dragValue={activeFeature === travelFieldKey(entry) ? dragValue : null}
|
||||
onTogglePin={() => onTogglePin(travelFieldKey(entry))}
|
||||
onSetDestination={(slug, label) => onTravelTimeSetDestination(index, slug, label)}
|
||||
onSetDestination={(slug, label, lat, lon) => onTravelTimeSetDestination(index, slug, label, lat, lon)}
|
||||
onTimeRangeChange={(range) => onTravelTimeRangeChange(index, range)}
|
||||
onDragStart={() => onDragStart(travelFieldKey(entry))}
|
||||
onDragChange={onDragChange}
|
||||
|
|
|
|||
|
|
@ -108,7 +108,7 @@ export default function LocationSearch({
|
|||
<button
|
||||
type="button"
|
||||
onClick={() => setExpanded(true)}
|
||||
className="absolute top-3 left-3 z-10 p-2 bg-white dark:bg-warm-800 rounded shadow-lg"
|
||||
className="p-2 bg-white dark:bg-warm-800 rounded shadow-lg pointer-events-auto"
|
||||
aria-label="Search places or postcodes"
|
||||
>
|
||||
<SearchIcon className="w-5 h-5 text-warm-600 dark:text-warm-300" />
|
||||
|
|
@ -120,7 +120,7 @@ export default function LocationSearch({
|
|||
<div
|
||||
ref={containerRef}
|
||||
data-tutorial="search"
|
||||
className="absolute top-3 left-3 z-10 flex flex-col"
|
||||
className="flex flex-col pointer-events-auto"
|
||||
onMouseEnter={onMouseEnter}
|
||||
>
|
||||
<div className="flex items-center shadow-lg rounded bg-white dark:bg-warm-800">
|
||||
|
|
|
|||
|
|
@ -200,6 +200,7 @@ export default memo(function Map({
|
|||
{...viewState}
|
||||
onMove={handleMove}
|
||||
onLoad={undefined}
|
||||
onIdle={screenshotMode ? () => { window.__map_idle = true; } : undefined}
|
||||
mapStyle={mapStyle}
|
||||
style={{ width: '100%', height: '100%' }}
|
||||
attributionControl={false}
|
||||
|
|
@ -223,10 +224,7 @@ export default memo(function Map({
|
|||
<div className="flex-1 flex items-center justify-center">
|
||||
<div className="flex items-center gap-8 bg-navy-900/90 rounded-3xl px-14 py-10">
|
||||
<LogoIcon className="w-24 h-24 text-teal-400" />
|
||||
<span
|
||||
className="font-bold text-white"
|
||||
style={{ fontSize: '5.5rem', letterSpacing: '-0.03em' }}
|
||||
>
|
||||
<span className="font-bold text-white whitespace-nowrap" style={{ fontSize: '5rem' }}>
|
||||
Your perfect postcode
|
||||
</span>
|
||||
</div>
|
||||
|
|
@ -263,55 +261,57 @@ export default memo(function Map({
|
|||
) : null
|
||||
) : (
|
||||
<>
|
||||
<LocationSearch
|
||||
onFlyTo={handleFlyTo}
|
||||
onLocationSearched={onLocationSearched}
|
||||
onMouseEnter={handleMouseLeave}
|
||||
/>
|
||||
{!hideLegend &&
|
||||
(viewFeature && colorRange ? (
|
||||
viewFeature.startsWith('tt_') ? (
|
||||
<div className="absolute top-3 left-3 right-3 z-10 flex flex-wrap items-start justify-between gap-2 pointer-events-none">
|
||||
<LocationSearch
|
||||
onFlyTo={handleFlyTo}
|
||||
onLocationSearched={onLocationSearched}
|
||||
onMouseEnter={handleMouseLeave}
|
||||
/>
|
||||
{!hideLegend &&
|
||||
(viewFeature && colorRange ? (
|
||||
viewFeature.startsWith('tt_') ? (
|
||||
<MapLegend
|
||||
featureLabel={`Travel time (${MODE_LABELS[viewFeature.split('_')[1] as keyof typeof MODE_LABELS]})`}
|
||||
range={colorRange}
|
||||
showCancel={viewSource === 'eye'}
|
||||
onCancel={onCancelPin}
|
||||
mode="feature"
|
||||
theme={theme}
|
||||
suffix=" min"
|
||||
/>
|
||||
) : colorFeatureMeta ? (
|
||||
<MapLegend
|
||||
featureLabel={
|
||||
viewSource === 'eye'
|
||||
? `Previewing \u201c${colorFeatureMeta.name}\u201d`
|
||||
: colorFeatureMeta.name
|
||||
}
|
||||
range={colorRange}
|
||||
showCancel={viewSource === 'eye'}
|
||||
onCancel={onCancelPin}
|
||||
mode="feature"
|
||||
enumValues={
|
||||
colorFeatureMeta.type === 'enum' ? colorFeatureMeta.values : undefined
|
||||
}
|
||||
theme={theme}
|
||||
raw={colorFeatureMeta.raw}
|
||||
/>
|
||||
) : null
|
||||
) : (
|
||||
<MapLegend
|
||||
featureLabel={`Travel time (${MODE_LABELS[viewFeature.split('_')[1] as keyof typeof MODE_LABELS]})`}
|
||||
range={colorRange}
|
||||
showCancel={viewSource === 'eye'}
|
||||
onCancel={onCancelPin}
|
||||
mode="feature"
|
||||
theme={theme}
|
||||
suffix=" min"
|
||||
/>
|
||||
) : colorFeatureMeta ? (
|
||||
<MapLegend
|
||||
featureLabel={
|
||||
viewSource === 'eye'
|
||||
? `Previewing \u201c${colorFeatureMeta.name}\u201d`
|
||||
: colorFeatureMeta.name
|
||||
featureLabel="Number of properties"
|
||||
range={
|
||||
usePostcodeView
|
||||
? [postcodeCountRange.min, postcodeCountRange.max]
|
||||
: [countRange.min, countRange.max]
|
||||
}
|
||||
range={colorRange}
|
||||
showCancel={viewSource === 'eye'}
|
||||
showCancel={false}
|
||||
onCancel={onCancelPin}
|
||||
mode="feature"
|
||||
enumValues={
|
||||
colorFeatureMeta.type === 'enum' ? colorFeatureMeta.values : undefined
|
||||
}
|
||||
mode="density"
|
||||
theme={theme}
|
||||
raw={colorFeatureMeta.raw}
|
||||
/>
|
||||
) : null
|
||||
) : (
|
||||
<MapLegend
|
||||
featureLabel="Number of properties"
|
||||
range={
|
||||
usePostcodeView
|
||||
? [postcodeCountRange.min, postcodeCountRange.max]
|
||||
: [countRange.min, countRange.max]
|
||||
}
|
||||
showCancel={false}
|
||||
onCancel={onCancelPin}
|
||||
mode="density"
|
||||
theme={theme}
|
||||
/>
|
||||
))}
|
||||
))}
|
||||
</div>
|
||||
{popupInfo && (
|
||||
<div
|
||||
className="absolute bg-white dark:bg-warm-800 rounded-lg shadow-lg text-sm dark:text-white"
|
||||
|
|
|
|||
|
|
@ -125,7 +125,7 @@ export default function MapLegend({
|
|||
}
|
||||
|
||||
return (
|
||||
<div className="absolute top-3 right-3 z-10 bg-white dark:bg-navy-800 dark:text-white rounded shadow-lg p-3 text-xs min-w-[160px]">
|
||||
<div className="bg-white dark:bg-navy-800 dark:text-white rounded shadow-lg p-3 text-xs min-w-[300px] pointer-events-auto">
|
||||
<div className="flex items-center justify-between mb-2">
|
||||
<span className="font-semibold text-sm dark:text-white">{featureLabel}</span>
|
||||
{showCancel && (
|
||||
|
|
|
|||
|
|
@ -22,6 +22,7 @@ import { usePOIData } from '../../hooks/usePOIData';
|
|||
import { useFilters } from '../../hooks/useFilters';
|
||||
import { useHexagonSelection } from '../../hooks/useHexagonSelection';
|
||||
import { usePaneResize } from '../../hooks/usePaneResize';
|
||||
import { useCollapsibleGroups } from '../../hooks/useCollapsibleGroups';
|
||||
import { useAiFilters } from '../../hooks/useAiFilters';
|
||||
import { useUrlSync } from '../../hooks/useUrlSync';
|
||||
import { useTutorial } from '../../hooks/useTutorial';
|
||||
|
|
@ -38,6 +39,7 @@ import { trackEvent } from '../../lib/analytics';
|
|||
import { INITIAL_VIEW_STATE } from '../../lib/consts';
|
||||
import { useLicense } from '../../hooks/useLicense';
|
||||
import UpgradeModal from '../ui/UpgradeModal';
|
||||
import { CloseIcon } from '../ui/icons/CloseIcon';
|
||||
import { SpinnerIcon } from '../ui/icons/SpinnerIcon';
|
||||
import { MapPinIcon } from '../ui/icons/MapPinIcon';
|
||||
import { BookmarkIcon } from '../ui/icons/BookmarkIcon';
|
||||
|
|
@ -202,13 +204,6 @@ export default function MapPage({
|
|||
]
|
||||
);
|
||||
|
||||
const handleTravelTimeSetDestination = useCallback(
|
||||
(index: number, slug: string, label: string) => {
|
||||
travelTime.handleSetDestination(index, slug, label);
|
||||
},
|
||||
[travelTime.handleSetDestination]
|
||||
);
|
||||
|
||||
const handleTravelTimeRemoveEntry = useCallback(
|
||||
(index: number) => {
|
||||
const entry = travelTime.entries[index];
|
||||
|
|
@ -240,6 +235,16 @@ export default function MapPage({
|
|||
travelTimeEntries: travelTime.entries,
|
||||
});
|
||||
|
||||
const handleTravelTimeSetDestination = useCallback(
|
||||
(index: number, slug: string, label: string, lat: number, lon: number) => {
|
||||
travelTime.handleSetDestination(index, slug, label);
|
||||
if (slug) {
|
||||
mapFlyToRef.current?.(lat, lon, mapData.currentView?.zoom ?? INITIAL_VIEW_STATE.zoom);
|
||||
}
|
||||
},
|
||||
[travelTime.handleSetDestination, mapData.currentView?.zoom]
|
||||
);
|
||||
|
||||
// First transit destination — used to pick the best central_postcode for journey display
|
||||
const journeyDest = useMemo(() => {
|
||||
const entry = travelTime.entries.find((e) => e.mode === 'transit' && e.slug);
|
||||
|
|
@ -274,6 +279,7 @@ export default function MapPage({
|
|||
}, []);
|
||||
|
||||
const pois = usePOIData(mapData.bounds, selectedPOICategories);
|
||||
const [isAreaGroupExpanded, toggleAreaGroup] = useCollapsibleGroups(true);
|
||||
|
||||
useUrlSync(
|
||||
mapData.currentView,
|
||||
|
|
@ -437,14 +443,22 @@ export default function MapPage({
|
|||
? mapData.postcodeData.length > 0
|
||||
: mapData.data.length > 0;
|
||||
if (hasData) {
|
||||
// Wait for deck.gl to actually paint: in interleaved MapboxOverlay mode,
|
||||
// hexagons render during MapLibre's rAF cycle. Double-rAF ensures at
|
||||
// least one full paint has completed before we signal readiness.
|
||||
requestAnimationFrame(() => {
|
||||
requestAnimationFrame(() => {
|
||||
window.__screenshot_ready = true;
|
||||
});
|
||||
});
|
||||
// Wait for both deck.gl data AND MapLibre base map tile rendering.
|
||||
// __map_idle is set by Map's onIdle callback, which fires after all
|
||||
// tiles are loaded and rendered — critical for SwiftShader where
|
||||
// edge tiles can lag behind the center.
|
||||
const waitAndSignal = () => {
|
||||
if (window.__map_idle) {
|
||||
requestAnimationFrame(() => {
|
||||
requestAnimationFrame(() => {
|
||||
window.__screenshot_ready = true;
|
||||
});
|
||||
});
|
||||
} else {
|
||||
requestAnimationFrame(waitAndSignal);
|
||||
}
|
||||
};
|
||||
waitAndSignal();
|
||||
}
|
||||
}
|
||||
}, [
|
||||
|
|
@ -528,6 +542,8 @@ export default function MapPage({
|
|||
hexagonLocation={hexagonLocation}
|
||||
filters={filters}
|
||||
travelTimeEntries={travelTime.activeEntries}
|
||||
isGroupExpanded={isAreaGroupExpanded}
|
||||
onToggleGroup={toggleAreaGroup}
|
||||
/>
|
||||
);
|
||||
|
||||
|
|
@ -850,6 +866,13 @@ export default function MapPage({
|
|||
isActive={selection.rightPaneTab === 'properties'}
|
||||
onClick={selection.handlePropertiesTabClick}
|
||||
/>
|
||||
<button
|
||||
onClick={selection.handleCloseSelection}
|
||||
className="px-2 flex items-center text-warm-400 hover:text-warm-700 dark:hover:text-warm-300"
|
||||
title="Close pane"
|
||||
>
|
||||
<CloseIcon className="w-4 h-4" />
|
||||
</button>
|
||||
</div>
|
||||
|
||||
<div className="flex-1 overflow-hidden">
|
||||
|
|
|
|||
|
|
@ -25,7 +25,7 @@ export default function POIPane({
|
|||
onNavigateToSource,
|
||||
}: POIPaneProps) {
|
||||
const [searchTerm, setSearchTerm] = useState('');
|
||||
const [collapsedGroups, toggleCollapse] = useCollapsibleGroups();
|
||||
const [isGroupExpanded, toggleCollapse] = useCollapsibleGroups();
|
||||
const [showInfo, setShowInfo] = useState(false);
|
||||
|
||||
const allCategories = groups.flatMap((g) => g.categories);
|
||||
|
|
@ -150,7 +150,7 @@ export default function POIPane({
|
|||
const groupSelected = group.categories.filter((c) => selectedCategories.has(c)).length;
|
||||
const allInGroupSelected = groupSelected === group.categories.length;
|
||||
const someInGroupSelected = groupSelected > 0 && !allInGroupSelected;
|
||||
const isCollapsed = collapsedGroups.has(group.name) && !searchTerm;
|
||||
const isCollapsed = !isGroupExpanded(group.name) && !searchTerm;
|
||||
|
||||
return (
|
||||
<div key={group.name}>
|
||||
|
|
|
|||
|
|
@ -22,7 +22,7 @@ interface TravelTimeCardProps {
|
|||
isActive: boolean;
|
||||
dragValue: [number, number] | null;
|
||||
onTogglePin: () => void;
|
||||
onSetDestination: (slug: string, label: string) => void;
|
||||
onSetDestination: (slug: string, label: string, lat: number, lon: number) => void;
|
||||
onTimeRangeChange: (range: [number, number]) => void;
|
||||
onDragStart: () => void;
|
||||
onDragChange: (value: [number, number]) => void;
|
||||
|
|
@ -54,8 +54,8 @@ export function TravelTimeCard({
|
|||
const [showBestInfo, setShowBestInfo] = useState(false);
|
||||
|
||||
const handleDestinationSelect = useCallback(
|
||||
(selectedSlug: string, selectedLabel: string) => {
|
||||
onSetDestination(selectedSlug, selectedLabel);
|
||||
(selectedSlug: string, selectedLabel: string, lat: number, lon: number) => {
|
||||
onSetDestination(selectedSlug, selectedLabel, lat, lon);
|
||||
},
|
||||
[onSetDestination]
|
||||
);
|
||||
|
|
@ -103,7 +103,7 @@ export function TravelTimeCard({
|
|||
loading={destinationsLoading}
|
||||
onSelect={handleDestinationSelect}
|
||||
value={label || undefined}
|
||||
onClear={() => onSetDestination('', '')}
|
||||
onClear={() => onSetDestination('', '', 0, 0)}
|
||||
placeholder="Select destination..."
|
||||
/>
|
||||
|
||||
|
|
|
|||
|
|
@ -9,7 +9,7 @@ import { CloseIcon } from './icons/CloseIcon';
|
|||
interface DestinationDropdownProps {
|
||||
destinations: Destination[];
|
||||
loading: boolean;
|
||||
onSelect: (slug: string, label: string) => void;
|
||||
onSelect: (slug: string, label: string, lat: number, lon: number) => void;
|
||||
onClear?: () => void;
|
||||
value?: string;
|
||||
placeholder?: string;
|
||||
|
|
@ -66,7 +66,7 @@ export function DestinationDropdown({
|
|||
|
||||
const handleSelect = useCallback(
|
||||
(dest: Destination) => {
|
||||
onSelect(dest.slug, dest.name);
|
||||
onSelect(dest.slug, dest.name, dest.lat, dest.lon);
|
||||
setOpen(false);
|
||||
setFilter('');
|
||||
setActiveIndex(-1);
|
||||
|
|
|
|||
|
|
@ -169,15 +169,6 @@ export default function Header({
|
|||
Pricing
|
||||
</a>
|
||||
)}
|
||||
{user && (
|
||||
<a
|
||||
href={PAGE_PATHS.saved}
|
||||
className={tabClass('saved')}
|
||||
onClick={(e) => navLink('saved', e)}
|
||||
>
|
||||
Saved
|
||||
</a>
|
||||
)}
|
||||
</nav>
|
||||
)}
|
||||
</div>
|
||||
|
|
@ -187,20 +178,6 @@ export default function Header({
|
|||
{/* Desktop-only dashboard actions */}
|
||||
{!isMobile && activePage === 'dashboard' && (
|
||||
<>
|
||||
{onSaveSearch && (
|
||||
<button
|
||||
onClick={onSaveSearch}
|
||||
disabled={savingSearch}
|
||||
className="flex items-center gap-1.5 px-3 py-1.5 rounded bg-navy-800 hover:bg-navy-700 transition-colors text-sm disabled:opacity-50 disabled:cursor-wait"
|
||||
>
|
||||
{savingSearch ? (
|
||||
<SpinnerIcon className="w-4 h-4 animate-spin" />
|
||||
) : (
|
||||
<BookmarkIcon className="w-4 h-4" />
|
||||
)}
|
||||
Save
|
||||
</button>
|
||||
)}
|
||||
<button
|
||||
onClick={handleShare}
|
||||
disabled={sharing}
|
||||
|
|
@ -232,8 +209,31 @@ export default function Header({
|
|||
<DownloadIcon className="w-4 h-4" />
|
||||
{exporting ? 'Exporting...' : 'Export'}
|
||||
</button>
|
||||
{onSaveSearch && (
|
||||
<button
|
||||
onClick={onSaveSearch}
|
||||
disabled={savingSearch}
|
||||
className="flex items-center gap-1.5 px-3 py-1.5 rounded bg-navy-800 hover:bg-navy-700 transition-colors text-sm disabled:opacity-50 disabled:cursor-wait"
|
||||
>
|
||||
{savingSearch ? (
|
||||
<SpinnerIcon className="w-4 h-4 animate-spin" />
|
||||
) : (
|
||||
<BookmarkIcon className="w-4 h-4" />
|
||||
)}
|
||||
Save
|
||||
</button>
|
||||
)}
|
||||
</>
|
||||
)}
|
||||
{!isMobile && user && (
|
||||
<a
|
||||
href={PAGE_PATHS.saved}
|
||||
className={tabClass('saved')}
|
||||
onClick={(e) => navLink('saved', e)}
|
||||
>
|
||||
Saved
|
||||
</a>
|
||||
)}
|
||||
|
||||
{/* Desktop-only auth */}
|
||||
{!isMobile && (
|
||||
|
|
|
|||
|
|
@ -90,28 +90,10 @@ export default function MobileMenu({
|
|||
mobileNavItem('pricing', 'Pricing')}
|
||||
{user && mobileNavItem('invites', 'Invite Friends')}
|
||||
{user && mobileNavItem('account', 'Account')}
|
||||
{user && mobileNavItem('saved', 'Saved')}
|
||||
|
||||
{/* Dashboard actions */}
|
||||
{activePage === 'dashboard' && (
|
||||
<div className="mt-3 pt-3 border-t border-navy-700 flex flex-col gap-1">
|
||||
{onSaveSearch && (
|
||||
<button
|
||||
onClick={() => {
|
||||
onSaveSearch();
|
||||
onClose();
|
||||
}}
|
||||
disabled={savingSearch}
|
||||
className="w-full flex items-center gap-2 px-4 py-3 text-base text-warm-300 hover:bg-navy-800 hover:text-white rounded disabled:opacity-50"
|
||||
>
|
||||
{savingSearch ? (
|
||||
<SpinnerIcon className="w-5 h-5 animate-spin" />
|
||||
) : (
|
||||
<BookmarkIcon className="w-5 h-5" />
|
||||
)}
|
||||
Save
|
||||
</button>
|
||||
)}
|
||||
<button
|
||||
onClick={() => {
|
||||
onShare();
|
||||
|
|
@ -133,8 +115,27 @@ export default function MobileMenu({
|
|||
<DownloadIcon className="w-5 h-5" />
|
||||
{exporting ? 'Exporting...' : 'Export'}
|
||||
</button>
|
||||
{onSaveSearch && (
|
||||
<button
|
||||
onClick={() => {
|
||||
onSaveSearch();
|
||||
onClose();
|
||||
}}
|
||||
disabled={savingSearch}
|
||||
className="w-full flex items-center gap-2 px-4 py-3 text-base text-warm-300 hover:bg-navy-800 hover:text-white rounded disabled:opacity-50"
|
||||
>
|
||||
{savingSearch ? (
|
||||
<SpinnerIcon className="w-5 h-5 animate-spin" />
|
||||
) : (
|
||||
<BookmarkIcon className="w-5 h-5" />
|
||||
)}
|
||||
Save
|
||||
</button>
|
||||
)}
|
||||
{user && mobileNavItem('saved', 'Saved')}
|
||||
</div>
|
||||
)}
|
||||
{activePage !== 'dashboard' && user && mobileNavItem('saved', 'Saved')}
|
||||
</nav>
|
||||
|
||||
{/* Theme toggle + Auth section at bottom */}
|
||||
|
|
|
|||
|
|
@ -19,6 +19,8 @@ export interface AiFiltersResult {
|
|||
summary: string;
|
||||
/** The listing mode used (historical/buy/rent) */
|
||||
listingType: string;
|
||||
/** Number of properties matching the proposed filters (excludes travel time) */
|
||||
matchCount: number;
|
||||
}
|
||||
|
||||
export type AiFilterErrorType = 'auth' | 'limit' | 'error';
|
||||
|
|
@ -43,7 +45,11 @@ interface UseAiFiltersResult {
|
|||
}
|
||||
|
||||
/** Build a human-readable summary of the AI result. */
|
||||
function buildSummary(filters: FeatureFilters, travelTimeFilters: AiTravelTimeFilter[]): string {
|
||||
function buildSummary(
|
||||
filters: FeatureFilters,
|
||||
travelTimeFilters: AiTravelTimeFilter[],
|
||||
matchCount: number
|
||||
): string {
|
||||
const parts: string[] = [];
|
||||
|
||||
for (const [name, value] of Object.entries(filters)) {
|
||||
|
|
@ -63,7 +69,8 @@ function buildSummary(filters: FeatureFilters, travelTimeFilters: AiTravelTimeFi
|
|||
}
|
||||
|
||||
if (parts.length === 0) return 'No filters set';
|
||||
return `Set ${parts.length} filter${parts.length > 1 ? 's' : ''}: ${parts.join(', ')}`;
|
||||
const countStr = matchCount.toLocaleString();
|
||||
return `${countStr} properties match · Set ${parts.length} filter${parts.length > 1 ? 's' : ''}: ${parts.join(', ')}`;
|
||||
}
|
||||
|
||||
export function useAiFilters(): UseAiFiltersResult {
|
||||
|
|
@ -137,13 +144,15 @@ export function useAiFilters(): UseAiFiltersResult {
|
|||
})
|
||||
);
|
||||
const filters = json.filters as FeatureFilters;
|
||||
const summaryText = buildSummary(filters, travelTimeFilters);
|
||||
const matchCount: number = json.match_count ?? 0;
|
||||
const summaryText = buildSummary(filters, travelTimeFilters, matchCount);
|
||||
const result: AiFiltersResult = {
|
||||
filters,
|
||||
travelTimeFilters,
|
||||
notes: json.notes || '',
|
||||
summary: summaryText,
|
||||
listingType: json.listing_type || 'historical',
|
||||
matchCount,
|
||||
};
|
||||
setNotes(result.notes || null);
|
||||
setSummary(summaryText);
|
||||
|
|
|
|||
|
|
@ -1,14 +1,24 @@
|
|||
import { useState, useCallback } from 'react';
|
||||
|
||||
export function useCollapsibleGroups(): [
|
||||
Set<string>,
|
||||
/**
|
||||
* Manages collapsible group state.
|
||||
* @param defaultCollapsed When true, groups start collapsed (tracks expanded groups).
|
||||
* When false (default), groups start expanded (tracks collapsed groups).
|
||||
*/
|
||||
export function useCollapsibleGroups(defaultCollapsed = false): [
|
||||
(name: string) => boolean,
|
||||
(name: string) => void,
|
||||
(name: string) => void,
|
||||
] {
|
||||
const [collapsed, setCollapsed] = useState<Set<string>>(new Set());
|
||||
const [toggled, setToggled] = useState<Set<string>>(new Set());
|
||||
|
||||
const isExpanded = useCallback(
|
||||
(name: string) => (defaultCollapsed ? toggled.has(name) : !toggled.has(name)),
|
||||
[toggled, defaultCollapsed]
|
||||
);
|
||||
|
||||
const toggle = useCallback((name: string) => {
|
||||
setCollapsed((prev) => {
|
||||
setToggled((prev) => {
|
||||
const next = new Set(prev);
|
||||
if (next.has(name)) next.delete(name);
|
||||
else next.add(name);
|
||||
|
|
@ -16,14 +26,24 @@ export function useCollapsibleGroups(): [
|
|||
});
|
||||
}, []);
|
||||
|
||||
const expand = useCallback((name: string) => {
|
||||
setCollapsed((prev) => {
|
||||
if (!prev.has(name)) return prev;
|
||||
const next = new Set(prev);
|
||||
next.delete(name);
|
||||
return next;
|
||||
});
|
||||
}, []);
|
||||
const expand = useCallback(
|
||||
(name: string) => {
|
||||
setToggled((prev) => {
|
||||
if (defaultCollapsed) {
|
||||
if (prev.has(name)) return prev;
|
||||
const next = new Set(prev);
|
||||
next.add(name);
|
||||
return next;
|
||||
} else {
|
||||
if (!prev.has(name)) return prev;
|
||||
const next = new Set(prev);
|
||||
next.delete(name);
|
||||
return next;
|
||||
}
|
||||
});
|
||||
},
|
||||
[defaultCollapsed]
|
||||
);
|
||||
|
||||
return [collapsed, toggle, expand];
|
||||
return [isExpanded, toggle, expand];
|
||||
}
|
||||
|
|
|
|||
|
|
@ -321,7 +321,7 @@ export function useDeckLayers({
|
|||
ttVal as number,
|
||||
ttVal as number,
|
||||
clr,
|
||||
null,
|
||||
fr,
|
||||
0,
|
||||
densityGradientRef.current,
|
||||
dark,
|
||||
|
|
@ -422,7 +422,7 @@ export function useDeckLayers({
|
|||
ttVal as number,
|
||||
ttVal as number,
|
||||
clr,
|
||||
null,
|
||||
fr,
|
||||
0,
|
||||
densityGradientRef.current,
|
||||
dark,
|
||||
|
|
|
|||
|
|
@ -82,7 +82,9 @@ export function useMapData({
|
|||
|
||||
// Build the travel param string from entries with destinations.
|
||||
// Format: mode:slug[:best][:min:max] — server filters rows outside [min,max].
|
||||
// When excludeFieldKey is set, that entry's time range is omitted (for drag preview).
|
||||
// When excludeFieldKey is set, that entry uses a wide range (0:1440) instead of
|
||||
// the committed range. This still filters out rows with no travel data (the server
|
||||
// skips rows where minutes=None when any range is set) while including all actual values.
|
||||
const buildTravelParam = useCallback(
|
||||
(excludeFieldKey?: string): string => {
|
||||
const segments: string[] = [];
|
||||
|
|
@ -91,7 +93,11 @@ export function useMapData({
|
|||
let seg = `${entry.mode}:${entry.slug}`;
|
||||
if (entry.useBest) seg += ':best';
|
||||
const isExcluded = excludeFieldKey === `tt_${entry.mode}_${entry.slug}`;
|
||||
if (entry.timeRange && !isExcluded) seg += `:${entry.timeRange[0]}:${entry.timeRange[1]}`;
|
||||
if (isExcluded) {
|
||||
seg += ':0:1440';
|
||||
} else if (entry.timeRange) {
|
||||
seg += `:${entry.timeRange[0]}:${entry.timeRange[1]}`;
|
||||
}
|
||||
segments.push(seg);
|
||||
}
|
||||
return segments.join('|');
|
||||
|
|
@ -119,11 +125,14 @@ export function useMapData({
|
|||
const boundsStr = `${bounds.south},${bounds.west},${bounds.north},${bounds.east}`;
|
||||
const isTravelTimeDrag = activeFeature.startsWith('tt_');
|
||||
const dragTravelParam = isTravelTimeDrag ? buildTravelParam(activeFeature) : travelParam;
|
||||
// Travel time fields are computed from the travel param, not regular feature columns.
|
||||
// Sending a tt_* name as fields would cause a 400 (unknown field). Use empty string instead.
|
||||
const fieldsParam = isTravelTimeDrag ? '' : activeFeature;
|
||||
|
||||
if (usePostcodeView) {
|
||||
const params = new URLSearchParams({ bounds: boundsStr });
|
||||
if (filtersStr) params.set('filters', filtersStr);
|
||||
params.set('fields', activeFeature);
|
||||
params.set('fields', fieldsParam);
|
||||
if (dragTravelParam) params.set('travel', dragTravelParam);
|
||||
|
||||
fetch(apiUrl('postcodes', params), authHeaders({ signal: dragAbortRef.current.signal }))
|
||||
|
|
@ -140,7 +149,7 @@ export function useMapData({
|
|||
bounds: boundsStr,
|
||||
});
|
||||
if (filtersStr) params.set('filters', filtersStr);
|
||||
params.set('fields', activeFeature);
|
||||
params.set('fields', fieldsParam);
|
||||
if (dragTravelParam) params.set('travel', dragTravelParam);
|
||||
|
||||
fetch(apiUrl('hexagons', params), authHeaders({ signal: dragAbortRef.current.signal }))
|
||||
|
|
|
|||
|
|
@ -7,6 +7,8 @@ export interface Destination {
|
|||
slug: string;
|
||||
place_type: string;
|
||||
city?: string;
|
||||
lat: number;
|
||||
lon: number;
|
||||
}
|
||||
|
||||
/** Fetches all travel-time destinations for a mode once, with client-side caching. */
|
||||
|
|
|
|||
|
|
@ -189,22 +189,7 @@ export const STACKED_ENUM_GROUPS: Record<
|
|||
valueColors: ['#3b82f6', '#f59e0b'],
|
||||
},
|
||||
],
|
||||
Environment: [
|
||||
{
|
||||
label: 'Ground Risk',
|
||||
feature: 'Environmental risk',
|
||||
components: [
|
||||
'Collapsible deposits risk',
|
||||
'Compressible ground risk',
|
||||
'Landslide risk',
|
||||
'Running sand risk',
|
||||
'Shrink-swell risk',
|
||||
'Soluble rocks risk',
|
||||
],
|
||||
valueOrder: ['Low', 'Moderate', 'Significant'],
|
||||
valueColors: ['#22c55e', '#eab308', '#ef4444'],
|
||||
},
|
||||
],
|
||||
Environment: [],
|
||||
};
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -181,6 +181,19 @@ const FEATURE_ICON_PATHS: Record<string, ReactNode> = {
|
|||
<path d="M6 12v5c0 2.5 3 4 6 4s6-1.5 6-4v-5" />
|
||||
</>
|
||||
),
|
||||
'Good+ primary schools within 2km': (
|
||||
<>
|
||||
<path d="M4 19V9l8-6 8 6v10" />
|
||||
<path d="M9 19v-6h6v6" />
|
||||
<line x1="4" y1="19" x2="20" y2="19" />
|
||||
</>
|
||||
),
|
||||
'Good+ secondary schools within 2km': (
|
||||
<>
|
||||
<path d="M22 10v6M2 10l10-5 10 5-10 5z" />
|
||||
<path d="M6 12v5c0 2.5 3 4 6 4s6-1.5 6-4v-5" />
|
||||
</>
|
||||
),
|
||||
|
||||
// ── Deprivation ──────────────────────────────
|
||||
'Income Score (rate)': (
|
||||
|
|
@ -443,52 +456,6 @@ const FEATURE_ICON_PATHS: Record<string, ReactNode> = {
|
|||
<line x1="12" y1="20" x2="12.01" y2="20" />
|
||||
</>
|
||||
),
|
||||
'Environmental risk': (
|
||||
<>
|
||||
<path d="M10.29 3.86L1.82 18a2 2 0 001.71 3h16.94a2 2 0 001.71-3L13.71 3.86a2 2 0 00-3.42 0z" />
|
||||
<line x1="12" y1="9" x2="12" y2="13" />
|
||||
<line x1="12" y1="17" x2="12.01" y2="17" />
|
||||
</>
|
||||
),
|
||||
'Collapsible deposits risk': (
|
||||
<>
|
||||
<polyline points="12 2 2 7 12 12 22 7 12 2" />
|
||||
<polyline points="2 17 12 22 22 17" />
|
||||
<polyline points="2 12 12 17 22 12" />
|
||||
</>
|
||||
),
|
||||
'Compressible ground risk': (
|
||||
<>
|
||||
<line x1="12" y1="2" x2="12" y2="22" />
|
||||
<polyline points="16 6 12 2 8 6" />
|
||||
<polyline points="16 18 12 22 8 18" />
|
||||
<line x1="4" y1="12" x2="20" y2="12" />
|
||||
</>
|
||||
),
|
||||
'Landslide risk': (
|
||||
<>
|
||||
<path d="M8 3l4 8 5-5 5 15H2L8 3z" />
|
||||
</>
|
||||
),
|
||||
'Running sand risk': (
|
||||
<>
|
||||
<path d="M2 6c2-1 4-1 6 0s4 1 6 0 4-1 6 0" />
|
||||
<path d="M2 12c2-1 4-1 6 0s4 1 6 0 4-1 6 0" />
|
||||
<path d="M2 18c2-1 4-1 6 0s4 1 6 0 4-1 6 0" />
|
||||
</>
|
||||
),
|
||||
'Shrink-swell risk': (
|
||||
<>
|
||||
<line x1="2" y1="12" x2="22" y2="12" />
|
||||
<polyline points="6 8 2 12 6 16" />
|
||||
<polyline points="18 8 22 12 18 16" />
|
||||
</>
|
||||
),
|
||||
'Soluble rocks risk': (
|
||||
<>
|
||||
<path d="M12 2.69l5.66 5.66a8 8 0 11-11.31 0z" />
|
||||
</>
|
||||
),
|
||||
};
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -170,7 +170,7 @@ export function summarizeParams(queryString: string): string {
|
|||
const colonIdx = entry.indexOf(':');
|
||||
return colonIdx > 0 ? entry.substring(0, colonIdx) : entry;
|
||||
})
|
||||
.filter(Boolean);
|
||||
.filter((n) => n && n !== 'Listing status');
|
||||
if (filterNames.length > 0) {
|
||||
parts.push(
|
||||
filterNames.length <= 2 ? filterNames.join(', ') : `${filterNames.length} filters`
|
||||
|
|
@ -186,5 +186,13 @@ export function summarizeParams(queryString: string): string {
|
|||
}
|
||||
}
|
||||
|
||||
const ttParams = params.getAll('tt');
|
||||
if (ttParams.length > 0) {
|
||||
const count = ttParams.filter(Boolean).length;
|
||||
if (count > 0) {
|
||||
parts.push(`${count} travel time ${count === 1 ? 'destination' : 'destinations'}`);
|
||||
}
|
||||
}
|
||||
|
||||
return parts.length > 0 ? parts.join(' + ') : 'No filters';
|
||||
}
|
||||
|
|
|
|||
269
pipeline/check_travel_times.py
Normal file
269
pipeline/check_travel_times.py
Normal file
|
|
@ -0,0 +1,269 @@
|
|||
"""Find corrupted and duplicate travel-time parquet files.
|
||||
|
||||
A travel-time parquet file is considered corrupted when the R5 routing
|
||||
computation failed or was interrupted, leaving either zero rows or only
|
||||
the origin postcode. We detect this by comparing each file's row count
|
||||
against a per-mode threshold derived from the 5th-percentile of all files
|
||||
in that mode. Files at or below 1 row are always flagged.
|
||||
|
||||
Duplicates arise when places.parquet is rebuilt between R5 runs — each
|
||||
place gets a new numeric index prefix, so the skip-completed logic
|
||||
doesn't recognize previous results. --dedup keeps only the largest
|
||||
file per slug and removes the rest.
|
||||
|
||||
Usage:
|
||||
uv run python pipeline/check_travel_times.py [--travel-times property-data/travel-times]
|
||||
[--threshold-pct 5]
|
||||
[--delete]
|
||||
[--dedup]
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
import polars as pl
|
||||
|
||||
|
||||
@dataclass
|
||||
class BadFile:
|
||||
mode: str
|
||||
filename: str
|
||||
slug: str
|
||||
rows: int
|
||||
|
||||
|
||||
def scan_mode(mode_dir: Path, mode: str) -> tuple[list[tuple[str, str, int]], int]:
|
||||
"""Return (filename, slug, row_count) for every parquet in a mode dir."""
|
||||
entries: list[tuple[str, str, int]] = []
|
||||
errors = 0
|
||||
for f in sorted(os.listdir(mode_dir)):
|
||||
if not f.endswith(".parquet"):
|
||||
continue
|
||||
path = mode_dir / f
|
||||
slug = f.removesuffix(".parquet")
|
||||
# Strip numeric prefix (e.g. "000699-london-bridge" → "london-bridge")
|
||||
if "-" in slug:
|
||||
prefix, rest = slug.split("-", 1)
|
||||
if prefix.isdigit():
|
||||
slug = rest
|
||||
try:
|
||||
rows = pl.scan_parquet(path).select(pl.len()).collect().item()
|
||||
except Exception as exc:
|
||||
print(f" ERROR reading {mode}/{f}: {exc}", file=sys.stderr)
|
||||
errors += 1
|
||||
entries.append((f, slug, -1))
|
||||
continue
|
||||
entries.append((f, slug, rows))
|
||||
return entries, errors
|
||||
|
||||
|
||||
def percentile(values: list[int], pct: float) -> float:
|
||||
"""Linear-interpolation percentile on a sorted list."""
|
||||
if not values:
|
||||
return 0.0
|
||||
s = sorted(values)
|
||||
idx = (pct / 100) * (len(s) - 1)
|
||||
lo = int(idx)
|
||||
hi = min(lo + 1, len(s) - 1)
|
||||
frac = idx - lo
|
||||
return s[lo] + frac * (s[hi] - s[lo])
|
||||
|
||||
|
||||
def find_bad_files(
|
||||
base_dir: Path, threshold_pct: float
|
||||
) -> tuple[list[BadFile], dict[str, dict]]:
|
||||
"""Scan all modes and return bad files + per-mode stats."""
|
||||
bad: list[BadFile] = []
|
||||
stats: dict[str, dict] = {}
|
||||
|
||||
modes = sorted(
|
||||
d
|
||||
for d in os.listdir(base_dir)
|
||||
if (base_dir / d).is_dir()
|
||||
)
|
||||
|
||||
for mode in modes:
|
||||
mode_dir = base_dir / mode
|
||||
entries, errors = scan_mode(mode_dir, mode)
|
||||
if not entries:
|
||||
continue
|
||||
|
||||
row_counts = [r for _, _, r in entries if r >= 0]
|
||||
if not row_counts:
|
||||
continue
|
||||
|
||||
p5 = percentile(row_counts, threshold_pct)
|
||||
median = percentile(row_counts, 50)
|
||||
# Threshold: max of 1 and the chosen percentile — ensures we always
|
||||
# catch files with 0-1 rows even if p5 is 0 (e.g. walking mode).
|
||||
threshold = max(1, int(p5))
|
||||
|
||||
mode_bad = []
|
||||
for filename, slug, rows in entries:
|
||||
if rows <= threshold:
|
||||
bf = BadFile(mode=mode, filename=filename, slug=slug, rows=rows)
|
||||
mode_bad.append(bf)
|
||||
bad.append(bf)
|
||||
|
||||
stats[mode] = {
|
||||
"total": len(entries),
|
||||
"errors": errors,
|
||||
"bad": len(mode_bad),
|
||||
"threshold": threshold,
|
||||
"p5": p5,
|
||||
"median": median,
|
||||
"min": min(row_counts),
|
||||
"max": max(row_counts),
|
||||
}
|
||||
|
||||
return bad, stats
|
||||
|
||||
|
||||
def find_duplicates(base_dir: Path) -> tuple[list[BadFile], dict[str, dict]]:
|
||||
"""Find duplicate files (same slug, different numeric prefix). Keep the largest."""
|
||||
dupes: list[BadFile] = []
|
||||
stats: dict[str, dict] = {}
|
||||
|
||||
modes = sorted(d for d in os.listdir(base_dir) if (base_dir / d).is_dir())
|
||||
|
||||
for mode in modes:
|
||||
mode_dir = base_dir / mode
|
||||
entries, _ = scan_mode(mode_dir, mode)
|
||||
if not entries:
|
||||
continue
|
||||
|
||||
# Group by slug, keep largest
|
||||
slug_files: dict[str, list[tuple[str, int]]] = {}
|
||||
for filename, slug, rows in entries:
|
||||
slug_files.setdefault(slug, []).append((filename, rows))
|
||||
|
||||
mode_dupes = 0
|
||||
for slug, files in slug_files.items():
|
||||
if len(files) <= 1:
|
||||
continue
|
||||
# Keep the file with the most rows
|
||||
files.sort(key=lambda x: x[1], reverse=True)
|
||||
for filename, rows in files[1:]:
|
||||
dupes.append(BadFile(mode=mode, filename=filename, slug=slug, rows=rows))
|
||||
mode_dupes += 1
|
||||
|
||||
duped_slugs = sum(1 for fs in slug_files.values() if len(fs) > 1)
|
||||
stats[mode] = {
|
||||
"total": len(entries),
|
||||
"unique_slugs": len(slug_files),
|
||||
"duped_slugs": duped_slugs,
|
||||
"removable": mode_dupes,
|
||||
}
|
||||
|
||||
return dupes, stats
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument(
|
||||
"--travel-times",
|
||||
type=Path,
|
||||
default=Path("property-data/travel-times"),
|
||||
help="Path to travel-times directory",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--threshold-pct",
|
||||
type=float,
|
||||
default=5,
|
||||
help="Percentile below which files are flagged (default: 5th)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--delete",
|
||||
action="store_true",
|
||||
help="Delete corrupted files (so R5 will recompute them)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dedup",
|
||||
action="store_true",
|
||||
help="Remove duplicate files (keep largest per slug)",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
if not args.travel_times.is_dir():
|
||||
print(f"Error: {args.travel_times} is not a directory", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# --- Corruption check ---
|
||||
bad_files, stats = find_bad_files(args.travel_times, args.threshold_pct)
|
||||
|
||||
print("=== Per-mode summary ===\n")
|
||||
print(f"{'Mode':<10} {'Total':>6} {'Bad':>5} {'Threshold':>10} {'Median':>8} {'Range':>20}")
|
||||
print("-" * 65)
|
||||
for mode, s in sorted(stats.items()):
|
||||
rng = f"{s['min']:,}–{s['max']:,}"
|
||||
print(
|
||||
f"{mode:<10} {s['total']:>6} {s['bad']:>5} {s['threshold']:>10,} "
|
||||
f"{s['median']:>8,.0f} {rng:>20}"
|
||||
)
|
||||
|
||||
if bad_files:
|
||||
print(f"\n=== Corrupted files ({len(bad_files)} total) ===\n")
|
||||
current_mode = ""
|
||||
for bf in sorted(bad_files, key=lambda b: (b.mode, b.rows, b.slug)):
|
||||
if bf.mode != current_mode:
|
||||
current_mode = bf.mode
|
||||
print(f"\n {current_mode}/")
|
||||
status = "UNREADABLE" if bf.rows < 0 else f"{bf.rows} rows"
|
||||
print(f" {bf.filename} ({status})")
|
||||
|
||||
if args.delete:
|
||||
print(f"\nDeleting {len(bad_files)} corrupted files...")
|
||||
deleted = _delete_files(args.travel_times, bad_files)
|
||||
print(f"Deleted {deleted}/{len(bad_files)} files.")
|
||||
else:
|
||||
print(f"\nRun with --delete to remove these files so R5 can recompute them.")
|
||||
else:
|
||||
print("\nNo corrupted files found.")
|
||||
|
||||
# --- Dedup check ---
|
||||
dupe_files, dupe_stats = find_duplicates(args.travel_times)
|
||||
|
||||
total_removable = sum(s["removable"] for s in dupe_stats.values())
|
||||
if total_removable > 0:
|
||||
print(f"\n=== Duplicates ({total_removable} removable files) ===\n")
|
||||
print(f"{'Mode':<10} {'Total':>6} {'Unique':>7} {'Duped slugs':>12} {'Removable':>10}")
|
||||
print("-" * 50)
|
||||
for mode, s in sorted(dupe_stats.items()):
|
||||
if s["removable"] > 0:
|
||||
print(
|
||||
f"{mode:<10} {s['total']:>6} {s['unique_slugs']:>7} "
|
||||
f"{s['duped_slugs']:>12} {s['removable']:>10}"
|
||||
)
|
||||
|
||||
if args.dedup:
|
||||
# Exclude files already deleted by --delete
|
||||
deleted_set = {(bf.mode, bf.filename) for bf in bad_files} if args.delete else set()
|
||||
to_delete = [df for df in dupe_files if (df.mode, df.filename) not in deleted_set]
|
||||
print(f"\nRemoving {len(to_delete)} duplicate files (keeping largest per slug)...")
|
||||
deleted = _delete_files(args.travel_times, to_delete)
|
||||
print(f"Deleted {deleted}/{len(to_delete)} files.")
|
||||
else:
|
||||
print("\nRun with --dedup to remove duplicates (keeps largest per slug).")
|
||||
else:
|
||||
print("\nNo duplicates found.")
|
||||
|
||||
|
||||
def _delete_files(base_dir: Path, files: list[BadFile]) -> int:
|
||||
deleted = 0
|
||||
for bf in files:
|
||||
path = base_dir / bf.mode / bf.filename
|
||||
try:
|
||||
path.unlink()
|
||||
deleted += 1
|
||||
except OSError as exc:
|
||||
print(f" Failed to delete {path}: {exc}", file=sys.stderr)
|
||||
return deleted
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -1,44 +0,0 @@
|
|||
"""Download OS GeoSure ground stability data (5km hex grid).
|
||||
|
||||
Downloads the GB-Hex-5km-GeoSure dataset from Ordnance Survey as an ESRI
|
||||
Shapefile and extracts it.
|
||||
|
||||
Source: https://osdatahub.os.uk/downloads/open/GeoSure
|
||||
License: Open Government Licence v3.0
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
from pipeline.utils import download, extract_zip
|
||||
|
||||
URL = "https://api.os.uk/downloads/v1/products/GB-Hex-5km-GeoSure/downloads?area=GB&format=ESRI%C2%AE+Shapefile&redirect"
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Download OS GeoSure ground stability data"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=Path,
|
||||
required=True,
|
||||
help="Output directory for extracted shapefile",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
with tempfile.TemporaryDirectory() as cache_dir:
|
||||
zip_path = Path(cache_dir) / "geosure.zip"
|
||||
|
||||
download(URL, zip_path, timeout=300)
|
||||
extract_zip(zip_path, args.output)
|
||||
|
||||
shp_files = list(args.output.rglob("*.shp"))
|
||||
print(f"Extracted {len(shp_files)} shapefiles to {args.output}")
|
||||
for f in shp_files:
|
||||
print(f" {f.relative_to(args.output)}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
140
pipeline/download/median_age.py
Normal file
140
pipeline/download/median_age.py
Normal file
|
|
@ -0,0 +1,140 @@
|
|||
"""Download Census 2021 median age by LSOA.
|
||||
|
||||
Downloads five-year age band counts (TS007A) from the NOMIS API, then computes
|
||||
the median age per LSOA using linear interpolation within the median class.
|
||||
|
||||
Source: NOMIS (ONS Census 2021 — TS007A dataset, NM_2020_1)
|
||||
License: Open Government Licence v3.0
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
|
||||
import httpx
|
||||
import polars as pl
|
||||
|
||||
# NOMIS API: Census 2021 TS007A (age by five-year bands) by LSOA 2021 (TYPE151)
|
||||
# c2021_age_19=1..18 selects 18 five-year bands (excluding 0 = Total)
|
||||
# measures=20100 selects absolute count
|
||||
BASE_URL = "https://www.nomisweb.co.uk/api/v01/dataset/NM_2020_1.data.csv?date=latest&geography=TYPE151&c2021_age_19=1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18&measures=20100&select=GEOGRAPHY_CODE,C2021_AGE_19_NAME,OBS_VALUE"
|
||||
PAGE_SIZE = 25000
|
||||
|
||||
# Five-year age bands in order, with lower bounds for interpolation.
|
||||
# The last band (85+) is open-ended — we treat it as 85-89 for median purposes.
|
||||
AGE_BANDS = [
|
||||
(0, 5), # Aged 0 to 4 years
|
||||
(5, 5), # Aged 5 to 9 years
|
||||
(10, 5), # Aged 10 to 14 years
|
||||
(15, 5), # Aged 15 to 19 years
|
||||
(20, 5), # Aged 20 to 24 years
|
||||
(25, 5), # Aged 25 to 29 years
|
||||
(30, 5), # Aged 30 to 34 years
|
||||
(35, 5), # Aged 35 to 39 years
|
||||
(40, 5), # Aged 40 to 44 years
|
||||
(45, 5), # Aged 45 to 49 years
|
||||
(50, 5), # Aged 50 to 54 years
|
||||
(55, 5), # Aged 55 to 59 years
|
||||
(60, 5), # Aged 60 to 64 years
|
||||
(65, 5), # Aged 65 to 69 years
|
||||
(70, 5), # Aged 70 to 74 years
|
||||
(75, 5), # Aged 75 to 79 years
|
||||
(80, 5), # Aged 80 to 84 years
|
||||
(85, 5), # Aged 85 years and over
|
||||
]
|
||||
|
||||
|
||||
def compute_median_age(counts: list[int]) -> float:
|
||||
"""Compute median age from five-year band counts using linear interpolation."""
|
||||
total = sum(counts)
|
||||
if total == 0:
|
||||
return float("nan")
|
||||
|
||||
half = total / 2
|
||||
cumulative = 0
|
||||
for i, count in enumerate(counts):
|
||||
if cumulative + count >= half:
|
||||
lower_bound, width = AGE_BANDS[i]
|
||||
# Linear interpolation within the median band
|
||||
return lower_bound + ((half - cumulative) / count) * width
|
||||
cumulative += count
|
||||
|
||||
return float("nan")
|
||||
|
||||
|
||||
def download_and_convert(output_path: Path) -> None:
|
||||
print("Downloading Census 2021 age by five-year bands from NOMIS...")
|
||||
frames = []
|
||||
offset = 0
|
||||
while True:
|
||||
url = f"{BASE_URL}&recordoffset={offset}"
|
||||
response = httpx.get(url, follow_redirects=True, timeout=120)
|
||||
response.raise_for_status()
|
||||
if len(response.content) == 0:
|
||||
break
|
||||
chunk = pl.read_csv(BytesIO(response.content))
|
||||
if chunk.height == 0:
|
||||
break
|
||||
frames.append(chunk)
|
||||
print(f" Fetched {chunk.height} rows (offset={offset})")
|
||||
if chunk.height < PAGE_SIZE:
|
||||
break
|
||||
offset += PAGE_SIZE
|
||||
|
||||
df = pl.concat(frames)
|
||||
print(f"Total rows: {df.height}")
|
||||
|
||||
# Filter to England only
|
||||
df = df.filter(pl.col("GEOGRAPHY_CODE").str.starts_with("E"))
|
||||
|
||||
# Pivot: one row per LSOA, columns = age band names, values = counts
|
||||
pivoted = df.pivot(
|
||||
on="C2021_AGE_19_NAME",
|
||||
index="GEOGRAPHY_CODE",
|
||||
values="OBS_VALUE",
|
||||
)
|
||||
|
||||
# Extract age band columns in order and compute median
|
||||
# NOMIS returns band names like "Aged 0 to 4 years", "Aged 85 years and over"
|
||||
band_cols = [c for c in pivoted.columns if c != "GEOGRAPHY_CODE"]
|
||||
# Sort by the lower bound of each band
|
||||
band_cols.sort(key=lambda c: int(c.split()[1]))
|
||||
|
||||
print(f"Age bands found: {len(band_cols)}")
|
||||
print(f" First: {band_cols[0]}")
|
||||
print(f" Last: {band_cols[-1]}")
|
||||
|
||||
# Compute median age per LSOA
|
||||
rows = pivoted.select("GEOGRAPHY_CODE", *band_cols).to_dicts()
|
||||
medians = []
|
||||
for row in rows:
|
||||
counts = [row[col] for col in band_cols]
|
||||
median = compute_median_age(counts)
|
||||
medians.append({"lsoa21": row["GEOGRAPHY_CODE"], "median_age": round(median, 1)})
|
||||
|
||||
result = pl.DataFrame(medians).with_columns(
|
||||
pl.col("median_age").cast(pl.Float32),
|
||||
)
|
||||
|
||||
print(f"England LSOAs: {result.height}")
|
||||
print(f"Median age range: {result['median_age'].min()} - {result['median_age'].max()}")
|
||||
print(f"Mean of medians: {result['median_age'].mean():.1f}")
|
||||
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
result.write_parquet(output_path, compression="zstd")
|
||||
print(f"Saved to {output_path}")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Download Census 2021 median age by LSOA"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output", type=Path, required=True, help="Output parquet file path"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
download_and_convert(args.output)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
218
pipeline/download/os_greenspace.py
Normal file
218
pipeline/download/os_greenspace.py
Normal file
|
|
@ -0,0 +1,218 @@
|
|||
"""Download OS Open Greenspace and extract access points.
|
||||
|
||||
Downloads the OS Open Greenspace dataset as ESRI Shapefile and extracts
|
||||
access point locations (park entrances). Each access point is tagged with
|
||||
its parent site's function type (e.g. Public Park Or Garden). Sites without
|
||||
access points fall back to polygon centroids.
|
||||
|
||||
Using access points rather than polygon centroids gives much more accurate
|
||||
distance calculations — a property next to Hyde Park won't show 400m just
|
||||
because the centroid is in the middle of the park.
|
||||
|
||||
Source: https://osdatahub.os.uk/downloads/open/OpenGreenspace
|
||||
License: Open Government Licence v3.0
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import polars as pl
|
||||
import shapefile as shp
|
||||
from pyproj import Transformer
|
||||
from shapely.geometry import shape as to_shapely
|
||||
|
||||
from pipeline.utils.download import download, extract_zip
|
||||
|
||||
URL = "https://api.os.uk/downloads/v1/products/OpenGreenspace/downloads?area=GB&format=ESRI%C2%AE+Shapefile&redirect"
|
||||
|
||||
_to_wgs84 = Transformer.from_crs("EPSG:27700", "EPSG:4326", always_xy=True)
|
||||
|
||||
|
||||
def _find_field(field_names: list[str], *needles: str) -> int | None:
|
||||
"""Find the index of the first field whose lowercased name contains any needle."""
|
||||
for i, name in enumerate(field_names):
|
||||
lower = name.lower()
|
||||
for needle in needles:
|
||||
if needle in lower:
|
||||
return i
|
||||
return None
|
||||
|
||||
|
||||
def _read_site_functions(shp_path: Path) -> dict[str, str]:
|
||||
"""Build a mapping from site ID → function type from the GreenspaceSite shapefile."""
|
||||
reader = shp.Reader(str(shp_path), encoding="latin-1")
|
||||
field_names = [f[0] for f in reader.fields[1:]]
|
||||
|
||||
id_idx = _find_field(field_names, "id")
|
||||
func_idx = _find_field(field_names, "funct")
|
||||
if id_idx is None or func_idx is None:
|
||||
raise ValueError(f"Missing id/function fields. Available: {field_names}")
|
||||
|
||||
site_funcs = {}
|
||||
for rec in reader.iterRecords():
|
||||
site_funcs[rec[id_idx]] = rec[func_idx]
|
||||
|
||||
print(f" Loaded {len(site_funcs):,} site function mappings")
|
||||
return site_funcs
|
||||
|
||||
|
||||
def _read_access_points(
|
||||
shp_path: Path, site_funcs: dict[str, str]
|
||||
) -> tuple[list[float], list[float], list[str]]:
|
||||
"""Read access points, tagging each with its parent site's function."""
|
||||
reader = shp.Reader(str(shp_path), encoding="latin-1")
|
||||
field_names = [f[0] for f in reader.fields[1:]]
|
||||
|
||||
# The access point shapefile has a reference field linking to the parent site
|
||||
ref_idx = _find_field(field_names, "refto", "ref_to", "greensp")
|
||||
if ref_idx is None:
|
||||
raise ValueError(
|
||||
f"No site reference field found in access points. Available: {field_names}"
|
||||
)
|
||||
|
||||
lats: list[float] = []
|
||||
lngs: list[float] = []
|
||||
categories: list[str] = []
|
||||
skipped = 0
|
||||
|
||||
for sr in reader.shapeRecords():
|
||||
site_id = sr.record[ref_idx]
|
||||
func = site_funcs.get(site_id)
|
||||
if func is None:
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
try:
|
||||
geom = to_shapely(sr.shape.__geo_interface__)
|
||||
if geom.is_empty:
|
||||
continue
|
||||
lng, lat = _to_wgs84.transform(geom.x, geom.y)
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
lats.append(lat)
|
||||
lngs.append(lng)
|
||||
categories.append(func)
|
||||
|
||||
if skipped:
|
||||
print(f" Skipped {skipped:,} access points with unknown site ID")
|
||||
|
||||
return lats, lngs, categories
|
||||
|
||||
|
||||
def _read_site_centroids(
|
||||
shp_path: Path, site_funcs: dict[str, str], covered_ids: set[str]
|
||||
) -> tuple[list[float], list[float], list[str]]:
|
||||
"""Read polygon centroids for sites that have no access points (fallback)."""
|
||||
reader = shp.Reader(str(shp_path), encoding="latin-1")
|
||||
field_names = [f[0] for f in reader.fields[1:]]
|
||||
id_idx = _find_field(field_names, "id")
|
||||
func_idx = _find_field(field_names, "funct")
|
||||
if id_idx is None or func_idx is None:
|
||||
return [], [], []
|
||||
|
||||
lats: list[float] = []
|
||||
lngs: list[float] = []
|
||||
categories: list[str] = []
|
||||
|
||||
for sr in reader.shapeRecords():
|
||||
site_id = sr.record[id_idx]
|
||||
if site_id in covered_ids:
|
||||
continue
|
||||
|
||||
func = sr.record[func_idx]
|
||||
try:
|
||||
geom = to_shapely(sr.shape.__geo_interface__)
|
||||
if geom.is_empty or not geom.is_valid:
|
||||
continue
|
||||
centroid = geom.centroid
|
||||
lng, lat = _to_wgs84.transform(centroid.x, centroid.y)
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
lats.append(lat)
|
||||
lngs.append(lng)
|
||||
categories.append(func)
|
||||
|
||||
return lats, lngs, categories
|
||||
|
||||
|
||||
def download_greenspace(output: Path) -> None:
|
||||
output.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with tempfile.TemporaryDirectory() as cache_dir:
|
||||
zip_path = Path(cache_dir) / "greenspace.zip"
|
||||
extract_dir = Path(cache_dir) / "extracted"
|
||||
|
||||
download(URL, zip_path, timeout=300)
|
||||
extract_zip(zip_path, extract_dir)
|
||||
|
||||
# Find both shapefiles
|
||||
site_shps = list(extract_dir.rglob("*GreenspaceSite*.shp"))
|
||||
access_shps = list(extract_dir.rglob("*AccessPoint*.shp"))
|
||||
|
||||
if not site_shps:
|
||||
raise FileNotFoundError("No GreenspaceSite shapefile found")
|
||||
if not access_shps:
|
||||
raise FileNotFoundError("No AccessPoint shapefile found")
|
||||
|
||||
# Step 1: Build site ID → function mapping
|
||||
print(f"Reading {site_shps[0].name} for function types...")
|
||||
site_funcs = _read_site_functions(site_shps[0])
|
||||
|
||||
# Step 2: Read access points (primary — park entrances)
|
||||
print(f"Reading {access_shps[0].name}...")
|
||||
ap_lats, ap_lngs, ap_cats = _read_access_points(access_shps[0], site_funcs)
|
||||
print(f" {len(ap_lats):,} access points loaded")
|
||||
|
||||
# Step 3: Fall back to centroids for sites without any access points
|
||||
covered_ids = set()
|
||||
reader = shp.Reader(str(access_shps[0]), encoding="latin-1")
|
||||
field_names = [f[0] for f in reader.fields[1:]]
|
||||
ref_idx = _find_field(field_names, "refto", "ref_to", "greensp")
|
||||
if ref_idx is not None:
|
||||
for rec in reader.iterRecords():
|
||||
covered_ids.add(rec[ref_idx])
|
||||
|
||||
print("Adding centroids for sites without access points...")
|
||||
fb_lats, fb_lngs, fb_cats = _read_site_centroids(
|
||||
site_shps[0], site_funcs, covered_ids
|
||||
)
|
||||
print(f" {len(fb_lats):,} centroid fallbacks added")
|
||||
|
||||
lats = ap_lats + fb_lats
|
||||
lngs = ap_lngs + fb_lngs
|
||||
categories = ap_cats + fb_cats
|
||||
|
||||
df = pl.DataFrame(
|
||||
{
|
||||
"lat": np.array(lats, dtype=np.float64),
|
||||
"lng": np.array(lngs, dtype=np.float64),
|
||||
"category": categories,
|
||||
}
|
||||
)
|
||||
|
||||
df.write_parquet(output)
|
||||
size_mb = output.stat().st_size / (1024 * 1024)
|
||||
print(f"Wrote {output} ({size_mb:.1f} MB, {len(df):,} points)")
|
||||
|
||||
counts = df.group_by("category").len().sort("len", descending=True)
|
||||
for row in counts.iter_rows(named=True):
|
||||
print(f" {row['category']}: {row['len']:,}")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Download OS Open Greenspace access points"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output", type=Path, required=True, help="Output parquet file path"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
download_greenspace(args.output)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -12,7 +12,7 @@ from io import BytesIO
|
|||
from pathlib import Path
|
||||
|
||||
PROTOMAPS_BASE = "https://build.protomaps.com"
|
||||
UK_BBOX = "-10.5,49.5,2.5,61"
|
||||
UK_BBOX = "-10.5,49,5,61"
|
||||
MAX_AGE_DAYS = 14
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -37,6 +37,7 @@ def main():
|
|||
"NUMBER_HABITABLE_ROOMS",
|
||||
"FLOOR_HEIGHT",
|
||||
"CONSTRUCTION_AGE_BAND",
|
||||
"TENURE",
|
||||
)
|
||||
.filter(pl.col("epc_address").is_not_null())
|
||||
.with_columns(
|
||||
|
|
@ -52,6 +53,7 @@ def main():
|
|||
epc_base.sort("INSPECTION_DATE", descending=True)
|
||||
.group_by("epc_address", "POSTCODE")
|
||||
.first()
|
||||
.drop("TENURE")
|
||||
)
|
||||
|
||||
# Events fork: detect renovation events between consecutive certificates
|
||||
|
|
@ -124,11 +126,29 @@ def main():
|
|||
print(f"Renovation events: {events.height} properties with events")
|
||||
print(event_counts)
|
||||
|
||||
# Left-join events back onto dedup EPC
|
||||
# Social tenure fork: flag properties that were ever social housing
|
||||
social_tenure = (
|
||||
epc_base.filter(
|
||||
pl.col("TENURE").str.to_lowercase().str.contains("social")
|
||||
)
|
||||
.select("epc_address", "POSTCODE")
|
||||
.unique()
|
||||
.with_columns(pl.lit("Yes").alias("was_council_house"))
|
||||
.collect()
|
||||
)
|
||||
print(f"Former council houses (EPC social tenure): {social_tenure.height}")
|
||||
|
||||
# Left-join events and social tenure back onto dedup EPC
|
||||
epc = epc.join(
|
||||
events.lazy(),
|
||||
on=["epc_address", "POSTCODE"],
|
||||
how="left",
|
||||
).join(
|
||||
social_tenure.lazy(),
|
||||
on=["epc_address", "POSTCODE"],
|
||||
how="left",
|
||||
).with_columns(
|
||||
pl.col("was_council_house").fill_null("No"),
|
||||
)
|
||||
|
||||
print("EPC dataset")
|
||||
|
|
|
|||
|
|
@ -52,20 +52,17 @@ _AREA_COLUMNS = [
|
|||
"Number of parks within 2km",
|
||||
"Train or tube stations within 1km",
|
||||
"Distance to nearest train or tube station (km)",
|
||||
"Distance to nearest park (km)",
|
||||
# Environment
|
||||
"Noise (dB)",
|
||||
"Max available download speed (Mbps)",
|
||||
# Schools
|
||||
"Good+ primary schools within 5km",
|
||||
"Good+ secondary schools within 5km",
|
||||
# GeoSure
|
||||
"Environmental risk",
|
||||
"Collapsible deposits risk",
|
||||
"Compressible ground risk",
|
||||
"Landslide risk",
|
||||
"Running sand risk",
|
||||
"Shrink-swell risk",
|
||||
"Soluble rocks risk",
|
||||
"Good+ primary schools within 2km",
|
||||
"Good+ secondary schools within 2km",
|
||||
# Demographics
|
||||
"Median age",
|
||||
]
|
||||
|
||||
|
||||
|
|
@ -79,9 +76,9 @@ def _build(
|
|||
noise_path: Path,
|
||||
school_proximity_path: Path,
|
||||
broadband_path: Path,
|
||||
geosure_path: Path,
|
||||
rental_prices_path: Path,
|
||||
lsoa_population_path: Path,
|
||||
median_age_path: Path,
|
||||
) -> tuple[pl.DataFrame, pl.DataFrame]:
|
||||
"""Build postcode and properties dataframes from epc_pp + auxiliary data.
|
||||
|
||||
|
|
@ -194,6 +191,9 @@ def _build(
|
|||
.alias("minor_crime_per_1k"),
|
||||
).drop("population")
|
||||
|
||||
median_age = pl.scan_parquet(median_age_path)
|
||||
wide = wide.join(median_age, on="lsoa21", how="left")
|
||||
|
||||
poi_counts = pl.scan_parquet(poi_proximity_path)
|
||||
wide = wide.join(poi_counts, on="postcode", how="left")
|
||||
|
||||
|
|
@ -239,9 +239,6 @@ def _build(
|
|||
)
|
||||
wide = wide.join(broadband, left_on="postcode", right_on="bb_postcode", how="left")
|
||||
|
||||
geosure = pl.scan_parquet(geosure_path)
|
||||
wide = wide.join(geosure, on="postcode", how="left")
|
||||
|
||||
# Derive property_type: prefer EPC data, fall back to price-paid.
|
||||
# For Houses, use built_form (e.g. Semi-Detached, Mid-Terrace) for finer detail.
|
||||
bad_built_form = pl.col("built_form").is_null() | pl.col("built_form").is_in(
|
||||
|
|
@ -330,25 +327,23 @@ def _build(
|
|||
"parks_2km": "Number of parks within 2km",
|
||||
"train_tube_1km": "Train or tube stations within 1km",
|
||||
"train_tube_nearest_km": "Distance to nearest train or tube station (km)",
|
||||
"parks_nearest_km": "Distance to nearest park (km)",
|
||||
"latest_price": "Last known price",
|
||||
"number_habitable_rooms": "Number of bedrooms & living rooms",
|
||||
"noise_lden_db": "Noise (dB)",
|
||||
"good_primary_5km": "Good+ primary schools within 5km",
|
||||
"good_secondary_5km": "Good+ secondary schools within 5km",
|
||||
"good_primary_2km": "Good+ primary schools within 2km",
|
||||
"good_secondary_2km": "Good+ secondary schools within 2km",
|
||||
"max_download_speed": "Max available download speed (Mbps)",
|
||||
"serious_crime_avg_yr": "Serious crime (avg/yr)",
|
||||
"minor_crime_avg_yr": "Minor crime (avg/yr)",
|
||||
"serious_crime_per_1k": "Serious crime per 1k residents (avg/yr)",
|
||||
"minor_crime_per_1k": "Minor crime per 1k residents (avg/yr)",
|
||||
"environmental_risk": "Environmental risk",
|
||||
"collapsible_deposits_risk": "Collapsible deposits risk",
|
||||
"compressible_ground_risk": "Compressible ground risk",
|
||||
"landslide_risk": "Landslide risk",
|
||||
"running_sand_risk": "Running sand risk",
|
||||
"shrink_swell_risk": "Shrink-swell risk",
|
||||
"soluble_rocks_risk": "Soluble rocks risk",
|
||||
"median_monthly_rent": "Estimated monthly rent",
|
||||
"floor_height": "Interior height (m)",
|
||||
"was_council_house": "Former council house",
|
||||
"median_age": "Median age",
|
||||
}
|
||||
)
|
||||
)
|
||||
|
|
@ -416,12 +411,6 @@ def main():
|
|||
required=True,
|
||||
help="Broadband performance by output area parquet file",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--geosure",
|
||||
type=Path,
|
||||
required=True,
|
||||
help="GeoSure ground stability parquet file",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--rental-prices",
|
||||
type=Path,
|
||||
|
|
@ -434,6 +423,12 @@ def main():
|
|||
required=True,
|
||||
help="Census 2021 population by LSOA parquet file",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--median-age",
|
||||
type=Path,
|
||||
required=True,
|
||||
help="Census 2021 median age by LSOA parquet file",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-postcodes",
|
||||
type=Path,
|
||||
|
|
@ -458,9 +453,9 @@ def main():
|
|||
noise_path=args.noise,
|
||||
school_proximity_path=args.school_proximity,
|
||||
broadband_path=args.broadband,
|
||||
geosure_path=args.geosure,
|
||||
rental_prices_path=args.rental_prices,
|
||||
lsoa_population_path=args.lsoa_population,
|
||||
median_age_path=args.median_age,
|
||||
)
|
||||
|
||||
print(f"\nPostcode columns: {postcode_df.columns}")
|
||||
|
|
|
|||
|
|
@ -13,7 +13,6 @@ from pipeline.utils.poi_counts import count_pois_per_postcode, min_distance_per_
|
|||
POI_GROUPS_2KM = {
|
||||
"restaurants": ["Restaurant", "Fast Food"],
|
||||
"groceries": ["Greengrocer", "Supermarket", "Convenience Store"],
|
||||
"parks": ["Park"],
|
||||
}
|
||||
|
||||
# Train/tube stations counted at 1km radius
|
||||
|
|
@ -21,11 +20,18 @@ TRAIN_TUBE_GROUP = {
|
|||
"train_tube": ["Metro or Tram stop", "Rail station"],
|
||||
}
|
||||
|
||||
# Groups for which to compute distance to nearest POI
|
||||
# Groups for which to compute distance to nearest POI (from filtered POIs)
|
||||
DISTANCE_GROUPS = {
|
||||
"train_tube": ["Metro or Tram stop", "Rail station"],
|
||||
}
|
||||
|
||||
# OS Open Greenspace function types used for park counts and distance calculation.
|
||||
# Uses the authoritative OS dataset instead of OSM point POIs for better coverage
|
||||
# of green spaces that are only mapped as polygons in OSM.
|
||||
GREENSPACE_PARK_FUNCTIONS = {
|
||||
"parks": ["Public Park Or Garden", "Playing Field", "Play Space"],
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
|
|
@ -37,6 +43,12 @@ def main():
|
|||
parser.add_argument(
|
||||
"--pois", type=Path, required=True, help="Filtered POIs parquet"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--greenspace",
|
||||
type=Path,
|
||||
required=True,
|
||||
help="OS Open Greenspace centroids parquet",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output", type=Path, required=True, help="Output parquet path"
|
||||
)
|
||||
|
|
@ -60,11 +72,25 @@ def main():
|
|||
postcodes, pois, groups=TRAIN_TUBE_GROUP, radius_km=1
|
||||
)
|
||||
|
||||
# Distance to nearest train/tube station
|
||||
# Distance to nearest train/tube station (from filtered POIs)
|
||||
distances = min_distance_per_postcode(postcodes, pois, groups=DISTANCE_GROUPS)
|
||||
|
||||
# Park counts and distances from OS Open Greenspace
|
||||
greenspace = pl.read_parquet(args.greenspace)
|
||||
park_counts_2km = count_pois_per_postcode(
|
||||
postcodes, greenspace, groups=GREENSPACE_PARK_FUNCTIONS, radius_km=2
|
||||
)
|
||||
park_distances = min_distance_per_postcode(
|
||||
postcodes, greenspace, groups=GREENSPACE_PARK_FUNCTIONS
|
||||
)
|
||||
|
||||
# Join all results on postcode
|
||||
result = counts_2km.join(counts_1km, on="postcode").join(distances, on="postcode")
|
||||
result = (
|
||||
counts_2km.join(counts_1km, on="postcode")
|
||||
.join(distances, on="postcode")
|
||||
.join(park_counts_2km, on="postcode")
|
||||
.join(park_distances, on="postcode")
|
||||
)
|
||||
|
||||
result.write_parquet(args.output)
|
||||
size_mb = args.output.stat().st_size / (1024 * 1024)
|
||||
|
|
|
|||
|
|
@ -60,9 +60,14 @@ def main():
|
|||
# Load all postcodes for proximity counting
|
||||
postcodes = arcgis.rename({"lng": "lon"})
|
||||
|
||||
result = count_pois_per_postcode(
|
||||
counts_5km = count_pois_per_postcode(
|
||||
postcodes, schools, radius_km=5, groups=SCHOOL_GROUPS
|
||||
)
|
||||
counts_2km = count_pois_per_postcode(
|
||||
postcodes, schools, radius_km=2, groups=SCHOOL_GROUPS
|
||||
)
|
||||
|
||||
result = counts_5km.join(counts_2km, on="postcode")
|
||||
|
||||
result.write_parquet(args.output)
|
||||
size_mb = args.output.stat().st_size / (1024 * 1024)
|
||||
|
|
|
|||
|
|
@ -1,130 +0,0 @@
|
|||
"""Spatial-join GeoSure 5km hex grid risk data to postcode centroids.
|
||||
|
||||
Reads six ESRI Shapefiles (one per hazard type), converts polygons from
|
||||
BNG to WGS84, and queries postcode centroids against them using an STRtree.
|
||||
Outputs a postcode-level parquet with six risk columns plus an overall max.
|
||||
|
||||
Source: Ordnance Survey GeoSure (Open Data)
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import polars as pl
|
||||
import shapefile as shp
|
||||
from pyproj import Transformer
|
||||
from shapely import STRtree, points
|
||||
from shapely.geometry import shape as to_shapely
|
||||
|
||||
_GEOSURE_RISKS = [
|
||||
("CollapsibleDeposits", "collapsible_deposits_risk"),
|
||||
("CompressibleGround", "compressible_ground_risk"),
|
||||
("Landslides", "landslide_risk"),
|
||||
("RunningSand", "running_sand_risk"),
|
||||
("ShrinkSwell", "shrink_swell_risk"),
|
||||
("SolubleRocks", "soluble_rocks_risk"),
|
||||
]
|
||||
|
||||
|
||||
def transform(geosure_dir: Path, arcgis_path: Path) -> pl.DataFrame:
|
||||
"""Spatial-join GeoSure 5km hex grid risk data to postcode centroids."""
|
||||
to_wgs84 = Transformer.from_crs("EPSG:27700", "EPSG:4326", always_xy=True)
|
||||
|
||||
print("Loading postcode centroids for GeoSure join...")
|
||||
postcodes_df = pl.read_parquet(arcgis_path, columns=["pcds", "lat", "long"])
|
||||
lats = postcodes_df["lat"].to_numpy()
|
||||
lons = postcodes_df["long"].to_numpy()
|
||||
pts = points(np.column_stack([lons, lats]))
|
||||
|
||||
result = pl.DataFrame({"postcode": postcodes_df["pcds"]})
|
||||
risk_cols = []
|
||||
|
||||
for risk_key, col_name in _GEOSURE_RISKS:
|
||||
shp_files = list(geosure_dir.rglob(f"*_GS_{risk_key}_*.shp"))
|
||||
if not shp_files:
|
||||
print(f" Warning: No shapefile found for {risk_key}")
|
||||
result = result.with_columns(pl.lit(None).cast(pl.UInt8).alias(col_name))
|
||||
risk_cols.append(col_name)
|
||||
continue
|
||||
|
||||
print(f" Reading {shp_files[0].name}...")
|
||||
reader = shp.Reader(str(shp_files[0]))
|
||||
|
||||
hex_polys = []
|
||||
hex_classes = []
|
||||
for sr in reader.shapeRecords():
|
||||
geo = sr.shape.__geo_interface__
|
||||
new_coords = []
|
||||
for ring in geo["coordinates"]:
|
||||
xs, ys = zip(*ring)
|
||||
t_lons, t_lats = to_wgs84.transform(list(xs), list(ys))
|
||||
new_coords.append(list(zip(t_lons, t_lats)))
|
||||
geo["coordinates"] = new_coords
|
||||
hex_polys.append(to_shapely(geo))
|
||||
hex_classes.append(int(sr.record["CLASS"]))
|
||||
|
||||
classes_arr = np.array(hex_classes, dtype=np.uint8)
|
||||
|
||||
print(f" Querying {len(pts)} postcodes against {len(hex_polys)} hexagons...")
|
||||
tree = STRtree(hex_polys)
|
||||
pt_idx, hex_idx = tree.query(pts, predicate="intersects")
|
||||
|
||||
risk_values = np.zeros(len(pts), dtype=np.uint8)
|
||||
np.maximum.at(risk_values, pt_idx, classes_arr[hex_idx])
|
||||
|
||||
result = result.with_columns(pl.Series(col_name, risk_values))
|
||||
risk_cols.append(col_name)
|
||||
|
||||
# Overall environmental risk = max across all 6
|
||||
result = result.with_columns(
|
||||
pl.max_horizontal(*risk_cols).alias("environmental_risk")
|
||||
)
|
||||
|
||||
# Convert 0 → null, 1/2/3 → Low/Moderate/Significant
|
||||
label_map = {1: "Low", 2: "Moderate", 3: "Significant"}
|
||||
for col in risk_cols + ["environmental_risk"]:
|
||||
result = result.with_columns(
|
||||
pl.col(col)
|
||||
.replace_strict(label_map, default=None, return_dtype=pl.Utf8)
|
||||
.alias(col)
|
||||
)
|
||||
|
||||
print(f" GeoSure join complete: {result.height} postcodes")
|
||||
return result
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Spatial-join GeoSure ground stability data to postcode centroids"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--geosure",
|
||||
type=Path,
|
||||
required=True,
|
||||
help="GeoSure shapefile directory",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--arcgis",
|
||||
type=Path,
|
||||
required=True,
|
||||
help="ArcGIS postcode data parquet (for lat/lon coordinates)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=Path,
|
||||
required=True,
|
||||
help="Output parquet file path",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
result = transform(args.geosure, args.arcgis)
|
||||
|
||||
args.output.parent.mkdir(parents=True, exist_ok=True)
|
||||
result.write_parquet(args.output, compression="zstd")
|
||||
size_mb = args.output.stat().st_size / (1024 * 1024)
|
||||
print(f"Wrote {args.output} ({size_mb:.1f} MB)")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -3,6 +3,8 @@ package propertymap;
|
|||
import com.conveyal.r5.transit.TransportNetwork;
|
||||
import org.duckdb.DuckDBConnection;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.DirectoryStream;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
|
|
@ -132,11 +134,15 @@ public class App {
|
|||
Path modeDir = outDir.resolve(mode);
|
||||
Files.createDirectories(modeDir);
|
||||
|
||||
// Scan existing slugs once (O(directory)) instead of per-origin stat calls.
|
||||
// This matches by slug regardless of numeric prefix, so re-indexed places.parquet
|
||||
// won't cause duplicate computation.
|
||||
Set<String> existingSlugs = skipCompleted ? scanExistingSlugs(modeDir) : Set.of();
|
||||
|
||||
List<Integer> remaining = new ArrayList<>();
|
||||
for (int idx : originIndices) {
|
||||
if (skipCompleted) {
|
||||
Path f = modeDir.resolve(originFilename(idx, originNames[idx]));
|
||||
if (Files.exists(f) && Files.size(f) > 0) continue;
|
||||
if (skipCompleted && existingSlugs.contains(slugFromName(originNames[idx]))) {
|
||||
continue;
|
||||
}
|
||||
remaining.add(idx);
|
||||
}
|
||||
|
|
@ -255,10 +261,38 @@ public class App {
|
|||
|
||||
/** Build a filename from index + place name (index prefix prevents collisions after sanitization). */
|
||||
private static String originFilename(int index, String name) {
|
||||
String safe = name.toLowerCase()
|
||||
return String.format("%06d-%s.parquet", index, slugFromName(name));
|
||||
}
|
||||
|
||||
/** Slugify a place name: lowercase, strip non-alphanumeric (except spaces/hyphens), collapse whitespace. */
|
||||
private static String slugFromName(String name) {
|
||||
return name.toLowerCase()
|
||||
.replaceAll("[^a-z0-9 -]", "")
|
||||
.replaceAll("\\s+", "-");
|
||||
return String.format("%06d-%s.parquet", index, safe);
|
||||
}
|
||||
|
||||
/**
|
||||
* Scan a mode directory for existing non-empty parquet files, returning the set of slugs
|
||||
* (filenames with numeric prefix stripped). This allows resume to work across places.parquet
|
||||
* rebuilds where indices change but slugs stay the same.
|
||||
*/
|
||||
private static Set<String> scanExistingSlugs(Path modeDir) throws IOException {
|
||||
Set<String> slugs = new HashSet<>();
|
||||
if (!Files.isDirectory(modeDir)) return slugs;
|
||||
try (DirectoryStream<Path> stream = Files.newDirectoryStream(modeDir, "*.parquet")) {
|
||||
for (Path p : stream) {
|
||||
if (Files.size(p) > 0) {
|
||||
String stem = p.getFileName().toString().replace(".parquet", "");
|
||||
int dash = stem.indexOf('-');
|
||||
if (dash > 0 && stem.substring(0, dash).chars().allMatch(Character::isDigit)) {
|
||||
slugs.add(stem.substring(dash + 1));
|
||||
} else {
|
||||
slugs.add(stem);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return slugs;
|
||||
}
|
||||
|
||||
private static String requiredArg(String[] args, String name) {
|
||||
|
|
|
|||
|
|
@ -46,6 +46,12 @@ export class ScreenshotCache {
|
|||
normalized.poi = pois.join(',');
|
||||
}
|
||||
|
||||
// Sort travel time entries
|
||||
const tt = params.getAll('tt').sort();
|
||||
if (tt.length > 0) {
|
||||
normalized.tt = tt.join(',');
|
||||
}
|
||||
|
||||
if (params.get('tab')) {
|
||||
normalized.tab = params.get('tab')!;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -41,8 +41,8 @@ app.get('/screenshot', async (req, res) => {
|
|||
qs.set(key, val);
|
||||
}
|
||||
}
|
||||
// Repeated params: filter, poi
|
||||
for (const key of ['filter', 'poi']) {
|
||||
// Repeated params: filter, poi, tt (travel time)
|
||||
for (const key of ['filter', 'poi', 'tt']) {
|
||||
const val = req.query[key];
|
||||
if (typeof val === 'string' && val) {
|
||||
qs.append(key, val);
|
||||
|
|
|
|||
|
|
@ -354,6 +354,45 @@ pub fn compute_feature_stats(vals: &[f32], bounds: &Bounds, integer_bins: bool)
|
|||
let mut p1 = percentile_from_uniform_histogram(count, min, max, &prelim_counts, 1.0);
|
||||
let mut p99 = percentile_from_uniform_histogram(count, min, max, &prelim_counts, 99.0);
|
||||
|
||||
// Iterative refinement for outlier-dominated distributions.
|
||||
// When extreme outliers (e.g. 317M sqm from web scraping) dominate the range,
|
||||
// the uniform histogram puts all real data in one bin, making percentile
|
||||
// estimation useless. Zoom into the estimated data region and recompute.
|
||||
let mut refined_counts = prelim_counts;
|
||||
let mut refined_count = count;
|
||||
let mut refined_min = min;
|
||||
let mut refined_max = max;
|
||||
for _ in 0..3 {
|
||||
let iqr = p99 - p1;
|
||||
if iqr <= 0.0 || (refined_max - refined_min) <= 5.0 * iqr {
|
||||
break;
|
||||
}
|
||||
let new_min = (p1 - iqr).max(min);
|
||||
let new_max = p99 + iqr;
|
||||
if new_max <= new_min {
|
||||
break;
|
||||
}
|
||||
let bin_width = (new_max - new_min) / HISTOGRAM_BINS as f32;
|
||||
let mut counts = vec![0u64; HISTOGRAM_BINS];
|
||||
let mut cnt = 0usize;
|
||||
for &value in vals {
|
||||
if value.is_finite() && value >= new_min && value <= new_max {
|
||||
let bin = ((value - new_min) / bin_width) as usize;
|
||||
counts[bin.min(HISTOGRAM_BINS - 1)] += 1;
|
||||
cnt += 1;
|
||||
}
|
||||
}
|
||||
if cnt == 0 {
|
||||
break;
|
||||
}
|
||||
p1 = percentile_from_uniform_histogram(cnt, new_min, new_max, &counts, 1.0);
|
||||
p99 = percentile_from_uniform_histogram(cnt, new_min, new_max, &counts, 99.0);
|
||||
refined_counts = counts;
|
||||
refined_count = cnt;
|
||||
refined_min = new_min;
|
||||
refined_max = new_max;
|
||||
}
|
||||
|
||||
// For integer-binned features, snap p1/p99 to integer boundaries
|
||||
// so each middle bin is exactly 1 unit wide.
|
||||
if integer_bins {
|
||||
|
|
@ -411,24 +450,34 @@ pub fn compute_feature_stats(vals: &[f32], bounds: &Bounds, integer_bins: bool)
|
|||
}
|
||||
|
||||
let histogram = Histogram {
|
||||
min,
|
||||
max,
|
||||
min: refined_min,
|
||||
max: refined_max,
|
||||
p1,
|
||||
p99,
|
||||
counts,
|
||||
};
|
||||
|
||||
// Compute slider bounds
|
||||
// Compute slider bounds (use refined histogram for accurate percentiles)
|
||||
let (slider_min, slider_max) = match bounds {
|
||||
Bounds::Fixed {
|
||||
min: fmin,
|
||||
max: fmax,
|
||||
} => (*fmin, *fmax),
|
||||
Bounds::Percentile { low, high } => {
|
||||
let p_low =
|
||||
percentile_from_uniform_histogram(count, min, max, &prelim_counts, *low as f32);
|
||||
let p_high =
|
||||
percentile_from_uniform_histogram(count, min, max, &prelim_counts, *high as f32);
|
||||
let p_low = percentile_from_uniform_histogram(
|
||||
refined_count,
|
||||
refined_min,
|
||||
refined_max,
|
||||
&refined_counts,
|
||||
*low as f32,
|
||||
);
|
||||
let p_high = percentile_from_uniform_histogram(
|
||||
refined_count,
|
||||
refined_min,
|
||||
refined_max,
|
||||
&refined_counts,
|
||||
*high as f32,
|
||||
);
|
||||
(p_low, p_high)
|
||||
}
|
||||
};
|
||||
|
|
@ -1402,4 +1451,47 @@ mod tests {
|
|||
assert_eq!(stats.histogram.max, 30.0);
|
||||
assert_eq!(stats.histogram.counts.iter().sum::<u64>(), 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn extreme_outlier_does_not_destroy_quantization() {
|
||||
// Simulate floor area: 10k normal values (50-200 sqm) + one 317M outlier
|
||||
let mut data: Vec<f32> = (0..10_000).map(|i| 50.0 + (i % 150) as f32).collect();
|
||||
data.push(317_000_000.0); // Extreme outlier from web scraping
|
||||
|
||||
let bounds = make_percentile_bounds(0.0, 98.0);
|
||||
let stats = compute_feature_stats(&data, &bounds, false);
|
||||
|
||||
// After refinement, histogram range should be much tighter than 317M
|
||||
assert!(
|
||||
stats.histogram.max < 1_000_000.0,
|
||||
"histogram.max should be refined, got {}",
|
||||
stats.histogram.max,
|
||||
);
|
||||
// p1 should be near 50, not millions
|
||||
assert!(
|
||||
stats.histogram.p1 < 100.0,
|
||||
"p1 should be near real data, got {}",
|
||||
stats.histogram.p1,
|
||||
);
|
||||
// Slider min should reflect actual data range
|
||||
assert!(
|
||||
stats.slider_min < 100.0,
|
||||
"slider_min should be near real data, got {}",
|
||||
stats.slider_min,
|
||||
);
|
||||
|
||||
// Quantization using histogram.min/max should give usable range
|
||||
let qmin = stats.histogram.min;
|
||||
let qrange = stats.histogram.max - stats.histogram.min;
|
||||
assert!(qrange > 0.0 && qrange < 1_000_000.0);
|
||||
|
||||
// A typical floor area (100 sqm) should be distinguishable from min
|
||||
let normalized = (100.0 - qmin) / qrange;
|
||||
let encoded = (normalized * QUANT_SCALE).round() as u16;
|
||||
assert!(
|
||||
encoded > 100,
|
||||
"100 sqm should encode to a meaningful u16 value, got {}",
|
||||
encoded,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -68,9 +68,9 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[
|
|||
features: &[
|
||||
FeatureConfig {
|
||||
name: "Last known price",
|
||||
bounds: Bounds::Percentile {
|
||||
low: 0.0,
|
||||
high: 98.0,
|
||||
bounds: Bounds::Fixed {
|
||||
min: 0.0,
|
||||
max: 2_500_000.0,
|
||||
},
|
||||
step: 10000.0,
|
||||
description: "Most recent sale price from the Land Registry",
|
||||
|
|
@ -79,15 +79,15 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[
|
|||
prefix: "£",
|
||||
suffix: "",
|
||||
raw: false,
|
||||
absolute: false,
|
||||
absolute: true,
|
||||
modes: &["historical"],
|
||||
linked: "",
|
||||
},
|
||||
FeatureConfig {
|
||||
name: "Estimated current price",
|
||||
bounds: Bounds::Percentile {
|
||||
low: 0.0,
|
||||
high: 98.0,
|
||||
bounds: Bounds::Fixed {
|
||||
min: 0.0,
|
||||
max: 2_500_000.0,
|
||||
},
|
||||
step: 10000.0,
|
||||
description: "Inflation-adjusted estimate of the current property value",
|
||||
|
|
@ -96,7 +96,7 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[
|
|||
prefix: "£",
|
||||
suffix: "",
|
||||
raw: false,
|
||||
absolute: false,
|
||||
absolute: true,
|
||||
modes: &["historical"],
|
||||
linked: "Asking price",
|
||||
},
|
||||
|
|
@ -252,9 +252,9 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[
|
|||
},
|
||||
FeatureConfig {
|
||||
name: "Asking price",
|
||||
bounds: Bounds::Percentile {
|
||||
low: 0.0,
|
||||
high: 98.0,
|
||||
bounds: Bounds::Fixed {
|
||||
min: 0.0,
|
||||
max: 2_500_000.0,
|
||||
},
|
||||
step: 10000.0,
|
||||
description: "Listed asking price for properties currently for sale",
|
||||
|
|
@ -263,7 +263,7 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[
|
|||
prefix: "£",
|
||||
suffix: "",
|
||||
raw: false,
|
||||
absolute: false,
|
||||
absolute: true,
|
||||
modes: &["buy"],
|
||||
linked: "Estimated current price",
|
||||
},
|
||||
|
|
@ -430,6 +430,40 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[
|
|||
modes: &[],
|
||||
linked: "",
|
||||
},
|
||||
FeatureConfig {
|
||||
name: "Good+ primary schools within 2km",
|
||||
bounds: Bounds::Fixed {
|
||||
min: 0.0,
|
||||
max: 10.0,
|
||||
},
|
||||
step: 1.0,
|
||||
description: "Primary schools rated Good or Outstanding by Ofsted within walking distance",
|
||||
detail: "Number of state-funded primary schools within 2km that have a current Ofsted rating of Good or Outstanding. Based on the latest inspection outcomes dataset. Schools that have not yet been inspected are excluded.",
|
||||
source: "ofsted",
|
||||
prefix: "",
|
||||
suffix: "",
|
||||
raw: false,
|
||||
absolute: false,
|
||||
modes: &[],
|
||||
linked: "",
|
||||
},
|
||||
FeatureConfig {
|
||||
name: "Good+ secondary schools within 2km",
|
||||
bounds: Bounds::Fixed {
|
||||
min: 0.0,
|
||||
max: 5.0,
|
||||
},
|
||||
step: 1.0,
|
||||
description: "Secondary schools rated Good or Outstanding by Ofsted within walking distance",
|
||||
detail: "Number of state-funded secondary schools within 2km that have a current Ofsted rating of Good or Outstanding. Based on the latest inspection outcomes dataset. Schools that have not yet been inspected are excluded.",
|
||||
source: "ofsted",
|
||||
prefix: "",
|
||||
suffix: "",
|
||||
raw: false,
|
||||
absolute: false,
|
||||
modes: &[],
|
||||
linked: "",
|
||||
},
|
||||
],
|
||||
},
|
||||
FeatureGroup {
|
||||
|
|
@ -949,6 +983,23 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[
|
|||
modes: &[],
|
||||
linked: "",
|
||||
},
|
||||
FeatureConfig {
|
||||
name: "Median age",
|
||||
bounds: Bounds::Percentile {
|
||||
low: 2.0,
|
||||
high: 98.0,
|
||||
},
|
||||
step: 0.5,
|
||||
description: "Median age of the local population",
|
||||
detail: "From the 2021 Census (TS007A). Median age of usual residents in the LSOA, computed by linear interpolation from five-year age band counts. Areas with younger populations tend to be urban, university towns, or have more families; older medians are typical in rural and coastal areas.",
|
||||
source: "census-2021",
|
||||
prefix: "",
|
||||
suffix: " years",
|
||||
raw: false,
|
||||
absolute: false,
|
||||
modes: &[],
|
||||
linked: "",
|
||||
},
|
||||
],
|
||||
},
|
||||
FeatureGroup {
|
||||
|
|
@ -996,8 +1047,8 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[
|
|||
},
|
||||
step: 1.0,
|
||||
description: "Number of parks and green spaces within 2km",
|
||||
detail: "Count of parks, gardens, nature reserves, and other green spaces within a 2km radius of the property's postcode centroid. Derived from OpenStreetMap POI data.",
|
||||
source: "osm-pois",
|
||||
detail: "Count of public parks, gardens, playing fields, and play spaces with at least one entrance within a 2km radius of the property's postcode centroid. Derived from the OS Open Greenspace dataset (Ordnance Survey), using park entrance locations for accurate proximity matching.",
|
||||
source: "os-open-greenspace",
|
||||
prefix: "",
|
||||
suffix: "",
|
||||
raw: false,
|
||||
|
|
@ -1005,6 +1056,23 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[
|
|||
modes: &[],
|
||||
linked: "",
|
||||
},
|
||||
FeatureConfig {
|
||||
name: "Distance to nearest park (km)",
|
||||
bounds: Bounds::Percentile {
|
||||
low: 2.0,
|
||||
high: 98.0,
|
||||
},
|
||||
step: 0.1,
|
||||
description: "Distance to the closest park or green space",
|
||||
detail: "Straight-line distance in kilometres from the property's postcode centroid to the nearest park entrance. Covers public parks, gardens, playing fields, and play spaces. Derived from the OS Open Greenspace dataset (Ordnance Survey), using access point locations rather than polygon centroids for accuracy — so properties bordering a large park correctly show a short distance.",
|
||||
source: "os-open-greenspace",
|
||||
prefix: "",
|
||||
suffix: " km",
|
||||
raw: false,
|
||||
absolute: false,
|
||||
modes: &[],
|
||||
linked: "",
|
||||
},
|
||||
],
|
||||
},
|
||||
FeatureGroup {
|
||||
|
|
@ -1056,6 +1124,13 @@ pub static ENUM_FEATURE_GROUPS: &[EnumFeatureGroup] = &[
|
|||
detail: "From HM Land Registry Price Paid data and EPC certificates. Detached, Semi-Detached, Terraced (includes all terrace sub-types), Flats/Maisonettes, or Other (bungalows, park homes, etc.).",
|
||||
source: "price-paid",
|
||||
},
|
||||
EnumFeatureConfig {
|
||||
name: "Former council house",
|
||||
order: Some(&["Yes", "No"]),
|
||||
description: "Whether the property was ever recorded as social housing",
|
||||
detail: "Derived from the TENURE field in Energy Performance Certificate data. If any EPC certificate for this property recorded the tenure as social rental, it indicates the property was council or housing-association stock at the time of that inspection. Properties that were later sold (e.g. via Right to Buy) retain this flag.",
|
||||
source: "epc",
|
||||
},
|
||||
EnumFeatureConfig {
|
||||
name: "Current energy rating",
|
||||
order: Some(&["A", "B", "C", "D", "E", "F", "G"]),
|
||||
|
|
@ -1082,55 +1157,6 @@ pub static ENUM_FEATURE_GROUPS: &[EnumFeatureGroup] = &[
|
|||
detail: "Maximum available fixed broadband download speed in Megabits per second, from Ofcom's Connected Nations 2025 report. Measured at Output Area level and represents the maximum speed available from any provider, not actual achieved speeds. Tiers: 10 = basic, 30 = superfast (SFBB), 100 = ultrafast 100Mbit, 300 = ultrafast (UFBB), 1000 = gigabit.",
|
||||
source: "broadband",
|
||||
},
|
||||
EnumFeatureConfig {
|
||||
name: "Environmental risk",
|
||||
order: Some(&["Low", "Moderate", "Significant"]),
|
||||
description: "Highest ground stability risk across all six hazard types",
|
||||
detail: "Overall ground stability risk for the area, taken as the maximum across all six GeoSure hazard categories (collapsible deposits, compressible ground, landslides, running sand, shrink-swell, and soluble rocks). From Ordnance Survey GeoSure data on a 5km hex grid.",
|
||||
source: "geosure",
|
||||
},
|
||||
EnumFeatureConfig {
|
||||
name: "Collapsible deposits risk",
|
||||
order: Some(&["Low", "Moderate", "Significant"]),
|
||||
description: "Risk of ground collapse from natural underground cavities",
|
||||
detail: "From OS GeoSure. Indicates the likelihood of ground collapse due to natural cavities formed by dissolution of soluble rocks or the collapse of old mines and natural pipes. Rated on a 5km hex grid across Great Britain.",
|
||||
source: "geosure",
|
||||
},
|
||||
EnumFeatureConfig {
|
||||
name: "Compressible ground risk",
|
||||
order: Some(&["Low", "Moderate", "Significant"]),
|
||||
description: "Risk of ground compression causing subsidence",
|
||||
detail: "From OS GeoSure. Indicates the potential for ground to compress under loading, which can cause gradual settlement or subsidence of buildings and infrastructure. Typically associated with soft clay, silt, or peat deposits.",
|
||||
source: "geosure",
|
||||
},
|
||||
EnumFeatureConfig {
|
||||
name: "Landslide risk",
|
||||
order: Some(&["Low", "Moderate", "Significant"]),
|
||||
description: "Risk of landslide or slope instability",
|
||||
detail: "From OS GeoSure. Indicates the susceptibility of the ground to landslides and slope instability. Based on slope angle, geology, and historical landslide records.",
|
||||
source: "geosure",
|
||||
},
|
||||
EnumFeatureConfig {
|
||||
name: "Running sand risk",
|
||||
order: Some(&["Low", "Moderate", "Significant"]),
|
||||
description: "Risk of sand becoming fluid when saturated",
|
||||
detail: "From OS GeoSure. Indicates the potential for fine-grained sand to behave like a fluid when saturated with water, which can affect excavations and foundations.",
|
||||
source: "geosure",
|
||||
},
|
||||
EnumFeatureConfig {
|
||||
name: "Shrink-swell risk",
|
||||
order: Some(&["Low", "Moderate", "Significant"]),
|
||||
description: "Risk of clay shrinking and swelling with moisture changes",
|
||||
detail: "From OS GeoSure. Indicates the potential for clay-rich soils to shrink when dry and swell when wet, causing ground movement that can damage buildings and infrastructure. One of the most common causes of subsidence in the UK.",
|
||||
source: "geosure",
|
||||
},
|
||||
EnumFeatureConfig {
|
||||
name: "Soluble rocks risk",
|
||||
order: Some(&["Low", "Moderate", "Significant"]),
|
||||
description: "Risk of sinkholes from dissolution of soluble rocks",
|
||||
detail: "From OS GeoSure. Indicates the potential for soluble rocks (limestone, chalk, gypsum) to dissolve, creating underground voids that can lead to sinkholes and ground subsidence.",
|
||||
source: "geosure",
|
||||
},
|
||||
],
|
||||
},
|
||||
];
|
||||
|
|
|
|||
|
|
@ -22,7 +22,7 @@ pub async fn og_middleware(request: Request, next: Next) -> Response {
|
|||
let response = next.run(request).await;
|
||||
|
||||
// Only inject OG tags into SPA HTML responses, not proxied PocketBase responses
|
||||
if path.starts_with("/pb/") || path.starts_with("/api/") {
|
||||
if path.starts_with("/pb/") || path.starts_with("/api/") || path.starts_with("/s/") {
|
||||
return response;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -12,6 +12,8 @@ use tracing::{info, warn};
|
|||
use crate::auth::OptionalUser;
|
||||
use crate::consts::{AI_FILTERS_MAX_TOKENS, AI_FILTERS_TEMPERATURE, AI_FILTERS_WEEKLY_TOKEN_LIMIT};
|
||||
use crate::data::slugify;
|
||||
use crate::data::travel_time::TravelData;
|
||||
use crate::parsing::{parse_filters, row_passes_filters};
|
||||
use crate::pocketbase::{get_superuser_token, log_ai_query};
|
||||
use crate::routes::{FeatureInfo, FeaturesResponse};
|
||||
use crate::state::{AppState, SharedState};
|
||||
|
|
@ -62,6 +64,8 @@ pub struct AiFiltersResponse {
|
|||
notes: String,
|
||||
/// The listing mode used for this response (historical/buy/rent)
|
||||
listing_type: String,
|
||||
/// Number of properties matching the proposed filters (excludes travel time)
|
||||
match_count: usize,
|
||||
}
|
||||
|
||||
/// Strip markdown code fences (```json ... ``` or ``` ... ```) from LLM output.
|
||||
|
|
@ -162,6 +166,76 @@ fn execute_destination_search(state: &AppState, query: &str, mode: &str) -> Valu
|
|||
matches.truncate(10);
|
||||
|
||||
if matches.is_empty() {
|
||||
// Check if the query matched a city that lacks its own travel data.
|
||||
// If so, return nearby stations within that city as suggestions.
|
||||
let matched_city_name: Option<&str> =
|
||||
pd.name_lower
|
||||
.iter()
|
||||
.enumerate()
|
||||
.find_map(|(idx, name_lower)| {
|
||||
let words_match = query_words.iter().all(|word| name_lower.contains(word));
|
||||
let slug = slugify(&pd.name[idx]);
|
||||
let slug_match =
|
||||
slug.contains(&query_slug) || query_slug.contains(&slug);
|
||||
if (words_match || slug_match) && pd.type_rank[idx] == 0 {
|
||||
Some(pd.name[idx].as_str())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
});
|
||||
|
||||
if let Some(city_name) = matched_city_name {
|
||||
let city_lower = city_name.to_lowercase();
|
||||
let mut city_matches: Vec<(usize, String, u8, u32)> = pd
|
||||
.city
|
||||
.iter()
|
||||
.enumerate()
|
||||
.filter_map(|(idx, city_opt)| {
|
||||
let city = city_opt.as_deref()?;
|
||||
if city.to_lowercase() != city_lower {
|
||||
return None;
|
||||
}
|
||||
let slug = slugify(&pd.name[idx]);
|
||||
if slug_set.contains(&slug) {
|
||||
Some((idx, slug, pd.type_rank[idx], pd.population[idx]))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
city_matches.sort_unstable_by(|a, b| a.2.cmp(&b.2).then(b.3.cmp(&a.3)));
|
||||
city_matches.truncate(10);
|
||||
|
||||
if !city_matches.is_empty() {
|
||||
let results: Vec<Value> = city_matches
|
||||
.into_iter()
|
||||
.map(|(idx, slug, ..)| {
|
||||
json!({
|
||||
"name": pd.name[idx],
|
||||
"slug": slug,
|
||||
"place_type": pd.place_type.get(idx).to_string(),
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
|
||||
info!(
|
||||
query = query,
|
||||
city = city_name,
|
||||
results = results.len(),
|
||||
"Destination search fell back to city stations"
|
||||
);
|
||||
|
||||
return json!({
|
||||
"results": results,
|
||||
"message": format!(
|
||||
"No travel data for '{}' directly. Pick one of these nearby stations:",
|
||||
city_name
|
||||
)
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
info!(
|
||||
query = query,
|
||||
mode = mode,
|
||||
|
|
@ -377,8 +451,8 @@ pub fn build_system_prompt(
|
|||
{\"name\": \"Serious crime (avg/yr)\", \"bound\": \"max\", \"value\": 20}, \
|
||||
{\"name\": \"Minor crime (avg/yr)\", \"bound\": \"max\", \"value\": 50}, \
|
||||
{\"name\": \"Noise (dB)\", \"bound\": \"max\", \"value\": 55}, \
|
||||
{\"name\": \"Good+ primary schools within 5km\", \"bound\": \"min\", \"value\": 5}, \
|
||||
{\"name\": \"Good+ secondary schools within 5km\", \"bound\": \"min\", \"value\": 2}, \
|
||||
{\"name\": \"Good+ primary schools within 2km\", \"bound\": \"min\", \"value\": 2}, \
|
||||
{\"name\": \"Good+ secondary schools within 2km\", \"bound\": \"min\", \"value\": 1}, \
|
||||
{\"name\": \"Number of parks within 2km\", \"bound\": \"min\", \"value\": 3}], \
|
||||
\"enum_filters\": [], \"travel_time_filters\": [], \"notes\": \"\"}"
|
||||
.to_string(),
|
||||
|
|
@ -416,8 +490,8 @@ pub fn build_system_prompt(
|
|||
Output: {\"numeric_filters\": [\
|
||||
{\"name\": \"Total floor area (sqm)\", \"bound\": \"min\", \"value\": 100}, \
|
||||
{\"name\": \"Number of bedrooms & living rooms\", \"bound\": \"min\", \"value\": 5}, \
|
||||
{\"name\": \"Good+ primary schools within 5km\", \"bound\": \"min\", \"value\": 5}, \
|
||||
{\"name\": \"Good+ secondary schools within 5km\", \"bound\": \"min\", \"value\": 2}], \
|
||||
{\"name\": \"Good+ primary schools within 2km\", \"bound\": \"min\", \"value\": 2}, \
|
||||
{\"name\": \"Good+ secondary schools within 2km\", \"bound\": \"min\", \"value\": 1}], \
|
||||
\"enum_filters\": [{\"name\": \"Property type\", \
|
||||
\"values\": [\"Detached\", \"Semi-Detached\"]}], \
|
||||
\"travel_time_filters\": [{\"mode\": \"car\", \"slug\": \"manchester\", \
|
||||
|
|
@ -441,7 +515,7 @@ pub fn build_system_prompt(
|
|||
"\nUser: \"3 bed house to buy under 500k with good schools\"\n\
|
||||
Output: {\"listing_type\": \"buy\", \
|
||||
\"numeric_filters\": [{\"name\": \"Asking price\", \"bound\": \"max\", \"value\": 500000}, \
|
||||
{\"name\": \"Good+ primary schools within 5km\", \"bound\": \"min\", \"value\": 5}], \
|
||||
{\"name\": \"Good+ primary schools within 2km\", \"bound\": \"min\", \"value\": 2}], \
|
||||
\"enum_filters\": [{\"name\": \"Property type\", \
|
||||
\"values\": [\"Detached\", \"Semi-Detached\", \"Terraced\"]}], \
|
||||
\"travel_time_filters\": [], \
|
||||
|
|
@ -556,8 +630,124 @@ async fn update_ai_usage(state: &AppState, user_id: &str, tokens_used: u64, week
|
|||
}
|
||||
}
|
||||
|
||||
/// Maximum number of round trips (function calls + retries) before giving up.
|
||||
const MAX_TOOL_ROUNDS: usize = 5;
|
||||
/// Convert validated filter JSON back to the `;;`-separated filter string format
|
||||
/// that `parse_filters` expects.
|
||||
///
|
||||
/// Numeric: `{"name": [min, max]}` → `name:min:max`
|
||||
/// Enum: `{"name": ["val1", "val2"]}` → `name:val1|val2`
|
||||
fn filters_to_filter_string(filters: &Value) -> String {
|
||||
let obj = match filters.as_object() {
|
||||
Some(obj) => obj,
|
||||
None => return String::new(),
|
||||
};
|
||||
|
||||
let mut parts = Vec::new();
|
||||
for (name, value) in obj {
|
||||
if let Some(arr) = value.as_array() {
|
||||
if arr.len() == 2 && arr[0].is_number() && arr[1].is_number() {
|
||||
let min = arr[0].as_f64().unwrap_or(0.0);
|
||||
let max = arr[1].as_f64().unwrap_or(0.0);
|
||||
parts.push(format!("{name}:{min}:{max}"));
|
||||
} else if !arr.is_empty() && arr[0].is_string() {
|
||||
let values: Vec<&str> = arr.iter().filter_map(|v| v.as_str()).collect();
|
||||
if !values.is_empty() {
|
||||
parts.push(format!("{name}:{}", values.join("|")));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
parts.join(";;")
|
||||
}
|
||||
|
||||
/// Count how many rows in the property dataset pass the given property filters
|
||||
/// AND travel time filters. Travel time data is loaded from the TravelTimeStore
|
||||
/// and checked per-postcode (same logic as hexagons.rs).
|
||||
fn count_matching_rows(
|
||||
state: &AppState,
|
||||
filters: &Value,
|
||||
travel_time_filters: &[TravelTimeFilter],
|
||||
) -> usize {
|
||||
let filter_str = filters_to_filter_string(filters);
|
||||
|
||||
let quant = state.data.quant_ref();
|
||||
let (parsed_filters, parsed_enum_filters) = if filter_str.is_empty() {
|
||||
(Vec::new(), Vec::new())
|
||||
} else {
|
||||
match parse_filters(
|
||||
Some(&filter_str),
|
||||
&state.feature_name_to_index,
|
||||
&state.data.enum_values,
|
||||
&quant,
|
||||
) {
|
||||
Ok(f) => f,
|
||||
Err(err) => {
|
||||
warn!("Failed to parse filters for match count: {err}");
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Load travel time data for each filter entry
|
||||
let travel_data: Vec<(TravelData, Option<f32>, Option<f32>)> = travel_time_filters
|
||||
.iter()
|
||||
.filter_map(|ttf| {
|
||||
let data = state.travel_time_store.get(&ttf.mode, &ttf.slug).ok()?;
|
||||
Some((data, ttf.min, ttf.max))
|
||||
})
|
||||
.collect();
|
||||
let has_travel = !travel_data.is_empty();
|
||||
|
||||
let feature_data = &state.data.feature_data;
|
||||
let num_features = state.data.num_features;
|
||||
let num_rows = state.data.lat.len();
|
||||
let (pc_interner, pc_keys) = state.data.postcode_parts();
|
||||
|
||||
let mut count = 0usize;
|
||||
for row in 0..num_rows {
|
||||
if !row_passes_filters(
|
||||
row,
|
||||
&parsed_filters,
|
||||
&parsed_enum_filters,
|
||||
feature_data,
|
||||
num_features,
|
||||
) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if has_travel {
|
||||
let postcode = pc_interner.resolve(&pc_keys[row]);
|
||||
let mut passes_travel = true;
|
||||
for (data, fmin, fmax) in &travel_data {
|
||||
let pass = if let Some(mins) = data.get(postcode).map(|r| r.minutes as f32) {
|
||||
fmin.map_or(true, |min| mins >= min)
|
||||
&& fmax.map_or(true, |max| mins <= max)
|
||||
} else {
|
||||
false // no travel data → postcode not reachable
|
||||
};
|
||||
if !pass {
|
||||
passes_travel = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if !passes_travel {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
count += 1;
|
||||
}
|
||||
|
||||
count
|
||||
}
|
||||
|
||||
/// Budget limits for the Gemini conversation loop. Separate counters prevent
|
||||
/// tool calls (destination searches) from starving JSON retries or zero-match
|
||||
/// refinements.
|
||||
const MAX_TOOL_CALLS: usize = 4;
|
||||
const MAX_RETRIES: usize = 3;
|
||||
const MAX_REFINEMENTS: u32 = 3;
|
||||
const MAX_TOTAL_ROUNDS: usize = 10;
|
||||
|
||||
pub async fn post_ai_filters(
|
||||
State(shared): State<Arc<SharedState>>,
|
||||
|
|
@ -631,9 +821,12 @@ pub async fn post_ai_filters(
|
|||
})];
|
||||
|
||||
let mut total_tokens_accumulated: u64 = 0;
|
||||
let mut tool_call_count = 0usize;
|
||||
let mut retry_count = 0usize;
|
||||
let mut refinement_attempts = 0u32;
|
||||
|
||||
// Function calling loop: model may call search_destinations, we execute and feed back
|
||||
for round in 0..MAX_TOOL_ROUNDS {
|
||||
for round in 0..MAX_TOTAL_ROUNDS {
|
||||
let body = json!({
|
||||
"systemInstruction": {
|
||||
"parts": [{ "text": state.ai_filters_system_prompt }]
|
||||
|
|
@ -686,7 +879,18 @@ pub async fn post_ai_filters(
|
|||
let fn_name = fc.get("name").and_then(|n| n.as_str()).unwrap_or("");
|
||||
let fn_args = fc.get("args").cloned().unwrap_or(json!({}));
|
||||
|
||||
info!(function = fn_name, round = round, "AI called tool");
|
||||
tool_call_count += 1;
|
||||
info!(function = fn_name, round = round, tool_call = tool_call_count, "AI called tool");
|
||||
|
||||
if tool_call_count > MAX_TOOL_CALLS {
|
||||
warn!("Tool call budget exhausted, forcing text output");
|
||||
contents.push(candidate.clone());
|
||||
contents.push(json!({
|
||||
"role": "user",
|
||||
"parts": [{ "text": "Tool call limit reached. Output your best JSON now using the destinations you already found. Do not call any more tools." }]
|
||||
}));
|
||||
continue;
|
||||
}
|
||||
|
||||
let fn_result = if fn_name == "search_destinations" {
|
||||
let query = fn_args.get("query").and_then(|q| q.as_str()).unwrap_or("");
|
||||
|
|
@ -724,8 +928,11 @@ pub async fn post_ai_filters(
|
|||
let text = text.trim();
|
||||
|
||||
if text.is_empty() {
|
||||
warn!("Gemini returned empty text content (round {})", round);
|
||||
// Retry by continuing the loop
|
||||
retry_count += 1;
|
||||
warn!("Gemini returned empty text content (round {}, retry {})", round, retry_count);
|
||||
if retry_count > MAX_RETRIES {
|
||||
return Err((StatusCode::BAD_GATEWAY, "AI returned empty responses".into()));
|
||||
}
|
||||
contents.push(candidate.clone());
|
||||
contents.push(json!({
|
||||
"role": "user",
|
||||
|
|
@ -737,8 +944,11 @@ pub async fn post_ai_filters(
|
|||
let raw: Value = match serde_json::from_str(text) {
|
||||
Ok(val) => val,
|
||||
Err(err) => {
|
||||
warn!(error = %err, round = round, "Failed to parse Gemini JSON output, retrying");
|
||||
// Ask the model to fix its output
|
||||
retry_count += 1;
|
||||
warn!(error = %err, round = round, retry = retry_count, "Failed to parse Gemini JSON output");
|
||||
if retry_count > MAX_RETRIES {
|
||||
return Err((StatusCode::BAD_GATEWAY, "AI returned invalid JSON".into()));
|
||||
}
|
||||
contents.push(candidate.clone());
|
||||
contents.push(json!({
|
||||
"role": "user",
|
||||
|
|
@ -776,6 +986,65 @@ pub async fn post_ai_filters(
|
|||
map.insert("Listing status".to_string(), json!([listing_value]));
|
||||
}
|
||||
|
||||
// Count matching properties and refine if too restrictive
|
||||
let match_count = count_matching_rows(&state, &filters, &travel_time_filters);
|
||||
info!(match_count = match_count, round = round, "AI filter match count");
|
||||
|
||||
if match_count == 0 {
|
||||
refinement_attempts += 1;
|
||||
let total_rows = state.data.lat.len();
|
||||
info!(
|
||||
attempt = refinement_attempts,
|
||||
"0 matches out of {total_rows} — asking AI to relax filters"
|
||||
);
|
||||
|
||||
if refinement_attempts > MAX_REFINEMENTS {
|
||||
warn!("Refinement budget exhausted, returning filters with 0 matches");
|
||||
let new_total = tokens_used + total_tokens_accumulated;
|
||||
update_ai_usage(&state, &user.id, new_total, current_week).await;
|
||||
counter!("ai_tokens_total").increment(total_tokens_accumulated);
|
||||
counter!("ai_requests_total", "status" => "zero_matches").increment(1);
|
||||
|
||||
let notes = if notes.is_empty() {
|
||||
"No properties match these filters. Try relaxing some constraints.".to_string()
|
||||
} else {
|
||||
format!("{}. No properties match — try relaxing some constraints.", notes)
|
||||
};
|
||||
|
||||
return Ok(Json(AiFiltersResponse {
|
||||
filters,
|
||||
travel_time_filters,
|
||||
notes,
|
||||
listing_type: listing_type.to_string(),
|
||||
match_count: 0,
|
||||
}));
|
||||
}
|
||||
|
||||
let feedback = match refinement_attempts {
|
||||
1 => format!(
|
||||
"Your proposed filters matched 0 properties out of {total_rows} total. \
|
||||
The combination is too restrictive. Please widen some numeric ranges \
|
||||
or add more enum values while keeping the user's intent. \
|
||||
Output the adjusted JSON."
|
||||
),
|
||||
2 => format!(
|
||||
"Still 0 matches out of {total_rows}. Please widen ranges further. \
|
||||
Output the adjusted JSON."
|
||||
),
|
||||
_ => format!(
|
||||
"Still 0 matches out of {total_rows}. Please remove additional filters \
|
||||
until some properties match, keeping the user's core priority. \
|
||||
Output the adjusted JSON."
|
||||
),
|
||||
};
|
||||
contents.push(candidate.clone());
|
||||
contents.push(json!({
|
||||
"role": "user",
|
||||
"parts": [{ "text": feedback }]
|
||||
}));
|
||||
continue;
|
||||
}
|
||||
|
||||
// Update usage with total accumulated tokens
|
||||
let new_total = tokens_used + total_tokens_accumulated;
|
||||
update_ai_usage(&state, &user.id, new_total, current_week).await;
|
||||
|
|
@ -810,13 +1079,14 @@ pub async fn post_ai_filters(
|
|||
travel_time_filters,
|
||||
notes,
|
||||
listing_type: listing_type.to_string(),
|
||||
match_count,
|
||||
}));
|
||||
}
|
||||
|
||||
// Exhausted tool rounds without getting a final text response
|
||||
// Exhausted total round budget without getting a valid response
|
||||
warn!(
|
||||
"AI exhausted {} tool-calling rounds without final response",
|
||||
MAX_TOOL_ROUNDS
|
||||
"AI exhausted {} total rounds without final response (tools={}, retries={}, refinements={})",
|
||||
MAX_TOTAL_ROUNDS, tool_call_count, retry_count, refinement_attempts
|
||||
);
|
||||
Err((
|
||||
StatusCode::BAD_GATEWAY,
|
||||
|
|
@ -902,8 +1172,10 @@ fn validate_travel_time_filters(raw: &Value, state: &AppState) -> Vec<TravelTime
|
|||
fn validate_and_convert(raw: &Value, features: &FeaturesResponse, listing_type: &str) -> Value {
|
||||
let mut result = serde_json::Map::new();
|
||||
|
||||
// Build lookup maps from feature metadata, filtering by listing mode
|
||||
let mut numeric_features: rustc_hash::FxHashMap<&str, (f32, f32)> =
|
||||
// Build lookup maps from feature metadata, filtering by listing mode.
|
||||
// Store both slider bounds (min/max from percentiles) and true data bounds
|
||||
// (histogram.min/max) so one-sided AI filters use the full data range.
|
||||
let mut numeric_features: rustc_hash::FxHashMap<&str, (f32, f32, f32, f32)> =
|
||||
rustc_hash::FxHashMap::default();
|
||||
let mut enum_features: rustc_hash::FxHashMap<&str, &[String]> =
|
||||
rustc_hash::FxHashMap::default();
|
||||
|
|
@ -915,12 +1187,14 @@ fn validate_and_convert(raw: &Value, features: &FeaturesResponse, listing_type:
|
|||
name,
|
||||
min,
|
||||
max,
|
||||
histogram,
|
||||
modes,
|
||||
..
|
||||
} => {
|
||||
// Only include features valid for the chosen listing mode
|
||||
if modes.is_empty() || modes.contains(&listing_type) {
|
||||
numeric_features.insert(name, (*min, *max));
|
||||
numeric_features
|
||||
.insert(name, (*min, *max, histogram.min, histogram.max));
|
||||
}
|
||||
}
|
||||
FeatureInfo::Enum { name, values, .. } => {
|
||||
|
|
@ -933,32 +1207,37 @@ fn validate_and_convert(raw: &Value, features: &FeaturesResponse, listing_type:
|
|||
}
|
||||
}
|
||||
|
||||
// Process numeric filters — each sets one bound (min or max)
|
||||
// Process numeric filters — each sets one bound (min or max).
|
||||
// The unset side uses the true data min/max (from histogram), not
|
||||
// the slider bounds (percentile-based), so a "max" filter for crime
|
||||
// produces [0, value] rather than [2nd-percentile, value].
|
||||
if let Some(arr) = raw.get("numeric_filters").and_then(|val| val.as_array()) {
|
||||
for item in arr {
|
||||
let name = match item.get("name").and_then(|val| val.as_str()) {
|
||||
Some(name) => name,
|
||||
None => continue,
|
||||
};
|
||||
let (feat_min, feat_max) = match numeric_features.get(name) {
|
||||
Some(range) => *range,
|
||||
None => continue,
|
||||
};
|
||||
let (slider_min, slider_max, data_min, data_max) =
|
||||
match numeric_features.get(name) {
|
||||
Some(range) => *range,
|
||||
None => continue,
|
||||
};
|
||||
let bound = match item.get("bound").and_then(|val| val.as_str()) {
|
||||
Some(b) => b,
|
||||
None => continue,
|
||||
};
|
||||
// Clamp value to true data range (not slider range)
|
||||
let value = match item.get("value").and_then(|val| val.as_f64()) {
|
||||
Some(v) => v.max(feat_min as f64).min(feat_max as f64) as f32,
|
||||
Some(v) => v.max(data_min as f64).min(data_max as f64) as f32,
|
||||
None => continue,
|
||||
};
|
||||
let (filter_min, filter_max) = match bound {
|
||||
"min" => (value, feat_max),
|
||||
"max" => (feat_min, value),
|
||||
"min" => (value, data_max),
|
||||
"max" => (data_min, value),
|
||||
_ => continue,
|
||||
};
|
||||
// Only include if range is narrower than full range
|
||||
if filter_min > feat_min || filter_max < feat_max {
|
||||
// Only include if range is narrower than full slider range
|
||||
if filter_min > slider_min || filter_max < slider_max {
|
||||
result.insert(name.to_string(), json!([filter_min, filter_max]));
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,8 +1,8 @@
|
|||
use std::sync::Arc;
|
||||
|
||||
use axum::extract::{Path, State};
|
||||
use axum::http::StatusCode;
|
||||
use axum::response::{IntoResponse, Redirect, Response};
|
||||
use axum::http::{header, StatusCode};
|
||||
use axum::response::{Html, IntoResponse, Response};
|
||||
use axum::Json;
|
||||
use rand::Rng;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
|
@ -139,7 +139,38 @@ pub async fn get_short_url(
|
|||
|
||||
match params {
|
||||
Some(params) => {
|
||||
Redirect::temporary(&format!("/dashboard?{params}")).into_response()
|
||||
let redirect_url = format!("/dashboard?{params}");
|
||||
let og_image_url = format!(
|
||||
"{}/api/screenshot?og=1&{params}",
|
||||
state.public_url
|
||||
);
|
||||
let og_url = format!("{}/s/{code}", state.public_url);
|
||||
let og_title = "Perfect Postcode \u{2014} Every neighbourhood in England";
|
||||
let og_description = "Explore property prices, energy ratings, crime stats, school ratings, and more across England on one interactive map.";
|
||||
|
||||
let html = format!(
|
||||
r#"<!DOCTYPE html>
|
||||
<html><head>
|
||||
<meta charset="utf-8" />
|
||||
<meta property="og:title" content="{og_title}" />
|
||||
<meta property="og:description" content="{og_description}" />
|
||||
<meta property="og:type" content="website" />
|
||||
<meta property="og:url" content="{og_url}" />
|
||||
<meta property="og:image" content="{og_image_url}" />
|
||||
<meta property="og:image:width" content="1200" />
|
||||
<meta property="og:image:height" content="630" />
|
||||
<meta name="twitter:card" content="summary_large_image" />
|
||||
<meta name="twitter:title" content="{og_title}" />
|
||||
<meta name="twitter:description" content="{og_description}" />
|
||||
<meta http-equiv="refresh" content="0;url={redirect_url}" />
|
||||
<title>{og_title}</title>
|
||||
</head><body></body></html>"#
|
||||
);
|
||||
(
|
||||
[(header::CACHE_CONTROL, "public, max-age=86400")],
|
||||
Html(html),
|
||||
)
|
||||
.into_response()
|
||||
}
|
||||
None => StatusCode::NOT_FOUND.into_response(),
|
||||
}
|
||||
|
|
|
|||
|
|
@ -17,6 +17,8 @@ pub struct DestinationResult {
|
|||
place_type: String,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
city: Option<String>,
|
||||
lat: f32,
|
||||
lon: f32,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
|
|
@ -76,6 +78,8 @@ pub async fn get_travel_destinations(
|
|||
slug,
|
||||
place_type: pd.place_type.get(idx).to_string(),
|
||||
city: pd.city[idx].clone(),
|
||||
lat: pd.lat[idx],
|
||||
lon: pd.lon[idx],
|
||||
})
|
||||
.collect();
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue