diff --git a/Makefile.data b/Makefile.data index 21d7b6e..a37c45f 100644 --- a/Makefile.data +++ b/Makefile.data @@ -37,8 +37,6 @@ NAPTAN := $(DATA_DIR)/naptan.parquet BROADBAND := $(DATA_DIR)/broadband.parquet SCHOOL_PROX := $(DATA_DIR)/school_proximity.parquet RENTAL := $(DATA_DIR)/rental_prices.parquet -GEOSURE_DIR := $(DATA_DIR)/geosure -GEOSURE := $(DATA_DIR)/geosure.parquet INSPIRE_DIR := $(DATA_DIR)/inspire OA_BOUNDARIES := $(DATA_DIR)/oa_boundaries.gpkg UPRN_LOOKUP := $(DATA_DIR)/uprn_lookup.parquet @@ -46,16 +44,17 @@ PC_BOUNDARIES := $(MANUAL_DATA)/postcode_boundaries TRANSIT_DIR := $(DATA_DIR)/transit TRANSIT_STAMP := $(TRANSIT_DIR)/.done GREENSPACE := $(DATA_DIR)/greenspace_water.parquet +OS_GREENSPACE := $(DATA_DIR)/os_greenspace.parquet PBF := $(DATA_DIR)/england-latest.osm.pbf PLACES := $(DATA_DIR)/places.parquet LISTINGS_BUY := $(DATA_DIR)/online_listings_buy.parquet LISTINGS_RENT := $(DATA_DIR)/online_listings_rent.parquet LSOA_POP := $(DATA_DIR)/lsoa_population.parquet +MEDIAN_AGE := $(DATA_DIR)/median_age.parquet ENGLAND_BOUNDARY := $(DATA_DIR)/england_boundary.geojson RM_OUTCODES := frontend/src/lib/rightmove-outcodes.json # Sentinel files for directory targets (Make doesn't track directories well) -GEOSURE_STAMP := $(GEOSURE_DIR)/.done INSPIRE_STAMP := $(INSPIRE_DIR)/.done PMTILES_VERSION := 1.22.3 @@ -65,10 +64,10 @@ PMTILES_VERSION := 1.22.3 .PHONY: prepare merge tiles \ download-arcgis download-price-paid download-deprivation download-ethnicity \ download-naptan download-pois download-ofsted download-broadband download-rental-prices \ - download-postcodes download-geosure download-noise download-inspire \ - download-oa-boundaries download-uprn-lookup download-transit-network download-greenspace download-pbf download-places download-lsoa-population download-england-boundary download-rightmove-outcodes \ + download-postcodes download-noise download-inspire \ + download-oa-boundaries download-uprn-lookup download-transit-network download-greenspace download-os-greenspace download-pbf download-places download-lsoa-population download-median-age download-england-boundary download-rightmove-outcodes \ transform-pois transform-epc-pp transform-crime transform-poi-proximity \ - transform-school-proximity transform-geosure transform-postcode-boundaries \ + transform-school-proximity transform-postcode-boundaries \ generate-postcode-boundaries prepare: $(PRICES_STAMP) @@ -83,7 +82,6 @@ download-pois: $(POIS_RAW) download-ofsted: $(OFSTED) download-broadband: $(BROADBAND) download-postcodes: $(POSTCODES) -download-geosure: $(GEOSURE_STAMP) download-rental-prices: $(RENTAL) download-noise: $(NOISE) download-inspire: $(INSPIRE_STAMP) @@ -91,9 +89,11 @@ download-oa-boundaries: $(OA_BOUNDARIES) download-uprn-lookup: $(UPRN_LOOKUP) download-transit-network: $(TRANSIT_STAMP) download-greenspace: $(GREENSPACE) +download-os-greenspace: $(OS_GREENSPACE) download-pbf: $(PBF) download-places: $(PLACES) download-lsoa-population: $(LSOA_POP) +download-median-age: $(MEDIAN_AGE) download-england-boundary: $(ENGLAND_BOUNDARY) download-rightmove-outcodes: $(RM_OUTCODES) transform-pois: $(POIS_FILTERED) @@ -101,7 +101,6 @@ transform-epc-pp: $(EPC_PP) transform-crime: $(CRIME) transform-poi-proximity: $(POI_PROXIMITY) transform-school-proximity: $(SCHOOL_PROX) -transform-geosure: $(GEOSURE) transform-postcode-boundaries: $(PC_BOUNDARIES) generate-postcode-boundaries: $(OA_BOUNDARIES) $(INSPIRE_STAMP) $(UPRN_LOOKUP) uv run python -m pipeline.transform.postcode_boundaries \ @@ -158,10 +157,6 @@ $(BROADBAND): $(POSTCODES): uv run python -m pipeline.download.postcodes --output $@ -$(GEOSURE_STAMP): - uv run python -m pipeline.download.geosure --output $(GEOSURE_DIR) - @touch $@ - $(NOISE): $(ARCGIS) uv run python -m pipeline.download.noise --arcgis $(ARCGIS) --output $@ @@ -185,12 +180,19 @@ $(RENTAL): $(GREENSPACE): $(PBF) uv run python -m pipeline.download.greenspace_water --output $@ --pbf $(PBF) +$(OS_GREENSPACE): + uv run python -m pipeline.download.os_greenspace --output $@ + $(PLACES): $(PBF) $(ENGLAND_BOUNDARY) uv run python -m pipeline.download.places --output $@ --pbf $(PBF) --boundary $(ENGLAND_BOUNDARY) $(LSOA_POP): uv run python -m pipeline.download.lsoa_population --output $@ + +$(MEDIAN_AGE): + uv run python -m pipeline.download.median_age --output $@ + $(ENGLAND_BOUNDARY): uv run python -m pipeline.download.england_boundary --output $@ @@ -216,15 +218,12 @@ $(CRIME): fi uv run python -m pipeline.transform.crime --input $(CRIME_DIR) --output $@ -$(POI_PROXIMITY): $(ARCGIS) $(POIS_FILTERED) - uv run python -m pipeline.transform.poi_proximity --arcgis $(ARCGIS) --pois $(POIS_FILTERED) --output $@ +$(POI_PROXIMITY): $(ARCGIS) $(POIS_FILTERED) $(OS_GREENSPACE) + uv run python -m pipeline.transform.poi_proximity --arcgis $(ARCGIS) --pois $(POIS_FILTERED) --greenspace $(OS_GREENSPACE) --output $@ $(SCHOOL_PROX): $(OFSTED) $(ARCGIS) uv run python -m pipeline.transform.school_proximity --ofsted $(OFSTED) --arcgis $(ARCGIS) --output $@ -$(GEOSURE): $(GEOSURE_STAMP) $(ARCGIS) - uv run python -m pipeline.transform.transform_geosure --geosure $(GEOSURE_DIR) --arcgis $(ARCGIS) --output $@ - # Postcode boundaries require manual generation — fail with instructions $(PC_BOUNDARIES): @echo "" @@ -243,7 +242,7 @@ $(PC_BOUNDARIES): # ── Final merge → postcode.parquet + properties.parquet ────────────────────── $(MERGE_STAMP): $(EPC_PP) $(ARCGIS) $(IOD) $(POI_PROXIMITY) \ - $(ETHNICITY) $(CRIME) $(NOISE) $(SCHOOL_PROX) $(BROADBAND) $(GEOSURE) $(RENTAL) $(LSOA_POP) + $(ETHNICITY) $(CRIME) $(NOISE) $(SCHOOL_PROX) $(BROADBAND) $(RENTAL) $(LSOA_POP) $(MEDIAN_AGE) uv run python -m pipeline.transform.merge \ --epc-pp $(EPC_PP) \ --arcgis $(ARCGIS) \ @@ -254,9 +253,9 @@ $(MERGE_STAMP): $(EPC_PP) $(ARCGIS) $(IOD) $(POI_PROXIMITY) \ --noise $(NOISE) \ --school-proximity $(SCHOOL_PROX) \ --broadband $(BROADBAND) \ - --geosure $(GEOSURE) \ --rental-prices $(RENTAL) \ --lsoa-population $(LSOA_POP) \ + --median-age $(MEDIAN_AGE) \ --output-postcodes $(POSTCODES_PQ) \ --output-properties $(PROPERTIES_PQ) @touch $@ diff --git a/finder/constants.py b/finder/constants.py index f10d5da..d9ab10c 100644 --- a/finder/constants.py +++ b/finder/constants.py @@ -9,6 +9,12 @@ DELAY_BETWEEN_OUTCODES = 0.5 MAX_RETRIES = 3 RETRY_BASE_DELAY = 2.0 GRID_CELL_SIZE = 0.01 # degrees for postcode spatial index +MAX_BEDROOMS = 20 # sanity cap — values above this are almost certainly parsing errors +# Rent sanity bounds (monthly). Rents outside this range are nulled out — they are +# almost always total-stay pricing (e.g. "Golf Open 2026" short lets), annual rents +# mislabelled as monthly, or data errors. +MIN_RENT_MONTHLY = 50 # below £50/month is implausible for any UK property +MAX_RENT_MONTHLY = 25_000 # above £25k/month covers ultra-prime London; higher is suspect SEED = 42 CHECKPOINT_INTERVAL = int(os.environ.get("CHECKPOINT_INTERVAL", "900")) # seconds diff --git a/finder/homecouk.py b/finder/homecouk.py index fc18fdf..f9e290f 100644 --- a/finder/homecouk.py +++ b/finder/homecouk.py @@ -15,6 +15,7 @@ from constants import ( HOMECOUK_API_BASE, HOMECOUK_BASE, HOMECOUK_PER_PAGE, + MAX_BEDROOMS, PROPERTY_TYPE_MAP, RETRY_BASE_DELAY, ) @@ -25,6 +26,7 @@ from metrics import ( homecouk_requests_total, ) from spatial import PostcodeSpatialIndex +from transform import validate_floor_area log = logging.getLogger("homecouk") @@ -216,10 +218,57 @@ def parse_floor_area(description: str | None) -> float | None: m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*ft", description, re.IGNORECASE) if m: sqft = float(m.group(1).replace(",", "")) - return round(sqft * 0.092903, 1) + return validate_floor_area(round(sqft * 0.092903, 1)) m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*m", description, re.IGNORECASE) if m: - return round(float(m.group(1).replace(",", "")), 1) + return validate_floor_area(round(float(m.group(1).replace(",", "")), 1)) + return None + + +def parse_tenure(prop: dict) -> str | None: + """Extract tenure from home.co.uk property data. + + Checks multiple sources in priority order: + 1. Dedicated 'tenure' or 'tenure_type' field in the API response + 2. Free-text search in the description for 'freehold' / 'leasehold' + 3. Free-text search in features lists + + home.co.uk aggregates listings from estate agents, so tenure is often + embedded in the description text rather than a structured field. + """ + # 1. Check dedicated tenure fields (in case the API adds them) + for key in ("tenure", "tenure_type", "tenureType"): + val = prop.get(key) + if val and isinstance(val, str): + lower = val.lower().strip() + if "leasehold" in lower: + return "Leasehold" + if "freehold" in lower: + return "Freehold" + + # 2. Check description text — estate agents often include tenure here + description = prop.get("description") or "" + if description: + lower_desc = description.lower() + if re.search(r"\bleasehold\b", lower_desc): + return "Leasehold" + if re.search(r"\bfreehold\b", lower_desc): + # Matches "Freehold" and "Share of Freehold" (both = freehold ownership) + return "Freehold" + + # 3. Check features / key_features lists if present + for key in ("features", "key_features", "keyFeatures"): + features = prop.get(key) + if features and isinstance(features, list): + for feat in features: + if not isinstance(feat, str): + continue + lower_feat = feat.lower() + if "leasehold" in lower_feat: + return "Leasehold" + if "freehold" in lower_feat: + return "Freehold" + return None @@ -267,7 +316,7 @@ def transform_property( return None price = prop.get("price") or prop.get("latest_price") - if not price: + if not price or int(price) <= 0: return None # Home.co.uk provides postcodes directly, but fall back to spatial index @@ -278,8 +327,16 @@ def transform_property( log.debug("No postcode for property at %.4f, %.4f — skipping", lat, lng) return None - bedrooms = prop.get("bedrooms", 0) or 0 - bathrooms = prop.get("bathrooms", 0) or 0 + raw_beds = prop.get("bedrooms", 0) or 0 + raw_baths = prop.get("bathrooms", 0) or 0 + bedrooms = raw_beds if raw_beds <= MAX_BEDROOMS else 0 + bathrooms = raw_baths if raw_baths <= MAX_BEDROOMS else 0 + if raw_beds > MAX_BEDROOMS or raw_baths > MAX_BEDROOMS: + log.warning( + "home.co.uk %s: implausible beds=%d baths=%d (capped to 0)", + prop.get("listing_id") or prop.get("property_id") or "?", + raw_beds, raw_baths, + ) listing_type = prop.get("listing_property_type") or prop.get("property_type") or "" address = prop.get("display_address") or prop.get("address") or "" @@ -304,7 +361,7 @@ def transform_property( "lat": lat, "Postcode": postcode, "Address per Property Register": address, - "Leasehold/Freehold": None, # not available from home.co.uk + "Leasehold/Freehold": parse_tenure(prop), "Property type": map_property_type(listing_type), "Property sub-type": listing_type or "Unknown", "price": int(price), diff --git a/finder/openrent.py b/finder/openrent.py index 791e79c..ce27fe8 100644 --- a/finder/openrent.py +++ b/finder/openrent.py @@ -34,6 +34,7 @@ from playwright.sync_api import sync_playwright from constants import ( DELAY_BETWEEN_PAGES, + MAX_BEDROOMS, OPENRENT_BASE, PROPERTY_TYPE_MAP, RETRY_BASE_DELAY, @@ -45,6 +46,7 @@ from metrics import ( openrent_requests_total, ) from spatial import PostcodeSpatialIndex +from transform import validate_floor_area log = logging.getLogger("openrent") @@ -607,10 +609,10 @@ def parse_floor_area(description: str | None) -> float | None: m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*ft", description, re.IGNORECASE) if m: sqft = float(m.group(1).replace(",", "")) - return round(sqft * 0.092903, 1) + return validate_floor_area(round(sqft * 0.092903, 1)) m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*m", description, re.IGNORECASE) if m: - return round(float(m.group(1).replace(",", "")), 1) + return validate_floor_area(round(float(m.group(1).replace(",", "")), 1)) return None @@ -651,7 +653,7 @@ def transform_property( lat = detail.get("lat") or search_data.get("lat") lng = detail.get("lng") or search_data.get("lng") price = detail.get("price") or search_data.get("price") - if not price: + if not price or int(price) <= 0: return None frequency = search_data.get("frequency", "monthly") @@ -701,8 +703,15 @@ def transform_property( log.debug("No postcode for property — skipping") return None - bedrooms = detail.get("bedrooms") or search_data.get("bedrooms", 0) or 0 - bathrooms = detail.get("bathrooms") or search_data.get("bathrooms", 0) or 0 + raw_beds = detail.get("bedrooms") or search_data.get("bedrooms", 0) or 0 + raw_baths = detail.get("bathrooms") or search_data.get("bathrooms", 0) or 0 + bedrooms = raw_beds if raw_beds <= MAX_BEDROOMS else 0 + bathrooms = raw_baths if raw_baths <= MAX_BEDROOMS else 0 + if raw_beds > MAX_BEDROOMS or raw_baths > MAX_BEDROOMS: + log.warning( + "OpenRent %s: implausible beds=%d baths=%d (capped to 0)", + search_data.get("id", "?"), raw_beds, raw_baths, + ) # Title: prefer detail page (has h1 with full title) title = detail.get("title") or search_data.get("title", "") @@ -746,6 +755,9 @@ def transform_property( "lat": lat, "Postcode": postcode, "Address per Property Register": address, + # OpenRent is a rental-only platform — tenure (Freehold/Leasehold) is a + # property ownership concept that doesn't apply to rental listings. The + # landlord's tenure is not shown on OpenRent listing pages. "Leasehold/Freehold": None, "Property type": map_property_type(property_type), "Property sub-type": property_type or "Unknown", diff --git a/finder/storage.py b/finder/storage.py index 9854188..4ab685f 100644 --- a/finder/storage.py +++ b/finder/storage.py @@ -4,6 +4,7 @@ from pathlib import Path import polars as pl +from constants import MAX_BEDROOMS, MAX_RENT_MONTHLY, MIN_RENT_MONTHLY from transform import normalize_price log = logging.getLogger("rightmove") @@ -18,6 +19,30 @@ def write_parquet(properties: list[dict], path: Path, channel: str) -> None: log.warning("No properties to write to %s", path) return + # Sanitize bedroom/bathroom counts — values above MAX_BEDROOMS are + # almost certainly prices or other numeric fields mis-parsed as bedrooms. + bad_count = 0 + for p in properties: + for key in ("Bedrooms", "Bathrooms"): + val = p.get(key, 0) or 0 + if val > MAX_BEDROOMS: + bad_count += 1 + p[key] = None + # Recompute derived field after sanitization + beds = p.get("Bedrooms") + baths = p.get("Bathrooms") + if beds is None or baths is None: + p["Number of bedrooms & living rooms"] = None + else: + p["Number of bedrooms & living rooms"] = beds + baths + + if bad_count: + log.warning( + "Sanitized %d properties with bedroom/bathroom counts > %d (set to null)", + bad_count, + MAX_BEDROOMS, + ) + # Parse first_visible_date to datetime listing_dates = [] for p in properties: @@ -36,15 +61,33 @@ def write_parquet(properties: list[dict], path: Path, channel: str) -> None: listing_dates.append(None) # Derive asking price / asking rent based on channel + # Zero prices indicate parsing failures or POA/auction listings — treat as null if channel == "buy": - asking_prices = [p["price"] for p in properties] + asking_prices = [p["price"] if p["price"] > 0 else None for p in properties] asking_rents = [None] * len(properties) listing_statuses = ["For sale"] * len(properties) else: asking_prices = [None] * len(properties) - asking_rents = [ - normalize_price(p["price"], p["price_frequency"]) for p in properties - ] + # Normalize to monthly, then apply sanity bounds. Rents outside + # [MIN_RENT_MONTHLY, MAX_RENT_MONTHLY] are almost always total-stay + # pricing (short lets), annual rents mislabelled as monthly, or £0 + # placeholders — null them out rather than polluting aggregates. + rent_outliers = 0 + asking_rents = [] + for p in properties: + monthly = normalize_price(p["price"], p["price_frequency"]) + if monthly < MIN_RENT_MONTHLY or monthly > MAX_RENT_MONTHLY: + rent_outliers += 1 + asking_rents.append(None) + else: + asking_rents.append(monthly) + if rent_outliers: + log.warning( + "Nulled %d rent outliers outside [£%d, £%d]/month", + rent_outliers, + MIN_RENT_MONTHLY, + MAX_RENT_MONTHLY, + ) listing_statuses = ["For rent"] * len(properties) df = pl.DataFrame( diff --git a/finder/transform.py b/finder/transform.py index 143d07a..1027220 100644 --- a/finder/transform.py +++ b/finder/transform.py @@ -1,12 +1,31 @@ import logging import re -from constants import PROPERTY_TYPE_MAP, RIGHTMOVE_BASE +from constants import MAX_BEDROOMS, PROPERTY_TYPE_MAP, RIGHTMOVE_BASE from spatial import PostcodeSpatialIndex log = logging.getLogger("rightmove") +# Maximum plausible floor area for a residential property listing (sqm). +# ~21,500 sq ft — covers even the largest UK mansions. +MAX_FLOOR_AREA_SQM = 2000.0 + + +def validate_floor_area(sqm: float | None) -> float | None: + """Validate a floor area value. Returns None for nonsensical values. + + Rejects zero/negative values and anything above MAX_FLOOR_AREA_SQM, + which catches parsing errors where prices or other large numbers are + mistakenly extracted as floor area from free-text descriptions or DOM text. + """ + if sqm is None: + return None + if sqm <= 0 or sqm > MAX_FLOOR_AREA_SQM: + return None + return sqm + + def parse_display_size(display_size: str | None) -> float | None: """Parse displaySize like '499 sq. ft.' or '4,124 sq. ft.' to sqm.""" if not display_size: @@ -15,11 +34,11 @@ def parse_display_size(display_size: str | None) -> float | None: m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*ft", display_size, re.IGNORECASE) if m: sqft = float(m.group(1).replace(",", "")) - return round(sqft * 0.092903, 1) + return validate_floor_area(round(sqft * 0.092903, 1)) # Try sq. m. m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*m", display_size, re.IGNORECASE) if m: - return round(float(m.group(1).replace(",", "")), 1) + return validate_floor_area(round(float(m.group(1).replace(",", "")), 1)) return None @@ -92,19 +111,34 @@ def transform_property( price_obj = prop.get("price", {}) amount = price_obj.get("amount") - if amount is None: + if not amount: return None frequency = price_obj.get("frequency", "") - price = normalize_price(int(amount), frequency) + # Store raw price — normalization to monthly happens once in storage.py + price = int(amount) + if price <= 0: + return None display_prices = price_obj.get("displayPrices", []) price_qualifier = ( display_prices[0].get("displayPriceQualifier", "") if display_prices else "" ) + # POA / Auction listings have unreliable prices — treat as no price + pq_lower = price_qualifier.lower() + if "poa" in pq_lower or "auction" in pq_lower: + return None + sub_type = prop.get("propertySubType", "") - bedrooms = prop.get("bedrooms", 0) or 0 - bathrooms = prop.get("bathrooms", 0) or 0 + raw_beds = prop.get("bedrooms", 0) or 0 + raw_baths = prop.get("bathrooms", 0) or 0 + bedrooms = raw_beds if raw_beds <= MAX_BEDROOMS else 0 + bathrooms = raw_baths if raw_baths <= MAX_BEDROOMS else 0 + if raw_beds > MAX_BEDROOMS or raw_baths > MAX_BEDROOMS: + log.warning( + "Rightmove %s: implausible beds=%d baths=%d (capped to 0)", + prop.get("id", "?"), raw_beds, raw_baths, + ) key_features = [ kf.get("description", "") diff --git a/finder/zoopla.py b/finder/zoopla.py index 59372ad..f7a7bec 100644 --- a/finder/zoopla.py +++ b/finder/zoopla.py @@ -26,9 +26,10 @@ import logging import re import time -from constants import DELAY_BETWEEN_PAGES, PROPERTY_TYPE_MAP, ZOOPLA_BASE +from constants import DELAY_BETWEEN_PAGES, MAX_BEDROOMS, PROPERTY_TYPE_MAP, ZOOPLA_BASE from metrics import zoopla_errors_total, zoopla_pages_scraped, zoopla_properties_scraped from spatial import PostcodeSpatialIndex +from transform import validate_floor_area log = logging.getLogger("zoopla") @@ -94,15 +95,16 @@ _EXTRACT_LISTINGS_JS = r"""() => { const areaMatch = text.match(/([\d,]+)\s*sq\.?\s*ft/i); let tenure = ''; - if (/freehold/i.test(text)) tenure = 'Freehold'; - else if (/leasehold/i.test(text)) tenure = 'Leasehold'; + if (/leasehold/i.test(text)) tenure = 'Leasehold'; + else if (/freehold/i.test(text)) tenure = 'Freehold'; results.push({ id, url: href.replace(window.location.origin, ''), price: priceMatch ? parseInt(priceMatch[1].replace(/,/g, '')) : null, - beds: bedsMatch ? parseInt(bedsMatch[1]) : null, - baths: bathsMatch ? parseInt(bathsMatch[1]) : null, - receptions: recMatch ? parseInt(recMatch[1]) : null, + price_text: priceText.trim(), + beds: bedsMatch && parseInt(bedsMatch[1]) <= 20 ? parseInt(bedsMatch[1]) : null, + baths: bathsMatch && parseInt(bathsMatch[1]) <= 20 ? parseInt(bathsMatch[1]) : null, + receptions: recMatch && parseInt(recMatch[1]) <= 20 ? parseInt(recMatch[1]) : null, floor_area_sqft: areaMatch ? parseInt(areaMatch[1].replace(/,/g, '')) : null, address, tenure, }); @@ -137,7 +139,9 @@ _EXTRACT_LISTINGS_JS = r"""() => { const text = card.innerText || ''; const lines = text.split('\n').map(l => l.trim()).filter(Boolean); - const priceMatch = text.match(/\u00a3([\d,]+)/); + const priceEl2 = card.querySelector('[data-testid="listing-price"]'); + const priceText2 = priceEl2 ? priceEl2.innerText : text; + const priceMatch = priceText2.match(/\u00a3([\d,]+)/); const bedsMatch = text.match(/(\d+)\s*beds?/i); const bathsMatch = text.match(/(\d+)\s*baths?/i); const recMatch = text.match(/(\d+)\s*reception/i); @@ -153,15 +157,16 @@ _EXTRACT_LISTINGS_JS = r"""() => { } let tenure = ''; - if (/freehold/i.test(text)) tenure = 'Freehold'; - else if (/leasehold/i.test(text)) tenure = 'Leasehold'; + if (/leasehold/i.test(text)) tenure = 'Leasehold'; + else if (/freehold/i.test(text)) tenure = 'Freehold'; results.push({ id, url: href.replace(window.location.origin, ''), price: priceMatch ? parseInt(priceMatch[1].replace(/,/g, '')) : null, - beds: bedsMatch ? parseInt(bedsMatch[1]) : null, - baths: bathsMatch ? parseInt(bathsMatch[1]) : null, - receptions: recMatch ? parseInt(recMatch[1]) : null, + price_text: priceText2.trim(), + beds: bedsMatch && parseInt(bedsMatch[1]) <= 20 ? parseInt(bedsMatch[1]) : null, + baths: bathsMatch && parseInt(bathsMatch[1]) <= 20 ? parseInt(bathsMatch[1]) : null, + receptions: recMatch && parseInt(recMatch[1]) <= 20 ? parseInt(recMatch[1]) : null, floor_area_sqft: areaMatch ? parseInt(areaMatch[1].replace(/,/g, '')) : null, address, tenure, }); @@ -597,6 +602,21 @@ def _map_property_type(raw_type: str | None) -> str: return "Other" +def _detect_rent_frequency(price_text: str) -> str: + """Detect rent frequency from Zoopla price text. + + Zoopla price elements contain text like '£1,500 pcm', '£350 pw', + '£18,000 pa'. Defaults to 'monthly' if no frequency indicator found. + """ + lower = price_text.lower() + if "pw" in lower or "per week" in lower or "/w" in lower: + return "weekly" + if "pa" in lower or "per annum" in lower or "/y" in lower or "per year" in lower: + return "yearly" + # pcm, per month, /m, or no indicator — default monthly + return "monthly" + + def transform_property( raw: dict, channel: str, @@ -608,7 +628,7 @@ def transform_property( Zoopla search cards do not include coordinates, so we resolve lat/lng from postcodes extracted from the address text.""" price = raw.get("price") - if not price: + if not price or int(price) <= 0: return None address = raw.get("address", "") @@ -647,21 +667,35 @@ def transform_property( if not (49 <= lat <= 56 and -7 <= lng <= 2): return None - bedrooms = raw.get("beds") or 0 - bathrooms = raw.get("baths") or 0 + raw_beds = raw.get("beds") or 0 + raw_baths = raw.get("baths") or 0 + bedrooms = raw_beds if raw_beds <= MAX_BEDROOMS else 0 + bathrooms = raw_baths if raw_baths <= MAX_BEDROOMS else 0 + if raw_beds > MAX_BEDROOMS or raw_baths > MAX_BEDROOMS: + log.warning( + "Zoopla %s: implausible beds=%d baths=%d (capped to 0)", + raw.get("id", "?"), raw_beds, raw_baths, + ) receptions = raw.get("receptions") or 0 # Floor area: convert sq ft to sq m floor_area_sqm = None sqft = raw.get("floor_area_sqft") if sqft: - floor_area_sqm = round(sqft * 0.092903, 1) + floor_area_sqm = validate_floor_area(round(sqft * 0.092903, 1)) listing_id = raw.get("id", "") listing_url = raw.get("url", "") if listing_url and not listing_url.startswith("http"): listing_url = ZOOPLA_BASE + listing_url + # Detect rent frequency from price text (e.g. "£1,500 pcm" vs "£350 pw") + if channel == "BUY": + frequency = "" + else: + price_text = raw.get("price_text", "") + frequency = _detect_rent_frequency(price_text) + return { "id": f"zp_{listing_id}", "Bedrooms": bedrooms, @@ -675,7 +709,7 @@ def transform_property( "Property type": "Other", # Not reliably extractable from Zoopla search cards "Property sub-type": "", "price": int(price), - "price_frequency": "" if channel == "BUY" else "monthly", + "price_frequency": frequency, "Price qualifier": "", "Total floor area (sqm)": floor_area_sqm, "Listing URL": listing_url, diff --git a/frontend/src/App.tsx b/frontend/src/App.tsx index 1c2f0f4..5e6c5b3 100644 --- a/frontend/src/App.tsx +++ b/frontend/src/App.tsx @@ -24,6 +24,7 @@ import { useSavedProperties } from './hooks/useSavedProperties'; declare global { interface Window { __screenshot_ready?: boolean; + __map_idle?: boolean; } } diff --git a/frontend/src/components/learn/LearnPage.tsx b/frontend/src/components/learn/LearnPage.tsx index d186f80..8adea94 100644 --- a/frontend/src/components/learn/LearnPage.tsx +++ b/frontend/src/components/learn/LearnPage.tsx @@ -69,6 +69,14 @@ const DATA_SOURCES = [ url: 'https://download.geofabrik.de/europe/great-britain-latest.osm.pbf', license: 'Open Data Commons Open Database License (ODbL)', }, + { + id: 'os-open-greenspace', + name: 'OS Open Greenspace', + origin: 'Ordnance Survey', + use: 'Authoritative green space boundaries for Great Britain, including public parks, gardens, playing fields, and play spaces. Polygon centroids are used for park proximity counts and distance-to-nearest-park calculations.', + url: 'https://osdatahub.os.uk/downloads/open/OpenGreenspace', + license: 'Open Government Licence v3.0', + }, { id: 'naptan', name: 'NaPTAN (Public Transport Stops)', @@ -101,14 +109,6 @@ const DATA_SOURCES = [ url: 'https://www.ofcom.org.uk/phones-and-broadband/coverage-and-speeds/connected-nations-20252/data-downloads-2025', license: 'Open Government Licence v3.0', }, - { - id: 'geosure', - name: 'GeoSure Ground Stability', - origin: 'Ordnance Survey', - use: 'Ground stability hazard ratings on a 5km hex grid covering Great Britain. Six risk categories (collapsible deposits, compressible ground, landslides, running sand, shrink-swell, and soluble rocks) rated Low, Moderate, or Significant. Spatial-joined to postcodes via centroid intersection.', - url: 'https://osdatahub.os.uk/downloads/open/GeoSure', - license: 'Open Government Licence v3.0', - }, { id: 'council-tax', name: 'Council Tax Levels 2025-26', diff --git a/frontend/src/components/map/AiFilterInput.tsx b/frontend/src/components/map/AiFilterInput.tsx index 29de7a8..0850e26 100644 --- a/frontend/src/components/map/AiFilterInput.tsx +++ b/frontend/src/components/map/AiFilterInput.tsx @@ -1,6 +1,7 @@ import { memo, useState, useCallback, useEffect, useRef } from 'react'; import { SpinnerIcon } from '../ui/icons/SpinnerIcon'; import { SparklesIcon } from '../ui/icons/SparklesIcon'; +import { ChevronIcon } from '../ui/icons/ChevronIcon'; import type { AiFilterErrorType } from '../../hooks/useAiFilters'; const EXAMPLE_QUERIES = [ @@ -13,6 +14,7 @@ const LOADING_MESSAGES = [ 'Analysing your query...', 'Searching for destinations...', 'Generating filters...', + 'Refining results...', ]; /** Cycle through loading messages to show progress. */ @@ -28,9 +30,11 @@ function useLoadingMessage(loading: boolean): string { // Advance message every 1.5s timerRef.current = setTimeout(() => setIndex(1), 1500); const t2 = setTimeout(() => setIndex(2), 3500); + const t3 = setTimeout(() => setIndex(3), 5500); return () => { clearTimeout(timerRef.current); clearTimeout(t2); + clearTimeout(t3); }; }, [loading]); @@ -62,18 +66,45 @@ export default memo(function AiFilterInput({ const [expanded, setExpanded] = useState(false); const loadingMessage = useLoadingMessage(loading); const containerRef = useRef(null); + const textareaRef = useRef(null); + + const queryRef = useRef(query); + queryRef.current = query; useEffect(() => { if (!expanded || loading) return; const handler = (e: MouseEvent) => { if (containerRef.current && !containerRef.current.contains(e.target as Node)) { - setExpanded(false); + if (!queryRef.current.trim()) setExpanded(false); } }; document.addEventListener('mousedown', handler); return () => document.removeEventListener('mousedown', handler); }, [expanded, loading]); + const resizeTextarea = useCallback(() => { + const ta = textareaRef.current; + if (!ta) return; + ta.style.height = 'auto'; + ta.style.height = `${ta.scrollHeight}px`; + }, []); + + const handleKeyDown = useCallback( + (e: React.KeyboardEvent) => { + if (e.key === 'Enter' && !e.shiftKey) { + e.preventDefault(); + const trimmed = query.trim(); + if (!trimmed || loading) return; + if (!isLoggedIn) { + onLoginRequired(); + return; + } + onSubmit(trimmed); + } + }, + [query, loading, isLoggedIn, onLoginRequired, onSubmit] + ); + const handleSubmit = useCallback( (e: React.FormEvent) => { e.preventDefault(); @@ -129,14 +160,27 @@ export default memo(function AiFilterInput({ describe what you're looking for + -
- +