From 3a3e249bdd34fb84aae440b130636ed809f1a8da Mon Sep 17 00:00:00 2001 From: Andras Schmelczer Date: Wed, 25 Mar 2026 08:06:05 +0000 Subject: [PATCH] finder improvements --- finder/constants.py | 6 ++++ finder/homecouk.py | 69 +++++++++++++++++++++++++++++++++++++++++---- finder/openrent.py | 22 +++++++++++---- finder/storage.py | 51 ++++++++++++++++++++++++++++++--- finder/transform.py | 48 ++++++++++++++++++++++++++----- finder/zoopla.py | 68 +++++++++++++++++++++++++++++++++----------- 6 files changed, 225 insertions(+), 39 deletions(-) diff --git a/finder/constants.py b/finder/constants.py index f10d5da..d9ab10c 100644 --- a/finder/constants.py +++ b/finder/constants.py @@ -9,6 +9,12 @@ DELAY_BETWEEN_OUTCODES = 0.5 MAX_RETRIES = 3 RETRY_BASE_DELAY = 2.0 GRID_CELL_SIZE = 0.01 # degrees for postcode spatial index +MAX_BEDROOMS = 20 # sanity cap — values above this are almost certainly parsing errors +# Rent sanity bounds (monthly). Rents outside this range are nulled out — they are +# almost always total-stay pricing (e.g. "Golf Open 2026" short lets), annual rents +# mislabelled as monthly, or data errors. +MIN_RENT_MONTHLY = 50 # below £50/month is implausible for any UK property +MAX_RENT_MONTHLY = 25_000 # above £25k/month covers ultra-prime London; higher is suspect SEED = 42 CHECKPOINT_INTERVAL = int(os.environ.get("CHECKPOINT_INTERVAL", "900")) # seconds diff --git a/finder/homecouk.py b/finder/homecouk.py index fc18fdf..f9e290f 100644 --- a/finder/homecouk.py +++ b/finder/homecouk.py @@ -15,6 +15,7 @@ from constants import ( HOMECOUK_API_BASE, HOMECOUK_BASE, HOMECOUK_PER_PAGE, + MAX_BEDROOMS, PROPERTY_TYPE_MAP, RETRY_BASE_DELAY, ) @@ -25,6 +26,7 @@ from metrics import ( homecouk_requests_total, ) from spatial import PostcodeSpatialIndex +from transform import validate_floor_area log = logging.getLogger("homecouk") @@ -216,10 +218,57 @@ def parse_floor_area(description: str | None) -> float | None: m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*ft", description, re.IGNORECASE) if m: sqft = float(m.group(1).replace(",", "")) - return round(sqft * 0.092903, 1) + return validate_floor_area(round(sqft * 0.092903, 1)) m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*m", description, re.IGNORECASE) if m: - return round(float(m.group(1).replace(",", "")), 1) + return validate_floor_area(round(float(m.group(1).replace(",", "")), 1)) + return None + + +def parse_tenure(prop: dict) -> str | None: + """Extract tenure from home.co.uk property data. + + Checks multiple sources in priority order: + 1. Dedicated 'tenure' or 'tenure_type' field in the API response + 2. Free-text search in the description for 'freehold' / 'leasehold' + 3. Free-text search in features lists + + home.co.uk aggregates listings from estate agents, so tenure is often + embedded in the description text rather than a structured field. + """ + # 1. Check dedicated tenure fields (in case the API adds them) + for key in ("tenure", "tenure_type", "tenureType"): + val = prop.get(key) + if val and isinstance(val, str): + lower = val.lower().strip() + if "leasehold" in lower: + return "Leasehold" + if "freehold" in lower: + return "Freehold" + + # 2. Check description text — estate agents often include tenure here + description = prop.get("description") or "" + if description: + lower_desc = description.lower() + if re.search(r"\bleasehold\b", lower_desc): + return "Leasehold" + if re.search(r"\bfreehold\b", lower_desc): + # Matches "Freehold" and "Share of Freehold" (both = freehold ownership) + return "Freehold" + + # 3. Check features / key_features lists if present + for key in ("features", "key_features", "keyFeatures"): + features = prop.get(key) + if features and isinstance(features, list): + for feat in features: + if not isinstance(feat, str): + continue + lower_feat = feat.lower() + if "leasehold" in lower_feat: + return "Leasehold" + if "freehold" in lower_feat: + return "Freehold" + return None @@ -267,7 +316,7 @@ def transform_property( return None price = prop.get("price") or prop.get("latest_price") - if not price: + if not price or int(price) <= 0: return None # Home.co.uk provides postcodes directly, but fall back to spatial index @@ -278,8 +327,16 @@ def transform_property( log.debug("No postcode for property at %.4f, %.4f — skipping", lat, lng) return None - bedrooms = prop.get("bedrooms", 0) or 0 - bathrooms = prop.get("bathrooms", 0) or 0 + raw_beds = prop.get("bedrooms", 0) or 0 + raw_baths = prop.get("bathrooms", 0) or 0 + bedrooms = raw_beds if raw_beds <= MAX_BEDROOMS else 0 + bathrooms = raw_baths if raw_baths <= MAX_BEDROOMS else 0 + if raw_beds > MAX_BEDROOMS or raw_baths > MAX_BEDROOMS: + log.warning( + "home.co.uk %s: implausible beds=%d baths=%d (capped to 0)", + prop.get("listing_id") or prop.get("property_id") or "?", + raw_beds, raw_baths, + ) listing_type = prop.get("listing_property_type") or prop.get("property_type") or "" address = prop.get("display_address") or prop.get("address") or "" @@ -304,7 +361,7 @@ def transform_property( "lat": lat, "Postcode": postcode, "Address per Property Register": address, - "Leasehold/Freehold": None, # not available from home.co.uk + "Leasehold/Freehold": parse_tenure(prop), "Property type": map_property_type(listing_type), "Property sub-type": listing_type or "Unknown", "price": int(price), diff --git a/finder/openrent.py b/finder/openrent.py index 791e79c..ce27fe8 100644 --- a/finder/openrent.py +++ b/finder/openrent.py @@ -34,6 +34,7 @@ from playwright.sync_api import sync_playwright from constants import ( DELAY_BETWEEN_PAGES, + MAX_BEDROOMS, OPENRENT_BASE, PROPERTY_TYPE_MAP, RETRY_BASE_DELAY, @@ -45,6 +46,7 @@ from metrics import ( openrent_requests_total, ) from spatial import PostcodeSpatialIndex +from transform import validate_floor_area log = logging.getLogger("openrent") @@ -607,10 +609,10 @@ def parse_floor_area(description: str | None) -> float | None: m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*ft", description, re.IGNORECASE) if m: sqft = float(m.group(1).replace(",", "")) - return round(sqft * 0.092903, 1) + return validate_floor_area(round(sqft * 0.092903, 1)) m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*m", description, re.IGNORECASE) if m: - return round(float(m.group(1).replace(",", "")), 1) + return validate_floor_area(round(float(m.group(1).replace(",", "")), 1)) return None @@ -651,7 +653,7 @@ def transform_property( lat = detail.get("lat") or search_data.get("lat") lng = detail.get("lng") or search_data.get("lng") price = detail.get("price") or search_data.get("price") - if not price: + if not price or int(price) <= 0: return None frequency = search_data.get("frequency", "monthly") @@ -701,8 +703,15 @@ def transform_property( log.debug("No postcode for property — skipping") return None - bedrooms = detail.get("bedrooms") or search_data.get("bedrooms", 0) or 0 - bathrooms = detail.get("bathrooms") or search_data.get("bathrooms", 0) or 0 + raw_beds = detail.get("bedrooms") or search_data.get("bedrooms", 0) or 0 + raw_baths = detail.get("bathrooms") or search_data.get("bathrooms", 0) or 0 + bedrooms = raw_beds if raw_beds <= MAX_BEDROOMS else 0 + bathrooms = raw_baths if raw_baths <= MAX_BEDROOMS else 0 + if raw_beds > MAX_BEDROOMS or raw_baths > MAX_BEDROOMS: + log.warning( + "OpenRent %s: implausible beds=%d baths=%d (capped to 0)", + search_data.get("id", "?"), raw_beds, raw_baths, + ) # Title: prefer detail page (has h1 with full title) title = detail.get("title") or search_data.get("title", "") @@ -746,6 +755,9 @@ def transform_property( "lat": lat, "Postcode": postcode, "Address per Property Register": address, + # OpenRent is a rental-only platform — tenure (Freehold/Leasehold) is a + # property ownership concept that doesn't apply to rental listings. The + # landlord's tenure is not shown on OpenRent listing pages. "Leasehold/Freehold": None, "Property type": map_property_type(property_type), "Property sub-type": property_type or "Unknown", diff --git a/finder/storage.py b/finder/storage.py index 9854188..4ab685f 100644 --- a/finder/storage.py +++ b/finder/storage.py @@ -4,6 +4,7 @@ from pathlib import Path import polars as pl +from constants import MAX_BEDROOMS, MAX_RENT_MONTHLY, MIN_RENT_MONTHLY from transform import normalize_price log = logging.getLogger("rightmove") @@ -18,6 +19,30 @@ def write_parquet(properties: list[dict], path: Path, channel: str) -> None: log.warning("No properties to write to %s", path) return + # Sanitize bedroom/bathroom counts — values above MAX_BEDROOMS are + # almost certainly prices or other numeric fields mis-parsed as bedrooms. + bad_count = 0 + for p in properties: + for key in ("Bedrooms", "Bathrooms"): + val = p.get(key, 0) or 0 + if val > MAX_BEDROOMS: + bad_count += 1 + p[key] = None + # Recompute derived field after sanitization + beds = p.get("Bedrooms") + baths = p.get("Bathrooms") + if beds is None or baths is None: + p["Number of bedrooms & living rooms"] = None + else: + p["Number of bedrooms & living rooms"] = beds + baths + + if bad_count: + log.warning( + "Sanitized %d properties with bedroom/bathroom counts > %d (set to null)", + bad_count, + MAX_BEDROOMS, + ) + # Parse first_visible_date to datetime listing_dates = [] for p in properties: @@ -36,15 +61,33 @@ def write_parquet(properties: list[dict], path: Path, channel: str) -> None: listing_dates.append(None) # Derive asking price / asking rent based on channel + # Zero prices indicate parsing failures or POA/auction listings — treat as null if channel == "buy": - asking_prices = [p["price"] for p in properties] + asking_prices = [p["price"] if p["price"] > 0 else None for p in properties] asking_rents = [None] * len(properties) listing_statuses = ["For sale"] * len(properties) else: asking_prices = [None] * len(properties) - asking_rents = [ - normalize_price(p["price"], p["price_frequency"]) for p in properties - ] + # Normalize to monthly, then apply sanity bounds. Rents outside + # [MIN_RENT_MONTHLY, MAX_RENT_MONTHLY] are almost always total-stay + # pricing (short lets), annual rents mislabelled as monthly, or £0 + # placeholders — null them out rather than polluting aggregates. + rent_outliers = 0 + asking_rents = [] + for p in properties: + monthly = normalize_price(p["price"], p["price_frequency"]) + if monthly < MIN_RENT_MONTHLY or monthly > MAX_RENT_MONTHLY: + rent_outliers += 1 + asking_rents.append(None) + else: + asking_rents.append(monthly) + if rent_outliers: + log.warning( + "Nulled %d rent outliers outside [£%d, £%d]/month", + rent_outliers, + MIN_RENT_MONTHLY, + MAX_RENT_MONTHLY, + ) listing_statuses = ["For rent"] * len(properties) df = pl.DataFrame( diff --git a/finder/transform.py b/finder/transform.py index 143d07a..1027220 100644 --- a/finder/transform.py +++ b/finder/transform.py @@ -1,12 +1,31 @@ import logging import re -from constants import PROPERTY_TYPE_MAP, RIGHTMOVE_BASE +from constants import MAX_BEDROOMS, PROPERTY_TYPE_MAP, RIGHTMOVE_BASE from spatial import PostcodeSpatialIndex log = logging.getLogger("rightmove") +# Maximum plausible floor area for a residential property listing (sqm). +# ~21,500 sq ft — covers even the largest UK mansions. +MAX_FLOOR_AREA_SQM = 2000.0 + + +def validate_floor_area(sqm: float | None) -> float | None: + """Validate a floor area value. Returns None for nonsensical values. + + Rejects zero/negative values and anything above MAX_FLOOR_AREA_SQM, + which catches parsing errors where prices or other large numbers are + mistakenly extracted as floor area from free-text descriptions or DOM text. + """ + if sqm is None: + return None + if sqm <= 0 or sqm > MAX_FLOOR_AREA_SQM: + return None + return sqm + + def parse_display_size(display_size: str | None) -> float | None: """Parse displaySize like '499 sq. ft.' or '4,124 sq. ft.' to sqm.""" if not display_size: @@ -15,11 +34,11 @@ def parse_display_size(display_size: str | None) -> float | None: m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*ft", display_size, re.IGNORECASE) if m: sqft = float(m.group(1).replace(",", "")) - return round(sqft * 0.092903, 1) + return validate_floor_area(round(sqft * 0.092903, 1)) # Try sq. m. m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*m", display_size, re.IGNORECASE) if m: - return round(float(m.group(1).replace(",", "")), 1) + return validate_floor_area(round(float(m.group(1).replace(",", "")), 1)) return None @@ -92,19 +111,34 @@ def transform_property( price_obj = prop.get("price", {}) amount = price_obj.get("amount") - if amount is None: + if not amount: return None frequency = price_obj.get("frequency", "") - price = normalize_price(int(amount), frequency) + # Store raw price — normalization to monthly happens once in storage.py + price = int(amount) + if price <= 0: + return None display_prices = price_obj.get("displayPrices", []) price_qualifier = ( display_prices[0].get("displayPriceQualifier", "") if display_prices else "" ) + # POA / Auction listings have unreliable prices — treat as no price + pq_lower = price_qualifier.lower() + if "poa" in pq_lower or "auction" in pq_lower: + return None + sub_type = prop.get("propertySubType", "") - bedrooms = prop.get("bedrooms", 0) or 0 - bathrooms = prop.get("bathrooms", 0) or 0 + raw_beds = prop.get("bedrooms", 0) or 0 + raw_baths = prop.get("bathrooms", 0) or 0 + bedrooms = raw_beds if raw_beds <= MAX_BEDROOMS else 0 + bathrooms = raw_baths if raw_baths <= MAX_BEDROOMS else 0 + if raw_beds > MAX_BEDROOMS or raw_baths > MAX_BEDROOMS: + log.warning( + "Rightmove %s: implausible beds=%d baths=%d (capped to 0)", + prop.get("id", "?"), raw_beds, raw_baths, + ) key_features = [ kf.get("description", "") diff --git a/finder/zoopla.py b/finder/zoopla.py index 59372ad..f7a7bec 100644 --- a/finder/zoopla.py +++ b/finder/zoopla.py @@ -26,9 +26,10 @@ import logging import re import time -from constants import DELAY_BETWEEN_PAGES, PROPERTY_TYPE_MAP, ZOOPLA_BASE +from constants import DELAY_BETWEEN_PAGES, MAX_BEDROOMS, PROPERTY_TYPE_MAP, ZOOPLA_BASE from metrics import zoopla_errors_total, zoopla_pages_scraped, zoopla_properties_scraped from spatial import PostcodeSpatialIndex +from transform import validate_floor_area log = logging.getLogger("zoopla") @@ -94,15 +95,16 @@ _EXTRACT_LISTINGS_JS = r"""() => { const areaMatch = text.match(/([\d,]+)\s*sq\.?\s*ft/i); let tenure = ''; - if (/freehold/i.test(text)) tenure = 'Freehold'; - else if (/leasehold/i.test(text)) tenure = 'Leasehold'; + if (/leasehold/i.test(text)) tenure = 'Leasehold'; + else if (/freehold/i.test(text)) tenure = 'Freehold'; results.push({ id, url: href.replace(window.location.origin, ''), price: priceMatch ? parseInt(priceMatch[1].replace(/,/g, '')) : null, - beds: bedsMatch ? parseInt(bedsMatch[1]) : null, - baths: bathsMatch ? parseInt(bathsMatch[1]) : null, - receptions: recMatch ? parseInt(recMatch[1]) : null, + price_text: priceText.trim(), + beds: bedsMatch && parseInt(bedsMatch[1]) <= 20 ? parseInt(bedsMatch[1]) : null, + baths: bathsMatch && parseInt(bathsMatch[1]) <= 20 ? parseInt(bathsMatch[1]) : null, + receptions: recMatch && parseInt(recMatch[1]) <= 20 ? parseInt(recMatch[1]) : null, floor_area_sqft: areaMatch ? parseInt(areaMatch[1].replace(/,/g, '')) : null, address, tenure, }); @@ -137,7 +139,9 @@ _EXTRACT_LISTINGS_JS = r"""() => { const text = card.innerText || ''; const lines = text.split('\n').map(l => l.trim()).filter(Boolean); - const priceMatch = text.match(/\u00a3([\d,]+)/); + const priceEl2 = card.querySelector('[data-testid="listing-price"]'); + const priceText2 = priceEl2 ? priceEl2.innerText : text; + const priceMatch = priceText2.match(/\u00a3([\d,]+)/); const bedsMatch = text.match(/(\d+)\s*beds?/i); const bathsMatch = text.match(/(\d+)\s*baths?/i); const recMatch = text.match(/(\d+)\s*reception/i); @@ -153,15 +157,16 @@ _EXTRACT_LISTINGS_JS = r"""() => { } let tenure = ''; - if (/freehold/i.test(text)) tenure = 'Freehold'; - else if (/leasehold/i.test(text)) tenure = 'Leasehold'; + if (/leasehold/i.test(text)) tenure = 'Leasehold'; + else if (/freehold/i.test(text)) tenure = 'Freehold'; results.push({ id, url: href.replace(window.location.origin, ''), price: priceMatch ? parseInt(priceMatch[1].replace(/,/g, '')) : null, - beds: bedsMatch ? parseInt(bedsMatch[1]) : null, - baths: bathsMatch ? parseInt(bathsMatch[1]) : null, - receptions: recMatch ? parseInt(recMatch[1]) : null, + price_text: priceText2.trim(), + beds: bedsMatch && parseInt(bedsMatch[1]) <= 20 ? parseInt(bedsMatch[1]) : null, + baths: bathsMatch && parseInt(bathsMatch[1]) <= 20 ? parseInt(bathsMatch[1]) : null, + receptions: recMatch && parseInt(recMatch[1]) <= 20 ? parseInt(recMatch[1]) : null, floor_area_sqft: areaMatch ? parseInt(areaMatch[1].replace(/,/g, '')) : null, address, tenure, }); @@ -597,6 +602,21 @@ def _map_property_type(raw_type: str | None) -> str: return "Other" +def _detect_rent_frequency(price_text: str) -> str: + """Detect rent frequency from Zoopla price text. + + Zoopla price elements contain text like '£1,500 pcm', '£350 pw', + '£18,000 pa'. Defaults to 'monthly' if no frequency indicator found. + """ + lower = price_text.lower() + if "pw" in lower or "per week" in lower or "/w" in lower: + return "weekly" + if "pa" in lower or "per annum" in lower or "/y" in lower or "per year" in lower: + return "yearly" + # pcm, per month, /m, or no indicator — default monthly + return "monthly" + + def transform_property( raw: dict, channel: str, @@ -608,7 +628,7 @@ def transform_property( Zoopla search cards do not include coordinates, so we resolve lat/lng from postcodes extracted from the address text.""" price = raw.get("price") - if not price: + if not price or int(price) <= 0: return None address = raw.get("address", "") @@ -647,21 +667,35 @@ def transform_property( if not (49 <= lat <= 56 and -7 <= lng <= 2): return None - bedrooms = raw.get("beds") or 0 - bathrooms = raw.get("baths") or 0 + raw_beds = raw.get("beds") or 0 + raw_baths = raw.get("baths") or 0 + bedrooms = raw_beds if raw_beds <= MAX_BEDROOMS else 0 + bathrooms = raw_baths if raw_baths <= MAX_BEDROOMS else 0 + if raw_beds > MAX_BEDROOMS or raw_baths > MAX_BEDROOMS: + log.warning( + "Zoopla %s: implausible beds=%d baths=%d (capped to 0)", + raw.get("id", "?"), raw_beds, raw_baths, + ) receptions = raw.get("receptions") or 0 # Floor area: convert sq ft to sq m floor_area_sqm = None sqft = raw.get("floor_area_sqft") if sqft: - floor_area_sqm = round(sqft * 0.092903, 1) + floor_area_sqm = validate_floor_area(round(sqft * 0.092903, 1)) listing_id = raw.get("id", "") listing_url = raw.get("url", "") if listing_url and not listing_url.startswith("http"): listing_url = ZOOPLA_BASE + listing_url + # Detect rent frequency from price text (e.g. "£1,500 pcm" vs "£350 pw") + if channel == "BUY": + frequency = "" + else: + price_text = raw.get("price_text", "") + frequency = _detect_rent_frequency(price_text) + return { "id": f"zp_{listing_id}", "Bedrooms": bedrooms, @@ -675,7 +709,7 @@ def transform_property( "Property type": "Other", # Not reliably extractable from Zoopla search cards "Property sub-type": "", "price": int(price), - "price_frequency": "" if channel == "BUY" else "monthly", + "price_frequency": frequency, "Price qualifier": "", "Total floor area (sqm)": floor_area_sqm, "Listing URL": listing_url,