finder improvements

This commit is contained in:
Andras Schmelczer 2026-03-25 08:06:05 +00:00
parent 30055ab870
commit 3a3e249bdd
6 changed files with 225 additions and 39 deletions

View file

@ -9,6 +9,12 @@ DELAY_BETWEEN_OUTCODES = 0.5
MAX_RETRIES = 3
RETRY_BASE_DELAY = 2.0
GRID_CELL_SIZE = 0.01 # degrees for postcode spatial index
MAX_BEDROOMS = 20 # sanity cap — values above this are almost certainly parsing errors
# Rent sanity bounds (monthly). Rents outside this range are nulled out — they are
# almost always total-stay pricing (e.g. "Golf Open 2026" short lets), annual rents
# mislabelled as monthly, or data errors.
MIN_RENT_MONTHLY = 50 # below £50/month is implausible for any UK property
MAX_RENT_MONTHLY = 25_000 # above £25k/month covers ultra-prime London; higher is suspect
SEED = 42
CHECKPOINT_INTERVAL = int(os.environ.get("CHECKPOINT_INTERVAL", "900")) # seconds

View file

@ -15,6 +15,7 @@ from constants import (
HOMECOUK_API_BASE,
HOMECOUK_BASE,
HOMECOUK_PER_PAGE,
MAX_BEDROOMS,
PROPERTY_TYPE_MAP,
RETRY_BASE_DELAY,
)
@ -25,6 +26,7 @@ from metrics import (
homecouk_requests_total,
)
from spatial import PostcodeSpatialIndex
from transform import validate_floor_area
log = logging.getLogger("homecouk")
@ -216,10 +218,57 @@ def parse_floor_area(description: str | None) -> float | None:
m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*ft", description, re.IGNORECASE)
if m:
sqft = float(m.group(1).replace(",", ""))
return round(sqft * 0.092903, 1)
return validate_floor_area(round(sqft * 0.092903, 1))
m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*m", description, re.IGNORECASE)
if m:
return round(float(m.group(1).replace(",", "")), 1)
return validate_floor_area(round(float(m.group(1).replace(",", "")), 1))
return None
def parse_tenure(prop: dict) -> str | None:
"""Extract tenure from home.co.uk property data.
Checks multiple sources in priority order:
1. Dedicated 'tenure' or 'tenure_type' field in the API response
2. Free-text search in the description for 'freehold' / 'leasehold'
3. Free-text search in features lists
home.co.uk aggregates listings from estate agents, so tenure is often
embedded in the description text rather than a structured field.
"""
# 1. Check dedicated tenure fields (in case the API adds them)
for key in ("tenure", "tenure_type", "tenureType"):
val = prop.get(key)
if val and isinstance(val, str):
lower = val.lower().strip()
if "leasehold" in lower:
return "Leasehold"
if "freehold" in lower:
return "Freehold"
# 2. Check description text — estate agents often include tenure here
description = prop.get("description") or ""
if description:
lower_desc = description.lower()
if re.search(r"\bleasehold\b", lower_desc):
return "Leasehold"
if re.search(r"\bfreehold\b", lower_desc):
# Matches "Freehold" and "Share of Freehold" (both = freehold ownership)
return "Freehold"
# 3. Check features / key_features lists if present
for key in ("features", "key_features", "keyFeatures"):
features = prop.get(key)
if features and isinstance(features, list):
for feat in features:
if not isinstance(feat, str):
continue
lower_feat = feat.lower()
if "leasehold" in lower_feat:
return "Leasehold"
if "freehold" in lower_feat:
return "Freehold"
return None
@ -267,7 +316,7 @@ def transform_property(
return None
price = prop.get("price") or prop.get("latest_price")
if not price:
if not price or int(price) <= 0:
return None
# Home.co.uk provides postcodes directly, but fall back to spatial index
@ -278,8 +327,16 @@ def transform_property(
log.debug("No postcode for property at %.4f, %.4f — skipping", lat, lng)
return None
bedrooms = prop.get("bedrooms", 0) or 0
bathrooms = prop.get("bathrooms", 0) or 0
raw_beds = prop.get("bedrooms", 0) or 0
raw_baths = prop.get("bathrooms", 0) or 0
bedrooms = raw_beds if raw_beds <= MAX_BEDROOMS else 0
bathrooms = raw_baths if raw_baths <= MAX_BEDROOMS else 0
if raw_beds > MAX_BEDROOMS or raw_baths > MAX_BEDROOMS:
log.warning(
"home.co.uk %s: implausible beds=%d baths=%d (capped to 0)",
prop.get("listing_id") or prop.get("property_id") or "?",
raw_beds, raw_baths,
)
listing_type = prop.get("listing_property_type") or prop.get("property_type") or ""
address = prop.get("display_address") or prop.get("address") or ""
@ -304,7 +361,7 @@ def transform_property(
"lat": lat,
"Postcode": postcode,
"Address per Property Register": address,
"Leasehold/Freehold": None, # not available from home.co.uk
"Leasehold/Freehold": parse_tenure(prop),
"Property type": map_property_type(listing_type),
"Property sub-type": listing_type or "Unknown",
"price": int(price),

View file

@ -34,6 +34,7 @@ from playwright.sync_api import sync_playwright
from constants import (
DELAY_BETWEEN_PAGES,
MAX_BEDROOMS,
OPENRENT_BASE,
PROPERTY_TYPE_MAP,
RETRY_BASE_DELAY,
@ -45,6 +46,7 @@ from metrics import (
openrent_requests_total,
)
from spatial import PostcodeSpatialIndex
from transform import validate_floor_area
log = logging.getLogger("openrent")
@ -607,10 +609,10 @@ def parse_floor_area(description: str | None) -> float | None:
m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*ft", description, re.IGNORECASE)
if m:
sqft = float(m.group(1).replace(",", ""))
return round(sqft * 0.092903, 1)
return validate_floor_area(round(sqft * 0.092903, 1))
m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*m", description, re.IGNORECASE)
if m:
return round(float(m.group(1).replace(",", "")), 1)
return validate_floor_area(round(float(m.group(1).replace(",", "")), 1))
return None
@ -651,7 +653,7 @@ def transform_property(
lat = detail.get("lat") or search_data.get("lat")
lng = detail.get("lng") or search_data.get("lng")
price = detail.get("price") or search_data.get("price")
if not price:
if not price or int(price) <= 0:
return None
frequency = search_data.get("frequency", "monthly")
@ -701,8 +703,15 @@ def transform_property(
log.debug("No postcode for property — skipping")
return None
bedrooms = detail.get("bedrooms") or search_data.get("bedrooms", 0) or 0
bathrooms = detail.get("bathrooms") or search_data.get("bathrooms", 0) or 0
raw_beds = detail.get("bedrooms") or search_data.get("bedrooms", 0) or 0
raw_baths = detail.get("bathrooms") or search_data.get("bathrooms", 0) or 0
bedrooms = raw_beds if raw_beds <= MAX_BEDROOMS else 0
bathrooms = raw_baths if raw_baths <= MAX_BEDROOMS else 0
if raw_beds > MAX_BEDROOMS or raw_baths > MAX_BEDROOMS:
log.warning(
"OpenRent %s: implausible beds=%d baths=%d (capped to 0)",
search_data.get("id", "?"), raw_beds, raw_baths,
)
# Title: prefer detail page (has h1 with full title)
title = detail.get("title") or search_data.get("title", "")
@ -746,6 +755,9 @@ def transform_property(
"lat": lat,
"Postcode": postcode,
"Address per Property Register": address,
# OpenRent is a rental-only platform — tenure (Freehold/Leasehold) is a
# property ownership concept that doesn't apply to rental listings. The
# landlord's tenure is not shown on OpenRent listing pages.
"Leasehold/Freehold": None,
"Property type": map_property_type(property_type),
"Property sub-type": property_type or "Unknown",

View file

@ -4,6 +4,7 @@ from pathlib import Path
import polars as pl
from constants import MAX_BEDROOMS, MAX_RENT_MONTHLY, MIN_RENT_MONTHLY
from transform import normalize_price
log = logging.getLogger("rightmove")
@ -18,6 +19,30 @@ def write_parquet(properties: list[dict], path: Path, channel: str) -> None:
log.warning("No properties to write to %s", path)
return
# Sanitize bedroom/bathroom counts — values above MAX_BEDROOMS are
# almost certainly prices or other numeric fields mis-parsed as bedrooms.
bad_count = 0
for p in properties:
for key in ("Bedrooms", "Bathrooms"):
val = p.get(key, 0) or 0
if val > MAX_BEDROOMS:
bad_count += 1
p[key] = None
# Recompute derived field after sanitization
beds = p.get("Bedrooms")
baths = p.get("Bathrooms")
if beds is None or baths is None:
p["Number of bedrooms & living rooms"] = None
else:
p["Number of bedrooms & living rooms"] = beds + baths
if bad_count:
log.warning(
"Sanitized %d properties with bedroom/bathroom counts > %d (set to null)",
bad_count,
MAX_BEDROOMS,
)
# Parse first_visible_date to datetime
listing_dates = []
for p in properties:
@ -36,15 +61,33 @@ def write_parquet(properties: list[dict], path: Path, channel: str) -> None:
listing_dates.append(None)
# Derive asking price / asking rent based on channel
# Zero prices indicate parsing failures or POA/auction listings — treat as null
if channel == "buy":
asking_prices = [p["price"] for p in properties]
asking_prices = [p["price"] if p["price"] > 0 else None for p in properties]
asking_rents = [None] * len(properties)
listing_statuses = ["For sale"] * len(properties)
else:
asking_prices = [None] * len(properties)
asking_rents = [
normalize_price(p["price"], p["price_frequency"]) for p in properties
]
# Normalize to monthly, then apply sanity bounds. Rents outside
# [MIN_RENT_MONTHLY, MAX_RENT_MONTHLY] are almost always total-stay
# pricing (short lets), annual rents mislabelled as monthly, or £0
# placeholders — null them out rather than polluting aggregates.
rent_outliers = 0
asking_rents = []
for p in properties:
monthly = normalize_price(p["price"], p["price_frequency"])
if monthly < MIN_RENT_MONTHLY or monthly > MAX_RENT_MONTHLY:
rent_outliers += 1
asking_rents.append(None)
else:
asking_rents.append(monthly)
if rent_outliers:
log.warning(
"Nulled %d rent outliers outside [£%d, £%d]/month",
rent_outliers,
MIN_RENT_MONTHLY,
MAX_RENT_MONTHLY,
)
listing_statuses = ["For rent"] * len(properties)
df = pl.DataFrame(

View file

@ -1,12 +1,31 @@
import logging
import re
from constants import PROPERTY_TYPE_MAP, RIGHTMOVE_BASE
from constants import MAX_BEDROOMS, PROPERTY_TYPE_MAP, RIGHTMOVE_BASE
from spatial import PostcodeSpatialIndex
log = logging.getLogger("rightmove")
# Maximum plausible floor area for a residential property listing (sqm).
# ~21,500 sq ft — covers even the largest UK mansions.
MAX_FLOOR_AREA_SQM = 2000.0
def validate_floor_area(sqm: float | None) -> float | None:
"""Validate a floor area value. Returns None for nonsensical values.
Rejects zero/negative values and anything above MAX_FLOOR_AREA_SQM,
which catches parsing errors where prices or other large numbers are
mistakenly extracted as floor area from free-text descriptions or DOM text.
"""
if sqm is None:
return None
if sqm <= 0 or sqm > MAX_FLOOR_AREA_SQM:
return None
return sqm
def parse_display_size(display_size: str | None) -> float | None:
"""Parse displaySize like '499 sq. ft.' or '4,124 sq. ft.' to sqm."""
if not display_size:
@ -15,11 +34,11 @@ def parse_display_size(display_size: str | None) -> float | None:
m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*ft", display_size, re.IGNORECASE)
if m:
sqft = float(m.group(1).replace(",", ""))
return round(sqft * 0.092903, 1)
return validate_floor_area(round(sqft * 0.092903, 1))
# Try sq. m.
m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*m", display_size, re.IGNORECASE)
if m:
return round(float(m.group(1).replace(",", "")), 1)
return validate_floor_area(round(float(m.group(1).replace(",", "")), 1))
return None
@ -92,19 +111,34 @@ def transform_property(
price_obj = prop.get("price", {})
amount = price_obj.get("amount")
if amount is None:
if not amount:
return None
frequency = price_obj.get("frequency", "")
price = normalize_price(int(amount), frequency)
# Store raw price — normalization to monthly happens once in storage.py
price = int(amount)
if price <= 0:
return None
display_prices = price_obj.get("displayPrices", [])
price_qualifier = (
display_prices[0].get("displayPriceQualifier", "") if display_prices else ""
)
# POA / Auction listings have unreliable prices — treat as no price
pq_lower = price_qualifier.lower()
if "poa" in pq_lower or "auction" in pq_lower:
return None
sub_type = prop.get("propertySubType", "")
bedrooms = prop.get("bedrooms", 0) or 0
bathrooms = prop.get("bathrooms", 0) or 0
raw_beds = prop.get("bedrooms", 0) or 0
raw_baths = prop.get("bathrooms", 0) or 0
bedrooms = raw_beds if raw_beds <= MAX_BEDROOMS else 0
bathrooms = raw_baths if raw_baths <= MAX_BEDROOMS else 0
if raw_beds > MAX_BEDROOMS or raw_baths > MAX_BEDROOMS:
log.warning(
"Rightmove %s: implausible beds=%d baths=%d (capped to 0)",
prop.get("id", "?"), raw_beds, raw_baths,
)
key_features = [
kf.get("description", "")

View file

@ -26,9 +26,10 @@ import logging
import re
import time
from constants import DELAY_BETWEEN_PAGES, PROPERTY_TYPE_MAP, ZOOPLA_BASE
from constants import DELAY_BETWEEN_PAGES, MAX_BEDROOMS, PROPERTY_TYPE_MAP, ZOOPLA_BASE
from metrics import zoopla_errors_total, zoopla_pages_scraped, zoopla_properties_scraped
from spatial import PostcodeSpatialIndex
from transform import validate_floor_area
log = logging.getLogger("zoopla")
@ -94,15 +95,16 @@ _EXTRACT_LISTINGS_JS = r"""() => {
const areaMatch = text.match(/([\d,]+)\s*sq\.?\s*ft/i);
let tenure = '';
if (/freehold/i.test(text)) tenure = 'Freehold';
else if (/leasehold/i.test(text)) tenure = 'Leasehold';
if (/leasehold/i.test(text)) tenure = 'Leasehold';
else if (/freehold/i.test(text)) tenure = 'Freehold';
results.push({
id, url: href.replace(window.location.origin, ''),
price: priceMatch ? parseInt(priceMatch[1].replace(/,/g, '')) : null,
beds: bedsMatch ? parseInt(bedsMatch[1]) : null,
baths: bathsMatch ? parseInt(bathsMatch[1]) : null,
receptions: recMatch ? parseInt(recMatch[1]) : null,
price_text: priceText.trim(),
beds: bedsMatch && parseInt(bedsMatch[1]) <= 20 ? parseInt(bedsMatch[1]) : null,
baths: bathsMatch && parseInt(bathsMatch[1]) <= 20 ? parseInt(bathsMatch[1]) : null,
receptions: recMatch && parseInt(recMatch[1]) <= 20 ? parseInt(recMatch[1]) : null,
floor_area_sqft: areaMatch ? parseInt(areaMatch[1].replace(/,/g, '')) : null,
address, tenure,
});
@ -137,7 +139,9 @@ _EXTRACT_LISTINGS_JS = r"""() => {
const text = card.innerText || '';
const lines = text.split('\n').map(l => l.trim()).filter(Boolean);
const priceMatch = text.match(/\u00a3([\d,]+)/);
const priceEl2 = card.querySelector('[data-testid="listing-price"]');
const priceText2 = priceEl2 ? priceEl2.innerText : text;
const priceMatch = priceText2.match(/\u00a3([\d,]+)/);
const bedsMatch = text.match(/(\d+)\s*beds?/i);
const bathsMatch = text.match(/(\d+)\s*baths?/i);
const recMatch = text.match(/(\d+)\s*reception/i);
@ -153,15 +157,16 @@ _EXTRACT_LISTINGS_JS = r"""() => {
}
let tenure = '';
if (/freehold/i.test(text)) tenure = 'Freehold';
else if (/leasehold/i.test(text)) tenure = 'Leasehold';
if (/leasehold/i.test(text)) tenure = 'Leasehold';
else if (/freehold/i.test(text)) tenure = 'Freehold';
results.push({
id, url: href.replace(window.location.origin, ''),
price: priceMatch ? parseInt(priceMatch[1].replace(/,/g, '')) : null,
beds: bedsMatch ? parseInt(bedsMatch[1]) : null,
baths: bathsMatch ? parseInt(bathsMatch[1]) : null,
receptions: recMatch ? parseInt(recMatch[1]) : null,
price_text: priceText2.trim(),
beds: bedsMatch && parseInt(bedsMatch[1]) <= 20 ? parseInt(bedsMatch[1]) : null,
baths: bathsMatch && parseInt(bathsMatch[1]) <= 20 ? parseInt(bathsMatch[1]) : null,
receptions: recMatch && parseInt(recMatch[1]) <= 20 ? parseInt(recMatch[1]) : null,
floor_area_sqft: areaMatch ? parseInt(areaMatch[1].replace(/,/g, '')) : null,
address, tenure,
});
@ -597,6 +602,21 @@ def _map_property_type(raw_type: str | None) -> str:
return "Other"
def _detect_rent_frequency(price_text: str) -> str:
"""Detect rent frequency from Zoopla price text.
Zoopla price elements contain text like '£1,500 pcm', '£350 pw',
'£18,000 pa'. Defaults to 'monthly' if no frequency indicator found.
"""
lower = price_text.lower()
if "pw" in lower or "per week" in lower or "/w" in lower:
return "weekly"
if "pa" in lower or "per annum" in lower or "/y" in lower or "per year" in lower:
return "yearly"
# pcm, per month, /m, or no indicator — default monthly
return "monthly"
def transform_property(
raw: dict,
channel: str,
@ -608,7 +628,7 @@ def transform_property(
Zoopla search cards do not include coordinates, so we resolve lat/lng
from postcodes extracted from the address text."""
price = raw.get("price")
if not price:
if not price or int(price) <= 0:
return None
address = raw.get("address", "")
@ -647,21 +667,35 @@ def transform_property(
if not (49 <= lat <= 56 and -7 <= lng <= 2):
return None
bedrooms = raw.get("beds") or 0
bathrooms = raw.get("baths") or 0
raw_beds = raw.get("beds") or 0
raw_baths = raw.get("baths") or 0
bedrooms = raw_beds if raw_beds <= MAX_BEDROOMS else 0
bathrooms = raw_baths if raw_baths <= MAX_BEDROOMS else 0
if raw_beds > MAX_BEDROOMS or raw_baths > MAX_BEDROOMS:
log.warning(
"Zoopla %s: implausible beds=%d baths=%d (capped to 0)",
raw.get("id", "?"), raw_beds, raw_baths,
)
receptions = raw.get("receptions") or 0
# Floor area: convert sq ft to sq m
floor_area_sqm = None
sqft = raw.get("floor_area_sqft")
if sqft:
floor_area_sqm = round(sqft * 0.092903, 1)
floor_area_sqm = validate_floor_area(round(sqft * 0.092903, 1))
listing_id = raw.get("id", "")
listing_url = raw.get("url", "")
if listing_url and not listing_url.startswith("http"):
listing_url = ZOOPLA_BASE + listing_url
# Detect rent frequency from price text (e.g. "£1,500 pcm" vs "£350 pw")
if channel == "BUY":
frequency = ""
else:
price_text = raw.get("price_text", "")
frequency = _detect_rent_frequency(price_text)
return {
"id": f"zp_{listing_id}",
"Bedrooms": bedrooms,
@ -675,7 +709,7 @@ def transform_property(
"Property type": "Other", # Not reliably extractable from Zoopla search cards
"Property sub-type": "",
"price": int(price),
"price_frequency": "" if channel == "BUY" else "monthly",
"price_frequency": frequency,
"Price qualifier": "",
"Total floor area (sqm)": floor_area_sqm,
"Listing URL": listing_url,