finder improvements

This commit is contained in:
Andras Schmelczer 2026-03-25 08:06:05 +00:00
parent 30055ab870
commit 3a3e249bdd
6 changed files with 225 additions and 39 deletions

View file

@ -26,9 +26,10 @@ import logging
import re
import time
from constants import DELAY_BETWEEN_PAGES, PROPERTY_TYPE_MAP, ZOOPLA_BASE
from constants import DELAY_BETWEEN_PAGES, MAX_BEDROOMS, PROPERTY_TYPE_MAP, ZOOPLA_BASE
from metrics import zoopla_errors_total, zoopla_pages_scraped, zoopla_properties_scraped
from spatial import PostcodeSpatialIndex
from transform import validate_floor_area
log = logging.getLogger("zoopla")
@ -94,15 +95,16 @@ _EXTRACT_LISTINGS_JS = r"""() => {
const areaMatch = text.match(/([\d,]+)\s*sq\.?\s*ft/i);
let tenure = '';
if (/freehold/i.test(text)) tenure = 'Freehold';
else if (/leasehold/i.test(text)) tenure = 'Leasehold';
if (/leasehold/i.test(text)) tenure = 'Leasehold';
else if (/freehold/i.test(text)) tenure = 'Freehold';
results.push({
id, url: href.replace(window.location.origin, ''),
price: priceMatch ? parseInt(priceMatch[1].replace(/,/g, '')) : null,
beds: bedsMatch ? parseInt(bedsMatch[1]) : null,
baths: bathsMatch ? parseInt(bathsMatch[1]) : null,
receptions: recMatch ? parseInt(recMatch[1]) : null,
price_text: priceText.trim(),
beds: bedsMatch && parseInt(bedsMatch[1]) <= 20 ? parseInt(bedsMatch[1]) : null,
baths: bathsMatch && parseInt(bathsMatch[1]) <= 20 ? parseInt(bathsMatch[1]) : null,
receptions: recMatch && parseInt(recMatch[1]) <= 20 ? parseInt(recMatch[1]) : null,
floor_area_sqft: areaMatch ? parseInt(areaMatch[1].replace(/,/g, '')) : null,
address, tenure,
});
@ -137,7 +139,9 @@ _EXTRACT_LISTINGS_JS = r"""() => {
const text = card.innerText || '';
const lines = text.split('\n').map(l => l.trim()).filter(Boolean);
const priceMatch = text.match(/\u00a3([\d,]+)/);
const priceEl2 = card.querySelector('[data-testid="listing-price"]');
const priceText2 = priceEl2 ? priceEl2.innerText : text;
const priceMatch = priceText2.match(/\u00a3([\d,]+)/);
const bedsMatch = text.match(/(\d+)\s*beds?/i);
const bathsMatch = text.match(/(\d+)\s*baths?/i);
const recMatch = text.match(/(\d+)\s*reception/i);
@ -153,15 +157,16 @@ _EXTRACT_LISTINGS_JS = r"""() => {
}
let tenure = '';
if (/freehold/i.test(text)) tenure = 'Freehold';
else if (/leasehold/i.test(text)) tenure = 'Leasehold';
if (/leasehold/i.test(text)) tenure = 'Leasehold';
else if (/freehold/i.test(text)) tenure = 'Freehold';
results.push({
id, url: href.replace(window.location.origin, ''),
price: priceMatch ? parseInt(priceMatch[1].replace(/,/g, '')) : null,
beds: bedsMatch ? parseInt(bedsMatch[1]) : null,
baths: bathsMatch ? parseInt(bathsMatch[1]) : null,
receptions: recMatch ? parseInt(recMatch[1]) : null,
price_text: priceText2.trim(),
beds: bedsMatch && parseInt(bedsMatch[1]) <= 20 ? parseInt(bedsMatch[1]) : null,
baths: bathsMatch && parseInt(bathsMatch[1]) <= 20 ? parseInt(bathsMatch[1]) : null,
receptions: recMatch && parseInt(recMatch[1]) <= 20 ? parseInt(recMatch[1]) : null,
floor_area_sqft: areaMatch ? parseInt(areaMatch[1].replace(/,/g, '')) : null,
address, tenure,
});
@ -597,6 +602,21 @@ def _map_property_type(raw_type: str | None) -> str:
return "Other"
def _detect_rent_frequency(price_text: str) -> str:
"""Detect rent frequency from Zoopla price text.
Zoopla price elements contain text like '£1,500 pcm', '£350 pw',
'£18,000 pa'. Defaults to 'monthly' if no frequency indicator found.
"""
lower = price_text.lower()
if "pw" in lower or "per week" in lower or "/w" in lower:
return "weekly"
if "pa" in lower or "per annum" in lower or "/y" in lower or "per year" in lower:
return "yearly"
# pcm, per month, /m, or no indicator — default monthly
return "monthly"
def transform_property(
raw: dict,
channel: str,
@ -608,7 +628,7 @@ def transform_property(
Zoopla search cards do not include coordinates, so we resolve lat/lng
from postcodes extracted from the address text."""
price = raw.get("price")
if not price:
if not price or int(price) <= 0:
return None
address = raw.get("address", "")
@ -647,21 +667,35 @@ def transform_property(
if not (49 <= lat <= 56 and -7 <= lng <= 2):
return None
bedrooms = raw.get("beds") or 0
bathrooms = raw.get("baths") or 0
raw_beds = raw.get("beds") or 0
raw_baths = raw.get("baths") or 0
bedrooms = raw_beds if raw_beds <= MAX_BEDROOMS else 0
bathrooms = raw_baths if raw_baths <= MAX_BEDROOMS else 0
if raw_beds > MAX_BEDROOMS or raw_baths > MAX_BEDROOMS:
log.warning(
"Zoopla %s: implausible beds=%d baths=%d (capped to 0)",
raw.get("id", "?"), raw_beds, raw_baths,
)
receptions = raw.get("receptions") or 0
# Floor area: convert sq ft to sq m
floor_area_sqm = None
sqft = raw.get("floor_area_sqft")
if sqft:
floor_area_sqm = round(sqft * 0.092903, 1)
floor_area_sqm = validate_floor_area(round(sqft * 0.092903, 1))
listing_id = raw.get("id", "")
listing_url = raw.get("url", "")
if listing_url and not listing_url.startswith("http"):
listing_url = ZOOPLA_BASE + listing_url
# Detect rent frequency from price text (e.g. "£1,500 pcm" vs "£350 pw")
if channel == "BUY":
frequency = ""
else:
price_text = raw.get("price_text", "")
frequency = _detect_rent_frequency(price_text)
return {
"id": f"zp_{listing_id}",
"Bedrooms": bedrooms,
@ -675,7 +709,7 @@ def transform_property(
"Property type": "Other", # Not reliably extractable from Zoopla search cards
"Property sub-type": "",
"price": int(price),
"price_frequency": "" if channel == "BUY" else "monthly",
"price_frequency": frequency,
"Price qualifier": "",
"Total floor area (sqm)": floor_area_sqm,
"Listing URL": listing_url,