all is well
Some checks failed
Build and publish Docker image / build-and-push (push) Failing after 7m0s
CI / Check (push) Failing after 7m9s

This commit is contained in:
Andras Schmelczer 2026-05-17 17:20:19 +01:00
parent eac1bd0d13
commit 2f149503bb
53 changed files with 1543 additions and 354 deletions

View file

@ -24,7 +24,7 @@ import time
from constants import DELAY_BETWEEN_PAGES, MAX_BEDROOMS, PROPERTY_TYPE_MAP, ZOOPLA_BASE
from spatial import PostcodeSpatialIndex
from transform import normalize_sub_type, validate_floor_area
from transform import normalize_sub_type, parse_int_value, validate_floor_area
log = logging.getLogger("zoopla")
@ -106,7 +106,8 @@ _EXTRACT_LISTINGS_JS = r"""() => {
const bedsMatch = text.match(/(\d+)\s*beds?/i);
const bathsMatch = text.match(/(\d+)\s*baths?/i);
const recMatch = text.match(/(\d+)\s*reception/i);
const areaMatch = text.match(/([\d,]+)\s*sq\.?\s*ft/i);
const areaSqftMatch = text.match(/([\d,]+(?:\.\d+)?)\s*(?:sq\.?\s*ft|square\s+feet|ft(?:\^?2|²))/i);
const areaSqmMatch = text.match(/([\d,]+(?:\.\d+)?)\s*(?:sq\.?\s*m|square\s+met(?:er|re)s?|m(?:\^?2|²))/i);
let tenure = '';
if (/leasehold/i.test(text)) tenure = 'Leasehold';
@ -141,7 +142,8 @@ _EXTRACT_LISTINGS_JS = r"""() => {
beds: bedsMatch && parseInt(bedsMatch[1]) <= 20 ? parseInt(bedsMatch[1]) : null,
baths: bathsMatch && parseInt(bathsMatch[1]) <= 20 ? parseInt(bathsMatch[1]) : null,
receptions: recMatch && parseInt(recMatch[1]) <= 20 ? parseInt(recMatch[1]) : null,
floor_area_sqft: areaMatch ? parseInt(areaMatch[1].replace(/,/g, '')) : null,
floor_area_sqft: areaSqftMatch ? parseInt(areaSqftMatch[1].replace(/,/g, '')) : null,
floor_area_sqm: areaSqmMatch ? parseFloat(areaSqmMatch[1].replace(/,/g, '')) : null,
address, tenure, property_type,
});
}
@ -181,7 +183,8 @@ _EXTRACT_LISTINGS_JS = r"""() => {
const bedsMatch = text.match(/(\d+)\s*beds?/i);
const bathsMatch = text.match(/(\d+)\s*baths?/i);
const recMatch = text.match(/(\d+)\s*reception/i);
const areaMatch = text.match(/([\d,]+)\s*sq\.?\s*ft/i);
const areaSqftMatch = text.match(/([\d,]+(?:\.\d+)?)\s*(?:sq\.?\s*ft|square\s+feet|ft(?:\^?2|²))/i);
const areaSqmMatch = text.match(/([\d,]+(?:\.\d+)?)\s*(?:sq\.?\s*m|square\s+met(?:er|re)s?|m(?:\^?2|²))/i);
let address = '';
for (const line of lines) {
@ -225,7 +228,8 @@ _EXTRACT_LISTINGS_JS = r"""() => {
beds: bedsMatch && parseInt(bedsMatch[1]) <= 20 ? parseInt(bedsMatch[1]) : null,
baths: bathsMatch && parseInt(bathsMatch[1]) <= 20 ? parseInt(bathsMatch[1]) : null,
receptions: recMatch && parseInt(recMatch[1]) <= 20 ? parseInt(recMatch[1]) : null,
floor_area_sqft: areaMatch ? parseInt(areaMatch[1].replace(/,/g, '')) : null,
floor_area_sqft: areaSqftMatch ? parseInt(areaSqftMatch[1].replace(/,/g, '')) : null,
floor_area_sqm: areaSqmMatch ? parseFloat(areaSqmMatch[1].replace(/,/g, '')) : null,
address, tenure, property_type,
});
}
@ -611,7 +615,22 @@ def _map_property_type(raw_type: str | None) -> str:
return canonical
# Keyword fallback
lower = raw_type.lower()
if "flat" in lower or "apartment" in lower or "maisonette" in lower or "studio" in lower or "penthouse" in lower:
excluded_flat_like = (
"block of apartment",
"house of multiple occupation",
"private halls",
"retirement",
"serviced apartment",
)
if any(term in lower for term in excluded_flat_like):
return "Other"
if (
"flat" in lower
or "apartment" in lower
or "maisonette" in lower
or "studio" in lower
or "penthouse" in lower
):
return "Flats/Maisonettes"
if "semi" in lower and "detach" in lower:
return "Semi-Detached"
@ -634,8 +653,8 @@ def transform_property(
Zoopla search cards do not include coordinates, so we resolve lat/lng
from postcodes extracted from the address text."""
price = raw.get("price")
if not price or int(price) <= 0:
price = parse_int_value(raw.get("price"))
if not price or price <= 0:
return None
address = raw.get("address", "")
@ -670,10 +689,10 @@ def transform_property(
if not (49 <= lat <= 56 and -7 <= lng <= 2):
return None
raw_beds = raw.get("beds") or 0
raw_baths = raw.get("baths") or 0
bedrooms = raw_beds if raw_beds <= MAX_BEDROOMS else 0
bathrooms = raw_baths if raw_baths <= MAX_BEDROOMS else 0
raw_beds = parse_int_value(raw.get("beds")) or 0
raw_baths = parse_int_value(raw.get("baths")) or 0
bedrooms = raw_beds if 0 <= raw_beds <= MAX_BEDROOMS else 0
bathrooms = raw_baths if 0 <= raw_baths <= MAX_BEDROOMS else 0
if raw_beds > MAX_BEDROOMS or raw_baths > MAX_BEDROOMS:
log.warning(
"Zoopla %s: implausible beds=%d baths=%d (capped to 0)",
@ -683,9 +702,13 @@ def transform_property(
# Floor area: convert sq ft to sq m
floor_area_sqm = None
sqft = raw.get("floor_area_sqft")
if sqft:
floor_area_sqm = validate_floor_area(round(sqft * 0.092903, 1))
raw_sqm = raw.get("floor_area_sqm")
if raw_sqm:
floor_area_sqm = validate_floor_area(round(float(raw_sqm), 1))
else:
sqft = raw.get("floor_area_sqft")
if sqft:
floor_area_sqm = validate_floor_area(round(float(sqft) * 0.092903, 1))
listing_id = raw.get("id", "")
listing_url = raw.get("url", "")
@ -704,7 +727,7 @@ def transform_property(
"Leasehold/Freehold": raw.get("tenure") or None,
"Property type": _map_property_type(raw.get("property_type")),
"Property sub-type": normalize_sub_type(raw.get("property_type")),
"price": int(price),
"price": price,
"price_frequency": "",
"Price qualifier": "",
"Total floor area (sqm)": floor_area_sqm,
@ -760,7 +783,18 @@ def search_outcode(
properties = []
dropped = 0
for raw in raw_listings:
transformed = transform_property(raw, pc_index, pc_coords, search_outcode=outcode)
try:
transformed = transform_property(
raw, pc_index, pc_coords, search_outcode=outcode
)
except Exception as exc:
log.warning(
"Zoopla %s property %s failed to transform: %s",
outcode,
raw.get("id", "?"),
exc,
)
transformed = None
if transformed:
properties.append(transformed)
else: