all is well
Some checks failed
Build and publish Docker image / build-and-push (push) Failing after 7m0s
CI / Check (push) Failing after 7m9s

This commit is contained in:
Andras Schmelczer 2026-05-17 17:20:19 +01:00
parent eac1bd0d13
commit 2f149503bb
53 changed files with 1543 additions and 354 deletions

View file

@ -82,7 +82,7 @@ PROPERTY_TYPE_MAP = {
"Farm / Barn": "Other",
"Farm House": "Other",
"House": "Detached",
"House of Multiple Occupation": "Flats/Maisonettes",
"House of Multiple Occupation": "Other",
"House Share": "Other",
"Not Specified": "Other",
"Chalet": "Other",
@ -90,15 +90,15 @@ PROPERTY_TYPE_MAP = {
"Coach House": "Other",
"Character Property": "Other",
"Cluster House": "Other",
"Retirement Property": "Flats/Maisonettes",
"Retirement Property": "Other",
"Parking": "Other",
"Plot": "Other",
"Garages": "Other",
"Mews": "Terraced",
"Property": "Other",
"Flat Share": "Other",
"Block of Apartments": "Flats/Maisonettes",
"Private Halls": "Flats/Maisonettes",
"Block of Apartments": "Other",
"Private Halls": "Other",
"Terraced Bungalow": "Terraced",
"Equestrian Facility": "Other",
"Ground Maisonette": "Flats/Maisonettes",
@ -107,13 +107,13 @@ PROPERTY_TYPE_MAP = {
"Farm Land": "Other",
"House Boat": "Other",
"Barn": "Other",
"Serviced Apartments": "Flats/Maisonettes",
"Serviced Apartments": "Other",
# Space-separated variants (from home.co.uk underscore/hyphen normalization)
"Semi Detached": "Semi-Detached",
"Semi Detached Bungalow": "Semi-Detached",
"End Of Terrace": "Terraced",
"End Terrace": "Terraced",
"Block Of Apartments": "Flats/Maisonettes",
"Block Of Apartments": "Other",
# Lowercase variants (from home.co.uk / Rightmove APIs)
"house": "Detached",
"bungalow": "Other",
@ -121,7 +121,7 @@ PROPERTY_TYPE_MAP = {
"land": "Other",
"other": "Other",
"not-specified": "Other",
"retirement-property": "Flats/Maisonettes",
"retirement-property": "Other",
"equestrian-facility": "Other",
"flat": "Flats/Maisonettes",
"detached": "Detached",

View file

@ -19,7 +19,12 @@ from constants import (
RETRY_BASE_DELAY,
)
from spatial import PostcodeSpatialIndex
from transform import normalize_postcode, normalize_sub_type, validate_floor_area
from transform import (
normalize_postcode,
normalize_sub_type,
parse_int_value,
validate_floor_area,
)
log = logging.getLogger("homecouk")
@ -170,11 +175,19 @@ def parse_floor_area(description: str | None) -> float | None:
"""Try to extract floor area from description text like '789 sq.ft.' or '73 sq.m.'."""
if not description:
return None
m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*ft", description, re.IGNORECASE)
m = re.search(
r"([\d,]+(?:\.\d+)?)\s*(?:sq\.?\s*ft|square\s+feet|ft(?:\^?2|²))",
description,
re.IGNORECASE,
)
if m:
sqft = float(m.group(1).replace(",", ""))
return validate_floor_area(round(sqft * 0.092903, 1))
m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*m", description, re.IGNORECASE)
m = re.search(
r"([\d,]+(?:\.\d+)?)\s*(?:sq\.?\s*m|square\s+met(?:er|re)s?|m(?:\^?2|²))",
description,
re.IGNORECASE,
)
if m:
return validate_floor_area(round(float(m.group(1).replace(",", "")), 1))
return None
@ -237,6 +250,15 @@ def map_property_type(raw_type: str | None) -> str:
# Home.co.uk uses types like "House", "Flat", "Apartment", "Detached", etc.
# Try common patterns
lower = raw_type.lower()
excluded_flat_like = (
"block of apartment",
"house of multiple occupation",
"private halls",
"retirement",
"serviced apartment",
)
if any(term in lower for term in excluded_flat_like):
return "Other"
if (
"flat" in lower
or "apartment" in lower
@ -269,8 +291,10 @@ def transform_property(
log.debug("Coords outside England: lat=%.4f lng=%.4f — skipping", lat, lng)
return None
price = prop.get("price") or prop.get("latest_price")
if not price or int(price) <= 0:
price = parse_int_value(prop.get("price")) or parse_int_value(
prop.get("latest_price")
)
if not price or price <= 0:
return None
# Home.co.uk provides postcodes directly, but fall back to spatial index
@ -281,10 +305,10 @@ def transform_property(
log.debug("No postcode for property at %.4f, %.4f — skipping", lat, lng)
return None
raw_beds = prop.get("bedrooms", 0) or 0
raw_baths = prop.get("bathrooms", 0) or 0
bedrooms = raw_beds if raw_beds <= MAX_BEDROOMS else 0
bathrooms = raw_baths if raw_baths <= MAX_BEDROOMS else 0
raw_beds = parse_int_value(prop.get("bedrooms")) or 0
raw_baths = parse_int_value(prop.get("bathrooms")) or 0
bedrooms = raw_beds if 0 <= raw_beds <= MAX_BEDROOMS else 0
bathrooms = raw_baths if 0 <= raw_baths <= MAX_BEDROOMS else 0
if raw_beds > MAX_BEDROOMS or raw_baths > MAX_BEDROOMS:
log.warning(
"home.co.uk %s: implausible beds=%d baths=%d (capped to 0)",
@ -318,7 +342,7 @@ def transform_property(
"Leasehold/Freehold": parse_tenure(prop),
"Property type": map_property_type(listing_type),
"Property sub-type": normalize_sub_type(listing_type),
"price": int(price),
"price": price,
"price_frequency": "",
"Price qualifier": price_qualifier,
"Total floor area (sqm)": parse_floor_area(prop.get("description")),
@ -362,7 +386,16 @@ def search_outcode(
break
for prop in raw_props:
transformed = transform_property(prop, pc_index)
try:
transformed = transform_property(prop, pc_index)
except Exception as exc:
log.warning(
"home.co.uk %s property %s failed to transform: %s",
outcode,
prop.get("listing_id") or prop.get("property_id") or "?",
exc,
)
continue
if transformed:
properties.append(transformed)
if max_properties is not None and len(properties) >= max_properties:

63
finder/listing_filters.py Normal file
View file

@ -0,0 +1,63 @@
"""Shared target filters for manual buy-listing scrapes."""
import math
from typing import Any
BUY_MAX_PRICE = 1_000_000
BUY_MIN_BEDROOMS = 2
BUY_MAX_BEDROOMS = 5
BUY_ALLOWED_BATHROOMS = frozenset({2, 3})
BUY_MIN_FLOOR_AREA_SQM = 90.0
BUY_MAX_FLOOR_AREA_SQM = 170.0
BUY_PROPERTY_TYPES = frozenset({"Flats/Maisonettes"})
BUY_MIN_FLOOR_AREA_SQFT = round(BUY_MIN_FLOOR_AREA_SQM / 0.092903)
BUY_MAX_FLOOR_AREA_SQFT = round(BUY_MAX_FLOOR_AREA_SQM / 0.092903)
def _number(value: Any) -> float | None:
if value is None:
return None
try:
number = float(value)
except (TypeError, ValueError):
return None
if not math.isfinite(number):
return None
return number
def _int(value: Any) -> int | None:
number = _number(value)
if number is None or not number.is_integer():
return None
return int(number)
def matches_strict_buy_listing_filter(prop: dict) -> bool:
"""Exact filter used to guard scraped/output datasets."""
if "price" in prop:
price = _number(prop.get("price"))
else:
price = _number(prop.get("Asking price"))
if price is None or price <= 0 or price >= BUY_MAX_PRICE:
return False
bedrooms = _int(prop.get("Bedrooms"))
if bedrooms is None or (
bedrooms < BUY_MIN_BEDROOMS or bedrooms > BUY_MAX_BEDROOMS
):
return False
property_type = prop.get("Property type")
if property_type not in BUY_PROPERTY_TYPES:
return False
bathrooms = _int(prop.get("Bathrooms"))
if bathrooms not in BUY_ALLOWED_BATHROOMS:
return False
floor_area = _number(prop.get("Total floor area (sqm)"))
if floor_area is None:
return False
return BUY_MIN_FLOOR_AREA_SQM <= floor_area <= BUY_MAX_FLOOR_AREA_SQM

View file

@ -10,6 +10,15 @@ from constants import (
TYPEAHEAD_URL,
)
from http_client import fetch_with_retry
from listing_filters import (
BUY_ALLOWED_BATHROOMS,
BUY_MAX_BEDROOMS,
BUY_MAX_FLOOR_AREA_SQFT,
BUY_MAX_PRICE,
BUY_MIN_BEDROOMS,
BUY_MIN_FLOOR_AREA_SQFT,
matches_strict_buy_listing_filter,
)
from spatial import PostcodeSpatialIndex
from transform import transform_property
@ -22,12 +31,23 @@ outcode_cache: dict[str, str] = {}
# Requesting index >= 1008 returns HTTP 400.
_MAX_INDEX = 1008
# Property type filters for splitting overcapped searches. Each sub-query
# gets its own 1008 cap, so we can recover listings beyond the unfiltered limit.
_PROPERTY_TYPES = [
"detached", "semi-detached", "terraced", "flat",
"bungalow", "park-home", "land",
]
_BASE_BUY_SEARCH_PARAMS = {
"propertyTypes": "flat",
"minBedrooms": str(BUY_MIN_BEDROOMS),
"maxBedrooms": str(BUY_MAX_BEDROOMS),
"minBathrooms": str(min(BUY_ALLOWED_BATHROOMS)),
"maxBathrooms": str(max(BUY_ALLOWED_BATHROOMS)),
"minSize": str(BUY_MIN_FLOOR_AREA_SQFT),
"maxSize": str(BUY_MAX_FLOOR_AREA_SQFT),
"maxPrice": str(BUY_MAX_PRICE - 1),
}
def _buy_search_params(extra_params: dict | None = None) -> dict:
params = dict(_BASE_BUY_SEARCH_PARAMS)
if extra_params:
params.update(extra_params)
return params
def resolve_outcode_id(client: httpx.Client, outcode: str) -> str | None:
@ -92,8 +112,18 @@ def _paginate(
break
for prop in raw_props:
transformed = transform_property(prop, outcode, pc_index)
if transformed:
try:
transformed = transform_property(prop, outcode, pc_index)
except Exception as exc:
log.warning(
"Rightmove %s/%s property %s failed to transform: %s",
outcode,
channel_cfg["channel"],
prop.get("id", "?"),
exc,
)
continue
if transformed and matches_strict_buy_listing_filter(transformed):
properties.append(transformed)
if max_properties is not None and len(properties) >= max_properties:
return properties, result_count
@ -105,6 +135,15 @@ def _paginate(
if index >= result_count:
break
if index >= _MAX_INDEX:
log.warning(
"%s/%s: %d filtered results exceed Rightmove's %d-result page cap",
outcode,
channel_cfg["channel"],
result_count,
_MAX_INDEX,
)
break
time.sleep(DELAY_BETWEEN_PAGES)
@ -121,54 +160,20 @@ def search_outcode(
) -> list[dict]:
"""Paginate through search results for one outcode+channel. Returns transformed properties.
When the unfiltered result count exceeds 1008 (Rightmove's hard pagination cap),
re-queries per property type to recover listings beyond the cap.
Search requests set the supported Rightmove filters directly: flats,
2-5 bedrooms, 2-3 bathrooms, 969-1830 sq ft, and asking price below £1m.
"""
properties, result_count = _paginate(
client, outcode_id, outcode, channel_cfg, pc_index, max_properties=max_properties
properties, _ = _paginate(
client,
outcode_id,
outcode,
channel_cfg,
pc_index,
extra_params=_buy_search_params(),
max_properties=max_properties,
)
if max_properties is not None and len(properties) >= max_properties:
return properties[:max_properties]
if result_count <= _MAX_INDEX:
return properties
# Hit the 1008 cap — re-search per property type to get full coverage
ch = channel_cfg["channel"]
log.info(
"%s/%s: %d results exceed %d cap, splitting by property type",
outcode, ch, result_count, _MAX_INDEX,
)
all_by_id: dict[str, dict] = {p["id"]: p for p in properties}
for pt in _PROPERTY_TYPES:
pt_props, _ = _paginate(
client, outcode_id, outcode, channel_cfg, pc_index,
extra_params={"propertyTypes": pt},
max_properties=max_properties,
)
new = 0
for p in pt_props:
if p["id"] not in all_by_id:
all_by_id[p["id"]] = p
new += 1
if (
max_properties is not None
and len(all_by_id) >= max_properties
):
break
if new:
log.debug("%s/%s type=%s: +%d new properties", outcode, ch, pt, new)
if max_properties is not None and len(all_by_id) >= max_properties:
break
log.info(
"%s/%s: type split recovered %d%d properties",
outcode, ch, len(properties), len(all_by_id),
)
properties = list(all_by_id.values())
if max_properties is not None:
return properties[:max_properties]
return properties

View file

@ -19,6 +19,7 @@ from homecouk import load_cookies as load_homecouk_cookies
from homecouk import make_client as make_homecouk_client
from homecouk import search_outcode as homecouk_search_outcode
from http_client import make_client
from listing_filters import matches_strict_buy_listing_filter
from rightmove import resolve_outcode_id
from rightmove import search_outcode as rightmove_search_outcode
from spatial import PostcodeSpatialIndex
@ -181,11 +182,11 @@ def _source_names(sources: str | Iterable[str] | None) -> list[str]:
requested = [str(source).strip().lower() for source in sources]
requested = [source for source in requested if source]
if "all" in requested:
return list(SOURCE_ORDER)
unknown = sorted(set(requested) - set(SOURCE_ORDER))
unknown = sorted(set(requested) - set(SOURCE_ORDER) - {"all"})
if unknown:
raise ValueError(f"Unknown source(s): {', '.join(unknown)}")
if "all" in requested:
return list(SOURCE_ORDER)
return [source for source in SOURCE_ORDER if source in requested]
@ -196,19 +197,28 @@ def _dedup_key(prop: dict) -> tuple:
def _merge_properties(source_results: dict[str, list[dict]]) -> tuple[list[dict], dict, int]:
merged: dict[str, dict] = {}
seen_keys: set[tuple] = set()
seen_ids: set[str] = set()
counts = {source: 0 for source in SOURCE_ORDER}
deduped = 0
for source in SOURCE_ORDER:
for prop in source_results.get(source, []):
prop_id = prop.get("id")
key = _dedup_key(prop)
if (prop_id is not None and prop_id in merged) or key in seen_keys:
deduped += 1
continue
storage_key = prop_id if prop_id is not None else f"{source}:{len(merged)}"
if prop_id is not None:
prop_id = str(prop_id)
if prop_id in seen_ids:
deduped += 1
continue
seen_ids.add(prop_id)
storage_key = prop_id
else:
key = _dedup_key(prop)
if key in seen_keys:
deduped += 1
continue
seen_keys.add(key)
storage_key = f"{source}:{len(merged)}"
merged[storage_key] = prop
seen_keys.add(key)
counts[source] += 1
return list(merged.values()), counts, deduped
@ -241,13 +251,22 @@ def _store_properties(
if remaining == 0:
return 0
eligible = [prop for prop in props if _property_is_londonish(prop)]
dropped = len(props) - len(eligible)
if dropped:
londonish = [prop for prop in props if _property_is_londonish(prop)]
dropped_outside_area = len(props) - len(londonish)
if dropped_outside_area:
log.debug(
"%s dropped %d properties outside the Greater London-ish postcode filter",
source,
dropped,
dropped_outside_area,
)
eligible = [prop for prop in londonish if matches_strict_buy_listing_filter(prop)]
dropped_non_matching = len(londonish) - len(eligible)
if dropped_non_matching:
log.debug(
"%s dropped %d properties outside the strict buy-listing filters",
source,
dropped_non_matching,
)
selected = eligible if remaining is None else eligible[:remaining]
@ -367,20 +386,16 @@ def _scrape_homecouk(
log.info("home.co.uk cap reached")
return
remaining = _source_remaining(
results, "homecouk", max_properties_per_source
)
if remaining == 0:
log.info("home.co.uk cap reached")
return
for attempt in range(2):
try:
# home.co.uk cannot express the full filter set at source.
# Fetch the outcode page set first; _store_properties applies
# the strict filter and source cap after transformation.
props = homecouk_search_outcode(
client,
outcode,
pc_index,
max_properties=remaining,
max_properties=None,
)
added = _store_properties(
results,
@ -442,19 +457,17 @@ def _scrape_zoopla(
log.info("Zoopla cap reached")
return
remaining = _source_remaining(results, "zoopla", max_properties_per_source)
if remaining == 0:
log.info("Zoopla cap reached")
return
for attempt in range(2):
try:
# Zoopla source-side filters are unverified here. Fetch the
# outcode page set first; _store_properties applies the
# strict filter and source cap after transformation.
props, _ = zoopla_search_outcode(
page,
outcode,
pc_index,
pc_coords,
max_properties=remaining,
max_properties=None,
)
added = _store_properties(
results,
@ -506,9 +519,6 @@ def run_scrape(
output_base = Path(output_dir) if output_dir is not None else DATA_DIR
output_base.mkdir(parents=True, exist_ok=True)
if "zoopla" in selected_sources and pc_coords is None:
pc_coords = build_postcode_coords()
errors: list[str] = []
results = {source: [] for source in SOURCE_ORDER}
started_at = time.time()
@ -539,7 +549,8 @@ def run_scrape(
)
if "zoopla" in selected_sources:
assert pc_coords is not None
if pc_coords is None:
pc_coords = build_postcode_coords()
_scrape_zoopla(
selected_outcodes,
pc_index,
@ -551,19 +562,36 @@ def run_scrape(
merged, source_counts, deduped = _merge_properties(results)
output_path = output_base / "online_listings_buy.parquet"
write_parquet(merged, output_path)
if merged:
write_parquet(merged, output_path)
else:
if output_path.exists():
output_path.unlink()
log.warning("No strict properties to write to %s", output_path)
filtered = [prop for prop in merged if matches_strict_buy_listing_filter(prop)]
filtered_output_path = output_base / "online_listings_buy_filtered.parquet"
if filtered:
write_parquet(filtered, filtered_output_path)
else:
if filtered_output_path.exists():
filtered_output_path.unlink()
log.warning("No strict-filtered properties to write to %s", filtered_output_path)
counts = {
"total": len(merged),
"filtered_total": len(filtered),
"deduped": deduped,
"sources": source_counts,
}
source_summary = " ".join(
f"{source}:{source_counts[source]}" for source in SOURCE_ORDER
)
log.info(
"Sale scrape complete: %d unique (rightmove:%d homecouk:%d zoopla:%d deduped:%d)",
"Sale scrape complete: %d unique, %d strict-filtered (%s deduped:%d)",
len(merged),
source_counts["rightmove"],
source_counts["homecouk"],
source_counts["zoopla"],
len(filtered),
source_summary,
deduped,
)
@ -575,6 +603,7 @@ def run_scrape(
},
"counts": counts,
"path": str(output_path),
"filtered_path": str(filtered_output_path),
"errors": errors,
"elapsed_seconds": round(time.time() - started_at, 3),
}

View file

@ -45,9 +45,10 @@ def write_parquet(properties: list[dict], path: Path) -> None:
remapped = 0
for p in properties:
sub_type = p.get("Property sub-type", "")
if sub_type and sub_type != "Unknown":
current_type = p.get("Property type")
if sub_type and sub_type != "Unknown" and current_type in (None, "", "Other"):
new_type = map_property_type(sub_type)
if new_type != p.get("Property type"):
if new_type != current_type:
p["Property type"] = new_type
remapped += 1
if remapped:

View file

@ -1,4 +1,5 @@
import logging
import math
import re
from constants import MAX_BEDROOMS, PROPERTY_TYPE_MAP, RIGHTMOVE_BASE
@ -29,17 +30,43 @@ def validate_floor_area(sqm: float | None) -> float | None:
return sqm
def parse_int_value(value) -> int | None:
"""Parse an integer-like API value without truncating decimals."""
if value is None or isinstance(value, bool):
return None
if isinstance(value, int):
return value
if isinstance(value, float):
if not math.isfinite(value) or not value.is_integer():
return None
return int(value)
if isinstance(value, str):
cleaned = value.strip().replace(",", "").replace("£", "")
if not re.fullmatch(r"\d+", cleaned):
return None
return int(cleaned)
return None
def parse_display_size(display_size: str | None) -> float | None:
"""Parse displaySize like '499 sq. ft.' or '4,124 sq. ft.' to sqm."""
if not display_size:
return None
# Try sq. ft. first
m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*ft", display_size, re.IGNORECASE)
m = re.search(
r"([\d,]+(?:\.\d+)?)\s*(?:sq\.?\s*ft|square\s+feet|ft(?:\^?2|²))",
display_size,
re.IGNORECASE,
)
if m:
sqft = float(m.group(1).replace(",", ""))
return validate_floor_area(round(sqft * 0.092903, 1))
# Try sq. m.
m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*m", display_size, re.IGNORECASE)
m = re.search(
r"([\d,]+(?:\.\d+)?)\s*(?:sq\.?\s*m|square\s+met(?:er|re)s?|m(?:\^?2|²))",
display_size,
re.IGNORECASE,
)
if m:
return validate_floor_area(round(float(m.group(1).replace(",", "")), 1))
return None
@ -86,7 +113,21 @@ def map_property_type(sub_type: str | None) -> str:
return canonical
# Keyword fallback for compound types not in the map
lower = sub_type.lower()
if "flat" in lower or "apartment" in lower or "maisonette" in lower or "studio" in lower:
excluded_flat_like = (
"block of apartment",
"house of multiple occupation",
"private halls",
"retirement",
"serviced apartment",
)
if any(term in lower for term in excluded_flat_like):
return "Other"
if (
"flat" in lower
or "apartment" in lower
or "maisonette" in lower
or "studio" in lower
):
return "Flats/Maisonettes"
if "semi" in lower and "detach" in lower:
return "Semi-Detached"
@ -158,10 +199,10 @@ def transform_property(
lat, lng = fix_coords(raw_lat, raw_lng)
price_obj = prop.get("price", {})
amount = price_obj.get("amount")
amount = parse_int_value(price_obj.get("amount"))
if not amount:
return None
price = int(amount)
price = amount
if price <= 0:
return None
@ -172,14 +213,23 @@ def transform_property(
# POA / Auction listings have unreliable prices — treat as no price
pq_lower = price_qualifier.lower()
if "poa" in pq_lower or "auction" in pq_lower:
non_comparable_price_terms = (
"poa",
"auction",
"shared ownership",
"shared equity",
"part buy",
"part rent",
"from",
)
if any(term in pq_lower for term in non_comparable_price_terms):
return None
sub_type = prop.get("propertySubType", "")
raw_beds = prop.get("bedrooms", 0) or 0
raw_baths = prop.get("bathrooms", 0) or 0
bedrooms = raw_beds if raw_beds <= MAX_BEDROOMS else 0
bathrooms = raw_baths if raw_baths <= MAX_BEDROOMS else 0
raw_beds = parse_int_value(prop.get("bedrooms")) or 0
raw_baths = parse_int_value(prop.get("bathrooms")) or 0
bedrooms = raw_beds if 0 <= raw_beds <= MAX_BEDROOMS else 0
bathrooms = raw_baths if 0 <= raw_baths <= MAX_BEDROOMS else 0
if raw_beds > MAX_BEDROOMS or raw_baths > MAX_BEDROOMS:
log.warning(
"Rightmove %s: implausible beds=%d baths=%d (capped to 0)",
@ -197,8 +247,15 @@ def transform_property(
log.debug("No England postcode for property at %.4f, %.4f — skipping", lat, lng)
return None
property_url = prop.get("propertyUrl") or ""
if not isinstance(property_url, str):
property_url = ""
listing_id = prop.get("id") or property_url
if not listing_id:
return None
return {
"id": prop.get("id"),
"id": listing_id,
"Bedrooms": bedrooms,
"Bathrooms": bathrooms,
"Number of bedrooms & living rooms": bedrooms + bathrooms,
@ -213,7 +270,7 @@ def transform_property(
"price_frequency": "",
"Price qualifier": price_qualifier,
"Total floor area (sqm)": parse_display_size(prop.get("displaySize")),
"Listing URL": RIGHTMOVE_BASE + prop.get("propertyUrl", ""),
"Listing URL": RIGHTMOVE_BASE + property_url if property_url else "",
"Listing features": key_features,
"first_visible_date": prop.get("firstVisibleDate", ""),
}

View file

@ -24,7 +24,7 @@ import time
from constants import DELAY_BETWEEN_PAGES, MAX_BEDROOMS, PROPERTY_TYPE_MAP, ZOOPLA_BASE
from spatial import PostcodeSpatialIndex
from transform import normalize_sub_type, validate_floor_area
from transform import normalize_sub_type, parse_int_value, validate_floor_area
log = logging.getLogger("zoopla")
@ -106,7 +106,8 @@ _EXTRACT_LISTINGS_JS = r"""() => {
const bedsMatch = text.match(/(\d+)\s*beds?/i);
const bathsMatch = text.match(/(\d+)\s*baths?/i);
const recMatch = text.match(/(\d+)\s*reception/i);
const areaMatch = text.match(/([\d,]+)\s*sq\.?\s*ft/i);
const areaSqftMatch = text.match(/([\d,]+(?:\.\d+)?)\s*(?:sq\.?\s*ft|square\s+feet|ft(?:\^?2|²))/i);
const areaSqmMatch = text.match(/([\d,]+(?:\.\d+)?)\s*(?:sq\.?\s*m|square\s+met(?:er|re)s?|m(?:\^?2|²))/i);
let tenure = '';
if (/leasehold/i.test(text)) tenure = 'Leasehold';
@ -141,7 +142,8 @@ _EXTRACT_LISTINGS_JS = r"""() => {
beds: bedsMatch && parseInt(bedsMatch[1]) <= 20 ? parseInt(bedsMatch[1]) : null,
baths: bathsMatch && parseInt(bathsMatch[1]) <= 20 ? parseInt(bathsMatch[1]) : null,
receptions: recMatch && parseInt(recMatch[1]) <= 20 ? parseInt(recMatch[1]) : null,
floor_area_sqft: areaMatch ? parseInt(areaMatch[1].replace(/,/g, '')) : null,
floor_area_sqft: areaSqftMatch ? parseInt(areaSqftMatch[1].replace(/,/g, '')) : null,
floor_area_sqm: areaSqmMatch ? parseFloat(areaSqmMatch[1].replace(/,/g, '')) : null,
address, tenure, property_type,
});
}
@ -181,7 +183,8 @@ _EXTRACT_LISTINGS_JS = r"""() => {
const bedsMatch = text.match(/(\d+)\s*beds?/i);
const bathsMatch = text.match(/(\d+)\s*baths?/i);
const recMatch = text.match(/(\d+)\s*reception/i);
const areaMatch = text.match(/([\d,]+)\s*sq\.?\s*ft/i);
const areaSqftMatch = text.match(/([\d,]+(?:\.\d+)?)\s*(?:sq\.?\s*ft|square\s+feet|ft(?:\^?2|²))/i);
const areaSqmMatch = text.match(/([\d,]+(?:\.\d+)?)\s*(?:sq\.?\s*m|square\s+met(?:er|re)s?|m(?:\^?2|²))/i);
let address = '';
for (const line of lines) {
@ -225,7 +228,8 @@ _EXTRACT_LISTINGS_JS = r"""() => {
beds: bedsMatch && parseInt(bedsMatch[1]) <= 20 ? parseInt(bedsMatch[1]) : null,
baths: bathsMatch && parseInt(bathsMatch[1]) <= 20 ? parseInt(bathsMatch[1]) : null,
receptions: recMatch && parseInt(recMatch[1]) <= 20 ? parseInt(recMatch[1]) : null,
floor_area_sqft: areaMatch ? parseInt(areaMatch[1].replace(/,/g, '')) : null,
floor_area_sqft: areaSqftMatch ? parseInt(areaSqftMatch[1].replace(/,/g, '')) : null,
floor_area_sqm: areaSqmMatch ? parseFloat(areaSqmMatch[1].replace(/,/g, '')) : null,
address, tenure, property_type,
});
}
@ -611,7 +615,22 @@ def _map_property_type(raw_type: str | None) -> str:
return canonical
# Keyword fallback
lower = raw_type.lower()
if "flat" in lower or "apartment" in lower or "maisonette" in lower or "studio" in lower or "penthouse" in lower:
excluded_flat_like = (
"block of apartment",
"house of multiple occupation",
"private halls",
"retirement",
"serviced apartment",
)
if any(term in lower for term in excluded_flat_like):
return "Other"
if (
"flat" in lower
or "apartment" in lower
or "maisonette" in lower
or "studio" in lower
or "penthouse" in lower
):
return "Flats/Maisonettes"
if "semi" in lower and "detach" in lower:
return "Semi-Detached"
@ -634,8 +653,8 @@ def transform_property(
Zoopla search cards do not include coordinates, so we resolve lat/lng
from postcodes extracted from the address text."""
price = raw.get("price")
if not price or int(price) <= 0:
price = parse_int_value(raw.get("price"))
if not price or price <= 0:
return None
address = raw.get("address", "")
@ -670,10 +689,10 @@ def transform_property(
if not (49 <= lat <= 56 and -7 <= lng <= 2):
return None
raw_beds = raw.get("beds") or 0
raw_baths = raw.get("baths") or 0
bedrooms = raw_beds if raw_beds <= MAX_BEDROOMS else 0
bathrooms = raw_baths if raw_baths <= MAX_BEDROOMS else 0
raw_beds = parse_int_value(raw.get("beds")) or 0
raw_baths = parse_int_value(raw.get("baths")) or 0
bedrooms = raw_beds if 0 <= raw_beds <= MAX_BEDROOMS else 0
bathrooms = raw_baths if 0 <= raw_baths <= MAX_BEDROOMS else 0
if raw_beds > MAX_BEDROOMS or raw_baths > MAX_BEDROOMS:
log.warning(
"Zoopla %s: implausible beds=%d baths=%d (capped to 0)",
@ -683,9 +702,13 @@ def transform_property(
# Floor area: convert sq ft to sq m
floor_area_sqm = None
sqft = raw.get("floor_area_sqft")
if sqft:
floor_area_sqm = validate_floor_area(round(sqft * 0.092903, 1))
raw_sqm = raw.get("floor_area_sqm")
if raw_sqm:
floor_area_sqm = validate_floor_area(round(float(raw_sqm), 1))
else:
sqft = raw.get("floor_area_sqft")
if sqft:
floor_area_sqm = validate_floor_area(round(float(sqft) * 0.092903, 1))
listing_id = raw.get("id", "")
listing_url = raw.get("url", "")
@ -704,7 +727,7 @@ def transform_property(
"Leasehold/Freehold": raw.get("tenure") or None,
"Property type": _map_property_type(raw.get("property_type")),
"Property sub-type": normalize_sub_type(raw.get("property_type")),
"price": int(price),
"price": price,
"price_frequency": "",
"Price qualifier": "",
"Total floor area (sqm)": floor_area_sqm,
@ -760,7 +783,18 @@ def search_outcode(
properties = []
dropped = 0
for raw in raw_listings:
transformed = transform_property(raw, pc_index, pc_coords, search_outcode=outcode)
try:
transformed = transform_property(
raw, pc_index, pc_coords, search_outcode=outcode
)
except Exception as exc:
log.warning(
"Zoopla %s property %s failed to transform: %s",
outcode,
raw.get("id", "?"),
exc,
)
transformed = None
if transformed:
properties.append(transformed)
else: