397 lines
14 KiB
Python
397 lines
14 KiB
Python
import logging
|
||
import math
|
||
import re
|
||
|
||
from constants import MAX_BEDROOMS, PROPERTY_TYPE_MAP, RIGHTMOVE_BASE
|
||
from spatial import PostcodeSpatialIndex
|
||
|
||
log = logging.getLogger("rightmove")
|
||
|
||
|
||
# Floor area bounds (sqm). Values outside this range are almost certainly
|
||
# data errors: sub-5 sqm catches garbled extractions (e.g., 0.1 sqm for a
|
||
# detached house), and >2000 sqm (~21,500 sq ft) exceeds even the largest
|
||
# UK mansions.
|
||
MIN_FLOOR_AREA_SQM = 5.0
|
||
MAX_FLOOR_AREA_SQM = 2000.0
|
||
FULL_POSTCODE_RE = re.compile(
|
||
r"\b([A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2})\b",
|
||
re.IGNORECASE,
|
||
)
|
||
TRAILING_FULL_POSTCODE_RE = re.compile(
|
||
r"(?:,?\s*)\b[A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2}\b\s*$",
|
||
re.IGNORECASE,
|
||
)
|
||
TRAILING_OUTCODE_RE = re.compile(
|
||
r"(?:,?\s*)\b[A-Z]{1,2}\d[A-Z\d]?\b\s*$",
|
||
re.IGNORECASE,
|
||
)
|
||
|
||
|
||
def validate_floor_area(sqm: float | None) -> float | None:
|
||
"""Validate a floor area value. Returns None for nonsensical values.
|
||
|
||
Rejects values below MIN_FLOOR_AREA_SQM and above MAX_FLOOR_AREA_SQM,
|
||
which catches parsing errors where prices or other large numbers are
|
||
mistakenly extracted as floor area from free-text descriptions or DOM text.
|
||
"""
|
||
if sqm is None:
|
||
return None
|
||
if sqm < MIN_FLOOR_AREA_SQM or sqm > MAX_FLOOR_AREA_SQM:
|
||
return None
|
||
return sqm
|
||
|
||
|
||
def parse_int_value(value) -> int | None:
|
||
"""Parse an integer-like API value without truncating decimals."""
|
||
if value is None or isinstance(value, bool):
|
||
return None
|
||
if isinstance(value, int):
|
||
return value
|
||
if isinstance(value, float):
|
||
if not math.isfinite(value) or not value.is_integer():
|
||
return None
|
||
return int(value)
|
||
if isinstance(value, str):
|
||
cleaned = value.strip().replace(",", "").replace("£", "")
|
||
if not re.fullmatch(r"\d+", cleaned):
|
||
return None
|
||
return int(cleaned)
|
||
return None
|
||
|
||
|
||
def parse_display_size(display_size: str | None) -> float | None:
|
||
"""Parse displaySize like '499 sq. ft.' or '4,124 sq. ft.' to sqm."""
|
||
if not display_size:
|
||
return None
|
||
# Try sq. ft. first
|
||
m = re.search(
|
||
r"([\d,]+(?:\.\d+)?)\s*(?:sq\.?\s*ft|square\s+feet|ft(?:\^?2|²))",
|
||
display_size,
|
||
re.IGNORECASE,
|
||
)
|
||
if m:
|
||
sqft = float(m.group(1).replace(",", ""))
|
||
return validate_floor_area(round(sqft * 0.092903, 1))
|
||
# Try sq. m.
|
||
m = re.search(
|
||
r"([\d,]+(?:\.\d+)?)\s*(?:sq\.?\s*m|square\s+met(?:er|re)s?|m(?:\^?2|²))",
|
||
display_size,
|
||
re.IGNORECASE,
|
||
)
|
||
if m:
|
||
return validate_floor_area(round(float(m.group(1).replace(",", "")), 1))
|
||
return None
|
||
|
||
|
||
def normalize_sub_type(sub_type: str | None) -> str:
|
||
"""Normalize property sub-type for consistent storage.
|
||
|
||
Fixes delimiter inconsistencies (underscores/hyphens → spaces) from
|
||
legacy listing data and truncates Zoopla description fragments that were
|
||
accidentally captured as sub-types.
|
||
"""
|
||
if not sub_type:
|
||
return "Unknown"
|
||
cleaned = sub_type.replace("_", " ").strip()
|
||
# Description fragments captured as sub-types are much longer than any
|
||
# real property type name (longest canonical is ~25 chars)
|
||
if len(cleaned) > 40:
|
||
return "Unknown"
|
||
# Collapse multiple spaces
|
||
cleaned = re.sub(r"\s+", " ", cleaned)
|
||
return cleaned.title()
|
||
|
||
|
||
def map_property_type(sub_type: str | None) -> str:
|
||
"""Map propertySubType to canonical type."""
|
||
if not sub_type:
|
||
return "Other"
|
||
canonical = PROPERTY_TYPE_MAP.get(sub_type)
|
||
if canonical:
|
||
return canonical
|
||
# Try title-case variant (e.g., "country house" → "Country House")
|
||
canonical = PROPERTY_TYPE_MAP.get(sub_type.title())
|
||
if canonical:
|
||
return canonical
|
||
# Try lowercase variant (e.g., "Townhouse" → "townhouse")
|
||
canonical = PROPERTY_TYPE_MAP.get(sub_type.lower())
|
||
if canonical:
|
||
return canonical
|
||
# Normalize delimiters (underscores/hyphens → spaces) and try again
|
||
normalized = re.sub(r"[-_]+", " ", sub_type).strip().title()
|
||
canonical = PROPERTY_TYPE_MAP.get(normalized)
|
||
if canonical:
|
||
return canonical
|
||
# Keyword fallback for compound types not in the map
|
||
lower = sub_type.lower()
|
||
excluded_flat_like = (
|
||
"block of apartment",
|
||
"house of multiple occupation",
|
||
"private halls",
|
||
"retirement",
|
||
"serviced apartment",
|
||
)
|
||
if any(term in lower for term in excluded_flat_like):
|
||
return "Other"
|
||
if (
|
||
"flat" in lower
|
||
or "apartment" in lower
|
||
or "maisonette" in lower
|
||
or "studio" in lower
|
||
):
|
||
return "Flats/Maisonettes"
|
||
if "semi" in lower and "detach" in lower:
|
||
return "Semi-Detached"
|
||
if "detach" in lower:
|
||
return "Detached"
|
||
if "terrace" in lower or "mews" in lower:
|
||
return "Terraced"
|
||
if "house" in lower or "cottage" in lower:
|
||
return "Detached"
|
||
log.warning("Unknown propertySubType: %r — mapping to Other", sub_type)
|
||
return "Other"
|
||
|
||
|
||
def extract_tenure(tenure_obj: dict | None) -> str | None:
|
||
"""Extract tenure string from tenure object."""
|
||
if not tenure_obj:
|
||
return None
|
||
tt = tenure_obj.get("tenureType", "")
|
||
if tt == "FREEHOLD":
|
||
return "Freehold"
|
||
if tt == "LEASEHOLD":
|
||
return "Leasehold"
|
||
return None
|
||
|
||
|
||
def fix_coords(lat: float, lng: float) -> tuple[float, float]:
|
||
"""Swap lat/lng if they look reversed. England: lat ~49–56, lng ~-7–2."""
|
||
if 49 <= lat <= 56 and -7 <= lng <= 2:
|
||
return lat, lng
|
||
if 49 <= lng <= 56 and -7 <= lat <= 2:
|
||
log.debug(
|
||
"Swapping reversed coords: lat=%.4f lng=%.4f → lat=%.4f lng=%.4f",
|
||
lat,
|
||
lng,
|
||
lng,
|
||
lat,
|
||
)
|
||
return lng, lat
|
||
log.warning(
|
||
"Coords outside England bounds even after swap attempt: lat=%.4f lng=%.4f",
|
||
lat,
|
||
lng,
|
||
)
|
||
return lat, lng
|
||
|
||
|
||
def normalize_postcode(postcode: str) -> str:
|
||
"""Ensure UK postcode has exactly one space before the 3-char incode.
|
||
E.g., 'SW1A1AA' → 'SW1A 1AA', 'N4 2HA' → 'N4 2HA', 'E1 4AB' unchanged."""
|
||
# Strip all whitespace then re-insert the single canonical space
|
||
compact = re.sub(r"\s+", "", postcode).upper()
|
||
if len(compact) < 5:
|
||
return compact
|
||
return compact[:-3] + " " + compact[-3:]
|
||
|
||
|
||
def extract_full_postcode(text: str | None) -> str | None:
|
||
if not text:
|
||
return None
|
||
match = FULL_POSTCODE_RE.search(text)
|
||
if not match:
|
||
return None
|
||
return normalize_postcode(match.group(1))
|
||
|
||
|
||
def extract_outcode(postcode: str | None) -> str | None:
|
||
"""Return the outward code (district) of a UK postcode, e.g. 'SW1A 1AA' → 'SW1A'."""
|
||
if not postcode:
|
||
return None
|
||
normalized = normalize_postcode(postcode)
|
||
outcode = normalized.split(" ", 1)[0]
|
||
return outcode or None
|
||
|
||
|
||
def resolve_listing_postcode(
|
||
extracted_postcode: str | None, inferred_postcode: str
|
||
) -> tuple[str, str]:
|
||
"""Pick the authoritative postcode for a listing, returning (postcode, source).
|
||
|
||
The address-extracted postcode is more precise than the coordinate-nearest one,
|
||
but it is only trustworthy when it agrees with the location: a stale, mistyped or
|
||
well-formed-but-fabricated postcode (e.g. 'ZZ9 9ZZ') would otherwise silently
|
||
override the spatially-correct value. Since the spatial index only supports
|
||
nearest-lookup, accept the extracted postcode only when its outcode matches the
|
||
inferred (coordinate-nearest) postcode's outcode; otherwise fall back to the
|
||
inferred one, which is always a real, plausibly-correct postcode.
|
||
"""
|
||
if extracted_postcode and extract_outcode(extracted_postcode) == extract_outcode(
|
||
inferred_postcode
|
||
):
|
||
return extracted_postcode, "address"
|
||
if extracted_postcode:
|
||
log.debug(
|
||
"Rejecting extracted postcode %s (outcode mismatch with inferred %s)",
|
||
extracted_postcode,
|
||
inferred_postcode,
|
||
)
|
||
return inferred_postcode, "coordinates"
|
||
|
||
|
||
def clean_listing_address(address: str | None) -> str:
|
||
"""Remove postcode/outcode suffixes from listing display addresses.
|
||
|
||
Listing sites often include "..., BR1" or "..., SW1A 1AA" in their public
|
||
address. Those tokens add fake address numbers to the fuzzy matcher, so keep
|
||
the raw address separately and use this cleaned value for matching.
|
||
"""
|
||
if not address:
|
||
return ""
|
||
cleaned = str(address).strip()
|
||
cleaned = TRAILING_FULL_POSTCODE_RE.sub("", cleaned)
|
||
cleaned = TRAILING_OUTCODE_RE.sub("", cleaned)
|
||
cleaned = re.sub(r"\s+", " ", cleaned)
|
||
cleaned = re.sub(r"\s*,\s*", ", ", cleaned)
|
||
return cleaned.strip(" ,")
|
||
|
||
|
||
def build_register_address(
|
||
raw_address: str | None, number_or_name: str | None = None
|
||
) -> str:
|
||
"""Build a Property Register-style address, prepending the house number/name.
|
||
|
||
Listing display addresses are usually street-level ("South Street, Bromley")
|
||
because the portals hide the exact unit. When a scraper can recover the
|
||
property's own number or name (e.g. Zoopla detail pages expose
|
||
``propertyNumberOrName`` = "12" or "Martham Mill"), prepend it so the address
|
||
carries the house identifier that the EPC/Price-Paid register addresses also
|
||
use — turning a fuzzy street match into a near-exact one. Falls back to the
|
||
plain cleaned address when no number/name is available.
|
||
"""
|
||
cleaned = clean_listing_address(raw_address)
|
||
if not number_or_name:
|
||
return cleaned
|
||
number_or_name = number_or_name.strip()
|
||
if not number_or_name:
|
||
return cleaned
|
||
# Avoid duplicating a number/name the display address already starts with.
|
||
if cleaned.lower().startswith(number_or_name.lower()):
|
||
return cleaned
|
||
return f"{number_or_name}, {cleaned}" if cleaned else number_or_name
|
||
|
||
|
||
def transform_property(
|
||
prop: dict,
|
||
outcode: str,
|
||
pc_index: PostcodeSpatialIndex,
|
||
detail_postcode: str | None = None,
|
||
) -> dict | None:
|
||
"""Transform a raw Rightmove property dict into our output schema.
|
||
|
||
``detail_postcode`` is the property's TRUE full postcode recovered from its
|
||
detail page (see ``rightmove.parse_detail_postcode``); the search API itself
|
||
only exposes the outcode-level ``displayAddress``. When supplied and it
|
||
agrees with the coordinate-nearest postcode's outcode, it is preferred over
|
||
the coordinate guess and recorded with source ``"detail_address"``. A
|
||
detail postcode whose outcode disagrees with the location is discarded in
|
||
favour of the spatially-correct coordinate postcode, so a stale or wrong
|
||
detail value can never silently relocate a listing.
|
||
"""
|
||
loc = prop.get("location")
|
||
if not loc:
|
||
return None
|
||
raw_lat = loc.get("latitude")
|
||
raw_lng = loc.get("longitude")
|
||
if raw_lat is None or raw_lng is None:
|
||
return None
|
||
|
||
lat, lng = fix_coords(raw_lat, raw_lng)
|
||
|
||
price_obj = prop.get("price", {})
|
||
amount = parse_int_value(price_obj.get("amount"))
|
||
price = amount or 0
|
||
|
||
display_prices = price_obj.get("displayPrices", [])
|
||
price_qualifier = (
|
||
display_prices[0].get("displayPriceQualifier", "") if display_prices else ""
|
||
)
|
||
|
||
sub_type = prop.get("propertySubType", "")
|
||
raw_beds = parse_int_value(prop.get("bedrooms")) or 0
|
||
raw_baths = parse_int_value(prop.get("bathrooms")) or 0
|
||
bedrooms = raw_beds if 0 <= raw_beds <= MAX_BEDROOMS else 0
|
||
bathrooms = raw_baths if 0 <= raw_baths <= MAX_BEDROOMS else 0
|
||
if raw_beds > MAX_BEDROOMS or raw_baths > MAX_BEDROOMS:
|
||
log.warning(
|
||
"Rightmove %s: implausible beds=%d baths=%d (capped to 0)",
|
||
prop.get("id", "?"), raw_beds, raw_baths,
|
||
)
|
||
|
||
key_features = [
|
||
kf.get("description", "")
|
||
for kf in prop.get("keyFeatures", [])
|
||
if kf.get("description")
|
||
]
|
||
|
||
inferred_postcode = pc_index.nearest(lat, lng)
|
||
if not inferred_postcode:
|
||
log.debug("No England postcode for property at %.4f, %.4f — skipping", lat, lng)
|
||
return None
|
||
raw_address = prop.get("displayAddress", "") or ""
|
||
extracted_postcode = extract_full_postcode(raw_address)
|
||
|
||
# Prefer the detail page's true full postcode when it agrees with the
|
||
# location; otherwise fall back to the (display-address-or-coordinate) logic.
|
||
detail_full = extract_full_postcode(detail_postcode)
|
||
if detail_full and extract_outcode(detail_full) == extract_outcode(
|
||
inferred_postcode
|
||
):
|
||
postcode, postcode_source = detail_full, "detail_address"
|
||
else:
|
||
if detail_full:
|
||
log.debug(
|
||
"Rejecting Rightmove detail postcode %s (outcode mismatch with "
|
||
"inferred %s)",
|
||
detail_full,
|
||
inferred_postcode,
|
||
)
|
||
postcode, postcode_source = resolve_listing_postcode(
|
||
extracted_postcode, inferred_postcode
|
||
)
|
||
|
||
property_url = prop.get("propertyUrl") or ""
|
||
if not isinstance(property_url, str):
|
||
property_url = ""
|
||
listing_id = prop.get("id") or property_url
|
||
if not listing_id:
|
||
return None
|
||
|
||
return {
|
||
"id": listing_id,
|
||
"Bedrooms": bedrooms,
|
||
"Bathrooms": bathrooms,
|
||
"Number of bedrooms & living rooms": bedrooms + bathrooms,
|
||
"lon": lng,
|
||
"lat": lat,
|
||
"Postcode": postcode,
|
||
"Postcode source": postcode_source,
|
||
"Extracted postcode": extracted_postcode,
|
||
"Inferred postcode": inferred_postcode,
|
||
"Listing raw address": raw_address,
|
||
"Address per Property Register": clean_listing_address(raw_address),
|
||
# Rightmove's displayAddress is street-level; no UPRN/house number.
|
||
"UPRN": None,
|
||
"Property number or name": None,
|
||
"Leasehold/Freehold": extract_tenure(prop.get("tenure")),
|
||
"Property type": map_property_type(sub_type),
|
||
"Property sub-type": normalize_sub_type(sub_type),
|
||
"price": price,
|
||
"price_frequency": "",
|
||
"Price qualifier": price_qualifier,
|
||
"Total floor area (sqm)": parse_display_size(prop.get("displaySize")),
|
||
"Listing URL": RIGHTMOVE_BASE + property_url if property_url else "",
|
||
"Listing features": key_features,
|
||
"first_visible_date": prop.get("firstVisibleDate", ""),
|
||
}
|