import logging import math import re from constants import MAX_BEDROOMS, PROPERTY_TYPE_MAP, RIGHTMOVE_BASE from spatial import PostcodeSpatialIndex log = logging.getLogger("rightmove") # Floor area bounds (sqm). Values outside this range are almost certainly # data errors: sub-5 sqm catches garbled extractions (e.g., 0.1 sqm for a # detached house), and >2000 sqm (~21,500 sq ft) exceeds even the largest # UK mansions. MIN_FLOOR_AREA_SQM = 5.0 MAX_FLOOR_AREA_SQM = 2000.0 FULL_POSTCODE_RE = re.compile( r"\b([A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2})\b", re.IGNORECASE, ) TRAILING_FULL_POSTCODE_RE = re.compile( r"(?:,?\s*)\b[A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2}\b\s*$", re.IGNORECASE, ) TRAILING_OUTCODE_RE = re.compile( r"(?:,?\s*)\b[A-Z]{1,2}\d[A-Z\d]?\b\s*$", re.IGNORECASE, ) def validate_floor_area(sqm: float | None) -> float | None: """Validate a floor area value. Returns None for nonsensical values. Rejects values below MIN_FLOOR_AREA_SQM and above MAX_FLOOR_AREA_SQM, which catches parsing errors where prices or other large numbers are mistakenly extracted as floor area from free-text descriptions or DOM text. """ if sqm is None: return None if sqm < MIN_FLOOR_AREA_SQM or sqm > MAX_FLOOR_AREA_SQM: return None return sqm def parse_int_value(value) -> int | None: """Parse an integer-like API value without truncating decimals.""" if value is None or isinstance(value, bool): return None if isinstance(value, int): return value if isinstance(value, float): if not math.isfinite(value) or not value.is_integer(): return None return int(value) if isinstance(value, str): cleaned = value.strip().replace(",", "").replace("£", "") if not re.fullmatch(r"\d+", cleaned): return None return int(cleaned) return None def parse_display_size(display_size: str | None) -> float | None: """Parse displaySize like '499 sq. ft.' or '4,124 sq. ft.' to sqm.""" if not display_size: return None # Try sq. ft. first m = re.search( r"([\d,]+(?:\.\d+)?)\s*(?:sq\.?\s*ft|square\s+feet|ft(?:\^?2|²))", display_size, re.IGNORECASE, ) if m: sqft = float(m.group(1).replace(",", "")) return validate_floor_area(round(sqft * 0.092903, 1)) # Try sq. m. m = re.search( r"([\d,]+(?:\.\d+)?)\s*(?:sq\.?\s*m|square\s+met(?:er|re)s?|m(?:\^?2|²))", display_size, re.IGNORECASE, ) if m: return validate_floor_area(round(float(m.group(1).replace(",", "")), 1)) return None def normalize_sub_type(sub_type: str | None) -> str: """Normalize property sub-type for consistent storage. Fixes delimiter inconsistencies (underscores/hyphens → spaces) from legacy listing data and truncates Zoopla description fragments that were accidentally captured as sub-types. """ if not sub_type: return "Unknown" cleaned = sub_type.replace("_", " ").strip() # Description fragments captured as sub-types are much longer than any # real property type name (longest canonical is ~25 chars) if len(cleaned) > 40: return "Unknown" # Collapse multiple spaces cleaned = re.sub(r"\s+", " ", cleaned) return cleaned.title() def map_property_type(sub_type: str | None) -> str: """Map propertySubType to canonical type.""" if not sub_type: return "Other" canonical = PROPERTY_TYPE_MAP.get(sub_type) if canonical: return canonical # Try title-case variant (e.g., "country house" → "Country House") canonical = PROPERTY_TYPE_MAP.get(sub_type.title()) if canonical: return canonical # Try lowercase variant (e.g., "Townhouse" → "townhouse") canonical = PROPERTY_TYPE_MAP.get(sub_type.lower()) if canonical: return canonical # Normalize delimiters (underscores/hyphens → spaces) and try again normalized = re.sub(r"[-_]+", " ", sub_type).strip().title() canonical = PROPERTY_TYPE_MAP.get(normalized) if canonical: return canonical # Keyword fallback for compound types not in the map lower = sub_type.lower() excluded_flat_like = ( "block of apartment", "house of multiple occupation", "private halls", "retirement", "serviced apartment", ) if any(term in lower for term in excluded_flat_like): return "Other" if ( "flat" in lower or "apartment" in lower or "maisonette" in lower or "studio" in lower ): return "Flats/Maisonettes" if "semi" in lower and "detach" in lower: return "Semi-Detached" if "detach" in lower: return "Detached" if "terrace" in lower or "mews" in lower: return "Terraced" if "house" in lower or "cottage" in lower: return "Detached" log.warning("Unknown propertySubType: %r — mapping to Other", sub_type) return "Other" def extract_tenure(tenure_obj: dict | None) -> str | None: """Extract tenure string from tenure object.""" if not tenure_obj: return None tt = tenure_obj.get("tenureType", "") if tt == "FREEHOLD": return "Freehold" if tt == "LEASEHOLD": return "Leasehold" return None def fix_coords(lat: float, lng: float) -> tuple[float, float]: """Swap lat/lng if they look reversed. England: lat ~49–56, lng ~-7–2.""" if 49 <= lat <= 56 and -7 <= lng <= 2: return lat, lng if 49 <= lng <= 56 and -7 <= lat <= 2: log.debug( "Swapping reversed coords: lat=%.4f lng=%.4f → lat=%.4f lng=%.4f", lat, lng, lng, lat, ) return lng, lat log.warning( "Coords outside England bounds even after swap attempt: lat=%.4f lng=%.4f", lat, lng, ) return lat, lng def normalize_postcode(postcode: str) -> str: """Ensure UK postcode has exactly one space before the 3-char incode. E.g., 'SW1A1AA' → 'SW1A 1AA', 'N4 2HA' → 'N4 2HA', 'E1 4AB' unchanged.""" # Strip all whitespace then re-insert the single canonical space compact = re.sub(r"\s+", "", postcode).upper() if len(compact) < 5: return compact return compact[:-3] + " " + compact[-3:] def extract_full_postcode(text: str | None) -> str | None: if not text: return None match = FULL_POSTCODE_RE.search(text) if not match: return None return normalize_postcode(match.group(1)) def extract_outcode(postcode: str | None) -> str | None: """Return the outward code (district) of a UK postcode, e.g. 'SW1A 1AA' → 'SW1A'.""" if not postcode: return None normalized = normalize_postcode(postcode) outcode = normalized.split(" ", 1)[0] return outcode or None def resolve_listing_postcode( extracted_postcode: str | None, inferred_postcode: str ) -> tuple[str, str]: """Pick the authoritative postcode for a listing, returning (postcode, source). The address-extracted postcode is more precise than the coordinate-nearest one, but it is only trustworthy when it agrees with the location: a stale, mistyped or well-formed-but-fabricated postcode (e.g. 'ZZ9 9ZZ') would otherwise silently override the spatially-correct value. Since the spatial index only supports nearest-lookup, accept the extracted postcode only when its outcode matches the inferred (coordinate-nearest) postcode's outcode; otherwise fall back to the inferred one, which is always a real, plausibly-correct postcode. """ if extracted_postcode and extract_outcode(extracted_postcode) == extract_outcode( inferred_postcode ): return extracted_postcode, "address" if extracted_postcode: log.debug( "Rejecting extracted postcode %s (outcode mismatch with inferred %s)", extracted_postcode, inferred_postcode, ) return inferred_postcode, "coordinates" def clean_listing_address(address: str | None) -> str: """Remove postcode/outcode suffixes from listing display addresses. Listing sites often include "..., BR1" or "..., SW1A 1AA" in their public address. Those tokens add fake address numbers to the fuzzy matcher, so keep the raw address separately and use this cleaned value for matching. """ if not address: return "" cleaned = str(address).strip() cleaned = TRAILING_FULL_POSTCODE_RE.sub("", cleaned) cleaned = TRAILING_OUTCODE_RE.sub("", cleaned) cleaned = re.sub(r"\s+", " ", cleaned) cleaned = re.sub(r"\s*,\s*", ", ", cleaned) return cleaned.strip(" ,") def build_register_address( raw_address: str | None, number_or_name: str | None = None ) -> str: """Build a Property Register-style address, prepending the house number/name. Listing display addresses are usually street-level ("South Street, Bromley") because the portals hide the exact unit. When a scraper can recover the property's own number or name (e.g. Zoopla detail pages expose ``propertyNumberOrName`` = "12" or "Martham Mill"), prepend it so the address carries the house identifier that the EPC/Price-Paid register addresses also use — turning a fuzzy street match into a near-exact one. Falls back to the plain cleaned address when no number/name is available. """ cleaned = clean_listing_address(raw_address) if not number_or_name: return cleaned number_or_name = number_or_name.strip() if not number_or_name: return cleaned # Avoid duplicating a number/name the display address already starts with. if cleaned.lower().startswith(number_or_name.lower()): return cleaned return f"{number_or_name}, {cleaned}" if cleaned else number_or_name def transform_property( prop: dict, outcode: str, pc_index: PostcodeSpatialIndex, detail_postcode: str | None = None, ) -> dict | None: """Transform a raw Rightmove property dict into our output schema. ``detail_postcode`` is the property's TRUE full postcode recovered from its detail page (see ``rightmove.parse_detail_postcode``); the search API itself only exposes the outcode-level ``displayAddress``. When supplied and it agrees with the coordinate-nearest postcode's outcode, it is preferred over the coordinate guess and recorded with source ``"detail_address"``. A detail postcode whose outcode disagrees with the location is discarded in favour of the spatially-correct coordinate postcode, so a stale or wrong detail value can never silently relocate a listing. """ loc = prop.get("location") if not loc: return None raw_lat = loc.get("latitude") raw_lng = loc.get("longitude") if raw_lat is None or raw_lng is None: return None lat, lng = fix_coords(raw_lat, raw_lng) price_obj = prop.get("price", {}) amount = parse_int_value(price_obj.get("amount")) price = amount or 0 display_prices = price_obj.get("displayPrices", []) price_qualifier = ( display_prices[0].get("displayPriceQualifier", "") if display_prices else "" ) sub_type = prop.get("propertySubType", "") raw_beds = parse_int_value(prop.get("bedrooms")) or 0 raw_baths = parse_int_value(prop.get("bathrooms")) or 0 bedrooms = raw_beds if 0 <= raw_beds <= MAX_BEDROOMS else 0 bathrooms = raw_baths if 0 <= raw_baths <= MAX_BEDROOMS else 0 if raw_beds > MAX_BEDROOMS or raw_baths > MAX_BEDROOMS: log.warning( "Rightmove %s: implausible beds=%d baths=%d (capped to 0)", prop.get("id", "?"), raw_beds, raw_baths, ) key_features = [ kf.get("description", "") for kf in prop.get("keyFeatures", []) if kf.get("description") ] inferred_postcode = pc_index.nearest(lat, lng) if not inferred_postcode: log.debug("No England postcode for property at %.4f, %.4f — skipping", lat, lng) return None raw_address = prop.get("displayAddress", "") or "" extracted_postcode = extract_full_postcode(raw_address) # Prefer the detail page's true full postcode when it agrees with the # location; otherwise fall back to the (display-address-or-coordinate) logic. detail_full = extract_full_postcode(detail_postcode) if detail_full and extract_outcode(detail_full) == extract_outcode( inferred_postcode ): postcode, postcode_source = detail_full, "detail_address" else: if detail_full: log.debug( "Rejecting Rightmove detail postcode %s (outcode mismatch with " "inferred %s)", detail_full, inferred_postcode, ) postcode, postcode_source = resolve_listing_postcode( extracted_postcode, inferred_postcode ) property_url = prop.get("propertyUrl") or "" if not isinstance(property_url, str): property_url = "" listing_id = prop.get("id") or property_url if not listing_id: return None return { "id": listing_id, "Bedrooms": bedrooms, "Bathrooms": bathrooms, "Number of bedrooms & living rooms": bedrooms + bathrooms, "lon": lng, "lat": lat, "Postcode": postcode, "Postcode source": postcode_source, "Extracted postcode": extracted_postcode, "Inferred postcode": inferred_postcode, "Listing raw address": raw_address, "Address per Property Register": clean_listing_address(raw_address), # Rightmove's displayAddress is street-level; no UPRN/house number. "UPRN": None, "Property number or name": None, "Leasehold/Freehold": extract_tenure(prop.get("tenure")), "Property type": map_property_type(sub_type), "Property sub-type": normalize_sub_type(sub_type), "price": price, "price_frequency": "", "Price qualifier": price_qualifier, "Total floor area (sqm)": parse_display_size(prop.get("displaySize")), "Listing URL": RIGHTMOVE_BASE + property_url if property_url else "", "Listing features": key_features, "first_visible_date": prop.get("firstVisibleDate", ""), }