perfect-postcode/finder/transform.py

230 lines
7.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import logging
import re
from constants import MAX_BEDROOMS, PROPERTY_TYPE_MAP, RIGHTMOVE_BASE
from spatial import PostcodeSpatialIndex
log = logging.getLogger("rightmove")
# Floor area bounds (sqm). Values outside this range are almost certainly
# data errors: sub-5 sqm catches garbled extractions (e.g., 0.1 sqm for a
# detached house), and >2000 sqm (~21,500 sq ft) exceeds even the largest
# UK mansions.
MIN_FLOOR_AREA_SQM = 5.0
MAX_FLOOR_AREA_SQM = 2000.0
def validate_floor_area(sqm: float | None) -> float | None:
"""Validate a floor area value. Returns None for nonsensical values.
Rejects values below MIN_FLOOR_AREA_SQM and above MAX_FLOOR_AREA_SQM,
which catches parsing errors where prices or other large numbers are
mistakenly extracted as floor area from free-text descriptions or DOM text.
"""
if sqm is None:
return None
if sqm < MIN_FLOOR_AREA_SQM or sqm > MAX_FLOOR_AREA_SQM:
return None
return sqm
def parse_display_size(display_size: str | None) -> float | None:
"""Parse displaySize like '499 sq. ft.' or '4,124 sq. ft.' to sqm."""
if not display_size:
return None
# Try sq. ft. first
m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*ft", display_size, re.IGNORECASE)
if m:
sqft = float(m.group(1).replace(",", ""))
return validate_floor_area(round(sqft * 0.092903, 1))
# Try sq. m.
m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*m", display_size, re.IGNORECASE)
if m:
return validate_floor_area(round(float(m.group(1).replace(",", "")), 1))
return None
def normalize_sub_type(sub_type: str | None) -> str:
"""Normalize property sub-type for consistent storage.
Fixes delimiter inconsistencies (underscores/hyphens → spaces) from
home.co.uk and truncates Zoopla description fragments that were
accidentally captured as sub-types.
"""
if not sub_type:
return "Unknown"
cleaned = sub_type.replace("_", " ").strip()
# Description fragments captured as sub-types are much longer than any
# real property type name (longest canonical is ~25 chars)
if len(cleaned) > 40:
return "Unknown"
# Collapse multiple spaces
cleaned = re.sub(r"\s+", " ", cleaned)
return cleaned.title()
def map_property_type(sub_type: str | None) -> str:
"""Map propertySubType to canonical type."""
if not sub_type:
return "Other"
canonical = PROPERTY_TYPE_MAP.get(sub_type)
if canonical:
return canonical
# Try title-case variant (e.g., "country house" → "Country House")
canonical = PROPERTY_TYPE_MAP.get(sub_type.title())
if canonical:
return canonical
# Try lowercase variant (e.g., "Townhouse" → "townhouse")
canonical = PROPERTY_TYPE_MAP.get(sub_type.lower())
if canonical:
return canonical
# Normalize delimiters (underscores/hyphens → spaces) and try again
normalized = re.sub(r"[-_]+", " ", sub_type).strip().title()
canonical = PROPERTY_TYPE_MAP.get(normalized)
if canonical:
return canonical
# Keyword fallback for compound types not in the map
lower = sub_type.lower()
if "flat" in lower or "apartment" in lower or "maisonette" in lower or "studio" in lower:
return "Flats/Maisonettes"
if "semi" in lower and "detach" in lower:
return "Semi-Detached"
if "detach" in lower:
return "Detached"
if "terrace" in lower or "mews" in lower:
return "Terraced"
if "house" in lower or "cottage" in lower:
return "Detached"
log.warning("Unknown propertySubType: %r — mapping to Other", sub_type)
return "Other"
def extract_tenure(tenure_obj: dict | None) -> str | None:
"""Extract tenure string from tenure object."""
if not tenure_obj:
return None
tt = tenure_obj.get("tenureType", "")
if tt == "FREEHOLD":
return "Freehold"
if tt == "LEASEHOLD":
return "Leasehold"
return None
def fix_coords(lat: float, lng: float) -> tuple[float, float]:
"""Swap lat/lng if they look reversed. England: lat ~4956, lng ~-72."""
if 49 <= lat <= 56 and -7 <= lng <= 2:
return lat, lng
if 49 <= lng <= 56 and -7 <= lat <= 2:
log.debug(
"Swapping reversed coords: lat=%.4f lng=%.4f → lat=%.4f lng=%.4f",
lat,
lng,
lng,
lat,
)
return lng, lat
log.warning(
"Coords outside England bounds even after swap attempt: lat=%.4f lng=%.4f",
lat,
lng,
)
return lat, lng
def normalize_postcode(postcode: str) -> str:
"""Ensure UK postcode has exactly one space before the 3-char incode.
E.g., 'SW1A1AA''SW1A 1AA', 'N4 2HA''N4 2HA', 'E1 4AB' unchanged."""
# Strip all whitespace then re-insert the single canonical space
compact = re.sub(r"\s+", "", postcode).upper()
if len(compact) < 5:
return compact
return compact[:-3] + " " + compact[-3:]
def normalize_price(amount: int, frequency: str) -> int:
"""Normalise price to monthly for rentals (weekly × 52/12, yearly ÷ 12)."""
if frequency == "weekly":
return round(amount * 52 / 12)
if frequency == "yearly":
return round(amount / 12)
return amount
def transform_property(
prop: dict, outcode: str, pc_index: PostcodeSpatialIndex
) -> dict | None:
"""Transform a raw Rightmove property dict into our output schema."""
loc = prop.get("location")
if not loc:
return None
raw_lat = loc.get("latitude")
raw_lng = loc.get("longitude")
if raw_lat is None or raw_lng is None:
return None
lat, lng = fix_coords(raw_lat, raw_lng)
price_obj = prop.get("price", {})
amount = price_obj.get("amount")
if not amount:
return None
frequency = price_obj.get("frequency", "")
# Store raw price — normalization to monthly happens once in storage.py
price = int(amount)
if price <= 0:
return None
display_prices = price_obj.get("displayPrices", [])
price_qualifier = (
display_prices[0].get("displayPriceQualifier", "") if display_prices else ""
)
# POA / Auction listings have unreliable prices — treat as no price
pq_lower = price_qualifier.lower()
if "poa" in pq_lower or "auction" in pq_lower:
return None
sub_type = prop.get("propertySubType", "")
raw_beds = prop.get("bedrooms", 0) or 0
raw_baths = prop.get("bathrooms", 0) or 0
bedrooms = raw_beds if raw_beds <= MAX_BEDROOMS else 0
bathrooms = raw_baths if raw_baths <= MAX_BEDROOMS else 0
if raw_beds > MAX_BEDROOMS or raw_baths > MAX_BEDROOMS:
log.warning(
"Rightmove %s: implausible beds=%d baths=%d (capped to 0)",
prop.get("id", "?"), raw_beds, raw_baths,
)
key_features = [
kf.get("description", "")
for kf in prop.get("keyFeatures", [])
if kf.get("description")
]
postcode = pc_index.nearest(lat, lng)
if not postcode:
log.debug("No England postcode for property at %.4f, %.4f — skipping", lat, lng)
return None
return {
"id": prop.get("id"),
"Bedrooms": bedrooms,
"Bathrooms": bathrooms,
"Number of bedrooms & living rooms": bedrooms + bathrooms,
"lon": lng,
"lat": lat,
"Postcode": postcode,
"Address per Property Register": prop.get("displayAddress", ""),
"Leasehold/Freehold": extract_tenure(prop.get("tenure")),
"Property type": map_property_type(sub_type),
"Property sub-type": normalize_sub_type(sub_type),
"price": price,
"price_frequency": frequency,
"Price qualifier": price_qualifier,
"Total floor area (sqm)": parse_display_size(prop.get("displaySize")),
"Listing URL": RIGHTMOVE_BASE + prop.get("propertyUrl", ""),
"Listing features": key_features,
"first_visible_date": prop.get("firstVisibleDate", ""),
}