finder improvements
This commit is contained in:
parent
30055ab870
commit
3a3e249bdd
6 changed files with 225 additions and 39 deletions
|
|
@ -9,6 +9,12 @@ DELAY_BETWEEN_OUTCODES = 0.5
|
||||||
MAX_RETRIES = 3
|
MAX_RETRIES = 3
|
||||||
RETRY_BASE_DELAY = 2.0
|
RETRY_BASE_DELAY = 2.0
|
||||||
GRID_CELL_SIZE = 0.01 # degrees for postcode spatial index
|
GRID_CELL_SIZE = 0.01 # degrees for postcode spatial index
|
||||||
|
MAX_BEDROOMS = 20 # sanity cap — values above this are almost certainly parsing errors
|
||||||
|
# Rent sanity bounds (monthly). Rents outside this range are nulled out — they are
|
||||||
|
# almost always total-stay pricing (e.g. "Golf Open 2026" short lets), annual rents
|
||||||
|
# mislabelled as monthly, or data errors.
|
||||||
|
MIN_RENT_MONTHLY = 50 # below £50/month is implausible for any UK property
|
||||||
|
MAX_RENT_MONTHLY = 25_000 # above £25k/month covers ultra-prime London; higher is suspect
|
||||||
SEED = 42
|
SEED = 42
|
||||||
CHECKPOINT_INTERVAL = int(os.environ.get("CHECKPOINT_INTERVAL", "900")) # seconds
|
CHECKPOINT_INTERVAL = int(os.environ.get("CHECKPOINT_INTERVAL", "900")) # seconds
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -15,6 +15,7 @@ from constants import (
|
||||||
HOMECOUK_API_BASE,
|
HOMECOUK_API_BASE,
|
||||||
HOMECOUK_BASE,
|
HOMECOUK_BASE,
|
||||||
HOMECOUK_PER_PAGE,
|
HOMECOUK_PER_PAGE,
|
||||||
|
MAX_BEDROOMS,
|
||||||
PROPERTY_TYPE_MAP,
|
PROPERTY_TYPE_MAP,
|
||||||
RETRY_BASE_DELAY,
|
RETRY_BASE_DELAY,
|
||||||
)
|
)
|
||||||
|
|
@ -25,6 +26,7 @@ from metrics import (
|
||||||
homecouk_requests_total,
|
homecouk_requests_total,
|
||||||
)
|
)
|
||||||
from spatial import PostcodeSpatialIndex
|
from spatial import PostcodeSpatialIndex
|
||||||
|
from transform import validate_floor_area
|
||||||
|
|
||||||
log = logging.getLogger("homecouk")
|
log = logging.getLogger("homecouk")
|
||||||
|
|
||||||
|
|
@ -216,10 +218,57 @@ def parse_floor_area(description: str | None) -> float | None:
|
||||||
m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*ft", description, re.IGNORECASE)
|
m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*ft", description, re.IGNORECASE)
|
||||||
if m:
|
if m:
|
||||||
sqft = float(m.group(1).replace(",", ""))
|
sqft = float(m.group(1).replace(",", ""))
|
||||||
return round(sqft * 0.092903, 1)
|
return validate_floor_area(round(sqft * 0.092903, 1))
|
||||||
m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*m", description, re.IGNORECASE)
|
m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*m", description, re.IGNORECASE)
|
||||||
if m:
|
if m:
|
||||||
return round(float(m.group(1).replace(",", "")), 1)
|
return validate_floor_area(round(float(m.group(1).replace(",", "")), 1))
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def parse_tenure(prop: dict) -> str | None:
|
||||||
|
"""Extract tenure from home.co.uk property data.
|
||||||
|
|
||||||
|
Checks multiple sources in priority order:
|
||||||
|
1. Dedicated 'tenure' or 'tenure_type' field in the API response
|
||||||
|
2. Free-text search in the description for 'freehold' / 'leasehold'
|
||||||
|
3. Free-text search in features lists
|
||||||
|
|
||||||
|
home.co.uk aggregates listings from estate agents, so tenure is often
|
||||||
|
embedded in the description text rather than a structured field.
|
||||||
|
"""
|
||||||
|
# 1. Check dedicated tenure fields (in case the API adds them)
|
||||||
|
for key in ("tenure", "tenure_type", "tenureType"):
|
||||||
|
val = prop.get(key)
|
||||||
|
if val and isinstance(val, str):
|
||||||
|
lower = val.lower().strip()
|
||||||
|
if "leasehold" in lower:
|
||||||
|
return "Leasehold"
|
||||||
|
if "freehold" in lower:
|
||||||
|
return "Freehold"
|
||||||
|
|
||||||
|
# 2. Check description text — estate agents often include tenure here
|
||||||
|
description = prop.get("description") or ""
|
||||||
|
if description:
|
||||||
|
lower_desc = description.lower()
|
||||||
|
if re.search(r"\bleasehold\b", lower_desc):
|
||||||
|
return "Leasehold"
|
||||||
|
if re.search(r"\bfreehold\b", lower_desc):
|
||||||
|
# Matches "Freehold" and "Share of Freehold" (both = freehold ownership)
|
||||||
|
return "Freehold"
|
||||||
|
|
||||||
|
# 3. Check features / key_features lists if present
|
||||||
|
for key in ("features", "key_features", "keyFeatures"):
|
||||||
|
features = prop.get(key)
|
||||||
|
if features and isinstance(features, list):
|
||||||
|
for feat in features:
|
||||||
|
if not isinstance(feat, str):
|
||||||
|
continue
|
||||||
|
lower_feat = feat.lower()
|
||||||
|
if "leasehold" in lower_feat:
|
||||||
|
return "Leasehold"
|
||||||
|
if "freehold" in lower_feat:
|
||||||
|
return "Freehold"
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -267,7 +316,7 @@ def transform_property(
|
||||||
return None
|
return None
|
||||||
|
|
||||||
price = prop.get("price") or prop.get("latest_price")
|
price = prop.get("price") or prop.get("latest_price")
|
||||||
if not price:
|
if not price or int(price) <= 0:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Home.co.uk provides postcodes directly, but fall back to spatial index
|
# Home.co.uk provides postcodes directly, but fall back to spatial index
|
||||||
|
|
@ -278,8 +327,16 @@ def transform_property(
|
||||||
log.debug("No postcode for property at %.4f, %.4f — skipping", lat, lng)
|
log.debug("No postcode for property at %.4f, %.4f — skipping", lat, lng)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
bedrooms = prop.get("bedrooms", 0) or 0
|
raw_beds = prop.get("bedrooms", 0) or 0
|
||||||
bathrooms = prop.get("bathrooms", 0) or 0
|
raw_baths = prop.get("bathrooms", 0) or 0
|
||||||
|
bedrooms = raw_beds if raw_beds <= MAX_BEDROOMS else 0
|
||||||
|
bathrooms = raw_baths if raw_baths <= MAX_BEDROOMS else 0
|
||||||
|
if raw_beds > MAX_BEDROOMS or raw_baths > MAX_BEDROOMS:
|
||||||
|
log.warning(
|
||||||
|
"home.co.uk %s: implausible beds=%d baths=%d (capped to 0)",
|
||||||
|
prop.get("listing_id") or prop.get("property_id") or "?",
|
||||||
|
raw_beds, raw_baths,
|
||||||
|
)
|
||||||
|
|
||||||
listing_type = prop.get("listing_property_type") or prop.get("property_type") or ""
|
listing_type = prop.get("listing_property_type") or prop.get("property_type") or ""
|
||||||
address = prop.get("display_address") or prop.get("address") or ""
|
address = prop.get("display_address") or prop.get("address") or ""
|
||||||
|
|
@ -304,7 +361,7 @@ def transform_property(
|
||||||
"lat": lat,
|
"lat": lat,
|
||||||
"Postcode": postcode,
|
"Postcode": postcode,
|
||||||
"Address per Property Register": address,
|
"Address per Property Register": address,
|
||||||
"Leasehold/Freehold": None, # not available from home.co.uk
|
"Leasehold/Freehold": parse_tenure(prop),
|
||||||
"Property type": map_property_type(listing_type),
|
"Property type": map_property_type(listing_type),
|
||||||
"Property sub-type": listing_type or "Unknown",
|
"Property sub-type": listing_type or "Unknown",
|
||||||
"price": int(price),
|
"price": int(price),
|
||||||
|
|
|
||||||
|
|
@ -34,6 +34,7 @@ from playwright.sync_api import sync_playwright
|
||||||
|
|
||||||
from constants import (
|
from constants import (
|
||||||
DELAY_BETWEEN_PAGES,
|
DELAY_BETWEEN_PAGES,
|
||||||
|
MAX_BEDROOMS,
|
||||||
OPENRENT_BASE,
|
OPENRENT_BASE,
|
||||||
PROPERTY_TYPE_MAP,
|
PROPERTY_TYPE_MAP,
|
||||||
RETRY_BASE_DELAY,
|
RETRY_BASE_DELAY,
|
||||||
|
|
@ -45,6 +46,7 @@ from metrics import (
|
||||||
openrent_requests_total,
|
openrent_requests_total,
|
||||||
)
|
)
|
||||||
from spatial import PostcodeSpatialIndex
|
from spatial import PostcodeSpatialIndex
|
||||||
|
from transform import validate_floor_area
|
||||||
|
|
||||||
log = logging.getLogger("openrent")
|
log = logging.getLogger("openrent")
|
||||||
|
|
||||||
|
|
@ -607,10 +609,10 @@ def parse_floor_area(description: str | None) -> float | None:
|
||||||
m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*ft", description, re.IGNORECASE)
|
m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*ft", description, re.IGNORECASE)
|
||||||
if m:
|
if m:
|
||||||
sqft = float(m.group(1).replace(",", ""))
|
sqft = float(m.group(1).replace(",", ""))
|
||||||
return round(sqft * 0.092903, 1)
|
return validate_floor_area(round(sqft * 0.092903, 1))
|
||||||
m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*m", description, re.IGNORECASE)
|
m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*m", description, re.IGNORECASE)
|
||||||
if m:
|
if m:
|
||||||
return round(float(m.group(1).replace(",", "")), 1)
|
return validate_floor_area(round(float(m.group(1).replace(",", "")), 1))
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -651,7 +653,7 @@ def transform_property(
|
||||||
lat = detail.get("lat") or search_data.get("lat")
|
lat = detail.get("lat") or search_data.get("lat")
|
||||||
lng = detail.get("lng") or search_data.get("lng")
|
lng = detail.get("lng") or search_data.get("lng")
|
||||||
price = detail.get("price") or search_data.get("price")
|
price = detail.get("price") or search_data.get("price")
|
||||||
if not price:
|
if not price or int(price) <= 0:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
frequency = search_data.get("frequency", "monthly")
|
frequency = search_data.get("frequency", "monthly")
|
||||||
|
|
@ -701,8 +703,15 @@ def transform_property(
|
||||||
log.debug("No postcode for property — skipping")
|
log.debug("No postcode for property — skipping")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
bedrooms = detail.get("bedrooms") or search_data.get("bedrooms", 0) or 0
|
raw_beds = detail.get("bedrooms") or search_data.get("bedrooms", 0) or 0
|
||||||
bathrooms = detail.get("bathrooms") or search_data.get("bathrooms", 0) or 0
|
raw_baths = detail.get("bathrooms") or search_data.get("bathrooms", 0) or 0
|
||||||
|
bedrooms = raw_beds if raw_beds <= MAX_BEDROOMS else 0
|
||||||
|
bathrooms = raw_baths if raw_baths <= MAX_BEDROOMS else 0
|
||||||
|
if raw_beds > MAX_BEDROOMS or raw_baths > MAX_BEDROOMS:
|
||||||
|
log.warning(
|
||||||
|
"OpenRent %s: implausible beds=%d baths=%d (capped to 0)",
|
||||||
|
search_data.get("id", "?"), raw_beds, raw_baths,
|
||||||
|
)
|
||||||
|
|
||||||
# Title: prefer detail page (has h1 with full title)
|
# Title: prefer detail page (has h1 with full title)
|
||||||
title = detail.get("title") or search_data.get("title", "")
|
title = detail.get("title") or search_data.get("title", "")
|
||||||
|
|
@ -746,6 +755,9 @@ def transform_property(
|
||||||
"lat": lat,
|
"lat": lat,
|
||||||
"Postcode": postcode,
|
"Postcode": postcode,
|
||||||
"Address per Property Register": address,
|
"Address per Property Register": address,
|
||||||
|
# OpenRent is a rental-only platform — tenure (Freehold/Leasehold) is a
|
||||||
|
# property ownership concept that doesn't apply to rental listings. The
|
||||||
|
# landlord's tenure is not shown on OpenRent listing pages.
|
||||||
"Leasehold/Freehold": None,
|
"Leasehold/Freehold": None,
|
||||||
"Property type": map_property_type(property_type),
|
"Property type": map_property_type(property_type),
|
||||||
"Property sub-type": property_type or "Unknown",
|
"Property sub-type": property_type or "Unknown",
|
||||||
|
|
|
||||||
|
|
@ -4,6 +4,7 @@ from pathlib import Path
|
||||||
|
|
||||||
import polars as pl
|
import polars as pl
|
||||||
|
|
||||||
|
from constants import MAX_BEDROOMS, MAX_RENT_MONTHLY, MIN_RENT_MONTHLY
|
||||||
from transform import normalize_price
|
from transform import normalize_price
|
||||||
|
|
||||||
log = logging.getLogger("rightmove")
|
log = logging.getLogger("rightmove")
|
||||||
|
|
@ -18,6 +19,30 @@ def write_parquet(properties: list[dict], path: Path, channel: str) -> None:
|
||||||
log.warning("No properties to write to %s", path)
|
log.warning("No properties to write to %s", path)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
# Sanitize bedroom/bathroom counts — values above MAX_BEDROOMS are
|
||||||
|
# almost certainly prices or other numeric fields mis-parsed as bedrooms.
|
||||||
|
bad_count = 0
|
||||||
|
for p in properties:
|
||||||
|
for key in ("Bedrooms", "Bathrooms"):
|
||||||
|
val = p.get(key, 0) or 0
|
||||||
|
if val > MAX_BEDROOMS:
|
||||||
|
bad_count += 1
|
||||||
|
p[key] = None
|
||||||
|
# Recompute derived field after sanitization
|
||||||
|
beds = p.get("Bedrooms")
|
||||||
|
baths = p.get("Bathrooms")
|
||||||
|
if beds is None or baths is None:
|
||||||
|
p["Number of bedrooms & living rooms"] = None
|
||||||
|
else:
|
||||||
|
p["Number of bedrooms & living rooms"] = beds + baths
|
||||||
|
|
||||||
|
if bad_count:
|
||||||
|
log.warning(
|
||||||
|
"Sanitized %d properties with bedroom/bathroom counts > %d (set to null)",
|
||||||
|
bad_count,
|
||||||
|
MAX_BEDROOMS,
|
||||||
|
)
|
||||||
|
|
||||||
# Parse first_visible_date to datetime
|
# Parse first_visible_date to datetime
|
||||||
listing_dates = []
|
listing_dates = []
|
||||||
for p in properties:
|
for p in properties:
|
||||||
|
|
@ -36,15 +61,33 @@ def write_parquet(properties: list[dict], path: Path, channel: str) -> None:
|
||||||
listing_dates.append(None)
|
listing_dates.append(None)
|
||||||
|
|
||||||
# Derive asking price / asking rent based on channel
|
# Derive asking price / asking rent based on channel
|
||||||
|
# Zero prices indicate parsing failures or POA/auction listings — treat as null
|
||||||
if channel == "buy":
|
if channel == "buy":
|
||||||
asking_prices = [p["price"] for p in properties]
|
asking_prices = [p["price"] if p["price"] > 0 else None for p in properties]
|
||||||
asking_rents = [None] * len(properties)
|
asking_rents = [None] * len(properties)
|
||||||
listing_statuses = ["For sale"] * len(properties)
|
listing_statuses = ["For sale"] * len(properties)
|
||||||
else:
|
else:
|
||||||
asking_prices = [None] * len(properties)
|
asking_prices = [None] * len(properties)
|
||||||
asking_rents = [
|
# Normalize to monthly, then apply sanity bounds. Rents outside
|
||||||
normalize_price(p["price"], p["price_frequency"]) for p in properties
|
# [MIN_RENT_MONTHLY, MAX_RENT_MONTHLY] are almost always total-stay
|
||||||
]
|
# pricing (short lets), annual rents mislabelled as monthly, or £0
|
||||||
|
# placeholders — null them out rather than polluting aggregates.
|
||||||
|
rent_outliers = 0
|
||||||
|
asking_rents = []
|
||||||
|
for p in properties:
|
||||||
|
monthly = normalize_price(p["price"], p["price_frequency"])
|
||||||
|
if monthly < MIN_RENT_MONTHLY or monthly > MAX_RENT_MONTHLY:
|
||||||
|
rent_outliers += 1
|
||||||
|
asking_rents.append(None)
|
||||||
|
else:
|
||||||
|
asking_rents.append(monthly)
|
||||||
|
if rent_outliers:
|
||||||
|
log.warning(
|
||||||
|
"Nulled %d rent outliers outside [£%d, £%d]/month",
|
||||||
|
rent_outliers,
|
||||||
|
MIN_RENT_MONTHLY,
|
||||||
|
MAX_RENT_MONTHLY,
|
||||||
|
)
|
||||||
listing_statuses = ["For rent"] * len(properties)
|
listing_statuses = ["For rent"] * len(properties)
|
||||||
|
|
||||||
df = pl.DataFrame(
|
df = pl.DataFrame(
|
||||||
|
|
|
||||||
|
|
@ -1,12 +1,31 @@
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from constants import PROPERTY_TYPE_MAP, RIGHTMOVE_BASE
|
from constants import MAX_BEDROOMS, PROPERTY_TYPE_MAP, RIGHTMOVE_BASE
|
||||||
from spatial import PostcodeSpatialIndex
|
from spatial import PostcodeSpatialIndex
|
||||||
|
|
||||||
log = logging.getLogger("rightmove")
|
log = logging.getLogger("rightmove")
|
||||||
|
|
||||||
|
|
||||||
|
# Maximum plausible floor area for a residential property listing (sqm).
|
||||||
|
# ~21,500 sq ft — covers even the largest UK mansions.
|
||||||
|
MAX_FLOOR_AREA_SQM = 2000.0
|
||||||
|
|
||||||
|
|
||||||
|
def validate_floor_area(sqm: float | None) -> float | None:
|
||||||
|
"""Validate a floor area value. Returns None for nonsensical values.
|
||||||
|
|
||||||
|
Rejects zero/negative values and anything above MAX_FLOOR_AREA_SQM,
|
||||||
|
which catches parsing errors where prices or other large numbers are
|
||||||
|
mistakenly extracted as floor area from free-text descriptions or DOM text.
|
||||||
|
"""
|
||||||
|
if sqm is None:
|
||||||
|
return None
|
||||||
|
if sqm <= 0 or sqm > MAX_FLOOR_AREA_SQM:
|
||||||
|
return None
|
||||||
|
return sqm
|
||||||
|
|
||||||
|
|
||||||
def parse_display_size(display_size: str | None) -> float | None:
|
def parse_display_size(display_size: str | None) -> float | None:
|
||||||
"""Parse displaySize like '499 sq. ft.' or '4,124 sq. ft.' to sqm."""
|
"""Parse displaySize like '499 sq. ft.' or '4,124 sq. ft.' to sqm."""
|
||||||
if not display_size:
|
if not display_size:
|
||||||
|
|
@ -15,11 +34,11 @@ def parse_display_size(display_size: str | None) -> float | None:
|
||||||
m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*ft", display_size, re.IGNORECASE)
|
m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*ft", display_size, re.IGNORECASE)
|
||||||
if m:
|
if m:
|
||||||
sqft = float(m.group(1).replace(",", ""))
|
sqft = float(m.group(1).replace(",", ""))
|
||||||
return round(sqft * 0.092903, 1)
|
return validate_floor_area(round(sqft * 0.092903, 1))
|
||||||
# Try sq. m.
|
# Try sq. m.
|
||||||
m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*m", display_size, re.IGNORECASE)
|
m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*m", display_size, re.IGNORECASE)
|
||||||
if m:
|
if m:
|
||||||
return round(float(m.group(1).replace(",", "")), 1)
|
return validate_floor_area(round(float(m.group(1).replace(",", "")), 1))
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -92,19 +111,34 @@ def transform_property(
|
||||||
|
|
||||||
price_obj = prop.get("price", {})
|
price_obj = prop.get("price", {})
|
||||||
amount = price_obj.get("amount")
|
amount = price_obj.get("amount")
|
||||||
if amount is None:
|
if not amount:
|
||||||
return None
|
return None
|
||||||
frequency = price_obj.get("frequency", "")
|
frequency = price_obj.get("frequency", "")
|
||||||
price = normalize_price(int(amount), frequency)
|
# Store raw price — normalization to monthly happens once in storage.py
|
||||||
|
price = int(amount)
|
||||||
|
if price <= 0:
|
||||||
|
return None
|
||||||
|
|
||||||
display_prices = price_obj.get("displayPrices", [])
|
display_prices = price_obj.get("displayPrices", [])
|
||||||
price_qualifier = (
|
price_qualifier = (
|
||||||
display_prices[0].get("displayPriceQualifier", "") if display_prices else ""
|
display_prices[0].get("displayPriceQualifier", "") if display_prices else ""
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# POA / Auction listings have unreliable prices — treat as no price
|
||||||
|
pq_lower = price_qualifier.lower()
|
||||||
|
if "poa" in pq_lower or "auction" in pq_lower:
|
||||||
|
return None
|
||||||
|
|
||||||
sub_type = prop.get("propertySubType", "")
|
sub_type = prop.get("propertySubType", "")
|
||||||
bedrooms = prop.get("bedrooms", 0) or 0
|
raw_beds = prop.get("bedrooms", 0) or 0
|
||||||
bathrooms = prop.get("bathrooms", 0) or 0
|
raw_baths = prop.get("bathrooms", 0) or 0
|
||||||
|
bedrooms = raw_beds if raw_beds <= MAX_BEDROOMS else 0
|
||||||
|
bathrooms = raw_baths if raw_baths <= MAX_BEDROOMS else 0
|
||||||
|
if raw_beds > MAX_BEDROOMS or raw_baths > MAX_BEDROOMS:
|
||||||
|
log.warning(
|
||||||
|
"Rightmove %s: implausible beds=%d baths=%d (capped to 0)",
|
||||||
|
prop.get("id", "?"), raw_beds, raw_baths,
|
||||||
|
)
|
||||||
|
|
||||||
key_features = [
|
key_features = [
|
||||||
kf.get("description", "")
|
kf.get("description", "")
|
||||||
|
|
|
||||||
|
|
@ -26,9 +26,10 @@ import logging
|
||||||
import re
|
import re
|
||||||
import time
|
import time
|
||||||
|
|
||||||
from constants import DELAY_BETWEEN_PAGES, PROPERTY_TYPE_MAP, ZOOPLA_BASE
|
from constants import DELAY_BETWEEN_PAGES, MAX_BEDROOMS, PROPERTY_TYPE_MAP, ZOOPLA_BASE
|
||||||
from metrics import zoopla_errors_total, zoopla_pages_scraped, zoopla_properties_scraped
|
from metrics import zoopla_errors_total, zoopla_pages_scraped, zoopla_properties_scraped
|
||||||
from spatial import PostcodeSpatialIndex
|
from spatial import PostcodeSpatialIndex
|
||||||
|
from transform import validate_floor_area
|
||||||
|
|
||||||
log = logging.getLogger("zoopla")
|
log = logging.getLogger("zoopla")
|
||||||
|
|
||||||
|
|
@ -94,15 +95,16 @@ _EXTRACT_LISTINGS_JS = r"""() => {
|
||||||
const areaMatch = text.match(/([\d,]+)\s*sq\.?\s*ft/i);
|
const areaMatch = text.match(/([\d,]+)\s*sq\.?\s*ft/i);
|
||||||
|
|
||||||
let tenure = '';
|
let tenure = '';
|
||||||
if (/freehold/i.test(text)) tenure = 'Freehold';
|
if (/leasehold/i.test(text)) tenure = 'Leasehold';
|
||||||
else if (/leasehold/i.test(text)) tenure = 'Leasehold';
|
else if (/freehold/i.test(text)) tenure = 'Freehold';
|
||||||
|
|
||||||
results.push({
|
results.push({
|
||||||
id, url: href.replace(window.location.origin, ''),
|
id, url: href.replace(window.location.origin, ''),
|
||||||
price: priceMatch ? parseInt(priceMatch[1].replace(/,/g, '')) : null,
|
price: priceMatch ? parseInt(priceMatch[1].replace(/,/g, '')) : null,
|
||||||
beds: bedsMatch ? parseInt(bedsMatch[1]) : null,
|
price_text: priceText.trim(),
|
||||||
baths: bathsMatch ? parseInt(bathsMatch[1]) : null,
|
beds: bedsMatch && parseInt(bedsMatch[1]) <= 20 ? parseInt(bedsMatch[1]) : null,
|
||||||
receptions: recMatch ? parseInt(recMatch[1]) : null,
|
baths: bathsMatch && parseInt(bathsMatch[1]) <= 20 ? parseInt(bathsMatch[1]) : null,
|
||||||
|
receptions: recMatch && parseInt(recMatch[1]) <= 20 ? parseInt(recMatch[1]) : null,
|
||||||
floor_area_sqft: areaMatch ? parseInt(areaMatch[1].replace(/,/g, '')) : null,
|
floor_area_sqft: areaMatch ? parseInt(areaMatch[1].replace(/,/g, '')) : null,
|
||||||
address, tenure,
|
address, tenure,
|
||||||
});
|
});
|
||||||
|
|
@ -137,7 +139,9 @@ _EXTRACT_LISTINGS_JS = r"""() => {
|
||||||
const text = card.innerText || '';
|
const text = card.innerText || '';
|
||||||
const lines = text.split('\n').map(l => l.trim()).filter(Boolean);
|
const lines = text.split('\n').map(l => l.trim()).filter(Boolean);
|
||||||
|
|
||||||
const priceMatch = text.match(/\u00a3([\d,]+)/);
|
const priceEl2 = card.querySelector('[data-testid="listing-price"]');
|
||||||
|
const priceText2 = priceEl2 ? priceEl2.innerText : text;
|
||||||
|
const priceMatch = priceText2.match(/\u00a3([\d,]+)/);
|
||||||
const bedsMatch = text.match(/(\d+)\s*beds?/i);
|
const bedsMatch = text.match(/(\d+)\s*beds?/i);
|
||||||
const bathsMatch = text.match(/(\d+)\s*baths?/i);
|
const bathsMatch = text.match(/(\d+)\s*baths?/i);
|
||||||
const recMatch = text.match(/(\d+)\s*reception/i);
|
const recMatch = text.match(/(\d+)\s*reception/i);
|
||||||
|
|
@ -153,15 +157,16 @@ _EXTRACT_LISTINGS_JS = r"""() => {
|
||||||
}
|
}
|
||||||
|
|
||||||
let tenure = '';
|
let tenure = '';
|
||||||
if (/freehold/i.test(text)) tenure = 'Freehold';
|
if (/leasehold/i.test(text)) tenure = 'Leasehold';
|
||||||
else if (/leasehold/i.test(text)) tenure = 'Leasehold';
|
else if (/freehold/i.test(text)) tenure = 'Freehold';
|
||||||
|
|
||||||
results.push({
|
results.push({
|
||||||
id, url: href.replace(window.location.origin, ''),
|
id, url: href.replace(window.location.origin, ''),
|
||||||
price: priceMatch ? parseInt(priceMatch[1].replace(/,/g, '')) : null,
|
price: priceMatch ? parseInt(priceMatch[1].replace(/,/g, '')) : null,
|
||||||
beds: bedsMatch ? parseInt(bedsMatch[1]) : null,
|
price_text: priceText2.trim(),
|
||||||
baths: bathsMatch ? parseInt(bathsMatch[1]) : null,
|
beds: bedsMatch && parseInt(bedsMatch[1]) <= 20 ? parseInt(bedsMatch[1]) : null,
|
||||||
receptions: recMatch ? parseInt(recMatch[1]) : null,
|
baths: bathsMatch && parseInt(bathsMatch[1]) <= 20 ? parseInt(bathsMatch[1]) : null,
|
||||||
|
receptions: recMatch && parseInt(recMatch[1]) <= 20 ? parseInt(recMatch[1]) : null,
|
||||||
floor_area_sqft: areaMatch ? parseInt(areaMatch[1].replace(/,/g, '')) : null,
|
floor_area_sqft: areaMatch ? parseInt(areaMatch[1].replace(/,/g, '')) : null,
|
||||||
address, tenure,
|
address, tenure,
|
||||||
});
|
});
|
||||||
|
|
@ -597,6 +602,21 @@ def _map_property_type(raw_type: str | None) -> str:
|
||||||
return "Other"
|
return "Other"
|
||||||
|
|
||||||
|
|
||||||
|
def _detect_rent_frequency(price_text: str) -> str:
|
||||||
|
"""Detect rent frequency from Zoopla price text.
|
||||||
|
|
||||||
|
Zoopla price elements contain text like '£1,500 pcm', '£350 pw',
|
||||||
|
'£18,000 pa'. Defaults to 'monthly' if no frequency indicator found.
|
||||||
|
"""
|
||||||
|
lower = price_text.lower()
|
||||||
|
if "pw" in lower or "per week" in lower or "/w" in lower:
|
||||||
|
return "weekly"
|
||||||
|
if "pa" in lower or "per annum" in lower or "/y" in lower or "per year" in lower:
|
||||||
|
return "yearly"
|
||||||
|
# pcm, per month, /m, or no indicator — default monthly
|
||||||
|
return "monthly"
|
||||||
|
|
||||||
|
|
||||||
def transform_property(
|
def transform_property(
|
||||||
raw: dict,
|
raw: dict,
|
||||||
channel: str,
|
channel: str,
|
||||||
|
|
@ -608,7 +628,7 @@ def transform_property(
|
||||||
Zoopla search cards do not include coordinates, so we resolve lat/lng
|
Zoopla search cards do not include coordinates, so we resolve lat/lng
|
||||||
from postcodes extracted from the address text."""
|
from postcodes extracted from the address text."""
|
||||||
price = raw.get("price")
|
price = raw.get("price")
|
||||||
if not price:
|
if not price or int(price) <= 0:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
address = raw.get("address", "")
|
address = raw.get("address", "")
|
||||||
|
|
@ -647,21 +667,35 @@ def transform_property(
|
||||||
if not (49 <= lat <= 56 and -7 <= lng <= 2):
|
if not (49 <= lat <= 56 and -7 <= lng <= 2):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
bedrooms = raw.get("beds") or 0
|
raw_beds = raw.get("beds") or 0
|
||||||
bathrooms = raw.get("baths") or 0
|
raw_baths = raw.get("baths") or 0
|
||||||
|
bedrooms = raw_beds if raw_beds <= MAX_BEDROOMS else 0
|
||||||
|
bathrooms = raw_baths if raw_baths <= MAX_BEDROOMS else 0
|
||||||
|
if raw_beds > MAX_BEDROOMS or raw_baths > MAX_BEDROOMS:
|
||||||
|
log.warning(
|
||||||
|
"Zoopla %s: implausible beds=%d baths=%d (capped to 0)",
|
||||||
|
raw.get("id", "?"), raw_beds, raw_baths,
|
||||||
|
)
|
||||||
receptions = raw.get("receptions") or 0
|
receptions = raw.get("receptions") or 0
|
||||||
|
|
||||||
# Floor area: convert sq ft to sq m
|
# Floor area: convert sq ft to sq m
|
||||||
floor_area_sqm = None
|
floor_area_sqm = None
|
||||||
sqft = raw.get("floor_area_sqft")
|
sqft = raw.get("floor_area_sqft")
|
||||||
if sqft:
|
if sqft:
|
||||||
floor_area_sqm = round(sqft * 0.092903, 1)
|
floor_area_sqm = validate_floor_area(round(sqft * 0.092903, 1))
|
||||||
|
|
||||||
listing_id = raw.get("id", "")
|
listing_id = raw.get("id", "")
|
||||||
listing_url = raw.get("url", "")
|
listing_url = raw.get("url", "")
|
||||||
if listing_url and not listing_url.startswith("http"):
|
if listing_url and not listing_url.startswith("http"):
|
||||||
listing_url = ZOOPLA_BASE + listing_url
|
listing_url = ZOOPLA_BASE + listing_url
|
||||||
|
|
||||||
|
# Detect rent frequency from price text (e.g. "£1,500 pcm" vs "£350 pw")
|
||||||
|
if channel == "BUY":
|
||||||
|
frequency = ""
|
||||||
|
else:
|
||||||
|
price_text = raw.get("price_text", "")
|
||||||
|
frequency = _detect_rent_frequency(price_text)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"id": f"zp_{listing_id}",
|
"id": f"zp_{listing_id}",
|
||||||
"Bedrooms": bedrooms,
|
"Bedrooms": bedrooms,
|
||||||
|
|
@ -675,7 +709,7 @@ def transform_property(
|
||||||
"Property type": "Other", # Not reliably extractable from Zoopla search cards
|
"Property type": "Other", # Not reliably extractable from Zoopla search cards
|
||||||
"Property sub-type": "",
|
"Property sub-type": "",
|
||||||
"price": int(price),
|
"price": int(price),
|
||||||
"price_frequency": "" if channel == "BUY" else "monthly",
|
"price_frequency": frequency,
|
||||||
"Price qualifier": "",
|
"Price qualifier": "",
|
||||||
"Total floor area (sqm)": floor_area_sqm,
|
"Total floor area (sqm)": floor_area_sqm,
|
||||||
"Listing URL": listing_url,
|
"Listing URL": listing_url,
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue