finder improvements
This commit is contained in:
parent
30055ab870
commit
3a3e249bdd
6 changed files with 225 additions and 39 deletions
|
|
@ -9,6 +9,12 @@ DELAY_BETWEEN_OUTCODES = 0.5
|
|||
MAX_RETRIES = 3
|
||||
RETRY_BASE_DELAY = 2.0
|
||||
GRID_CELL_SIZE = 0.01 # degrees for postcode spatial index
|
||||
MAX_BEDROOMS = 20 # sanity cap — values above this are almost certainly parsing errors
|
||||
# Rent sanity bounds (monthly). Rents outside this range are nulled out — they are
|
||||
# almost always total-stay pricing (e.g. "Golf Open 2026" short lets), annual rents
|
||||
# mislabelled as monthly, or data errors.
|
||||
MIN_RENT_MONTHLY = 50 # below £50/month is implausible for any UK property
|
||||
MAX_RENT_MONTHLY = 25_000 # above £25k/month covers ultra-prime London; higher is suspect
|
||||
SEED = 42
|
||||
CHECKPOINT_INTERVAL = int(os.environ.get("CHECKPOINT_INTERVAL", "900")) # seconds
|
||||
|
||||
|
|
|
|||
|
|
@ -15,6 +15,7 @@ from constants import (
|
|||
HOMECOUK_API_BASE,
|
||||
HOMECOUK_BASE,
|
||||
HOMECOUK_PER_PAGE,
|
||||
MAX_BEDROOMS,
|
||||
PROPERTY_TYPE_MAP,
|
||||
RETRY_BASE_DELAY,
|
||||
)
|
||||
|
|
@ -25,6 +26,7 @@ from metrics import (
|
|||
homecouk_requests_total,
|
||||
)
|
||||
from spatial import PostcodeSpatialIndex
|
||||
from transform import validate_floor_area
|
||||
|
||||
log = logging.getLogger("homecouk")
|
||||
|
||||
|
|
@ -216,10 +218,57 @@ def parse_floor_area(description: str | None) -> float | None:
|
|||
m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*ft", description, re.IGNORECASE)
|
||||
if m:
|
||||
sqft = float(m.group(1).replace(",", ""))
|
||||
return round(sqft * 0.092903, 1)
|
||||
return validate_floor_area(round(sqft * 0.092903, 1))
|
||||
m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*m", description, re.IGNORECASE)
|
||||
if m:
|
||||
return round(float(m.group(1).replace(",", "")), 1)
|
||||
return validate_floor_area(round(float(m.group(1).replace(",", "")), 1))
|
||||
return None
|
||||
|
||||
|
||||
def parse_tenure(prop: dict) -> str | None:
|
||||
"""Extract tenure from home.co.uk property data.
|
||||
|
||||
Checks multiple sources in priority order:
|
||||
1. Dedicated 'tenure' or 'tenure_type' field in the API response
|
||||
2. Free-text search in the description for 'freehold' / 'leasehold'
|
||||
3. Free-text search in features lists
|
||||
|
||||
home.co.uk aggregates listings from estate agents, so tenure is often
|
||||
embedded in the description text rather than a structured field.
|
||||
"""
|
||||
# 1. Check dedicated tenure fields (in case the API adds them)
|
||||
for key in ("tenure", "tenure_type", "tenureType"):
|
||||
val = prop.get(key)
|
||||
if val and isinstance(val, str):
|
||||
lower = val.lower().strip()
|
||||
if "leasehold" in lower:
|
||||
return "Leasehold"
|
||||
if "freehold" in lower:
|
||||
return "Freehold"
|
||||
|
||||
# 2. Check description text — estate agents often include tenure here
|
||||
description = prop.get("description") or ""
|
||||
if description:
|
||||
lower_desc = description.lower()
|
||||
if re.search(r"\bleasehold\b", lower_desc):
|
||||
return "Leasehold"
|
||||
if re.search(r"\bfreehold\b", lower_desc):
|
||||
# Matches "Freehold" and "Share of Freehold" (both = freehold ownership)
|
||||
return "Freehold"
|
||||
|
||||
# 3. Check features / key_features lists if present
|
||||
for key in ("features", "key_features", "keyFeatures"):
|
||||
features = prop.get(key)
|
||||
if features and isinstance(features, list):
|
||||
for feat in features:
|
||||
if not isinstance(feat, str):
|
||||
continue
|
||||
lower_feat = feat.lower()
|
||||
if "leasehold" in lower_feat:
|
||||
return "Leasehold"
|
||||
if "freehold" in lower_feat:
|
||||
return "Freehold"
|
||||
|
||||
return None
|
||||
|
||||
|
||||
|
|
@ -267,7 +316,7 @@ def transform_property(
|
|||
return None
|
||||
|
||||
price = prop.get("price") or prop.get("latest_price")
|
||||
if not price:
|
||||
if not price or int(price) <= 0:
|
||||
return None
|
||||
|
||||
# Home.co.uk provides postcodes directly, but fall back to spatial index
|
||||
|
|
@ -278,8 +327,16 @@ def transform_property(
|
|||
log.debug("No postcode for property at %.4f, %.4f — skipping", lat, lng)
|
||||
return None
|
||||
|
||||
bedrooms = prop.get("bedrooms", 0) or 0
|
||||
bathrooms = prop.get("bathrooms", 0) or 0
|
||||
raw_beds = prop.get("bedrooms", 0) or 0
|
||||
raw_baths = prop.get("bathrooms", 0) or 0
|
||||
bedrooms = raw_beds if raw_beds <= MAX_BEDROOMS else 0
|
||||
bathrooms = raw_baths if raw_baths <= MAX_BEDROOMS else 0
|
||||
if raw_beds > MAX_BEDROOMS or raw_baths > MAX_BEDROOMS:
|
||||
log.warning(
|
||||
"home.co.uk %s: implausible beds=%d baths=%d (capped to 0)",
|
||||
prop.get("listing_id") or prop.get("property_id") or "?",
|
||||
raw_beds, raw_baths,
|
||||
)
|
||||
|
||||
listing_type = prop.get("listing_property_type") or prop.get("property_type") or ""
|
||||
address = prop.get("display_address") or prop.get("address") or ""
|
||||
|
|
@ -304,7 +361,7 @@ def transform_property(
|
|||
"lat": lat,
|
||||
"Postcode": postcode,
|
||||
"Address per Property Register": address,
|
||||
"Leasehold/Freehold": None, # not available from home.co.uk
|
||||
"Leasehold/Freehold": parse_tenure(prop),
|
||||
"Property type": map_property_type(listing_type),
|
||||
"Property sub-type": listing_type or "Unknown",
|
||||
"price": int(price),
|
||||
|
|
|
|||
|
|
@ -34,6 +34,7 @@ from playwright.sync_api import sync_playwright
|
|||
|
||||
from constants import (
|
||||
DELAY_BETWEEN_PAGES,
|
||||
MAX_BEDROOMS,
|
||||
OPENRENT_BASE,
|
||||
PROPERTY_TYPE_MAP,
|
||||
RETRY_BASE_DELAY,
|
||||
|
|
@ -45,6 +46,7 @@ from metrics import (
|
|||
openrent_requests_total,
|
||||
)
|
||||
from spatial import PostcodeSpatialIndex
|
||||
from transform import validate_floor_area
|
||||
|
||||
log = logging.getLogger("openrent")
|
||||
|
||||
|
|
@ -607,10 +609,10 @@ def parse_floor_area(description: str | None) -> float | None:
|
|||
m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*ft", description, re.IGNORECASE)
|
||||
if m:
|
||||
sqft = float(m.group(1).replace(",", ""))
|
||||
return round(sqft * 0.092903, 1)
|
||||
return validate_floor_area(round(sqft * 0.092903, 1))
|
||||
m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*m", description, re.IGNORECASE)
|
||||
if m:
|
||||
return round(float(m.group(1).replace(",", "")), 1)
|
||||
return validate_floor_area(round(float(m.group(1).replace(",", "")), 1))
|
||||
return None
|
||||
|
||||
|
||||
|
|
@ -651,7 +653,7 @@ def transform_property(
|
|||
lat = detail.get("lat") or search_data.get("lat")
|
||||
lng = detail.get("lng") or search_data.get("lng")
|
||||
price = detail.get("price") or search_data.get("price")
|
||||
if not price:
|
||||
if not price or int(price) <= 0:
|
||||
return None
|
||||
|
||||
frequency = search_data.get("frequency", "monthly")
|
||||
|
|
@ -701,8 +703,15 @@ def transform_property(
|
|||
log.debug("No postcode for property — skipping")
|
||||
return None
|
||||
|
||||
bedrooms = detail.get("bedrooms") or search_data.get("bedrooms", 0) or 0
|
||||
bathrooms = detail.get("bathrooms") or search_data.get("bathrooms", 0) or 0
|
||||
raw_beds = detail.get("bedrooms") or search_data.get("bedrooms", 0) or 0
|
||||
raw_baths = detail.get("bathrooms") or search_data.get("bathrooms", 0) or 0
|
||||
bedrooms = raw_beds if raw_beds <= MAX_BEDROOMS else 0
|
||||
bathrooms = raw_baths if raw_baths <= MAX_BEDROOMS else 0
|
||||
if raw_beds > MAX_BEDROOMS or raw_baths > MAX_BEDROOMS:
|
||||
log.warning(
|
||||
"OpenRent %s: implausible beds=%d baths=%d (capped to 0)",
|
||||
search_data.get("id", "?"), raw_beds, raw_baths,
|
||||
)
|
||||
|
||||
# Title: prefer detail page (has h1 with full title)
|
||||
title = detail.get("title") or search_data.get("title", "")
|
||||
|
|
@ -746,6 +755,9 @@ def transform_property(
|
|||
"lat": lat,
|
||||
"Postcode": postcode,
|
||||
"Address per Property Register": address,
|
||||
# OpenRent is a rental-only platform — tenure (Freehold/Leasehold) is a
|
||||
# property ownership concept that doesn't apply to rental listings. The
|
||||
# landlord's tenure is not shown on OpenRent listing pages.
|
||||
"Leasehold/Freehold": None,
|
||||
"Property type": map_property_type(property_type),
|
||||
"Property sub-type": property_type or "Unknown",
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@ from pathlib import Path
|
|||
|
||||
import polars as pl
|
||||
|
||||
from constants import MAX_BEDROOMS, MAX_RENT_MONTHLY, MIN_RENT_MONTHLY
|
||||
from transform import normalize_price
|
||||
|
||||
log = logging.getLogger("rightmove")
|
||||
|
|
@ -18,6 +19,30 @@ def write_parquet(properties: list[dict], path: Path, channel: str) -> None:
|
|||
log.warning("No properties to write to %s", path)
|
||||
return
|
||||
|
||||
# Sanitize bedroom/bathroom counts — values above MAX_BEDROOMS are
|
||||
# almost certainly prices or other numeric fields mis-parsed as bedrooms.
|
||||
bad_count = 0
|
||||
for p in properties:
|
||||
for key in ("Bedrooms", "Bathrooms"):
|
||||
val = p.get(key, 0) or 0
|
||||
if val > MAX_BEDROOMS:
|
||||
bad_count += 1
|
||||
p[key] = None
|
||||
# Recompute derived field after sanitization
|
||||
beds = p.get("Bedrooms")
|
||||
baths = p.get("Bathrooms")
|
||||
if beds is None or baths is None:
|
||||
p["Number of bedrooms & living rooms"] = None
|
||||
else:
|
||||
p["Number of bedrooms & living rooms"] = beds + baths
|
||||
|
||||
if bad_count:
|
||||
log.warning(
|
||||
"Sanitized %d properties with bedroom/bathroom counts > %d (set to null)",
|
||||
bad_count,
|
||||
MAX_BEDROOMS,
|
||||
)
|
||||
|
||||
# Parse first_visible_date to datetime
|
||||
listing_dates = []
|
||||
for p in properties:
|
||||
|
|
@ -36,15 +61,33 @@ def write_parquet(properties: list[dict], path: Path, channel: str) -> None:
|
|||
listing_dates.append(None)
|
||||
|
||||
# Derive asking price / asking rent based on channel
|
||||
# Zero prices indicate parsing failures or POA/auction listings — treat as null
|
||||
if channel == "buy":
|
||||
asking_prices = [p["price"] for p in properties]
|
||||
asking_prices = [p["price"] if p["price"] > 0 else None for p in properties]
|
||||
asking_rents = [None] * len(properties)
|
||||
listing_statuses = ["For sale"] * len(properties)
|
||||
else:
|
||||
asking_prices = [None] * len(properties)
|
||||
asking_rents = [
|
||||
normalize_price(p["price"], p["price_frequency"]) for p in properties
|
||||
]
|
||||
# Normalize to monthly, then apply sanity bounds. Rents outside
|
||||
# [MIN_RENT_MONTHLY, MAX_RENT_MONTHLY] are almost always total-stay
|
||||
# pricing (short lets), annual rents mislabelled as monthly, or £0
|
||||
# placeholders — null them out rather than polluting aggregates.
|
||||
rent_outliers = 0
|
||||
asking_rents = []
|
||||
for p in properties:
|
||||
monthly = normalize_price(p["price"], p["price_frequency"])
|
||||
if monthly < MIN_RENT_MONTHLY or monthly > MAX_RENT_MONTHLY:
|
||||
rent_outliers += 1
|
||||
asking_rents.append(None)
|
||||
else:
|
||||
asking_rents.append(monthly)
|
||||
if rent_outliers:
|
||||
log.warning(
|
||||
"Nulled %d rent outliers outside [£%d, £%d]/month",
|
||||
rent_outliers,
|
||||
MIN_RENT_MONTHLY,
|
||||
MAX_RENT_MONTHLY,
|
||||
)
|
||||
listing_statuses = ["For rent"] * len(properties)
|
||||
|
||||
df = pl.DataFrame(
|
||||
|
|
|
|||
|
|
@ -1,12 +1,31 @@
|
|||
import logging
|
||||
import re
|
||||
|
||||
from constants import PROPERTY_TYPE_MAP, RIGHTMOVE_BASE
|
||||
from constants import MAX_BEDROOMS, PROPERTY_TYPE_MAP, RIGHTMOVE_BASE
|
||||
from spatial import PostcodeSpatialIndex
|
||||
|
||||
log = logging.getLogger("rightmove")
|
||||
|
||||
|
||||
# Maximum plausible floor area for a residential property listing (sqm).
|
||||
# ~21,500 sq ft — covers even the largest UK mansions.
|
||||
MAX_FLOOR_AREA_SQM = 2000.0
|
||||
|
||||
|
||||
def validate_floor_area(sqm: float | None) -> float | None:
|
||||
"""Validate a floor area value. Returns None for nonsensical values.
|
||||
|
||||
Rejects zero/negative values and anything above MAX_FLOOR_AREA_SQM,
|
||||
which catches parsing errors where prices or other large numbers are
|
||||
mistakenly extracted as floor area from free-text descriptions or DOM text.
|
||||
"""
|
||||
if sqm is None:
|
||||
return None
|
||||
if sqm <= 0 or sqm > MAX_FLOOR_AREA_SQM:
|
||||
return None
|
||||
return sqm
|
||||
|
||||
|
||||
def parse_display_size(display_size: str | None) -> float | None:
|
||||
"""Parse displaySize like '499 sq. ft.' or '4,124 sq. ft.' to sqm."""
|
||||
if not display_size:
|
||||
|
|
@ -15,11 +34,11 @@ def parse_display_size(display_size: str | None) -> float | None:
|
|||
m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*ft", display_size, re.IGNORECASE)
|
||||
if m:
|
||||
sqft = float(m.group(1).replace(",", ""))
|
||||
return round(sqft * 0.092903, 1)
|
||||
return validate_floor_area(round(sqft * 0.092903, 1))
|
||||
# Try sq. m.
|
||||
m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*m", display_size, re.IGNORECASE)
|
||||
if m:
|
||||
return round(float(m.group(1).replace(",", "")), 1)
|
||||
return validate_floor_area(round(float(m.group(1).replace(",", "")), 1))
|
||||
return None
|
||||
|
||||
|
||||
|
|
@ -92,19 +111,34 @@ def transform_property(
|
|||
|
||||
price_obj = prop.get("price", {})
|
||||
amount = price_obj.get("amount")
|
||||
if amount is None:
|
||||
if not amount:
|
||||
return None
|
||||
frequency = price_obj.get("frequency", "")
|
||||
price = normalize_price(int(amount), frequency)
|
||||
# Store raw price — normalization to monthly happens once in storage.py
|
||||
price = int(amount)
|
||||
if price <= 0:
|
||||
return None
|
||||
|
||||
display_prices = price_obj.get("displayPrices", [])
|
||||
price_qualifier = (
|
||||
display_prices[0].get("displayPriceQualifier", "") if display_prices else ""
|
||||
)
|
||||
|
||||
# POA / Auction listings have unreliable prices — treat as no price
|
||||
pq_lower = price_qualifier.lower()
|
||||
if "poa" in pq_lower or "auction" in pq_lower:
|
||||
return None
|
||||
|
||||
sub_type = prop.get("propertySubType", "")
|
||||
bedrooms = prop.get("bedrooms", 0) or 0
|
||||
bathrooms = prop.get("bathrooms", 0) or 0
|
||||
raw_beds = prop.get("bedrooms", 0) or 0
|
||||
raw_baths = prop.get("bathrooms", 0) or 0
|
||||
bedrooms = raw_beds if raw_beds <= MAX_BEDROOMS else 0
|
||||
bathrooms = raw_baths if raw_baths <= MAX_BEDROOMS else 0
|
||||
if raw_beds > MAX_BEDROOMS or raw_baths > MAX_BEDROOMS:
|
||||
log.warning(
|
||||
"Rightmove %s: implausible beds=%d baths=%d (capped to 0)",
|
||||
prop.get("id", "?"), raw_beds, raw_baths,
|
||||
)
|
||||
|
||||
key_features = [
|
||||
kf.get("description", "")
|
||||
|
|
|
|||
|
|
@ -26,9 +26,10 @@ import logging
|
|||
import re
|
||||
import time
|
||||
|
||||
from constants import DELAY_BETWEEN_PAGES, PROPERTY_TYPE_MAP, ZOOPLA_BASE
|
||||
from constants import DELAY_BETWEEN_PAGES, MAX_BEDROOMS, PROPERTY_TYPE_MAP, ZOOPLA_BASE
|
||||
from metrics import zoopla_errors_total, zoopla_pages_scraped, zoopla_properties_scraped
|
||||
from spatial import PostcodeSpatialIndex
|
||||
from transform import validate_floor_area
|
||||
|
||||
log = logging.getLogger("zoopla")
|
||||
|
||||
|
|
@ -94,15 +95,16 @@ _EXTRACT_LISTINGS_JS = r"""() => {
|
|||
const areaMatch = text.match(/([\d,]+)\s*sq\.?\s*ft/i);
|
||||
|
||||
let tenure = '';
|
||||
if (/freehold/i.test(text)) tenure = 'Freehold';
|
||||
else if (/leasehold/i.test(text)) tenure = 'Leasehold';
|
||||
if (/leasehold/i.test(text)) tenure = 'Leasehold';
|
||||
else if (/freehold/i.test(text)) tenure = 'Freehold';
|
||||
|
||||
results.push({
|
||||
id, url: href.replace(window.location.origin, ''),
|
||||
price: priceMatch ? parseInt(priceMatch[1].replace(/,/g, '')) : null,
|
||||
beds: bedsMatch ? parseInt(bedsMatch[1]) : null,
|
||||
baths: bathsMatch ? parseInt(bathsMatch[1]) : null,
|
||||
receptions: recMatch ? parseInt(recMatch[1]) : null,
|
||||
price_text: priceText.trim(),
|
||||
beds: bedsMatch && parseInt(bedsMatch[1]) <= 20 ? parseInt(bedsMatch[1]) : null,
|
||||
baths: bathsMatch && parseInt(bathsMatch[1]) <= 20 ? parseInt(bathsMatch[1]) : null,
|
||||
receptions: recMatch && parseInt(recMatch[1]) <= 20 ? parseInt(recMatch[1]) : null,
|
||||
floor_area_sqft: areaMatch ? parseInt(areaMatch[1].replace(/,/g, '')) : null,
|
||||
address, tenure,
|
||||
});
|
||||
|
|
@ -137,7 +139,9 @@ _EXTRACT_LISTINGS_JS = r"""() => {
|
|||
const text = card.innerText || '';
|
||||
const lines = text.split('\n').map(l => l.trim()).filter(Boolean);
|
||||
|
||||
const priceMatch = text.match(/\u00a3([\d,]+)/);
|
||||
const priceEl2 = card.querySelector('[data-testid="listing-price"]');
|
||||
const priceText2 = priceEl2 ? priceEl2.innerText : text;
|
||||
const priceMatch = priceText2.match(/\u00a3([\d,]+)/);
|
||||
const bedsMatch = text.match(/(\d+)\s*beds?/i);
|
||||
const bathsMatch = text.match(/(\d+)\s*baths?/i);
|
||||
const recMatch = text.match(/(\d+)\s*reception/i);
|
||||
|
|
@ -153,15 +157,16 @@ _EXTRACT_LISTINGS_JS = r"""() => {
|
|||
}
|
||||
|
||||
let tenure = '';
|
||||
if (/freehold/i.test(text)) tenure = 'Freehold';
|
||||
else if (/leasehold/i.test(text)) tenure = 'Leasehold';
|
||||
if (/leasehold/i.test(text)) tenure = 'Leasehold';
|
||||
else if (/freehold/i.test(text)) tenure = 'Freehold';
|
||||
|
||||
results.push({
|
||||
id, url: href.replace(window.location.origin, ''),
|
||||
price: priceMatch ? parseInt(priceMatch[1].replace(/,/g, '')) : null,
|
||||
beds: bedsMatch ? parseInt(bedsMatch[1]) : null,
|
||||
baths: bathsMatch ? parseInt(bathsMatch[1]) : null,
|
||||
receptions: recMatch ? parseInt(recMatch[1]) : null,
|
||||
price_text: priceText2.trim(),
|
||||
beds: bedsMatch && parseInt(bedsMatch[1]) <= 20 ? parseInt(bedsMatch[1]) : null,
|
||||
baths: bathsMatch && parseInt(bathsMatch[1]) <= 20 ? parseInt(bathsMatch[1]) : null,
|
||||
receptions: recMatch && parseInt(recMatch[1]) <= 20 ? parseInt(recMatch[1]) : null,
|
||||
floor_area_sqft: areaMatch ? parseInt(areaMatch[1].replace(/,/g, '')) : null,
|
||||
address, tenure,
|
||||
});
|
||||
|
|
@ -597,6 +602,21 @@ def _map_property_type(raw_type: str | None) -> str:
|
|||
return "Other"
|
||||
|
||||
|
||||
def _detect_rent_frequency(price_text: str) -> str:
|
||||
"""Detect rent frequency from Zoopla price text.
|
||||
|
||||
Zoopla price elements contain text like '£1,500 pcm', '£350 pw',
|
||||
'£18,000 pa'. Defaults to 'monthly' if no frequency indicator found.
|
||||
"""
|
||||
lower = price_text.lower()
|
||||
if "pw" in lower or "per week" in lower or "/w" in lower:
|
||||
return "weekly"
|
||||
if "pa" in lower or "per annum" in lower or "/y" in lower or "per year" in lower:
|
||||
return "yearly"
|
||||
# pcm, per month, /m, or no indicator — default monthly
|
||||
return "monthly"
|
||||
|
||||
|
||||
def transform_property(
|
||||
raw: dict,
|
||||
channel: str,
|
||||
|
|
@ -608,7 +628,7 @@ def transform_property(
|
|||
Zoopla search cards do not include coordinates, so we resolve lat/lng
|
||||
from postcodes extracted from the address text."""
|
||||
price = raw.get("price")
|
||||
if not price:
|
||||
if not price or int(price) <= 0:
|
||||
return None
|
||||
|
||||
address = raw.get("address", "")
|
||||
|
|
@ -647,21 +667,35 @@ def transform_property(
|
|||
if not (49 <= lat <= 56 and -7 <= lng <= 2):
|
||||
return None
|
||||
|
||||
bedrooms = raw.get("beds") or 0
|
||||
bathrooms = raw.get("baths") or 0
|
||||
raw_beds = raw.get("beds") or 0
|
||||
raw_baths = raw.get("baths") or 0
|
||||
bedrooms = raw_beds if raw_beds <= MAX_BEDROOMS else 0
|
||||
bathrooms = raw_baths if raw_baths <= MAX_BEDROOMS else 0
|
||||
if raw_beds > MAX_BEDROOMS or raw_baths > MAX_BEDROOMS:
|
||||
log.warning(
|
||||
"Zoopla %s: implausible beds=%d baths=%d (capped to 0)",
|
||||
raw.get("id", "?"), raw_beds, raw_baths,
|
||||
)
|
||||
receptions = raw.get("receptions") or 0
|
||||
|
||||
# Floor area: convert sq ft to sq m
|
||||
floor_area_sqm = None
|
||||
sqft = raw.get("floor_area_sqft")
|
||||
if sqft:
|
||||
floor_area_sqm = round(sqft * 0.092903, 1)
|
||||
floor_area_sqm = validate_floor_area(round(sqft * 0.092903, 1))
|
||||
|
||||
listing_id = raw.get("id", "")
|
||||
listing_url = raw.get("url", "")
|
||||
if listing_url and not listing_url.startswith("http"):
|
||||
listing_url = ZOOPLA_BASE + listing_url
|
||||
|
||||
# Detect rent frequency from price text (e.g. "£1,500 pcm" vs "£350 pw")
|
||||
if channel == "BUY":
|
||||
frequency = ""
|
||||
else:
|
||||
price_text = raw.get("price_text", "")
|
||||
frequency = _detect_rent_frequency(price_text)
|
||||
|
||||
return {
|
||||
"id": f"zp_{listing_id}",
|
||||
"Bedrooms": bedrooms,
|
||||
|
|
@ -675,7 +709,7 @@ def transform_property(
|
|||
"Property type": "Other", # Not reliably extractable from Zoopla search cards
|
||||
"Property sub-type": "",
|
||||
"price": int(price),
|
||||
"price_frequency": "" if channel == "BUY" else "monthly",
|
||||
"price_frequency": frequency,
|
||||
"Price qualifier": "",
|
||||
"Total floor area (sqm)": floor_area_sqm,
|
||||
"Listing URL": listing_url,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue