all is well
This commit is contained in:
parent
eac1bd0d13
commit
2f149503bb
53 changed files with 1543 additions and 354 deletions
|
|
@ -24,7 +24,7 @@ import time
|
|||
|
||||
from constants import DELAY_BETWEEN_PAGES, MAX_BEDROOMS, PROPERTY_TYPE_MAP, ZOOPLA_BASE
|
||||
from spatial import PostcodeSpatialIndex
|
||||
from transform import normalize_sub_type, validate_floor_area
|
||||
from transform import normalize_sub_type, parse_int_value, validate_floor_area
|
||||
|
||||
log = logging.getLogger("zoopla")
|
||||
|
||||
|
|
@ -106,7 +106,8 @@ _EXTRACT_LISTINGS_JS = r"""() => {
|
|||
const bedsMatch = text.match(/(\d+)\s*beds?/i);
|
||||
const bathsMatch = text.match(/(\d+)\s*baths?/i);
|
||||
const recMatch = text.match(/(\d+)\s*reception/i);
|
||||
const areaMatch = text.match(/([\d,]+)\s*sq\.?\s*ft/i);
|
||||
const areaSqftMatch = text.match(/([\d,]+(?:\.\d+)?)\s*(?:sq\.?\s*ft|square\s+feet|ft(?:\^?2|²))/i);
|
||||
const areaSqmMatch = text.match(/([\d,]+(?:\.\d+)?)\s*(?:sq\.?\s*m|square\s+met(?:er|re)s?|m(?:\^?2|²))/i);
|
||||
|
||||
let tenure = '';
|
||||
if (/leasehold/i.test(text)) tenure = 'Leasehold';
|
||||
|
|
@ -141,7 +142,8 @@ _EXTRACT_LISTINGS_JS = r"""() => {
|
|||
beds: bedsMatch && parseInt(bedsMatch[1]) <= 20 ? parseInt(bedsMatch[1]) : null,
|
||||
baths: bathsMatch && parseInt(bathsMatch[1]) <= 20 ? parseInt(bathsMatch[1]) : null,
|
||||
receptions: recMatch && parseInt(recMatch[1]) <= 20 ? parseInt(recMatch[1]) : null,
|
||||
floor_area_sqft: areaMatch ? parseInt(areaMatch[1].replace(/,/g, '')) : null,
|
||||
floor_area_sqft: areaSqftMatch ? parseInt(areaSqftMatch[1].replace(/,/g, '')) : null,
|
||||
floor_area_sqm: areaSqmMatch ? parseFloat(areaSqmMatch[1].replace(/,/g, '')) : null,
|
||||
address, tenure, property_type,
|
||||
});
|
||||
}
|
||||
|
|
@ -181,7 +183,8 @@ _EXTRACT_LISTINGS_JS = r"""() => {
|
|||
const bedsMatch = text.match(/(\d+)\s*beds?/i);
|
||||
const bathsMatch = text.match(/(\d+)\s*baths?/i);
|
||||
const recMatch = text.match(/(\d+)\s*reception/i);
|
||||
const areaMatch = text.match(/([\d,]+)\s*sq\.?\s*ft/i);
|
||||
const areaSqftMatch = text.match(/([\d,]+(?:\.\d+)?)\s*(?:sq\.?\s*ft|square\s+feet|ft(?:\^?2|²))/i);
|
||||
const areaSqmMatch = text.match(/([\d,]+(?:\.\d+)?)\s*(?:sq\.?\s*m|square\s+met(?:er|re)s?|m(?:\^?2|²))/i);
|
||||
|
||||
let address = '';
|
||||
for (const line of lines) {
|
||||
|
|
@ -225,7 +228,8 @@ _EXTRACT_LISTINGS_JS = r"""() => {
|
|||
beds: bedsMatch && parseInt(bedsMatch[1]) <= 20 ? parseInt(bedsMatch[1]) : null,
|
||||
baths: bathsMatch && parseInt(bathsMatch[1]) <= 20 ? parseInt(bathsMatch[1]) : null,
|
||||
receptions: recMatch && parseInt(recMatch[1]) <= 20 ? parseInt(recMatch[1]) : null,
|
||||
floor_area_sqft: areaMatch ? parseInt(areaMatch[1].replace(/,/g, '')) : null,
|
||||
floor_area_sqft: areaSqftMatch ? parseInt(areaSqftMatch[1].replace(/,/g, '')) : null,
|
||||
floor_area_sqm: areaSqmMatch ? parseFloat(areaSqmMatch[1].replace(/,/g, '')) : null,
|
||||
address, tenure, property_type,
|
||||
});
|
||||
}
|
||||
|
|
@ -611,7 +615,22 @@ def _map_property_type(raw_type: str | None) -> str:
|
|||
return canonical
|
||||
# Keyword fallback
|
||||
lower = raw_type.lower()
|
||||
if "flat" in lower or "apartment" in lower or "maisonette" in lower or "studio" in lower or "penthouse" in lower:
|
||||
excluded_flat_like = (
|
||||
"block of apartment",
|
||||
"house of multiple occupation",
|
||||
"private halls",
|
||||
"retirement",
|
||||
"serviced apartment",
|
||||
)
|
||||
if any(term in lower for term in excluded_flat_like):
|
||||
return "Other"
|
||||
if (
|
||||
"flat" in lower
|
||||
or "apartment" in lower
|
||||
or "maisonette" in lower
|
||||
or "studio" in lower
|
||||
or "penthouse" in lower
|
||||
):
|
||||
return "Flats/Maisonettes"
|
||||
if "semi" in lower and "detach" in lower:
|
||||
return "Semi-Detached"
|
||||
|
|
@ -634,8 +653,8 @@ def transform_property(
|
|||
|
||||
Zoopla search cards do not include coordinates, so we resolve lat/lng
|
||||
from postcodes extracted from the address text."""
|
||||
price = raw.get("price")
|
||||
if not price or int(price) <= 0:
|
||||
price = parse_int_value(raw.get("price"))
|
||||
if not price or price <= 0:
|
||||
return None
|
||||
|
||||
address = raw.get("address", "")
|
||||
|
|
@ -670,10 +689,10 @@ def transform_property(
|
|||
if not (49 <= lat <= 56 and -7 <= lng <= 2):
|
||||
return None
|
||||
|
||||
raw_beds = raw.get("beds") or 0
|
||||
raw_baths = raw.get("baths") or 0
|
||||
bedrooms = raw_beds if raw_beds <= MAX_BEDROOMS else 0
|
||||
bathrooms = raw_baths if raw_baths <= MAX_BEDROOMS else 0
|
||||
raw_beds = parse_int_value(raw.get("beds")) or 0
|
||||
raw_baths = parse_int_value(raw.get("baths")) or 0
|
||||
bedrooms = raw_beds if 0 <= raw_beds <= MAX_BEDROOMS else 0
|
||||
bathrooms = raw_baths if 0 <= raw_baths <= MAX_BEDROOMS else 0
|
||||
if raw_beds > MAX_BEDROOMS or raw_baths > MAX_BEDROOMS:
|
||||
log.warning(
|
||||
"Zoopla %s: implausible beds=%d baths=%d (capped to 0)",
|
||||
|
|
@ -683,9 +702,13 @@ def transform_property(
|
|||
|
||||
# Floor area: convert sq ft to sq m
|
||||
floor_area_sqm = None
|
||||
sqft = raw.get("floor_area_sqft")
|
||||
if sqft:
|
||||
floor_area_sqm = validate_floor_area(round(sqft * 0.092903, 1))
|
||||
raw_sqm = raw.get("floor_area_sqm")
|
||||
if raw_sqm:
|
||||
floor_area_sqm = validate_floor_area(round(float(raw_sqm), 1))
|
||||
else:
|
||||
sqft = raw.get("floor_area_sqft")
|
||||
if sqft:
|
||||
floor_area_sqm = validate_floor_area(round(float(sqft) * 0.092903, 1))
|
||||
|
||||
listing_id = raw.get("id", "")
|
||||
listing_url = raw.get("url", "")
|
||||
|
|
@ -704,7 +727,7 @@ def transform_property(
|
|||
"Leasehold/Freehold": raw.get("tenure") or None,
|
||||
"Property type": _map_property_type(raw.get("property_type")),
|
||||
"Property sub-type": normalize_sub_type(raw.get("property_type")),
|
||||
"price": int(price),
|
||||
"price": price,
|
||||
"price_frequency": "",
|
||||
"Price qualifier": "",
|
||||
"Total floor area (sqm)": floor_area_sqm,
|
||||
|
|
@ -760,7 +783,18 @@ def search_outcode(
|
|||
properties = []
|
||||
dropped = 0
|
||||
for raw in raw_listings:
|
||||
transformed = transform_property(raw, pc_index, pc_coords, search_outcode=outcode)
|
||||
try:
|
||||
transformed = transform_property(
|
||||
raw, pc_index, pc_coords, search_outcode=outcode
|
||||
)
|
||||
except Exception as exc:
|
||||
log.warning(
|
||||
"Zoopla %s property %s failed to transform: %s",
|
||||
outcode,
|
||||
raw.get("id", "?"),
|
||||
exc,
|
||||
)
|
||||
transformed = None
|
||||
if transformed:
|
||||
properties.append(transformed)
|
||||
else:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue