finder improvements
This commit is contained in:
parent
30055ab870
commit
3a3e249bdd
6 changed files with 225 additions and 39 deletions
|
|
@ -26,9 +26,10 @@ import logging
|
|||
import re
|
||||
import time
|
||||
|
||||
from constants import DELAY_BETWEEN_PAGES, PROPERTY_TYPE_MAP, ZOOPLA_BASE
|
||||
from constants import DELAY_BETWEEN_PAGES, MAX_BEDROOMS, PROPERTY_TYPE_MAP, ZOOPLA_BASE
|
||||
from metrics import zoopla_errors_total, zoopla_pages_scraped, zoopla_properties_scraped
|
||||
from spatial import PostcodeSpatialIndex
|
||||
from transform import validate_floor_area
|
||||
|
||||
log = logging.getLogger("zoopla")
|
||||
|
||||
|
|
@ -94,15 +95,16 @@ _EXTRACT_LISTINGS_JS = r"""() => {
|
|||
const areaMatch = text.match(/([\d,]+)\s*sq\.?\s*ft/i);
|
||||
|
||||
let tenure = '';
|
||||
if (/freehold/i.test(text)) tenure = 'Freehold';
|
||||
else if (/leasehold/i.test(text)) tenure = 'Leasehold';
|
||||
if (/leasehold/i.test(text)) tenure = 'Leasehold';
|
||||
else if (/freehold/i.test(text)) tenure = 'Freehold';
|
||||
|
||||
results.push({
|
||||
id, url: href.replace(window.location.origin, ''),
|
||||
price: priceMatch ? parseInt(priceMatch[1].replace(/,/g, '')) : null,
|
||||
beds: bedsMatch ? parseInt(bedsMatch[1]) : null,
|
||||
baths: bathsMatch ? parseInt(bathsMatch[1]) : null,
|
||||
receptions: recMatch ? parseInt(recMatch[1]) : null,
|
||||
price_text: priceText.trim(),
|
||||
beds: bedsMatch && parseInt(bedsMatch[1]) <= 20 ? parseInt(bedsMatch[1]) : null,
|
||||
baths: bathsMatch && parseInt(bathsMatch[1]) <= 20 ? parseInt(bathsMatch[1]) : null,
|
||||
receptions: recMatch && parseInt(recMatch[1]) <= 20 ? parseInt(recMatch[1]) : null,
|
||||
floor_area_sqft: areaMatch ? parseInt(areaMatch[1].replace(/,/g, '')) : null,
|
||||
address, tenure,
|
||||
});
|
||||
|
|
@ -137,7 +139,9 @@ _EXTRACT_LISTINGS_JS = r"""() => {
|
|||
const text = card.innerText || '';
|
||||
const lines = text.split('\n').map(l => l.trim()).filter(Boolean);
|
||||
|
||||
const priceMatch = text.match(/\u00a3([\d,]+)/);
|
||||
const priceEl2 = card.querySelector('[data-testid="listing-price"]');
|
||||
const priceText2 = priceEl2 ? priceEl2.innerText : text;
|
||||
const priceMatch = priceText2.match(/\u00a3([\d,]+)/);
|
||||
const bedsMatch = text.match(/(\d+)\s*beds?/i);
|
||||
const bathsMatch = text.match(/(\d+)\s*baths?/i);
|
||||
const recMatch = text.match(/(\d+)\s*reception/i);
|
||||
|
|
@ -153,15 +157,16 @@ _EXTRACT_LISTINGS_JS = r"""() => {
|
|||
}
|
||||
|
||||
let tenure = '';
|
||||
if (/freehold/i.test(text)) tenure = 'Freehold';
|
||||
else if (/leasehold/i.test(text)) tenure = 'Leasehold';
|
||||
if (/leasehold/i.test(text)) tenure = 'Leasehold';
|
||||
else if (/freehold/i.test(text)) tenure = 'Freehold';
|
||||
|
||||
results.push({
|
||||
id, url: href.replace(window.location.origin, ''),
|
||||
price: priceMatch ? parseInt(priceMatch[1].replace(/,/g, '')) : null,
|
||||
beds: bedsMatch ? parseInt(bedsMatch[1]) : null,
|
||||
baths: bathsMatch ? parseInt(bathsMatch[1]) : null,
|
||||
receptions: recMatch ? parseInt(recMatch[1]) : null,
|
||||
price_text: priceText2.trim(),
|
||||
beds: bedsMatch && parseInt(bedsMatch[1]) <= 20 ? parseInt(bedsMatch[1]) : null,
|
||||
baths: bathsMatch && parseInt(bathsMatch[1]) <= 20 ? parseInt(bathsMatch[1]) : null,
|
||||
receptions: recMatch && parseInt(recMatch[1]) <= 20 ? parseInt(recMatch[1]) : null,
|
||||
floor_area_sqft: areaMatch ? parseInt(areaMatch[1].replace(/,/g, '')) : null,
|
||||
address, tenure,
|
||||
});
|
||||
|
|
@ -597,6 +602,21 @@ def _map_property_type(raw_type: str | None) -> str:
|
|||
return "Other"
|
||||
|
||||
|
||||
def _detect_rent_frequency(price_text: str) -> str:
|
||||
"""Detect rent frequency from Zoopla price text.
|
||||
|
||||
Zoopla price elements contain text like '£1,500 pcm', '£350 pw',
|
||||
'£18,000 pa'. Defaults to 'monthly' if no frequency indicator found.
|
||||
"""
|
||||
lower = price_text.lower()
|
||||
if "pw" in lower or "per week" in lower or "/w" in lower:
|
||||
return "weekly"
|
||||
if "pa" in lower or "per annum" in lower or "/y" in lower or "per year" in lower:
|
||||
return "yearly"
|
||||
# pcm, per month, /m, or no indicator — default monthly
|
||||
return "monthly"
|
||||
|
||||
|
||||
def transform_property(
|
||||
raw: dict,
|
||||
channel: str,
|
||||
|
|
@ -608,7 +628,7 @@ def transform_property(
|
|||
Zoopla search cards do not include coordinates, so we resolve lat/lng
|
||||
from postcodes extracted from the address text."""
|
||||
price = raw.get("price")
|
||||
if not price:
|
||||
if not price or int(price) <= 0:
|
||||
return None
|
||||
|
||||
address = raw.get("address", "")
|
||||
|
|
@ -647,21 +667,35 @@ def transform_property(
|
|||
if not (49 <= lat <= 56 and -7 <= lng <= 2):
|
||||
return None
|
||||
|
||||
bedrooms = raw.get("beds") or 0
|
||||
bathrooms = raw.get("baths") or 0
|
||||
raw_beds = raw.get("beds") or 0
|
||||
raw_baths = raw.get("baths") or 0
|
||||
bedrooms = raw_beds if raw_beds <= MAX_BEDROOMS else 0
|
||||
bathrooms = raw_baths if raw_baths <= MAX_BEDROOMS else 0
|
||||
if raw_beds > MAX_BEDROOMS or raw_baths > MAX_BEDROOMS:
|
||||
log.warning(
|
||||
"Zoopla %s: implausible beds=%d baths=%d (capped to 0)",
|
||||
raw.get("id", "?"), raw_beds, raw_baths,
|
||||
)
|
||||
receptions = raw.get("receptions") or 0
|
||||
|
||||
# Floor area: convert sq ft to sq m
|
||||
floor_area_sqm = None
|
||||
sqft = raw.get("floor_area_sqft")
|
||||
if sqft:
|
||||
floor_area_sqm = round(sqft * 0.092903, 1)
|
||||
floor_area_sqm = validate_floor_area(round(sqft * 0.092903, 1))
|
||||
|
||||
listing_id = raw.get("id", "")
|
||||
listing_url = raw.get("url", "")
|
||||
if listing_url and not listing_url.startswith("http"):
|
||||
listing_url = ZOOPLA_BASE + listing_url
|
||||
|
||||
# Detect rent frequency from price text (e.g. "£1,500 pcm" vs "£350 pw")
|
||||
if channel == "BUY":
|
||||
frequency = ""
|
||||
else:
|
||||
price_text = raw.get("price_text", "")
|
||||
frequency = _detect_rent_frequency(price_text)
|
||||
|
||||
return {
|
||||
"id": f"zp_{listing_id}",
|
||||
"Bedrooms": bedrooms,
|
||||
|
|
@ -675,7 +709,7 @@ def transform_property(
|
|||
"Property type": "Other", # Not reliably extractable from Zoopla search cards
|
||||
"Property sub-type": "",
|
||||
"price": int(price),
|
||||
"price_frequency": "" if channel == "BUY" else "monthly",
|
||||
"price_frequency": frequency,
|
||||
"Price qualifier": "",
|
||||
"Total floor area (sqm)": floor_area_sqm,
|
||||
"Listing URL": listing_url,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue