Fix scrape

This commit is contained in:
Andras Schmelczer 2026-03-26 07:54:39 +00:00
parent bbc2fcb86c
commit 3adbaf435d
2 changed files with 131 additions and 27 deletions

View file

@ -39,7 +39,7 @@ class TurnstileError(Exception):
# Maximum search result pages to scrape per outcode (25 listings/page)
MAX_PAGES_PER_OUTCODE = 10
MAX_PAGES_PER_OUTCODE = 40
# JavaScript to extract listings from the rendered DOM.
# Uses data-testid attributes as primary selectors (stable across deployments),
@ -98,6 +98,12 @@ _EXTRACT_LISTINGS_JS = r"""() => {
if (/leasehold/i.test(text)) tenure = 'Leasehold';
else if (/freehold/i.test(text)) tenure = 'Freehold';
// Extract property type (e.g., "2 bed flat for sale" "flat")
let property_type = '';
const ptMatch = text.match(/\d+\s*(?:beds?|bedrooms?)\s+([\w\s-]+?)\s+(?:for\s+sale|to\s+(?:rent|let)|for\s+rent)/i);
if (ptMatch) property_type = ptMatch[1].trim();
else if (/\bstudio\s*(?:flat|apartment)?\s+(?:for\s+sale|to\s+(?:rent|let)|for\s+rent)/i.test(text)) property_type = 'Studio';
results.push({
id, url: href.replace(window.location.origin, ''),
price: priceMatch ? parseInt(priceMatch[1].replace(/,/g, '')) : null,
@ -106,7 +112,7 @@ _EXTRACT_LISTINGS_JS = r"""() => {
baths: bathsMatch && parseInt(bathsMatch[1]) <= 20 ? parseInt(bathsMatch[1]) : null,
receptions: recMatch && parseInt(recMatch[1]) <= 20 ? parseInt(recMatch[1]) : null,
floor_area_sqft: areaMatch ? parseInt(areaMatch[1].replace(/,/g, '')) : null,
address, tenure,
address, tenure, property_type,
});
}
@ -160,6 +166,12 @@ _EXTRACT_LISTINGS_JS = r"""() => {
if (/leasehold/i.test(text)) tenure = 'Leasehold';
else if (/freehold/i.test(text)) tenure = 'Freehold';
// Extract property type
let property_type = '';
const ptMatch2 = text.match(/\d+\s*(?:beds?|bedrooms?)\s+([\w\s-]+?)\s+(?:for\s+sale|to\s+(?:rent|let)|for\s+rent)/i);
if (ptMatch2) property_type = ptMatch2[1].trim();
else if (/\bstudio\s*(?:flat|apartment)?\s+(?:for\s+sale|to\s+(?:rent|let)|for\s+rent)/i.test(text)) property_type = 'Studio';
results.push({
id, url: href.replace(window.location.origin, ''),
price: priceMatch ? parseInt(priceMatch[1].replace(/,/g, '')) : null,
@ -168,7 +180,7 @@ _EXTRACT_LISTINGS_JS = r"""() => {
baths: bathsMatch && parseInt(bathsMatch[1]) <= 20 ? parseInt(bathsMatch[1]) : null,
receptions: recMatch && parseInt(recMatch[1]) <= 20 ? parseInt(recMatch[1]) : null,
floor_area_sqft: areaMatch ? parseInt(areaMatch[1].replace(/,/g, '')) : null,
address, tenure,
address, tenure, property_type,
});
}
}
@ -557,6 +569,32 @@ def _paginate(page, total_results: int, channel: str) -> list[dict]:
# ---------------------------------------------------------------------------
# Cached outcode → (postcode, lat, lng) lookups to avoid repeated O(n) scans
# over 2.26M postcodes. Populated lazily on first lookup per outcode.
_outcode_coords_cache: dict[str, tuple[str, float, float] | None] = {}
def _resolve_outcode_coords(
outcode: str, pc_coords: dict[str, tuple[float, float]]
) -> tuple[str, float, float] | None:
"""Find first postcode + coords for an outcode. Result is cached."""
if outcode in _outcode_coords_cache:
return _outcode_coords_cache[outcode]
prefix = outcode + " "
for pcd, (lat, lng) in pc_coords.items():
if pcd.startswith(prefix) or (
len(outcode) >= 4
and pcd.startswith(outcode)
and len(pcd) > len(outcode)
):
_outcode_coords_cache[outcode] = (pcd, lat, lng)
return (pcd, lat, lng)
_outcode_coords_cache[outcode] = None
return None
def _extract_postcode(text: str) -> str | None:
"""Extract a full UK postcode from text like 'Dollar Bay Place, Canary Wharf E14 9SS'."""
match = re.search(r"([A-Z]{1,2}\d[A-Z0-9]?\s*\d[A-Z]{2})", text, re.IGNORECASE)
@ -585,11 +623,17 @@ def _map_property_type(raw_type: str | None) -> str:
"""Map Zoopla property type text to canonical type."""
if not raw_type:
return "Other"
# Exact match (handles Rightmove-style capitalised values)
canonical = PROPERTY_TYPE_MAP.get(raw_type)
if canonical:
return canonical
# Title-case match (handles regex-extracted lowercase like "town house" → "Town House")
canonical = PROPERTY_TYPE_MAP.get(raw_type.title())
if canonical:
return canonical
# Keyword fallback
lower = raw_type.lower()
if "flat" in lower or "apartment" in lower or "maisonette" in lower or "studio" in lower:
if "flat" in lower or "apartment" in lower or "maisonette" in lower or "studio" in lower or "penthouse" in lower:
return "Flats/Maisonettes"
if "detached" in lower and "semi" not in lower:
return "Detached"
@ -622,6 +666,7 @@ def transform_property(
channel: str,
pc_index: PostcodeSpatialIndex,
pc_coords: dict[str, tuple[float, float]],
search_outcode: str | None = None,
) -> dict | None:
"""Transform a raw Zoopla listing dict into the standard output schema.
@ -643,22 +688,18 @@ def transform_property(
lat, lng = coords
if lat is None:
# Try outcode-level fallback
outcode = _extract_outcode(address)
if outcode:
# ONSPD 7-char format: 4-char outcodes have no space before incode
# (e.g., "BH191AB"), while shorter outcodes do (e.g., "E14 5AB").
# Check both formats to handle all outcode lengths.
prefix = outcode + " "
for pcd, coords in pc_coords.items():
if pcd.startswith(prefix) or (
len(outcode) >= 4
and pcd.startswith(outcode)
and len(pcd) > len(outcode)
):
postcode = pcd
lat, lng = coords
break
# Try outcode-level fallback from address text
addr_outcode = _extract_outcode(address)
if addr_outcode:
result = _resolve_outcode_coords(addr_outcode, pc_coords)
if result:
postcode, lat, lng = result
# Final fallback: use the outcode we know we're searching
if lat is None and search_outcode:
result = _resolve_outcode_coords(search_outcode, pc_coords)
if result:
postcode, lat, lng = result
if lat is None or lng is None or not postcode:
return None
@ -706,8 +747,8 @@ def transform_property(
"Postcode": postcode,
"Address per Property Register": address,
"Leasehold/Freehold": raw.get("tenure") or None,
"Property type": "Other", # Not reliably extractable from Zoopla search cards
"Property sub-type": "",
"Property type": _map_property_type(raw.get("property_type")),
"Property sub-type": raw.get("property_type") or "",
"price": int(price),
"price_frequency": frequency,
"Price qualifier": "",
@ -774,7 +815,7 @@ def search_outcode(
properties = []
dropped = 0
for raw in raw_listings:
transformed = transform_property(raw, channel, pc_index, pc_coords)
transformed = transform_property(raw, channel, pc_index, pc_coords, search_outcode=outcode)
if transformed:
properties.append(transformed)
zoopla_properties_scraped.labels(channel=channel_label).inc()