Fix zoopla
Some checks failed
CI / Rust (lint + test) (push) Failing after 6m54s
CI / Python (lint + test) (push) Failing after 7m8s
CI / Frontend (lint + typecheck) (push) Successful in 8m55s
Build and publish Docker image / build-and-push (push) Failing after 3m8s

This commit is contained in:
Andras Schmelczer 2026-03-24 08:12:23 +00:00
parent 13980a2887
commit 4f61c702b1
2 changed files with 186 additions and 42 deletions

View file

@ -41,17 +41,23 @@ class TurnstileError(Exception):
MAX_PAGES_PER_OUTCODE = 10
# JavaScript to extract listings from the rendered DOM.
# Finds all detail links, walks up to the card container, and parses
# price, beds, baths, floor area, address, and tenure from the card text.
# Uses data-testid attributes as primary selectors (stable across deployments),
# then falls back to href-based link matching with parent-walking.
_EXTRACT_LISTINGS_JS = r"""() => {
const links = Array.from(document.querySelectorAll(
'a[href*="/for-sale/details/"], a[href*="/new-homes/details/"], a[href*="/to-rent/details/"]'
));
const seen = new Set();
const results = [];
for (const link of links) {
// Strategy 1: Use data-testid selectors (post-2025 redesign)
const listingCards = document.querySelectorAll(
'[data-testid="regular-listings"] > div, [data-testid="search-content"] li'
);
for (const card of listingCards) {
const link = card.querySelector(
'a[href*="/for-sale/details/"], a[href*="/new-homes/details/"], a[href*="/to-rent/details/"]'
);
if (!link) continue;
const href = link.href;
const match = href.match(/\/details\/(\d+)\//);
if (!match) continue;
@ -60,53 +66,108 @@ _EXTRACT_LISTINGS_JS = r"""() => {
if (seen.has(id)) continue;
seen.add(id);
// Walk up to the listing card container
let card = link;
for (let j = 0; j < 10; j++) {
card = card.parentElement;
if (!card) break;
const text = card.innerText || '';
if (text.includes('\u00a3') && (text.includes('bed') || text.includes('sq ft'))) {
break;
const text = card.innerText || '';
// Try data-testid price element first, then regex
const priceEl = card.querySelector('[data-testid="listing-price"]');
const priceText = priceEl ? priceEl.innerText : text;
const priceMatch = priceText.match(/\u00a3([\d,]+)/);
// Try address element first, then regex
const addressEl = card.querySelector('address');
let address = addressEl ? addressEl.innerText.trim() : '';
if (!address) {
const lines = text.split('\n').map(l => l.trim()).filter(Boolean);
for (const line of lines) {
if (/[A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2}/i.test(line) ||
(line.includes(',') && !line.includes('\u00a3') && !/^\d+ beds?/i.test(line))) {
address = line;
break;
}
}
}
if (!card) continue;
const text = card.innerText || '';
const lines = text.split('\n').map(l => l.trim()).filter(Boolean);
const priceMatch = text.match(/\u00a3([\d,]+)/);
const bedsMatch = text.match(/(\d+)\s*beds?/i);
const bathsMatch = text.match(/(\d+)\s*baths?/i);
const recMatch = text.match(/(\d+)\s*reception/i);
const areaMatch = text.match(/([\d,]+)\s*sq\s*ft/i);
let address = '';
for (const line of lines) {
if (/[A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2}/i.test(line) ||
(line.includes(',') && !line.includes('\u00a3') && !/^\d+ beds?/i.test(line))) {
address = line;
break;
}
}
const areaMatch = text.match(/([\d,]+)\s*sq\.?\s*ft/i);
let tenure = '';
if (/freehold/i.test(text)) tenure = 'Freehold';
else if (/leasehold/i.test(text)) tenure = 'Leasehold';
results.push({
id: id,
url: href.replace(window.location.origin, ''),
id, url: href.replace(window.location.origin, ''),
price: priceMatch ? parseInt(priceMatch[1].replace(/,/g, '')) : null,
beds: bedsMatch ? parseInt(bedsMatch[1]) : null,
baths: bathsMatch ? parseInt(bathsMatch[1]) : null,
receptions: recMatch ? parseInt(recMatch[1]) : null,
floor_area_sqft: areaMatch ? parseInt(areaMatch[1].replace(/,/g, '')) : null,
address: address,
tenure: tenure,
address, tenure,
});
}
// Strategy 2: Fall back to href-based link matching with parent-walking
if (results.length === 0) {
const links = Array.from(document.querySelectorAll(
'a[href*="/for-sale/details/"], a[href*="/new-homes/details/"], a[href*="/to-rent/details/"]'
));
for (const link of links) {
const href = link.href;
const match = href.match(/\/details\/(\d+)\//);
if (!match) continue;
const id = match[1];
if (seen.has(id)) continue;
seen.add(id);
let card = link;
for (let j = 0; j < 15; j++) {
card = card.parentElement;
if (!card) break;
const t = card.innerText || '';
if (t.includes('\u00a3') && (t.includes('bed') || t.includes('Bath') || t.includes('sq ft'))) {
break;
}
}
if (!card) continue;
const text = card.innerText || '';
const lines = text.split('\n').map(l => l.trim()).filter(Boolean);
const priceMatch = text.match(/\u00a3([\d,]+)/);
const bedsMatch = text.match(/(\d+)\s*beds?/i);
const bathsMatch = text.match(/(\d+)\s*baths?/i);
const recMatch = text.match(/(\d+)\s*reception/i);
const areaMatch = text.match(/([\d,]+)\s*sq\.?\s*ft/i);
let address = '';
for (const line of lines) {
if (/[A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2}/i.test(line) ||
(line.includes(',') && !line.includes('\u00a3') && !/^\d+ beds?/i.test(line))) {
address = line;
break;
}
}
let tenure = '';
if (/freehold/i.test(text)) tenure = 'Freehold';
else if (/leasehold/i.test(text)) tenure = 'Leasehold';
results.push({
id, url: href.replace(window.location.origin, ''),
price: priceMatch ? parseInt(priceMatch[1].replace(/,/g, '')) : null,
beds: bedsMatch ? parseInt(bedsMatch[1]) : null,
baths: bathsMatch ? parseInt(bathsMatch[1]) : null,
receptions: recMatch ? parseInt(recMatch[1]) : null,
floor_area_sqft: areaMatch ? parseInt(areaMatch[1].replace(/,/g, '')) : null,
address, tenure,
});
}
}
return results;
}"""
@ -255,18 +316,44 @@ def _navigate_search(page, outcode: str, channel: str) -> bool:
else:
search_input.press("Enter")
# Wait for results to load
time.sleep(6)
# Wait for results to load — try waiting for listings container, fall back to fixed wait
try:
page.wait_for_selector(
'[data-testid="regular-listings"], a[href*="/details/"]',
timeout=10000,
)
except Exception:
time.sleep(4)
_ensure_not_challenged(page)
return True
def _get_result_count(page) -> int:
"""Extract the total results count from the page body text."""
"""Extract the total results count from the page.
Tries __ZAD_TARGETING__ JSON first (most reliable), then body text regex
matching both "N results" and "N properties" patterns."""
try:
# Try the ZAD targeting JSON script tag first
count = page.evaluate("""() => {
const s = document.querySelector('#__ZAD_TARGETING__');
if (s) {
try {
const d = JSON.parse(s.textContent);
if (d.search_results_count != null) return d.search_results_count;
} catch(e) {}
}
return null;
}""")
if count is not None and count > 0:
return count
except Exception:
pass
try:
body = page.inner_text("body")
match = re.search(r"([\d,]+)\s+results?", body)
match = re.search(r"([\d,]+)\s+(?:results?|properties)", body)
if match:
return int(match.group(1).replace(",", ""))
except Exception:
@ -279,10 +366,42 @@ def _get_result_count(page) -> int:
# ---------------------------------------------------------------------------
_first_extraction_logged = False
def _extract_listings(page) -> list[dict]:
"""Extract listing data from the current search results page DOM."""
global _first_extraction_logged
try:
return page.evaluate(_EXTRACT_LISTINGS_JS)
listings = page.evaluate(_EXTRACT_LISTINGS_JS)
# Log diagnostic info on the very first extraction attempt
if not _first_extraction_logged:
_first_extraction_logged = True
try:
diag = page.evaluate("""() => {
const details = document.querySelectorAll('a[href*="/details/"]');
const testids = document.querySelectorAll('[data-testid]');
const testidNames = [...new Set([...testids].map(e => e.dataset.testid))];
return {
url: location.href,
title: document.title,
detailLinks: details.length,
testids: testidNames.slice(0, 30),
bodySnippet: document.body?.innerText?.slice(0, 500) || '',
};
}""")
log.info(
"Zoopla first-page diagnostic: url=%s title=%s detailLinks=%d "
"testids=%s bodySnippet=%.200s",
diag.get("url"), diag.get("title"), diag.get("detailLinks", 0),
diag.get("testids", []), diag.get("bodySnippet", ""),
)
except Exception:
pass
log.info("Zoopla first extraction: %d listings found", len(listings))
return listings
except Exception as e:
log.warning("Failed to extract listings from DOM: %s", e)
zoopla_errors_total.labels(type="extract_failed").inc()
@ -502,19 +621,40 @@ def search_outcode(
return []
total_results = _get_result_count(page)
if total_results == 0:
return []
raw_listings = _paginate(page, total_results, channel)
# Always try extraction even if result count is 0 — the count regex may
# not match Zoopla's current text format, but listings may still be in DOM
raw_listings = _paginate(page, max(total_results, 25), channel)
if not raw_listings:
if total_results > 0:
log.debug(
"Zoopla %s %s: page claims %d results but extraction found 0 — "
"DOM selectors may need updating",
outcode, channel, total_results,
)
return []
channel_label = "buy" if channel == "BUY" else "rent"
properties = []
dropped = 0
for raw in raw_listings:
transformed = transform_property(raw, channel, pc_index, pc_coords)
if transformed:
properties.append(transformed)
zoopla_properties_scraped.labels(channel=channel_label).inc()
else:
dropped += 1
if dropped and not properties:
log.debug(
"Zoopla %s %s: extracted %d raw listings but all %d dropped in transform "
"(no price/postcode/coords)",
outcode, channel, len(raw_listings), dropped,
)
elif dropped > len(raw_listings) // 2:
log.debug(
"Zoopla %s %s: %d/%d listings dropped in transform",
outcode, channel, dropped, len(raw_listings),
)
return properties