checkpoint finder

This commit is contained in:
Andras Schmelczer 2026-03-24 22:30:37 +00:00
parent 8616837c01
commit 96dfdd7491
5 changed files with 130 additions and 20 deletions

View file

@ -263,6 +263,46 @@ def _ensure_not_challenged(page) -> None:
# ---------------------------------------------------------------------------
def _navigate_direct(page, url: str) -> bool:
"""Navigate directly to a Zoopla search URL (skipping the homepage flow).
Used to load the second channel (e.g., RENT after BUY) for the same outcode
by swapping the path component. Falls back gracefully returns False if
the page has no listings, so the caller can retry via the full search flow.
"""
try:
page.goto(url, wait_until="domcontentloaded", timeout=30000)
except Exception as e:
log.debug("Direct navigation failed: %s", e)
return False
_ensure_not_challenged(page)
# Wait for listing content to hydrate
try:
page.wait_for_function(
"""() => {
const cards = document.querySelectorAll(
'[data-testid="regular-listings"] > div'
);
if (cards.length === 0) return false;
for (const card of cards) {
const t = card.innerText || '';
if (t.includes('\\u00a3') && t.length > 50) return true;
}
return false;
}""",
timeout=8000,
)
except Exception:
# Check if the page has any listings at all
has_listings = page.query_selector('a[href*="/details/"]')
if not has_listings:
return False
time.sleep(1.5)
return True
def _navigate_search(page, outcode: str, channel: str) -> bool:
"""Navigate to search results for an outcode via the homepage search flow.
@ -270,12 +310,12 @@ def _navigate_search(page, outcode: str, channel: str) -> bool:
Raises TurnstileError if Cloudflare blocks us."""
# Navigate to homepage to reset search state
page.goto(f"{ZOOPLA_BASE}/", wait_until="domcontentloaded", timeout=30000)
time.sleep(2)
time.sleep(0.5)
_ensure_not_challenged(page)
# Dismiss cookie consent (may reappear after navigation)
page.evaluate(_DISMISS_COOKIES_JS)
time.sleep(1)
time.sleep(0.3)
# Select Buy/Rent tab
if channel == "RENT":
@ -284,7 +324,7 @@ def _navigate_search(page, outcode: str, channel: str) -> bool:
)
if rent_tab:
rent_tab.click()
time.sleep(0.5)
time.sleep(0.2)
# Find and fill search input
search_input = page.query_selector(
@ -295,10 +335,10 @@ def _navigate_search(page, outcode: str, channel: str) -> bool:
return False
search_input.click()
time.sleep(0.3)
time.sleep(0.1)
search_input.fill("")
search_input.type(outcode, delay=60)
time.sleep(2)
time.sleep(1.2)
# Select first autocomplete suggestion
first_option = page.query_selector('[role="option"]')
@ -307,7 +347,7 @@ def _navigate_search(page, outcode: str, channel: str) -> bool:
return False
first_option.click()
time.sleep(0.5)
time.sleep(0.2)
# Click search button
search_btn = page.query_selector('button:has-text("Search")')
@ -326,6 +366,29 @@ def _navigate_search(page, outcode: str, channel: str) -> bool:
time.sleep(4)
_ensure_not_challenged(page)
# Wait for client-side hydration to populate listing content (prices, addresses).
# The structural container appears in server-rendered HTML before React hydrates
# the actual card content — extracting too early yields empty price/address fields.
try:
page.wait_for_function(
"""() => {
const cards = document.querySelectorAll(
'[data-testid="regular-listings"] > div'
);
if (cards.length === 0) return false;
for (const card of cards) {
const t = card.innerText || '';
if (t.includes('\\u00a3') && t.length > 50) return true;
}
return false;
}""",
timeout=8000,
)
except Exception:
# Content never appeared — extraction will likely fail but let it try
log.debug("Listing content hydration wait timed out — prices may not have rendered")
time.sleep(2)
return True
@ -437,8 +500,25 @@ def _paginate(page, total_results: int, channel: str) -> list[dict]:
try:
page.goto(next_url, wait_until="domcontentloaded", timeout=30000)
time.sleep(4)
_ensure_not_challenged(page)
# Wait for listing content instead of fixed sleep
try:
page.wait_for_function(
"""() => {
const cards = document.querySelectorAll(
'[data-testid="regular-listings"] > div'
);
if (cards.length === 0) return false;
for (const card of cards) {
const t = card.innerText || '';
if (t.includes('\\u00a3') && t.length > 50) return true;
}
return false;
}""",
timeout=8000,
)
except Exception:
time.sleep(1.5)
except TurnstileError:
raise
except Exception as e:
@ -608,17 +688,32 @@ def search_outcode(
channel: str,
pc_index: PostcodeSpatialIndex,
pc_coords: dict[str, tuple[float, float]],
) -> list[dict]:
base_search_url: str | None = None,
) -> tuple[list[dict], str | None]:
"""Search Zoopla for properties in one outcode.
Takes a live Camoufox Page (from launch_browser). Navigates through the
search flow, extracts listings from rendered DOM, and transforms to the
standard output schema.
If base_search_url is provided (from a previous channel search for the same
outcode), tries direct URL navigation first skipping the slow homepage
search flow. Falls back to full navigation if direct fails.
Returns (properties, search_url) where search_url can be passed to the next
channel call for this outcode.
Raises TurnstileError if Cloudflare blocks us mid-session.
"""
if not _navigate_search(page, outcode, channel):
return []
navigated = False
if base_search_url:
navigated = _navigate_direct(page, base_search_url)
if navigated:
log.debug("Zoopla %s %s: used direct URL navigation", outcode, channel)
if not navigated:
if not _navigate_search(page, outcode, channel):
return [], None
total_results = _get_result_count(page)
@ -632,7 +727,7 @@ def search_outcode(
"DOM selectors may need updating",
outcode, channel, total_results,
)
return []
return [], None
channel_label = "buy" if channel == "BUY" else "rent"
properties = []
@ -646,10 +741,13 @@ def search_outcode(
dropped += 1
if dropped and not properties:
# Log a sample raw listing to diagnose which fields are missing
sample = raw_listings[0] if raw_listings else {}
log.debug(
"Zoopla %s %s: extracted %d raw listings but all %d dropped in transform "
"(no price/postcode/coords)",
"(no price/postcode/coords). Sample raw: price=%s address=%r",
outcode, channel, len(raw_listings), dropped,
sample.get("price"), sample.get("address", ""),
)
elif dropped > len(raw_listings) // 2:
log.debug(
@ -657,4 +755,4 @@ def search_outcode(
outcode, channel, dropped, len(raw_listings),
)
return properties
return properties, page.url