checkpoint finder
This commit is contained in:
parent
8616837c01
commit
96dfdd7491
5 changed files with 130 additions and 20 deletions
124
finder/zoopla.py
124
finder/zoopla.py
|
|
@ -263,6 +263,46 @@ def _ensure_not_challenged(page) -> None:
|
|||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _navigate_direct(page, url: str) -> bool:
|
||||
"""Navigate directly to a Zoopla search URL (skipping the homepage flow).
|
||||
|
||||
Used to load the second channel (e.g., RENT after BUY) for the same outcode
|
||||
by swapping the path component. Falls back gracefully — returns False if
|
||||
the page has no listings, so the caller can retry via the full search flow.
|
||||
"""
|
||||
try:
|
||||
page.goto(url, wait_until="domcontentloaded", timeout=30000)
|
||||
except Exception as e:
|
||||
log.debug("Direct navigation failed: %s", e)
|
||||
return False
|
||||
_ensure_not_challenged(page)
|
||||
|
||||
# Wait for listing content to hydrate
|
||||
try:
|
||||
page.wait_for_function(
|
||||
"""() => {
|
||||
const cards = document.querySelectorAll(
|
||||
'[data-testid="regular-listings"] > div'
|
||||
);
|
||||
if (cards.length === 0) return false;
|
||||
for (const card of cards) {
|
||||
const t = card.innerText || '';
|
||||
if (t.includes('\\u00a3') && t.length > 50) return true;
|
||||
}
|
||||
return false;
|
||||
}""",
|
||||
timeout=8000,
|
||||
)
|
||||
except Exception:
|
||||
# Check if the page has any listings at all
|
||||
has_listings = page.query_selector('a[href*="/details/"]')
|
||||
if not has_listings:
|
||||
return False
|
||||
time.sleep(1.5)
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def _navigate_search(page, outcode: str, channel: str) -> bool:
|
||||
"""Navigate to search results for an outcode via the homepage search flow.
|
||||
|
||||
|
|
@ -270,12 +310,12 @@ def _navigate_search(page, outcode: str, channel: str) -> bool:
|
|||
Raises TurnstileError if Cloudflare blocks us."""
|
||||
# Navigate to homepage to reset search state
|
||||
page.goto(f"{ZOOPLA_BASE}/", wait_until="domcontentloaded", timeout=30000)
|
||||
time.sleep(2)
|
||||
time.sleep(0.5)
|
||||
_ensure_not_challenged(page)
|
||||
|
||||
# Dismiss cookie consent (may reappear after navigation)
|
||||
page.evaluate(_DISMISS_COOKIES_JS)
|
||||
time.sleep(1)
|
||||
time.sleep(0.3)
|
||||
|
||||
# Select Buy/Rent tab
|
||||
if channel == "RENT":
|
||||
|
|
@ -284,7 +324,7 @@ def _navigate_search(page, outcode: str, channel: str) -> bool:
|
|||
)
|
||||
if rent_tab:
|
||||
rent_tab.click()
|
||||
time.sleep(0.5)
|
||||
time.sleep(0.2)
|
||||
|
||||
# Find and fill search input
|
||||
search_input = page.query_selector(
|
||||
|
|
@ -295,10 +335,10 @@ def _navigate_search(page, outcode: str, channel: str) -> bool:
|
|||
return False
|
||||
|
||||
search_input.click()
|
||||
time.sleep(0.3)
|
||||
time.sleep(0.1)
|
||||
search_input.fill("")
|
||||
search_input.type(outcode, delay=60)
|
||||
time.sleep(2)
|
||||
time.sleep(1.2)
|
||||
|
||||
# Select first autocomplete suggestion
|
||||
first_option = page.query_selector('[role="option"]')
|
||||
|
|
@ -307,7 +347,7 @@ def _navigate_search(page, outcode: str, channel: str) -> bool:
|
|||
return False
|
||||
|
||||
first_option.click()
|
||||
time.sleep(0.5)
|
||||
time.sleep(0.2)
|
||||
|
||||
# Click search button
|
||||
search_btn = page.query_selector('button:has-text("Search")')
|
||||
|
|
@ -326,6 +366,29 @@ def _navigate_search(page, outcode: str, channel: str) -> bool:
|
|||
time.sleep(4)
|
||||
_ensure_not_challenged(page)
|
||||
|
||||
# Wait for client-side hydration to populate listing content (prices, addresses).
|
||||
# The structural container appears in server-rendered HTML before React hydrates
|
||||
# the actual card content — extracting too early yields empty price/address fields.
|
||||
try:
|
||||
page.wait_for_function(
|
||||
"""() => {
|
||||
const cards = document.querySelectorAll(
|
||||
'[data-testid="regular-listings"] > div'
|
||||
);
|
||||
if (cards.length === 0) return false;
|
||||
for (const card of cards) {
|
||||
const t = card.innerText || '';
|
||||
if (t.includes('\\u00a3') && t.length > 50) return true;
|
||||
}
|
||||
return false;
|
||||
}""",
|
||||
timeout=8000,
|
||||
)
|
||||
except Exception:
|
||||
# Content never appeared — extraction will likely fail but let it try
|
||||
log.debug("Listing content hydration wait timed out — prices may not have rendered")
|
||||
time.sleep(2)
|
||||
|
||||
return True
|
||||
|
||||
|
||||
|
|
@ -437,8 +500,25 @@ def _paginate(page, total_results: int, channel: str) -> list[dict]:
|
|||
|
||||
try:
|
||||
page.goto(next_url, wait_until="domcontentloaded", timeout=30000)
|
||||
time.sleep(4)
|
||||
_ensure_not_challenged(page)
|
||||
# Wait for listing content instead of fixed sleep
|
||||
try:
|
||||
page.wait_for_function(
|
||||
"""() => {
|
||||
const cards = document.querySelectorAll(
|
||||
'[data-testid="regular-listings"] > div'
|
||||
);
|
||||
if (cards.length === 0) return false;
|
||||
for (const card of cards) {
|
||||
const t = card.innerText || '';
|
||||
if (t.includes('\\u00a3') && t.length > 50) return true;
|
||||
}
|
||||
return false;
|
||||
}""",
|
||||
timeout=8000,
|
||||
)
|
||||
except Exception:
|
||||
time.sleep(1.5)
|
||||
except TurnstileError:
|
||||
raise
|
||||
except Exception as e:
|
||||
|
|
@ -608,17 +688,32 @@ def search_outcode(
|
|||
channel: str,
|
||||
pc_index: PostcodeSpatialIndex,
|
||||
pc_coords: dict[str, tuple[float, float]],
|
||||
) -> list[dict]:
|
||||
base_search_url: str | None = None,
|
||||
) -> tuple[list[dict], str | None]:
|
||||
"""Search Zoopla for properties in one outcode.
|
||||
|
||||
Takes a live Camoufox Page (from launch_browser). Navigates through the
|
||||
search flow, extracts listings from rendered DOM, and transforms to the
|
||||
standard output schema.
|
||||
|
||||
If base_search_url is provided (from a previous channel search for the same
|
||||
outcode), tries direct URL navigation first — skipping the slow homepage
|
||||
search flow. Falls back to full navigation if direct fails.
|
||||
|
||||
Returns (properties, search_url) where search_url can be passed to the next
|
||||
channel call for this outcode.
|
||||
|
||||
Raises TurnstileError if Cloudflare blocks us mid-session.
|
||||
"""
|
||||
if not _navigate_search(page, outcode, channel):
|
||||
return []
|
||||
navigated = False
|
||||
if base_search_url:
|
||||
navigated = _navigate_direct(page, base_search_url)
|
||||
if navigated:
|
||||
log.debug("Zoopla %s %s: used direct URL navigation", outcode, channel)
|
||||
|
||||
if not navigated:
|
||||
if not _navigate_search(page, outcode, channel):
|
||||
return [], None
|
||||
|
||||
total_results = _get_result_count(page)
|
||||
|
||||
|
|
@ -632,7 +727,7 @@ def search_outcode(
|
|||
"DOM selectors may need updating",
|
||||
outcode, channel, total_results,
|
||||
)
|
||||
return []
|
||||
return [], None
|
||||
|
||||
channel_label = "buy" if channel == "BUY" else "rent"
|
||||
properties = []
|
||||
|
|
@ -646,10 +741,13 @@ def search_outcode(
|
|||
dropped += 1
|
||||
|
||||
if dropped and not properties:
|
||||
# Log a sample raw listing to diagnose which fields are missing
|
||||
sample = raw_listings[0] if raw_listings else {}
|
||||
log.debug(
|
||||
"Zoopla %s %s: extracted %d raw listings but all %d dropped in transform "
|
||||
"(no price/postcode/coords)",
|
||||
"(no price/postcode/coords). Sample raw: price=%s address=%r",
|
||||
outcode, channel, len(raw_listings), dropped,
|
||||
sample.get("price"), sample.get("address", ""),
|
||||
)
|
||||
elif dropped > len(raw_listings) // 2:
|
||||
log.debug(
|
||||
|
|
@ -657,4 +755,4 @@ def search_outcode(
|
|||
outcode, channel, dropped, len(raw_listings),
|
||||
)
|
||||
|
||||
return properties
|
||||
return properties, page.url
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue