diff --git a/finder/constants.py b/finder/constants.py
index 2985486..09f9fd8 100644
--- a/finder/constants.py
+++ b/finder/constants.py
@@ -4,8 +4,8 @@ from pathlib import Path
ARCGIS_PATH = os.environ.get("ARCGIS_PATH", "/data/arcgis_data.parquet")
DATA_DIR = Path("/app/data")
PAGE_SIZE = 24
-DELAY_BETWEEN_PAGES = 0.5
-DELAY_BETWEEN_OUTCODES = 1.0
+DELAY_BETWEEN_PAGES = 0.3
+DELAY_BETWEEN_OUTCODES = 0.5
MAX_RETRIES = 3
RETRY_BASE_DELAY = 2.0
GRID_CELL_SIZE = 0.01 # degrees for postcode spatial index
diff --git a/finder/openrent.py b/finder/openrent.py
index c96dd44..1737745 100644
--- a/finder/openrent.py
+++ b/finder/openrent.py
@@ -351,7 +351,7 @@ def parse_search_results(html: str) -> list[dict]:
1 Bed Flat, Location, SW1Y
...- 1 Bed
- 1 Bath
- Furnished
...
"""
- soup = BeautifulSoup(html, "html.parser")
+ soup = BeautifulSoup(html, "lxml")
properties = []
# Property cards:
@@ -486,7 +486,7 @@ def parse_property_detail(html: str) -> dict:
- Tables have "Rent PCM", "Deposit", "Bills Included", etc. (NOT bedrooms)
- Description in elements with class containing "description"
"""
- soup = BeautifulSoup(html, "html.parser")
+ soup = BeautifulSoup(html, "lxml")
details: dict = {}
# --- Title from h1 ---
@@ -810,7 +810,7 @@ def search_outcode(
if detail_html:
detail_data = parse_property_detail(detail_html)
# Shorter delay for detail pages (within same outcode)
- time.sleep(DELAY_BETWEEN_PAGES * 0.5)
+ time.sleep(0.15)
transformed = transform_property(
search_data,
diff --git a/finder/pyproject.toml b/finder/pyproject.toml
index 05379b6..64be8ed 100644
--- a/finder/pyproject.toml
+++ b/finder/pyproject.toml
@@ -10,6 +10,7 @@ dependencies = [
"fake-useragent>=2.2.0",
"prometheus-client",
"beautifulsoup4",
+ "lxml",
"playwright>=1.58.0",
"playwright-stealth>=2.0.2",
"camoufox>=0.4.11",
diff --git a/finder/scraper.py b/finder/scraper.py
index 88c3dd2..1d2ccf4 100644
--- a/finder/scraper.py
+++ b/finder/scraper.py
@@ -472,12 +472,23 @@ def run_scrape(
try:
for i, outcode in enumerate(shuffled):
+ search_url = None
for ch_cfg in CHANNELS:
ch = ch_cfg["channel"]
+ # Build direct URL for second channel by swapping path
+ direct_url = None
+ if search_url:
+ if ch == "BUY":
+ direct_url = search_url.replace("/to-rent/", "/for-sale/")
+ else:
+ direct_url = search_url.replace("/for-sale/", "/to-rent/")
try:
- props = zoopla_search_outcode(
- page, outcode, ch, pc_index, pc_coords
+ props, result_url = zoopla_search_outcode(
+ page, outcode, ch, pc_index, pc_coords,
+ base_search_url=direct_url,
)
+ if result_url:
+ search_url = result_url
zp_results[ch].extend(props)
if props:
log.info("Zoopla %s: +%d properties", outcode, len(props))
diff --git a/finder/zoopla.py b/finder/zoopla.py
index 052794f..dbb59cc 100644
--- a/finder/zoopla.py
+++ b/finder/zoopla.py
@@ -263,6 +263,46 @@ def _ensure_not_challenged(page) -> None:
# ---------------------------------------------------------------------------
+def _navigate_direct(page, url: str) -> bool:
+ """Navigate directly to a Zoopla search URL (skipping the homepage flow).
+
+ Used to load the second channel (e.g., RENT after BUY) for the same outcode
+ by swapping the path component. Falls back gracefully — returns False if
+ the page has no listings, so the caller can retry via the full search flow.
+ """
+ try:
+ page.goto(url, wait_until="domcontentloaded", timeout=30000)
+ except Exception as e:
+ log.debug("Direct navigation failed: %s", e)
+ return False
+ _ensure_not_challenged(page)
+
+ # Wait for listing content to hydrate
+ try:
+ page.wait_for_function(
+ """() => {
+ const cards = document.querySelectorAll(
+ '[data-testid="regular-listings"] > div'
+ );
+ if (cards.length === 0) return false;
+ for (const card of cards) {
+ const t = card.innerText || '';
+ if (t.includes('\\u00a3') && t.length > 50) return true;
+ }
+ return false;
+ }""",
+ timeout=8000,
+ )
+ except Exception:
+ # Check if the page has any listings at all
+ has_listings = page.query_selector('a[href*="/details/"]')
+ if not has_listings:
+ return False
+ time.sleep(1.5)
+
+ return True
+
+
def _navigate_search(page, outcode: str, channel: str) -> bool:
"""Navigate to search results for an outcode via the homepage search flow.
@@ -270,12 +310,12 @@ def _navigate_search(page, outcode: str, channel: str) -> bool:
Raises TurnstileError if Cloudflare blocks us."""
# Navigate to homepage to reset search state
page.goto(f"{ZOOPLA_BASE}/", wait_until="domcontentloaded", timeout=30000)
- time.sleep(2)
+ time.sleep(0.5)
_ensure_not_challenged(page)
# Dismiss cookie consent (may reappear after navigation)
page.evaluate(_DISMISS_COOKIES_JS)
- time.sleep(1)
+ time.sleep(0.3)
# Select Buy/Rent tab
if channel == "RENT":
@@ -284,7 +324,7 @@ def _navigate_search(page, outcode: str, channel: str) -> bool:
)
if rent_tab:
rent_tab.click()
- time.sleep(0.5)
+ time.sleep(0.2)
# Find and fill search input
search_input = page.query_selector(
@@ -295,10 +335,10 @@ def _navigate_search(page, outcode: str, channel: str) -> bool:
return False
search_input.click()
- time.sleep(0.3)
+ time.sleep(0.1)
search_input.fill("")
search_input.type(outcode, delay=60)
- time.sleep(2)
+ time.sleep(1.2)
# Select first autocomplete suggestion
first_option = page.query_selector('[role="option"]')
@@ -307,7 +347,7 @@ def _navigate_search(page, outcode: str, channel: str) -> bool:
return False
first_option.click()
- time.sleep(0.5)
+ time.sleep(0.2)
# Click search button
search_btn = page.query_selector('button:has-text("Search")')
@@ -326,6 +366,29 @@ def _navigate_search(page, outcode: str, channel: str) -> bool:
time.sleep(4)
_ensure_not_challenged(page)
+ # Wait for client-side hydration to populate listing content (prices, addresses).
+ # The structural container appears in server-rendered HTML before React hydrates
+ # the actual card content — extracting too early yields empty price/address fields.
+ try:
+ page.wait_for_function(
+ """() => {
+ const cards = document.querySelectorAll(
+ '[data-testid="regular-listings"] > div'
+ );
+ if (cards.length === 0) return false;
+ for (const card of cards) {
+ const t = card.innerText || '';
+ if (t.includes('\\u00a3') && t.length > 50) return true;
+ }
+ return false;
+ }""",
+ timeout=8000,
+ )
+ except Exception:
+ # Content never appeared — extraction will likely fail but let it try
+ log.debug("Listing content hydration wait timed out — prices may not have rendered")
+ time.sleep(2)
+
return True
@@ -437,8 +500,25 @@ def _paginate(page, total_results: int, channel: str) -> list[dict]:
try:
page.goto(next_url, wait_until="domcontentloaded", timeout=30000)
- time.sleep(4)
_ensure_not_challenged(page)
+ # Wait for listing content instead of fixed sleep
+ try:
+ page.wait_for_function(
+ """() => {
+ const cards = document.querySelectorAll(
+ '[data-testid="regular-listings"] > div'
+ );
+ if (cards.length === 0) return false;
+ for (const card of cards) {
+ const t = card.innerText || '';
+ if (t.includes('\\u00a3') && t.length > 50) return true;
+ }
+ return false;
+ }""",
+ timeout=8000,
+ )
+ except Exception:
+ time.sleep(1.5)
except TurnstileError:
raise
except Exception as e:
@@ -608,17 +688,32 @@ def search_outcode(
channel: str,
pc_index: PostcodeSpatialIndex,
pc_coords: dict[str, tuple[float, float]],
-) -> list[dict]:
+ base_search_url: str | None = None,
+) -> tuple[list[dict], str | None]:
"""Search Zoopla for properties in one outcode.
Takes a live Camoufox Page (from launch_browser). Navigates through the
search flow, extracts listings from rendered DOM, and transforms to the
standard output schema.
+ If base_search_url is provided (from a previous channel search for the same
+ outcode), tries direct URL navigation first — skipping the slow homepage
+ search flow. Falls back to full navigation if direct fails.
+
+ Returns (properties, search_url) where search_url can be passed to the next
+ channel call for this outcode.
+
Raises TurnstileError if Cloudflare blocks us mid-session.
"""
- if not _navigate_search(page, outcode, channel):
- return []
+ navigated = False
+ if base_search_url:
+ navigated = _navigate_direct(page, base_search_url)
+ if navigated:
+ log.debug("Zoopla %s %s: used direct URL navigation", outcode, channel)
+
+ if not navigated:
+ if not _navigate_search(page, outcode, channel):
+ return [], None
total_results = _get_result_count(page)
@@ -632,7 +727,7 @@ def search_outcode(
"DOM selectors may need updating",
outcode, channel, total_results,
)
- return []
+ return [], None
channel_label = "buy" if channel == "BUY" else "rent"
properties = []
@@ -646,10 +741,13 @@ def search_outcode(
dropped += 1
if dropped and not properties:
+ # Log a sample raw listing to diagnose which fields are missing
+ sample = raw_listings[0] if raw_listings else {}
log.debug(
"Zoopla %s %s: extracted %d raw listings but all %d dropped in transform "
- "(no price/postcode/coords)",
+ "(no price/postcode/coords). Sample raw: price=%s address=%r",
outcode, channel, len(raw_listings), dropped,
+ sample.get("price"), sample.get("address", ""),
)
elif dropped > len(raw_listings) // 2:
log.debug(
@@ -657,4 +755,4 @@ def search_outcode(
outcode, channel, dropped, len(raw_listings),
)
- return properties
+ return properties, page.url