diff --git a/finder/constants.py b/finder/constants.py index 2985486..09f9fd8 100644 --- a/finder/constants.py +++ b/finder/constants.py @@ -4,8 +4,8 @@ from pathlib import Path ARCGIS_PATH = os.environ.get("ARCGIS_PATH", "/data/arcgis_data.parquet") DATA_DIR = Path("/app/data") PAGE_SIZE = 24 -DELAY_BETWEEN_PAGES = 0.5 -DELAY_BETWEEN_OUTCODES = 1.0 +DELAY_BETWEEN_PAGES = 0.3 +DELAY_BETWEEN_OUTCODES = 0.5 MAX_RETRIES = 3 RETRY_BASE_DELAY = 2.0 GRID_CELL_SIZE = 0.01 # degrees for postcode spatial index diff --git a/finder/openrent.py b/finder/openrent.py index c96dd44..1737745 100644 --- a/finder/openrent.py +++ b/finder/openrent.py @@ -351,7 +351,7 @@ def parse_search_results(html: str) -> list[dict]:
1 Bed Flat, Location, SW1Y
""" - soup = BeautifulSoup(html, "html.parser") + soup = BeautifulSoup(html, "lxml") properties = [] # Property cards: @@ -486,7 +486,7 @@ def parse_property_detail(html: str) -> dict: - Tables have "Rent PCM", "Deposit", "Bills Included", etc. (NOT bedrooms) - Description in elements with class containing "description" """ - soup = BeautifulSoup(html, "html.parser") + soup = BeautifulSoup(html, "lxml") details: dict = {} # --- Title from h1 --- @@ -810,7 +810,7 @@ def search_outcode( if detail_html: detail_data = parse_property_detail(detail_html) # Shorter delay for detail pages (within same outcode) - time.sleep(DELAY_BETWEEN_PAGES * 0.5) + time.sleep(0.15) transformed = transform_property( search_data, diff --git a/finder/pyproject.toml b/finder/pyproject.toml index 05379b6..64be8ed 100644 --- a/finder/pyproject.toml +++ b/finder/pyproject.toml @@ -10,6 +10,7 @@ dependencies = [ "fake-useragent>=2.2.0", "prometheus-client", "beautifulsoup4", + "lxml", "playwright>=1.58.0", "playwright-stealth>=2.0.2", "camoufox>=0.4.11", diff --git a/finder/scraper.py b/finder/scraper.py index 88c3dd2..1d2ccf4 100644 --- a/finder/scraper.py +++ b/finder/scraper.py @@ -472,12 +472,23 @@ def run_scrape( try: for i, outcode in enumerate(shuffled): + search_url = None for ch_cfg in CHANNELS: ch = ch_cfg["channel"] + # Build direct URL for second channel by swapping path + direct_url = None + if search_url: + if ch == "BUY": + direct_url = search_url.replace("/to-rent/", "/for-sale/") + else: + direct_url = search_url.replace("/for-sale/", "/to-rent/") try: - props = zoopla_search_outcode( - page, outcode, ch, pc_index, pc_coords + props, result_url = zoopla_search_outcode( + page, outcode, ch, pc_index, pc_coords, + base_search_url=direct_url, ) + if result_url: + search_url = result_url zp_results[ch].extend(props) if props: log.info("Zoopla %s: +%d properties", outcode, len(props)) diff --git a/finder/zoopla.py b/finder/zoopla.py index 052794f..dbb59cc 100644 --- a/finder/zoopla.py +++ b/finder/zoopla.py @@ -263,6 +263,46 @@ def _ensure_not_challenged(page) -> None: # --------------------------------------------------------------------------- +def _navigate_direct(page, url: str) -> bool: + """Navigate directly to a Zoopla search URL (skipping the homepage flow). + + Used to load the second channel (e.g., RENT after BUY) for the same outcode + by swapping the path component. Falls back gracefully — returns False if + the page has no listings, so the caller can retry via the full search flow. + """ + try: + page.goto(url, wait_until="domcontentloaded", timeout=30000) + except Exception as e: + log.debug("Direct navigation failed: %s", e) + return False + _ensure_not_challenged(page) + + # Wait for listing content to hydrate + try: + page.wait_for_function( + """() => { + const cards = document.querySelectorAll( + '[data-testid="regular-listings"] > div' + ); + if (cards.length === 0) return false; + for (const card of cards) { + const t = card.innerText || ''; + if (t.includes('\\u00a3') && t.length > 50) return true; + } + return false; + }""", + timeout=8000, + ) + except Exception: + # Check if the page has any listings at all + has_listings = page.query_selector('a[href*="/details/"]') + if not has_listings: + return False + time.sleep(1.5) + + return True + + def _navigate_search(page, outcode: str, channel: str) -> bool: """Navigate to search results for an outcode via the homepage search flow. @@ -270,12 +310,12 @@ def _navigate_search(page, outcode: str, channel: str) -> bool: Raises TurnstileError if Cloudflare blocks us.""" # Navigate to homepage to reset search state page.goto(f"{ZOOPLA_BASE}/", wait_until="domcontentloaded", timeout=30000) - time.sleep(2) + time.sleep(0.5) _ensure_not_challenged(page) # Dismiss cookie consent (may reappear after navigation) page.evaluate(_DISMISS_COOKIES_JS) - time.sleep(1) + time.sleep(0.3) # Select Buy/Rent tab if channel == "RENT": @@ -284,7 +324,7 @@ def _navigate_search(page, outcode: str, channel: str) -> bool: ) if rent_tab: rent_tab.click() - time.sleep(0.5) + time.sleep(0.2) # Find and fill search input search_input = page.query_selector( @@ -295,10 +335,10 @@ def _navigate_search(page, outcode: str, channel: str) -> bool: return False search_input.click() - time.sleep(0.3) + time.sleep(0.1) search_input.fill("") search_input.type(outcode, delay=60) - time.sleep(2) + time.sleep(1.2) # Select first autocomplete suggestion first_option = page.query_selector('[role="option"]') @@ -307,7 +347,7 @@ def _navigate_search(page, outcode: str, channel: str) -> bool: return False first_option.click() - time.sleep(0.5) + time.sleep(0.2) # Click search button search_btn = page.query_selector('button:has-text("Search")') @@ -326,6 +366,29 @@ def _navigate_search(page, outcode: str, channel: str) -> bool: time.sleep(4) _ensure_not_challenged(page) + # Wait for client-side hydration to populate listing content (prices, addresses). + # The structural container appears in server-rendered HTML before React hydrates + # the actual card content — extracting too early yields empty price/address fields. + try: + page.wait_for_function( + """() => { + const cards = document.querySelectorAll( + '[data-testid="regular-listings"] > div' + ); + if (cards.length === 0) return false; + for (const card of cards) { + const t = card.innerText || ''; + if (t.includes('\\u00a3') && t.length > 50) return true; + } + return false; + }""", + timeout=8000, + ) + except Exception: + # Content never appeared — extraction will likely fail but let it try + log.debug("Listing content hydration wait timed out — prices may not have rendered") + time.sleep(2) + return True @@ -437,8 +500,25 @@ def _paginate(page, total_results: int, channel: str) -> list[dict]: try: page.goto(next_url, wait_until="domcontentloaded", timeout=30000) - time.sleep(4) _ensure_not_challenged(page) + # Wait for listing content instead of fixed sleep + try: + page.wait_for_function( + """() => { + const cards = document.querySelectorAll( + '[data-testid="regular-listings"] > div' + ); + if (cards.length === 0) return false; + for (const card of cards) { + const t = card.innerText || ''; + if (t.includes('\\u00a3') && t.length > 50) return true; + } + return false; + }""", + timeout=8000, + ) + except Exception: + time.sleep(1.5) except TurnstileError: raise except Exception as e: @@ -608,17 +688,32 @@ def search_outcode( channel: str, pc_index: PostcodeSpatialIndex, pc_coords: dict[str, tuple[float, float]], -) -> list[dict]: + base_search_url: str | None = None, +) -> tuple[list[dict], str | None]: """Search Zoopla for properties in one outcode. Takes a live Camoufox Page (from launch_browser). Navigates through the search flow, extracts listings from rendered DOM, and transforms to the standard output schema. + If base_search_url is provided (from a previous channel search for the same + outcode), tries direct URL navigation first — skipping the slow homepage + search flow. Falls back to full navigation if direct fails. + + Returns (properties, search_url) where search_url can be passed to the next + channel call for this outcode. + Raises TurnstileError if Cloudflare blocks us mid-session. """ - if not _navigate_search(page, outcode, channel): - return [] + navigated = False + if base_search_url: + navigated = _navigate_direct(page, base_search_url) + if navigated: + log.debug("Zoopla %s %s: used direct URL navigation", outcode, channel) + + if not navigated: + if not _navigate_search(page, outcode, channel): + return [], None total_results = _get_result_count(page) @@ -632,7 +727,7 @@ def search_outcode( "DOM selectors may need updating", outcode, channel, total_results, ) - return [] + return [], None channel_label = "buy" if channel == "BUY" else "rent" properties = [] @@ -646,10 +741,13 @@ def search_outcode( dropped += 1 if dropped and not properties: + # Log a sample raw listing to diagnose which fields are missing + sample = raw_listings[0] if raw_listings else {} log.debug( "Zoopla %s %s: extracted %d raw listings but all %d dropped in transform " - "(no price/postcode/coords)", + "(no price/postcode/coords). Sample raw: price=%s address=%r", outcode, channel, len(raw_listings), dropped, + sample.get("price"), sample.get("address", ""), ) elif dropped > len(raw_listings) // 2: log.debug( @@ -657,4 +755,4 @@ def search_outcode( outcode, channel, dropped, len(raw_listings), ) - return properties + return properties, page.url