From 6b12e21d50a78ccf57ff4108c7379dd9a2deb3a5 Mon Sep 17 00:00:00 2001 From: Andras Schmelczer Date: Wed, 18 Mar 2026 22:46:08 +0000 Subject: [PATCH] More fixes --- Dockerfile | 4 +- finder/Dockerfile | 7 +- finder/constants.py | 8 + finder/main.py | 24 +- finder/metrics.py | 27 + finder/scraper.py | 108 +++- finder/storage.py | 6 +- finder/zoopla.py | 520 +++++++++++++++++ frontend/src/App.tsx | 1 + .../src/components/account/AccountPage.tsx | 65 +-- frontend/src/components/invite/InvitePage.tsx | 10 +- .../src/components/map/FeatureBrowser.tsx | 23 +- frontend/src/components/map/Filters.tsx | 175 ++++-- frontend/src/components/map/HoverCard.tsx | 2 +- .../components/map/JourneyInstructions.tsx | 22 +- frontend/src/components/map/MapPage.tsx | 27 +- frontend/src/components/map/POIPane.tsx | 2 +- .../src/components/map/TravelTimeCard.tsx | 31 +- frontend/src/components/ui/AuthModal.tsx | 2 +- frontend/src/components/ui/Header.tsx | 4 +- frontend/src/components/ui/InfoPopup.tsx | 2 +- .../src/components/ui/TravelTimeInfoPopup.tsx | 27 + frontend/src/components/ui/UserMenu.tsx | 13 +- frontend/src/hooks/useAiFilters.ts | 18 +- frontend/src/hooks/useDeckLayers.ts | 2 +- frontend/src/hooks/useTravelTime.ts | 9 + frontend/src/hooks/useTutorial.ts | 4 +- frontend/src/lib/api.ts | 6 + frontend/src/lib/clipboard.ts | 13 +- frontend/src/lib/consts.ts | 6 +- frontend/src/lib/external-search.ts | 108 +++- frontend/src/lib/format.ts | 20 + pipeline/transform/merge.py | 8 +- pipeline/utils/download.py | 2 +- pipeline/utils/fuzzy_join.py | 2 +- screenshot/src/screenshot.ts | 9 +- scripts/zoopla_experiment.py | 526 ++++++++---------- server-rs/src/data/poi.rs | 4 +- server-rs/src/data/postcodes.rs | 1 + server-rs/src/features.rs | 42 +- server-rs/src/main.rs | 2 + server-rs/src/og_middleware.rs | 10 + server-rs/src/parsing/filters.rs | 39 +- server-rs/src/pocketbase.rs | 62 ++- server-rs/src/routes/ai_filters.rs | 162 +++++- server-rs/src/routes/checkout.rs | 14 +- server-rs/src/routes/invites.rs | 38 +- server-rs/src/routes/newsletter.rs | 14 +- server-rs/src/routes/postcodes.rs | 2 +- server-rs/src/routes/pricing.rs | 12 +- server-rs/src/routes/reload.rs | 1 + server-rs/src/routes/shorten.rs | 20 +- server-rs/src/routes/stripe_webhook.rs | 26 +- server-rs/src/state.rs | 3 + 54 files changed, 1665 insertions(+), 630 deletions(-) create mode 100644 finder/zoopla.py create mode 100644 frontend/src/components/ui/TravelTimeInfoPopup.tsx diff --git a/Dockerfile b/Dockerfile index 4f6c9a1..481d385 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,5 @@ # Stage 1: Build frontend -FROM node:20-slim AS frontend +FROM node:22-slim AS frontend WORKDIR /app/frontend COPY frontend/package.json frontend/package-lock.json ./ RUN npm ci @@ -7,7 +7,7 @@ COPY frontend/ ./ RUN npm run build:no-prerender # Stage 2: Build Rust server -FROM rust:1.83-bookworm AS server +FROM rust:1.84-bookworm AS server WORKDIR /app COPY server-rs/ server-rs/ WORKDIR /app/server-rs diff --git a/finder/Dockerfile b/finder/Dockerfile index c975550..00c0344 100644 --- a/finder/Dockerfile +++ b/finder/Dockerfile @@ -5,9 +5,14 @@ COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv WORKDIR /app COPY pyproject.toml ./ RUN uv pip install --system -r pyproject.toml -RUN playwright install --with-deps chromium +RUN playwright install-deps firefox +RUN camoufox fetch \ + && python -c "from camoufox.pkgman import camoufox_path; p = camoufox_path(download_if_missing=False); print('Camoufox verified at', p)" COPY *.py ./ COPY property-data/arcgis_data.parquet /data/arcgis_data.parquet +HEALTHCHECK --interval=30s --timeout=5s --retries=3 \ + CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:1234/health')" + CMD ["python3", "main.py"] diff --git a/finder/constants.py b/finder/constants.py index 1f863e5..3c75f9d 100644 --- a/finder/constants.py +++ b/finder/constants.py @@ -31,6 +31,11 @@ SCRAPE_OPENRENT = os.environ.get("SCRAPE_OPENRENT", "true").lower() in ( "true", "yes", ) +SCRAPE_ZOOPLA = os.environ.get("SCRAPE_ZOOPLA", "true").lower() in ( + "1", + "true", + "yes", +) # URL to trigger server data reload after scrape (e.g. http://server:8001/api/reload) RELOAD_URL = os.environ.get("RELOAD_URL", "") @@ -47,6 +52,9 @@ HOMECOUK_PER_PAGE = 30 # max supported by the API # OpenRent OPENRENT_BASE = "https://www.openrent.co.uk" +# Zoopla +ZOOPLA_BASE = "https://www.zoopla.co.uk" + PROPERTY_TYPE_MAP = { "Detached": "Detached", "Semi-Detached": "Semi-Detached", diff --git a/finder/main.py b/finder/main.py index 3174d00..b68f824 100644 --- a/finder/main.py +++ b/finder/main.py @@ -14,6 +14,7 @@ from constants import ( SCRAPE_HOMECOUK, SCRAPE_OPENRENT, SCRAPE_RIGHTMOVE, + SCRAPE_ZOOPLA, ) from homecouk import load_cookies as load_homecouk_cookies from openrent import load_cookies as load_openrent_cookies @@ -48,6 +49,16 @@ log.setLevel(logging.DEBUG) logging.getLogger("httpx").setLevel(logging.WARNING) logging.getLogger("httpcore").setLevel(logging.WARNING) + +# Suppress noisy /metrics and /health request logs from werkzeug +class _NoiseFilter(logging.Filter): + def filter(self, record): + msg = record.getMessage() + return "GET /metrics" not in msg and "GET /health" not in msg + + +logging.getLogger("werkzeug").addFilter(_NoiseFilter()) + # --------------------------------------------------------------------------- # Startup: load data # --------------------------------------------------------------------------- @@ -55,13 +66,14 @@ logging.getLogger("httpcore").setLevel(logging.WARNING) log.info("Loading arcgis data...") OUTCODES = load_outcodes() PC_INDEX = build_postcode_index() -PC_COORDS = build_postcode_coords() if SCRAPE_OPENRENT else None +PC_COORDS = build_postcode_coords() if (SCRAPE_OPENRENT or SCRAPE_ZOOPLA) else None log.info( - "Ready — %d outcodes, postcode index built (rightmove=%s, homecouk=%s, openrent=%s)", + "Ready — %d outcodes, postcode index built (rightmove=%s, homecouk=%s, openrent=%s, zoopla=%s)", len(OUTCODES), SCRAPE_RIGHTMOVE, SCRAPE_HOMECOUK, SCRAPE_OPENRENT, + SCRAPE_ZOOPLA, ) # --------------------------------------------------------------------------- @@ -121,6 +133,11 @@ if SCHEDULE_HOUR >= 0: app = Flask(__name__) +@app.route("/health") +def health(): + return "ok", 200 + + @app.route("/run", methods=["POST"]) def trigger_run(): if _start_scrape(): @@ -147,6 +164,7 @@ def get_status(): "rightmove": status.rm_properties, "homecouk": status.hk_properties, "openrent": status.or_properties, + "zoopla": status.zp_properties, }, "errors": status.errors[-20:], # last 20 errors "elapsed_seconds": round(elapsed, 1), @@ -167,8 +185,10 @@ def get_debug(): "scrape_rightmove": SCRAPE_RIGHTMOVE, "scrape_homecouk": SCRAPE_HOMECOUK, "scrape_openrent": SCRAPE_OPENRENT, + "scrape_zoopla": SCRAPE_ZOOPLA, "homecouk_cookies_available": hk_cookies is not None, "openrent_cookies_available": or_cookies is not None, + "zoopla_note": "browser-based (Camoufox), no cookies needed", } ) diff --git a/finder/metrics.py b/finder/metrics.py index 134cc7f..df8ae26 100644 --- a/finder/metrics.py +++ b/finder/metrics.py @@ -109,6 +109,28 @@ openrent_properties_scraped = Counter( ["channel"], ) +# --------------------------------------------------------------------------- +# Counters — Zoopla +# --------------------------------------------------------------------------- + +zoopla_pages_scraped = Counter( + "zoopla_pages_scraped", + "Search result pages scraped from Zoopla", + ["channel"], +) + +zoopla_errors_total = Counter( + "zoopla_errors_total", + "Zoopla scraping errors", + ["type"], +) + +zoopla_properties_scraped = Counter( + "zoopla_properties_scraped", + "Properties scraped from Zoopla (before dedup)", + ["channel"], +) + # --------------------------------------------------------------------------- # Counters — FlareSolverr / cookie management # --------------------------------------------------------------------------- @@ -138,3 +160,8 @@ openrent_enabled = Gauge( "openrent_enabled", "Whether OpenRent scraping is currently active (1=yes, 0=no)", ) + +zoopla_enabled = Gauge( + "zoopla_enabled", + "Whether Zoopla scraping is currently active (1=yes, 0=no)", +) diff --git a/finder/scraper.py b/finder/scraper.py index 8728008..b5e9d3e 100644 --- a/finder/scraper.py +++ b/finder/scraper.py @@ -17,6 +17,7 @@ from constants import ( SCRAPE_HOMECOUK, SCRAPE_OPENRENT, SCRAPE_RIGHTMOVE, + SCRAPE_ZOOPLA, SEED, ) from homecouk import CookiesExpiredError @@ -35,12 +36,16 @@ from metrics import ( scrape_outcodes_total, scrape_properties_total, scrape_state, + zoopla_enabled, ) from openrent import WafChallengeError from openrent import load_cookies as load_openrent_cookies from openrent import make_client as make_openrent_client from openrent import search_outcode as openrent_search_outcode from rightmove import resolve_outcode_id, search_outcode +from zoopla import TurnstileError +from zoopla import launch_browser as launch_zoopla_browser +from zoopla import search_outcode as zoopla_search_outcode from spatial import PostcodeSpatialIndex from storage import write_parquet @@ -60,6 +65,7 @@ class ScrapeStatus: rm_properties: int = 0 hk_properties: int = 0 or_properties: int = 0 + zp_properties: int = 0 errors: list[str] = field(default_factory=list) started_at: float = 0.0 finished_at: float = 0.0 @@ -93,6 +99,9 @@ def _sync_gauges() -> None: scrape_properties_total.labels(channel=ch, source="openrent").set( status.or_properties ) + scrape_properties_total.labels(channel=ch, source="zoopla").set( + status.zp_properties + ) if status.started_at: end = status.finished_at if status.finished_at else time.time() scrape_elapsed_seconds.set(end - status.started_at) @@ -191,7 +200,7 @@ def run_scrape( random.seed(SEED) random.shuffle(shuffled) - if not SCRAPE_RIGHTMOVE and not SCRAPE_HOMECOUK and not SCRAPE_OPENRENT: + if not SCRAPE_RIGHTMOVE and not SCRAPE_HOMECOUK and not SCRAPE_OPENRENT and not SCRAPE_ZOOPLA: log.warning("All scrapers disabled — nothing to do") with status_lock: status.state = "done" @@ -239,8 +248,27 @@ def run_scrape( ) openrent_enabled.set(0) - # Build postcode coords if OpenRent is active and caller didn't provide them - if or_client and pc_coords is None: + # Zoopla: uses Camoufox browser (no cookies/client pattern) + zp_browser = None + zp_page = None + zp_failed = False + if not SCRAPE_ZOOPLA: + log.info("Zoopla scraping DISABLED (SCRAPE_ZOOPLA=false)") + zoopla_enabled.set(0) + else: + try: + zp_browser, zp_page = launch_zoopla_browser() + log.info("Zoopla scraping ENABLED (Camoufox browser launched)") + zoopla_enabled.set(1) + except TurnstileError: + log.warning("Zoopla Cloudflare Turnstile failed — disabling Zoopla") + zoopla_enabled.set(0) + except Exception as e: + log.warning("Zoopla browser launch failed: %s — disabling Zoopla", e) + zoopla_enabled.set(0) + + # Build postcode coords if OpenRent/Zoopla is active and caller didn't provide them + if (or_client or zp_page) and pc_coords is None: pc_coords = build_postcode_coords() try: @@ -256,6 +284,8 @@ def run_scrape( hk_dedup_count = 0 # home.co.uk skipped as cross-source duplicates or_count = 0 # OpenRent properties this channel or_dedup_count = 0 # OpenRent skipped as cross-source duplicates + zp_count = 0 # Zoopla properties this channel + zp_dedup_count = 0 # Zoopla skipped as cross-source duplicates with status_lock: status.channel = channel_name @@ -264,6 +294,7 @@ def run_scrape( status.rm_properties = 0 status.hk_properties = 0 status.or_properties = 0 + status.zp_properties = 0 channel_start = time.time() prev_prop_milestone = 0 # last 10k milestone we logged @@ -412,6 +443,63 @@ def run_scrape( with status_lock: status.errors.append(msg) + # --- Zoopla --- + if zp_page and not zp_failed: + made_requests = True + try: + zp_props = zoopla_search_outcode( + zp_page, + outcode, + channel_name, + pc_index, + pc_coords, + ) + for p in zp_props: + pid = p["id"] + key = _dedup_key(p) + if pid in all_properties or key in seen_dedup_keys: + zp_dedup_count += 1 + cross_source_dedup_total.labels( + channel="buy" if channel_name == "BUY" else "rent", + ).inc() + continue + all_properties[pid] = p + seen_dedup_keys.add(key) + zp_count += 1 + if zp_props: + log.info( + "Zoopla %s: +%d properties", outcode, len(zp_props) + ) + except TurnstileError: + log.warning( + "Zoopla Cloudflare challenge failed — attempting browser relaunch" + ) + try: + zp_browser.close() + except Exception: + pass + try: + zp_browser, zp_page = launch_zoopla_browser() + log.info("Zoopla browser relaunched, continuing") + except Exception: + log.warning( + "Browser relaunch failed, disabling Zoopla for rest of scrape" + ) + zp_page = None + zp_browser = None + zp_failed = True + zoopla_enabled.set(0) + with status_lock: + status.errors.append( + "Zoopla Cloudflare challenge failed and browser relaunch failed" + ) + except Exception as e: + msg = f"Error scraping Zoopla {outcode}/{channel_name}: {e}" + log.error(msg) + scrape_errors_total.labels(source="zoopla").inc() + with status_lock: + status.errors.append(msg) + with status_lock: if channel_name == "BUY": status.properties_buy = len(all_properties) @@ -420,6 +508,7 @@ def run_scrape( status.rm_properties = rm_count status.hk_properties = hk_count status.or_properties = or_count + status.zp_properties = zp_count _sync_gauges() # Log progress every 100 outcodes @@ -444,12 +533,13 @@ def run_scrape( if current_milestone > prev_prop_milestone: prev_prop_milestone = current_milestone log.info( - "%s %dk properties (rm: %d, hk: %d, or: %d) at outcode %d/%d [%s]", + "%s %dk properties (rm: %d, hk: %d, or: %d, zp: %d) at outcode %d/%d [%s]", channel_name, current_milestone * 10, rm_count, hk_count, or_count, + zp_count, done, len(shuffled), _fmt_elapsed(elapsed), @@ -472,13 +562,14 @@ def run_scrape( _sync_gauges() log.info( - "=== %s channel complete: %d unique (rm: %d, hk: %d, or: %d, cross-dedup: %d) ===", + "=== %s channel complete: %d unique (rm: %d, hk: %d, or: %d, zp: %d, cross-dedup: %d) ===", channel_name, len(deduped), rm_count, hk_count, or_count, - hk_dedup_count + or_dedup_count, + zp_count, + hk_dedup_count + or_dedup_count + zp_dedup_count, ) with status_lock: @@ -525,3 +616,8 @@ def run_scrape( hk_client.close() if or_client: or_client.close() + if zp_browser: + try: + zp_browser.close() + except Exception: + pass diff --git a/finder/storage.py b/finder/storage.py index 1004bee..9854188 100644 --- a/finder/storage.py +++ b/finder/storage.py @@ -25,7 +25,11 @@ def write_parquet(properties: list[dict], path: Path, channel: str) -> None: if fvd: try: dt = datetime.fromisoformat(fvd.replace("Z", "+00:00")) - listing_dates.append(dt.replace(tzinfo=None)) + # Convert to UTC naive datetime for consistent storage + if dt.tzinfo is not None: + from datetime import timezone + dt = dt.astimezone(timezone.utc).replace(tzinfo=None) + listing_dates.append(dt) except (ValueError, TypeError): listing_dates.append(None) else: diff --git a/finder/zoopla.py b/finder/zoopla.py new file mode 100644 index 0000000..4cddc17 --- /dev/null +++ b/finder/zoopla.py @@ -0,0 +1,520 @@ +"""Zoopla (zoopla.co.uk) scraper — buy and rental properties. + +Zoopla is behind Cloudflare Turnstile (managed interactive challenge), which +blocks all HTTP clients (curl_cffi, httpx) and even Playwright with stealth +patches. Only Camoufox (an anti-fingerprinting Firefox fork) passes reliably. + +Zoopla uses Next.js App Router with React Server Components (RSC). Search +result data is server-rendered in an RSC stream, not available via +__NEXT_DATA__ or a JSON API. URL-based location slugs return 0 results — +the working flow requires typing into the autocomplete input, selecting a +suggestion, and clicking Search. + +Architecture: + Unlike the other scrapers which use HTTP clients per outcode, Zoopla keeps + a single Camoufox browser alive for the entire scrape. For each outcode, it: + 1. Clears and types the outcode into the search input + 2. Selects the first autocomplete suggestion + 3. Clicks Search + 4. Extracts listing data from the rendered DOM + 5. Handles pagination via ?pn=N parameter + + The browser session replaces the cookie/client pattern used by other scrapers. +""" + +import logging +import re +import time + +from constants import DELAY_BETWEEN_PAGES, PROPERTY_TYPE_MAP, ZOOPLA_BASE +from metrics import zoopla_errors_total, zoopla_pages_scraped, zoopla_properties_scraped +from spatial import PostcodeSpatialIndex + +log = logging.getLogger("zoopla") + + +class TurnstileError(Exception): + """Raised when Cloudflare Turnstile challenge cannot be passed.""" + + +# Maximum search result pages to scrape per outcode (25 listings/page) +MAX_PAGES_PER_OUTCODE = 10 + +# JavaScript to extract listings from the rendered DOM. +# Finds all detail links, walks up to the card container, and parses +# price, beds, baths, floor area, address, and tenure from the card text. +_EXTRACT_LISTINGS_JS = r"""() => { + const links = Array.from(document.querySelectorAll( + 'a[href*="/for-sale/details/"], a[href*="/new-homes/details/"], a[href*="/to-rent/details/"]' + )); + + const seen = new Set(); + const results = []; + + for (const link of links) { + const href = link.href; + const match = href.match(/\/details\/(\d+)\//); + if (!match) continue; + + const id = match[1]; + if (seen.has(id)) continue; + seen.add(id); + + // Walk up to the listing card container + let card = link; + for (let j = 0; j < 10; j++) { + card = card.parentElement; + if (!card) break; + const text = card.innerText || ''; + if (text.includes('\u00a3') && (text.includes('bed') || text.includes('sq ft'))) { + break; + } + } + if (!card) continue; + + const text = card.innerText || ''; + const lines = text.split('\n').map(l => l.trim()).filter(Boolean); + + const priceMatch = text.match(/\u00a3([\d,]+)/); + const bedsMatch = text.match(/(\d+)\s*beds?/i); + const bathsMatch = text.match(/(\d+)\s*baths?/i); + const recMatch = text.match(/(\d+)\s*reception/i); + const areaMatch = text.match(/([\d,]+)\s*sq\s*ft/i); + + let address = ''; + for (const line of lines) { + if (/[A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2}/i.test(line) || + (line.includes(',') && !line.includes('\u00a3') && !/^\d+ beds?/i.test(line))) { + address = line; + break; + } + } + + let tenure = ''; + if (/freehold/i.test(text)) tenure = 'Freehold'; + else if (/leasehold/i.test(text)) tenure = 'Leasehold'; + + results.push({ + id: id, + url: href.replace(window.location.origin, ''), + price: priceMatch ? parseInt(priceMatch[1].replace(/,/g, '')) : null, + beds: bedsMatch ? parseInt(bedsMatch[1]) : null, + baths: bathsMatch ? parseInt(bathsMatch[1]) : null, + receptions: recMatch ? parseInt(recMatch[1]) : null, + floor_area_sqft: areaMatch ? parseInt(areaMatch[1].replace(/,/g, '')) : null, + address: address, + tenure: tenure, + }); + } + + return results; +}""" + +# JavaScript to dismiss the Usercentrics cookie consent overlay (shadow DOM). +_DISMISS_COOKIES_JS = """() => { + const aside = document.querySelector('#usercentrics-cmp-ui'); + if (aside && aside.shadowRoot) { + const btns = aside.shadowRoot.querySelectorAll('button'); + for (const btn of btns) { + if (btn.innerText.includes('Accept')) { btn.click(); return true; } + } + } + if (aside) { aside.remove(); return true; } + return false; +}""" + + +# --------------------------------------------------------------------------- +# Browser lifecycle +# --------------------------------------------------------------------------- + + +def launch_browser(): + """Launch Camoufox, navigate to Zoopla homepage, pass Cloudflare Turnstile, + and dismiss cookie consent. Returns (browser, page) tuple. + + Raises TurnstileError if Cloudflare cannot be passed within 60 seconds. + Caller must close browser when done.""" + from camoufox.pkgman import camoufox_path + + # Verify camoufox is pre-installed — never download at runtime + camoufox_path(download_if_missing=False) + + from camoufox.sync_api import Camoufox + + log.info("Launching Camoufox browser for Zoopla...") + browser = Camoufox(headless=True).__enter__() + page = browser.new_page() + + log.info("Navigating to Zoopla homepage...") + page.goto(f"{ZOOPLA_BASE}/", wait_until="domcontentloaded", timeout=60000) + + # Wait for Cloudflare Turnstile to resolve. + # Try clicking the Turnstile checkbox if present (helps in some cases). + for i in range(20): + if "Just a moment" not in page.title(): + break + # Attempt to click the Turnstile checkbox in the challenge iframe + for frame in page.frames: + if "challenges.cloudflare.com" in frame.url: + try: + iframe_el = page.query_selector('iframe[src*="challenges.cloudflare"]') + if iframe_el: + box = iframe_el.bounding_box() + if box: + page.mouse.click(box["x"] + 30, box["y"] + box["height"] / 2) + except Exception: + pass + break + time.sleep(3) + else: + page.close() + browser.close() + raise TurnstileError("Cloudflare Turnstile did not resolve after 60s") + + log.info("Cloudflare passed — title: %s", page.title()) + time.sleep(2) + + # Dismiss cookie consent + page.evaluate(_DISMISS_COOKIES_JS) + time.sleep(1) + + return browser, page + + +def _ensure_not_challenged(page) -> None: + """Check if current page is a Cloudflare challenge and wait/raise.""" + if "Just a moment" not in page.title(): + return + + log.warning("Cloudflare challenge detected mid-session, waiting...") + for i in range(20): + time.sleep(3) + if "Just a moment" not in page.title(): + log.info("Cloudflare challenge resolved") + return + + raise TurnstileError("Cloudflare re-challenge did not resolve") + + +# --------------------------------------------------------------------------- +# Search navigation +# --------------------------------------------------------------------------- + + +def _navigate_search(page, outcode: str, channel: str) -> bool: + """Navigate to search results for an outcode via the homepage search flow. + + Returns True if results were found, False if no results or navigation failed. + Raises TurnstileError if Cloudflare blocks us.""" + # Navigate to homepage to reset search state + page.goto(f"{ZOOPLA_BASE}/", wait_until="domcontentloaded", timeout=30000) + time.sleep(2) + _ensure_not_challenged(page) + + # Dismiss cookie consent (may reappear after navigation) + page.evaluate(_DISMISS_COOKIES_JS) + time.sleep(1) + + # Select Buy/Rent tab + if channel == "RENT": + rent_tab = page.query_selector( + 'button:has-text("Rent"), [role="tab"]:has-text("Rent")' + ) + if rent_tab: + rent_tab.click() + time.sleep(0.5) + + # Find and fill search input + search_input = page.query_selector( + 'input[name="autosuggest-input"]' + ) or page.query_selector('input[type="text"]') + if not search_input: + log.warning("Could not find search input on homepage") + return False + + search_input.click() + time.sleep(0.3) + search_input.fill("") + search_input.type(outcode, delay=60) + time.sleep(2) + + # Select first autocomplete suggestion + first_option = page.query_selector('[role="option"]') + if not first_option: + log.debug("No autocomplete suggestions for outcode %s", outcode) + return False + + first_option.click() + time.sleep(0.5) + + # Click search button + search_btn = page.query_selector('button:has-text("Search")') + if search_btn: + search_btn.click() + else: + search_input.press("Enter") + + # Wait for results to load + time.sleep(6) + _ensure_not_challenged(page) + + return True + + +def _get_result_count(page) -> int: + """Extract the total results count from the page body text.""" + try: + body = page.inner_text("body") + match = re.search(r"([\d,]+)\s+results?", body) + if match: + return int(match.group(1).replace(",", "")) + except Exception: + pass + return 0 + + +# --------------------------------------------------------------------------- +# Extraction and pagination +# --------------------------------------------------------------------------- + + +def _extract_listings(page) -> list[dict]: + """Extract listing data from the current search results page DOM.""" + try: + return page.evaluate(_EXTRACT_LISTINGS_JS) + except Exception as e: + log.warning("Failed to extract listings from DOM: %s", e) + zoopla_errors_total.labels(type="extract_failed").inc() + return [] + + +def _paginate(page, total_results: int, channel: str) -> list[dict]: + """Extract listings from all pages of search results. + + Page 1 is already loaded. For subsequent pages, clicks the Next button + or navigates via URL parameter ?pn=N.""" + all_listings = _extract_listings(page) + channel_label = "buy" if channel == "BUY" else "rent" + zoopla_pages_scraped.labels(channel=channel_label).inc() + + if not all_listings or total_results <= len(all_listings): + return all_listings + + seen_ids = {l["id"] for l in all_listings} + current_url = page.url + page_num = 2 + + while len(all_listings) < total_results and page_num <= MAX_PAGES_PER_OUTCODE: + time.sleep(DELAY_BETWEEN_PAGES) + + # Try navigating via URL parameter + if "?" in current_url: + next_url = re.sub(r"[?&]pn=\d+", "", current_url) + separator = "&" if "?" in next_url else "?" + next_url = f"{next_url}{separator}pn={page_num}" + else: + next_url = f"{current_url}?pn={page_num}" + + try: + page.goto(next_url, wait_until="domcontentloaded", timeout=30000) + time.sleep(4) + _ensure_not_challenged(page) + except TurnstileError: + raise + except Exception as e: + log.debug("Pagination navigation failed at page %d: %s", page_num, e) + break + + page_listings = _extract_listings(page) + if not page_listings: + break + + # Deduplicate within this outcode + new_count = 0 + for listing in page_listings: + if listing["id"] not in seen_ids: + seen_ids.add(listing["id"]) + all_listings.append(listing) + new_count += 1 + + zoopla_pages_scraped.labels(channel=channel_label).inc() + + if new_count == 0: + break # No new listings on this page + + page_num += 1 + + return all_listings + + +# --------------------------------------------------------------------------- +# Property transformation +# --------------------------------------------------------------------------- + + +def _extract_postcode(text: str) -> str | None: + """Extract a full UK postcode from text like 'Dollar Bay Place, Canary Wharf E14 9SS'.""" + match = re.search(r"([A-Z]{1,2}\d[A-Z0-9]?\s*\d[A-Z]{2})", text, re.IGNORECASE) + if match: + return match.group(1).upper().strip() + return None + + +def _extract_outcode(text: str) -> str | None: + """Extract a UK outcode from address text like 'Whitechapel Road, London E1'.""" + # Look for outcode at end of string or after last comma + match = re.search(r"\b([A-Z]{1,2}\d[A-Z0-9]?)\s*$", text.strip(), re.IGNORECASE) + if match: + return match.group(1).upper() + # Try after comma + parts = text.split(",") + if len(parts) > 1: + last = parts[-1].strip() + match = re.match(r"^([A-Z]{1,2}\d[A-Z0-9]?)$", last, re.IGNORECASE) + if match: + return match.group(1).upper() + return None + + +def _map_property_type(raw_type: str | None) -> str: + """Map Zoopla property type text to canonical type.""" + if not raw_type: + return "Other" + canonical = PROPERTY_TYPE_MAP.get(raw_type) + if canonical: + return canonical + lower = raw_type.lower() + if "flat" in lower or "apartment" in lower or "maisonette" in lower or "studio" in lower: + return "Flats/Maisonettes" + if "detached" in lower and "semi" not in lower: + return "Detached" + if "semi" in lower: + return "Semi-Detached" + if "terrace" in lower or "mews" in lower: + return "Terraced" + if "house" in lower: + return "Detached" + return "Other" + + +def transform_property( + raw: dict, + channel: str, + pc_index: PostcodeSpatialIndex, + pc_coords: dict[str, tuple[float, float]], +) -> dict | None: + """Transform a raw Zoopla listing dict into the standard output schema. + + Zoopla search cards do not include coordinates, so we resolve lat/lng + from postcodes extracted from the address text.""" + price = raw.get("price") + if not price: + return None + + address = raw.get("address", "") + + # Resolve postcode and coordinates from address + postcode = _extract_postcode(address) + lat = lng = None + + if postcode: + coords = pc_coords.get(postcode) + if coords: + lat, lng = coords + + if lat is None: + # Try outcode-level fallback + outcode = _extract_outcode(address) + if outcode: + prefix = outcode + " " + for pcd, coords in pc_coords.items(): + if pcd.startswith(prefix): + postcode = pcd + lat, lng = coords + break + + if lat is None or lng is None or not postcode: + return None + + # Validate coordinates are in England + if not (49 <= lat <= 56 and -7 <= lng <= 2): + return None + + bedrooms = raw.get("beds") or 0 + bathrooms = raw.get("baths") or 0 + receptions = raw.get("receptions") or 0 + + # Floor area: convert sq ft to sq m + floor_area_sqm = None + sqft = raw.get("floor_area_sqft") + if sqft: + floor_area_sqm = round(sqft * 0.092903, 1) + + listing_id = raw.get("id", "") + listing_url = raw.get("url", "") + if listing_url and not listing_url.startswith("http"): + listing_url = ZOOPLA_BASE + listing_url + + return { + "id": f"zp_{listing_id}", + "Bedrooms": bedrooms, + "Bathrooms": bathrooms, + "Number of bedrooms & living rooms": bedrooms + receptions, + "lon": lng, + "lat": lat, + "Postcode": postcode, + "Address per Property Register": address, + "Leasehold/Freehold": raw.get("tenure") or None, + "Property type": "Other", # Not reliably extractable from Zoopla search cards + "Property sub-type": "", + "price": int(price), + "price_frequency": "" if channel == "BUY" else "monthly", + "Price qualifier": "", + "Total floor area (sqm)": floor_area_sqm, + "Listing URL": listing_url, + "Listing features": [], + "first_visible_date": "", + } + + +# --------------------------------------------------------------------------- +# Top-level search function (called by scraper.py) +# --------------------------------------------------------------------------- + + +def search_outcode( + page, + outcode: str, + channel: str, + pc_index: PostcodeSpatialIndex, + pc_coords: dict[str, tuple[float, float]], +) -> list[dict]: + """Search Zoopla for properties in one outcode. + + Takes a live Camoufox Page (from launch_browser). Navigates through the + search flow, extracts listings from rendered DOM, and transforms to the + standard output schema. + + Raises TurnstileError if Cloudflare blocks us mid-session. + """ + if not _navigate_search(page, outcode, channel): + return [] + + total_results = _get_result_count(page) + if total_results == 0: + return [] + + raw_listings = _paginate(page, total_results, channel) + if not raw_listings: + return [] + + channel_label = "buy" if channel == "BUY" else "rent" + properties = [] + for raw in raw_listings: + transformed = transform_property(raw, channel, pc_index, pc_coords) + if transformed: + properties.append(transformed) + zoopla_properties_scraped.labels(channel=channel_label).inc() + + return properties diff --git a/frontend/src/App.tsx b/frontend/src/App.tsx index 057ebbd..5e2a1ce 100644 --- a/frontend/src/App.tsx +++ b/frontend/src/App.tsx @@ -395,6 +395,7 @@ export default function App() { onUnsaveProperty={user ? savedProperties.deleteProperty : undefined} isPropertySaved={user ? savedProperties.isPropertySaved : undefined} getSavedPropertyId={user ? savedProperties.getSavedPropertyId : undefined} + deferTutorial={showLicenseSuccess} /> )} {showAuthModal && ( diff --git a/frontend/src/components/account/AccountPage.tsx b/frontend/src/components/account/AccountPage.tsx index 0e5d34d..6cf8735 100644 --- a/frontend/src/components/account/AccountPage.tsx +++ b/frontend/src/components/account/AccountPage.tsx @@ -2,7 +2,7 @@ import { useState, useCallback, useEffect, useRef } from 'react'; import type { AuthUser } from '../../hooks/useAuth'; import type { SavedSearch } from '../../hooks/useSavedSearches'; import type { SavedProperty, SavedPropertyData } from '../../hooks/useSavedProperties'; -import { apiUrl, authHeaders, assertOk, shortenUrl } from '../../lib/api'; +import { apiUrl, authHeaders, assertOk, shortenUrl, prewarmScreenshot } from '../../lib/api'; import { copyToClipboard } from '../../lib/clipboard'; import { formatRelativeTime, formatNumber } from '../../lib/format'; import { summarizeParams } from '../../lib/url-state'; @@ -172,6 +172,7 @@ function SavedSearchesTab({ const handleShare = useCallback( async (params: string, id: string) => { + prewarmScreenshot(params); setSharingId(id); try { const shortUrl = await shortenUrl(params); @@ -213,7 +214,7 @@ function SavedSearchesTab({ {searches.map((search) => (
{search.screenshotUrl ? ( )} -
+

{search.name}

@@ -238,14 +239,14 @@ function SavedSearchesTab({ {summarizeParams(search.params)}

-
+
onUpdateNotes(search.id, notes)} />
-
+
- +
+
+ + +
+ {prop.data.listingUrl && ( + + View listing → + + )}
- {prop.data.listingUrl && ( - - View listing → - - )}
); })} diff --git a/frontend/src/components/invite/InvitePage.tsx b/frontend/src/components/invite/InvitePage.tsx index 60446a4..8fb68e1 100644 --- a/frontend/src/components/invite/InvitePage.tsx +++ b/frontend/src/components/invite/InvitePage.tsx @@ -94,10 +94,14 @@ export default function InvitePage({ const isDark = theme === 'dark'; - // Signal screenshot readiness once loading completes + // Signal screenshot readiness once loading completes and a frame has painted useEffect(() => { if (screenshotMode && !loading) { - window.__screenshot_ready = true; + requestAnimationFrame(() => { + requestAnimationFrame(() => { + window.__screenshot_ready = true; + }); + }); } }, [screenshotMode, loading]); @@ -313,7 +317,7 @@ export default function InvitePage({
+ setTravelInfoMode(mode)} + title="Feature info" + > + + + {!addFilterCollapsed && ( +
+ +
+ )}
{showPhilosophy && ( diff --git a/frontend/src/components/map/HoverCard.tsx b/frontend/src/components/map/HoverCard.tsx index 669a73a..3d08e9a 100644 --- a/frontend/src/components/map/HoverCard.tsx +++ b/frontend/src/components/map/HoverCard.tsx @@ -111,7 +111,7 @@ export default memo(function HoverCard({ )} {/* Hint */} -
+
Click for details
diff --git a/frontend/src/components/map/JourneyInstructions.tsx b/frontend/src/components/map/JourneyInstructions.tsx index 3a8f27c..a079aa2 100644 --- a/frontend/src/components/map/JourneyInstructions.tsx +++ b/frontend/src/components/map/JourneyInstructions.tsx @@ -241,7 +241,7 @@ export default function JourneyInstructions({ To {j.label || j.slug} - {displayLegs && displayLegs.length > 0 && ( + {!j.loading && totalMin > 0 && ( {totalMin} min @@ -269,6 +269,26 @@ export default function JourneyInstructions({
+ ) : j.minutes != null ? ( +
+
+ + + Walk · {j.minutes} min + +
+ + View on Google Maps + + + + +
) : ( No journey data available diff --git a/frontend/src/components/map/MapPage.tsx b/frontend/src/components/map/MapPage.tsx index 0034aec..62b93b7 100644 --- a/frontend/src/components/map/MapPage.tsx +++ b/frontend/src/components/map/MapPage.tsx @@ -72,6 +72,7 @@ interface MapPageProps { onUnsaveProperty?: (id: string) => void; isPropertySaved?: (address?: string, postcode?: string) => boolean; getSavedPropertyId?: (address?: string, postcode?: string) => string | undefined; + deferTutorial?: boolean; } export default function MapPage({ @@ -99,6 +100,7 @@ export default function MapPage({ onUnsaveProperty, isPropertySaved, getSavedPropertyId, + deferTutorial = false, }: MapPageProps) { const [selectedPOICategories, setSelectedPOICategories] = useState>(initialPOICategories); @@ -153,6 +155,14 @@ export default function MapPage({ const handleAiFilterSubmit = useCallback( async (query: string) => { + // Derive current listing type from Listing status filter + const listingVal = filters['Listing status'] as string[] | undefined; + const listingType = listingVal?.includes('For sale') + ? 'buy' + : listingVal?.includes('For rent') + ? 'rent' + : 'historical'; + // Build context from current filters for conversational refinement const context = { filters, @@ -165,7 +175,11 @@ export default function MapPage({ }; const hasContext = Object.keys(context.filters).length > 0 || context.travelTime.length > 0; - const result = await aiFilters.fetchAiFilters(query, hasContext ? context : undefined); + const result = await aiFilters.fetchAiFilters( + query, + hasContext ? context : undefined, + listingType + ); if (!result) return; handleSetFilters(result.filters); // Always sync travel time entries — clear stale ones when AI returns none @@ -354,7 +368,7 @@ export default function MapPage({ selection.areaStats?.central_postcode, ]); - const tutorial = useTutorial(initialLoading, isMobile); + const tutorial = useTutorial(initialLoading, isMobile, deferTutorial); const [exporting, setExporting] = useState(false); const handleExport = useCallback(() => { @@ -418,7 +432,14 @@ export default function MapPage({ ? mapData.postcodeData.length > 0 : mapData.data.length > 0; if (hasData) { - window.__screenshot_ready = true; + // Wait for deck.gl to actually paint: in interleaved MapboxOverlay mode, + // hexagons render during MapLibre's rAF cycle. Double-rAF ensures at + // least one full paint has completed before we signal readiness. + requestAnimationFrame(() => { + requestAnimationFrame(() => { + window.__screenshot_ready = true; + }); + }); } } }, [ diff --git a/frontend/src/components/map/POIPane.tsx b/frontend/src/components/map/POIPane.tsx index af9b588..0aace7e 100644 --- a/frontend/src/components/map/POIPane.tsx +++ b/frontend/src/components/map/POIPane.tsx @@ -84,7 +84,7 @@ export default function POIPane({ const selectedCount = selectedCategories.size; return ( -
+
diff --git a/frontend/src/components/map/TravelTimeCard.tsx b/frontend/src/components/map/TravelTimeCard.tsx index bde43cb..02b68ba 100644 --- a/frontend/src/components/map/TravelTimeCard.tsx +++ b/frontend/src/components/map/TravelTimeCard.tsx @@ -4,24 +4,13 @@ import { IconButton } from '../ui/IconButton'; import { PillToggle } from '../ui/PillToggle'; import { DestinationDropdown } from '../ui/DestinationDropdown'; import InfoPopup from '../ui/InfoPopup'; +import { TravelTimeInfoPopup } from '../ui/TravelTimeInfoPopup'; import { CloseIcon } from '../ui/icons/CloseIcon'; import { EyeIcon } from '../ui/icons/EyeIcon'; import { InfoIcon } from '../ui/icons/InfoIcon'; -import { CarIcon } from '../ui/icons/CarIcon'; -import { BicycleIcon } from '../ui/icons/BicycleIcon'; -import { WalkingIcon } from '../ui/icons/WalkingIcon'; -import { TransitIcon } from '../ui/icons/TransitIcon'; import { formatFilterValue } from '../../lib/format'; import { useTravelDestinations } from '../../hooks/useTravelDestinations'; -import { MODE_LABELS, type TransportMode } from '../../hooks/useTravelTime'; -import type { ComponentType } from 'react'; - -const MODE_ICONS: Record> = { - car: CarIcon, - bicycle: BicycleIcon, - walking: WalkingIcon, - transit: TransitIcon, -}; +import { MODE_LABELS, MODE_ICONS, type TransportMode } from '../../hooks/useTravelTime'; interface TravelTimeCardProps { mode: TransportMode; @@ -118,21 +107,7 @@ export function TravelTimeCard({
)} - {showInfo && ( - setShowInfo(false)}> -

- Shows how long it takes to reach the selected destination from each area - {mode === 'transit' - ? ' by public transport (bus, rail, tube). Times are computed across a typical weekday morning window.' - : mode === 'car' - ? ' by car, based on typical road speeds and the road network.' - : mode === 'bicycle' - ? ' by bicycle, using cycle-friendly routes.' - : ' on foot, using pedestrian paths and pavements.'}{' '} - Use the slider to filter areas within your preferred commute time. -

-
- )} + {showInfo && setShowInfo(false)} />} {showBestInfo && ( setShowBestInfo(false)}> diff --git a/frontend/src/components/ui/AuthModal.tsx b/frontend/src/components/ui/AuthModal.tsx index cc45edb..f3a4b4c 100644 --- a/frontend/src/components/ui/AuthModal.tsx +++ b/frontend/src/components/ui/AuthModal.tsx @@ -87,7 +87,7 @@ export default function AuthModal({ if (e.target === e.currentTarget) onClose(); }} > -
+
{/* Header */}
diff --git a/frontend/src/components/ui/Header.tsx b/frontend/src/components/ui/Header.tsx index 60aae7d..6ff1baf 100644 --- a/frontend/src/components/ui/Header.tsx +++ b/frontend/src/components/ui/Header.tsx @@ -1,6 +1,6 @@ import { useState, useCallback, useEffect } from 'react'; import type { AuthUser } from '../../hooks/useAuth'; -import { shortenUrl } from '../../lib/api'; +import { shortenUrl, prewarmScreenshot } from '../../lib/api'; import { copyToClipboard } from '../../lib/clipboard'; import { DownloadIcon } from './icons/DownloadIcon'; import { BookmarkIcon } from './icons/BookmarkIcon'; @@ -96,6 +96,7 @@ export default function Header({ doCopy(window.location.href); return; } + prewarmScreenshot(params); setSharing(true); try { const shortUrl = await shortenUrl(params); @@ -243,6 +244,7 @@ export default function Header({ theme={theme} onToggleTheme={onToggleTheme} onLogout={onLogout} + onNavigate={onPageChange} /> ) : ( <> diff --git a/frontend/src/components/ui/InfoPopup.tsx b/frontend/src/components/ui/InfoPopup.tsx index dc56a48..5938b88 100644 --- a/frontend/src/components/ui/InfoPopup.tsx +++ b/frontend/src/components/ui/InfoPopup.tsx @@ -23,7 +23,7 @@ export default function InfoPopup({ title, children, onClose, sourceLink }: Info

{title}

diff --git a/frontend/src/components/ui/TravelTimeInfoPopup.tsx b/frontend/src/components/ui/TravelTimeInfoPopup.tsx new file mode 100644 index 0000000..9a2be71 --- /dev/null +++ b/frontend/src/components/ui/TravelTimeInfoPopup.tsx @@ -0,0 +1,27 @@ +import InfoPopup from './InfoPopup'; +import { MODE_LABELS, type TransportMode } from '../../hooks/useTravelTime'; + +const MODE_INFO: Record = { + transit: + ' by public transport (bus, rail, tube). Times are computed across a typical weekday morning window.', + car: ' by car, based on typical road speeds and the road network.', + bicycle: ' by bicycle, using cycle-friendly routes.', + walking: ' on foot, using pedestrian paths and pavements.', +}; + +export function TravelTimeInfoPopup({ + mode, + onClose, +}: { + mode: TransportMode; + onClose: () => void; +}) { + return ( + +

+ Shows how long it takes to reach the selected destination from each area + {MODE_INFO[mode]} Use the slider to filter areas within your preferred commute time. +

+
+ ); +} diff --git a/frontend/src/components/ui/UserMenu.tsx b/frontend/src/components/ui/UserMenu.tsx index 0749e0f..e6c4cce 100644 --- a/frontend/src/components/ui/UserMenu.tsx +++ b/frontend/src/components/ui/UserMenu.tsx @@ -1,5 +1,7 @@ import { useState, useRef, useEffect } from 'react'; import type { AuthUser } from '../../hooks/useAuth'; +import type { Page } from './Header'; +import { PAGE_PATHS } from './Header'; import { SunIcon } from './icons/SunIcon'; import { MoonIcon } from './icons/MoonIcon'; @@ -8,11 +10,13 @@ export default function UserMenu({ theme, onToggleTheme, onLogout, + onNavigate, }: { user: AuthUser; theme: 'light' | 'dark'; onToggleTheme: () => void; onLogout: () => void; + onNavigate: (page: Page) => void; }) { const [open, setOpen] = useState(false); const menuRef = useRef(null); @@ -72,8 +76,13 @@ export default function UserMenu({ Theme: {theme === 'light' ? 'Light' : 'Dark'} setOpen(false)} + href={PAGE_PATHS.account} + onClick={(e) => { + if (e.metaKey || e.ctrlKey || e.shiftKey || e.button !== 0) return; + e.preventDefault(); + setOpen(false); + onNavigate('account'); + }} className="block w-full text-left px-3 py-2 text-sm text-warm-700 dark:text-warm-300 hover:bg-warm-50 dark:hover:bg-warm-700 rounded" > Account diff --git a/frontend/src/hooks/useAiFilters.ts b/frontend/src/hooks/useAiFilters.ts index f204624..93110cd 100644 --- a/frontend/src/hooks/useAiFilters.ts +++ b/frontend/src/hooks/useAiFilters.ts @@ -17,6 +17,8 @@ export interface AiFiltersResult { notes: string; /** Human-readable summary of what was set */ summary: string; + /** The listing mode used (historical/buy/rent) */ + listingType: string; } export type AiFilterErrorType = 'auth' | 'limit' | 'error'; @@ -28,7 +30,11 @@ export interface AiFiltersContext { } interface UseAiFiltersResult { - fetchAiFilters: (query: string, context?: AiFiltersContext) => Promise; + fetchAiFilters: ( + query: string, + context?: AiFiltersContext, + listingType?: string + ) => Promise; loading: boolean; error: string | null; errorType: AiFilterErrorType | null; @@ -41,6 +47,8 @@ function buildSummary(filters: FeatureFilters, travelTimeFilters: AiTravelTimeFi const parts: string[] = []; for (const [name, value] of Object.entries(filters)) { + // Skip Listing status — shown via the mode selector UI + if (name === 'Listing status') continue; if (Array.isArray(value) && value.length === 2 && typeof value[0] === 'number') { parts.push(name); } else if (Array.isArray(value)) { @@ -67,7 +75,11 @@ export function useAiFilters(): UseAiFiltersResult { const abortRef = useRef(null); const fetchAiFilters = useCallback( - async (query: string, context?: AiFiltersContext): Promise => { + async ( + query: string, + context?: AiFiltersContext, + listingType?: string + ): Promise => { abortRef.current?.abort(); const controller = new AbortController(); abortRef.current = controller; @@ -81,6 +93,7 @@ export function useAiFilters(): UseAiFiltersResult { try { const url = apiUrl('ai-filters'); const bodyObj: Record = { query }; + if (listingType) bodyObj.listing_type = listingType; if (context) { bodyObj.context = { filters: context.filters, @@ -130,6 +143,7 @@ export function useAiFilters(): UseAiFiltersResult { travelTimeFilters, notes: json.notes || '', summary: summaryText, + listingType: json.listing_type || 'historical', }; setNotes(result.notes || null); setSummary(summaryText); diff --git a/frontend/src/hooks/useDeckLayers.ts b/frontend/src/hooks/useDeckLayers.ts index b6596ef..920af1e 100644 --- a/frontend/src/hooks/useDeckLayers.ts +++ b/frontend/src/hooks/useDeckLayers.ts @@ -95,7 +95,7 @@ export function useDeckLayers({ useEffect(() => { if (!hasSelection) return; setMarchTime(0); - const id = setInterval(() => setMarchTime((t) => t + 0.3), 50); + const id = setInterval(() => setMarchTime((t) => (t + 0.3) % 10000), 50); return () => clearInterval(id); }, [hasSelection]); diff --git a/frontend/src/hooks/useTravelTime.ts b/frontend/src/hooks/useTravelTime.ts index c475e3d..704caa6 100644 --- a/frontend/src/hooks/useTravelTime.ts +++ b/frontend/src/hooks/useTravelTime.ts @@ -1,4 +1,6 @@ import { useState, useCallback, useMemo } from 'react'; +import type { ComponentType } from 'react'; +import { CarIcon, BicycleIcon, WalkingIcon, TransitIcon } from '../components/ui/icons'; export type TransportMode = 'car' | 'bicycle' | 'walking' | 'transit'; @@ -18,6 +20,13 @@ export const MODE_DESCRIPTIONS: Record = { transit: 'Journey time by train, tube, and bus', }; +export const MODE_ICONS: Record> = { + car: CarIcon, + bicycle: BicycleIcon, + walking: WalkingIcon, + transit: TransitIcon, +}; + export interface TravelTimeEntry { mode: TransportMode; slug: string; diff --git a/frontend/src/hooks/useTutorial.ts b/frontend/src/hooks/useTutorial.ts index 6234c00..5f13d37 100644 --- a/frontend/src/hooks/useTutorial.ts +++ b/frontend/src/hooks/useTutorial.ts @@ -59,13 +59,13 @@ const STEPS: Step[] = [ }, ]; -export function useTutorial(initialLoading: boolean, isMobile: boolean) { +export function useTutorial(initialLoading: boolean, isMobile: boolean, blocked = false) { const [run, setRun] = useState(() => { if (isMobile) return false; return !localStorage.getItem(STORAGE_KEY); }); - const shouldRun = run && !initialLoading && !isMobile; + const shouldRun = run && !initialLoading && !isMobile && !blocked; const handleCallback = useCallback((data: CallBackProps) => { const { status, action, type } = data; diff --git a/frontend/src/lib/api.ts b/frontend/src/lib/api.ts index 3b28daa..98fec79 100644 --- a/frontend/src/lib/api.ts +++ b/frontend/src/lib/api.ts @@ -59,6 +59,12 @@ export async function fetchWithRetry( } } +/** Fire-and-forget request to pre-warm the screenshot cache for OG images. */ +export function prewarmScreenshot(params: string): void { + fetch(apiUrl('screenshot', new URLSearchParams(`og=1&${params}`)), authHeaders()) + .catch(() => {}); // best-effort, don't care if it fails +} + export async function shortenUrl(params: string): Promise { const res = await fetch(apiUrl('shorten'), { method: 'POST', diff --git a/frontend/src/lib/clipboard.ts b/frontend/src/lib/clipboard.ts index c0a8b5d..925f243 100644 --- a/frontend/src/lib/clipboard.ts +++ b/frontend/src/lib/clipboard.ts @@ -1,7 +1,18 @@ /** Copy text to clipboard with execCommand fallback for older browsers. */ export function copyToClipboard(text: string, onSuccess: () => void): void { if (navigator.clipboard?.writeText) { - navigator.clipboard.writeText(text).then(onSuccess); + navigator.clipboard.writeText(text).then(onSuccess).catch(() => { + // Fallback if clipboard permission denied + const ta = document.createElement('textarea'); + ta.value = text; + ta.style.position = 'fixed'; + ta.style.opacity = '0'; + document.body.appendChild(ta); + ta.select(); + document.execCommand('copy'); + document.body.removeChild(ta); + onSuccess(); + }); } else { const ta = document.createElement('textarea'); ta.value = text; diff --git a/frontend/src/lib/consts.ts b/frontend/src/lib/consts.ts index 059cead..1cc1779 100644 --- a/frontend/src/lib/consts.ts +++ b/frontend/src/lib/consts.ts @@ -35,7 +35,7 @@ export const ZOOM_TO_RESOLUTION_THRESHOLDS = [ { maxZoom: 13, resolution: 9 }, ] as const; -export const POSTCODE_ZOOM_THRESHOLD = 16; +export const POSTCODE_ZOOM_THRESHOLD = 15; export const FEATURE_GRADIENT: { t: number; color: [number, number, number] }[] = [ { t: 0, color: [46, 204, 113] }, @@ -183,8 +183,8 @@ export const STACKED_ENUM_GROUPS: Record< }, { label: 'Leasehold/Freehold', - feature: 'Leashold/Freehold', - components: ['Leashold/Freehold'], + feature: 'Leasehold/Freehold', + components: ['Leasehold/Freehold'], valueOrder: ['Freehold', 'Leasehold'], valueColors: ['#3b82f6', '#f59e0b'], }, diff --git a/frontend/src/lib/external-search.ts b/frontend/src/lib/external-search.ts index 33c8ef5..5c89ef5 100644 --- a/frontend/src/lib/external-search.ts +++ b/frontend/src/lib/external-search.ts @@ -49,24 +49,57 @@ const RIGHTMOVE_PRICES = [ 3000000, 4000000, 5000000, 7500000, 10000000, 15000000, 20000000, ]; -function nearestRadius(target: number, allowed: number[]): number { - return allowed.reduce((best, r) => (Math.abs(r - target) < Math.abs(best - target) ? r : best)); -} +// Rightmove allowed monthly rent values (pcm) +const RIGHTMOVE_RENTS = [ + 250, 300, 350, 400, 450, 500, 600, 700, 800, 900, 1000, 1250, 1500, 1750, 2000, 2500, 3000, + 3500, 4000, 5000, 7500, 10000, 15000, 25000, +]; -/** Snap minPrice down and maxPrice up so Rightmove doesn't ignore them */ -function snapRightmovePrice(value: number, direction: 'floor' | 'ceil'): number { +// OnTheMarket allowed buy prices +const OTM_PRICES = [ + 50000, 60000, 70000, 80000, 90000, 100000, 110000, 120000, 125000, 130000, 140000, 150000, + 160000, 170000, 175000, 180000, 190000, 200000, 210000, 220000, 230000, 240000, 250000, 275000, + 300000, 325000, 350000, 375000, 400000, 425000, 450000, 475000, 500000, 550000, 600000, 650000, + 700000, 750000, 800000, 900000, 1000000, 1250000, 1500000, 2000000, 2500000, 3000000, 5000000, + 7500000, 10000000, 15000000, +]; + +// OnTheMarket allowed monthly rent values (pcm) +const OTM_RENTS = [ + 100, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000, + 1100, 1200, 1250, 1300, 1400, 1500, 1750, 2000, 2500, 3000, 3500, 4000, 5000, 7500, 10000, + 25000, +]; + +// Zoopla allowed buy prices +const ZOOPLA_PRICES = [ + 10000, 25000, 50000, 75000, 100000, 125000, 150000, 175000, 200000, 225000, 250000, 275000, + 300000, 325000, 350000, 375000, 400000, 425000, 450000, 475000, 500000, 550000, 600000, 650000, + 700000, 800000, 900000, 1000000, 1250000, 1500000, 1750000, 2000000, 2500000, 3000000, 4000000, + 5000000, 7500000, 10000000, 15000000, +]; + +// Zoopla allowed monthly rent values (pcm) +const ZOOPLA_RENTS = [ + 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1250, 1500, 1750, 2000, 2500, 3000, 3500, + 4000, 5000, 7500, 10000, 25000, +]; + +function snapToAllowed(value: number, allowed: number[], direction: 'floor' | 'ceil'): number { if (direction === 'floor') { - // Largest supported value <= target - for (let i = RIGHTMOVE_PRICES.length - 1; i >= 0; i--) { - if (RIGHTMOVE_PRICES[i] <= value) return RIGHTMOVE_PRICES[i]; + for (let i = allowed.length - 1; i >= 0; i--) { + if (allowed[i] <= value) return allowed[i]; } - return RIGHTMOVE_PRICES[0]; + return allowed[0]; } - // Smallest supported value >= target - for (const p of RIGHTMOVE_PRICES) { + for (const p of allowed) { if (p >= value) return p; } - return RIGHTMOVE_PRICES[RIGHTMOVE_PRICES.length - 1]; + return allowed[allowed.length - 1]; +} + +function nearestRadius(target: number, allowed: number[]): number { + return allowed.reduce((best, r) => (Math.abs(r - target) < Math.abs(best - target) ? r : best)); } interface SearchUrlOptions { @@ -90,7 +123,17 @@ export function buildPropertySearchUrls({ const radiusMiles = isPostcode ? 0.25 : (H3_RADIUS_MILES[resolution] ?? 1); - const priceFilter = filters['Last known price']; + const listingStatus = filters['Listing status']; + const isRent = + Array.isArray(listingStatus) && + typeof listingStatus[0] === 'string' && + (listingStatus as string[]).includes('For rent'); + + // Check price filters in priority order: asking price (current listings) > estimated > last known + // For rent mode, check asking rent first + const priceFilter = isRent + ? filters['Asking rent (monthly)'] + : (filters['Asking price'] ?? filters['Estimated current price'] ?? filters['Last known price']); const minPrice = Array.isArray(priceFilter) && typeof priceFilter[0] === 'number' ? priceFilter[0] : undefined; const maxPrice = @@ -131,15 +174,16 @@ export function buildPropertySearchUrls({ // Rightmove — requires locationIdentifier from typeahead API let rightmove: string | null = null; if (rightmoveLocationId) { + const rmPrices = isRent ? RIGHTMOVE_RENTS : RIGHTMOVE_PRICES; const rmParams = new URLSearchParams(); rmParams.set('searchLocation', postcode); rmParams.set('useLocationIdentifier', 'true'); rmParams.set('locationIdentifier', rightmoveLocationId); rmParams.set('radius', String(nearestRadius(radiusMiles, RIGHTMOVE_RADII))); if (minPrice !== undefined) - rmParams.set('minPrice', String(snapRightmovePrice(minPrice, 'floor'))); + rmParams.set('minPrice', String(snapToAllowed(minPrice, rmPrices, 'floor'))); if (maxPrice !== undefined) - rmParams.set('maxPrice', String(snapRightmovePrice(maxPrice, 'ceil'))); + rmParams.set('maxPrice', String(snapToAllowed(maxPrice, rmPrices, 'ceil'))); if (minBedrooms !== undefined) rmParams.set('minBedrooms', String(Math.floor(minBedrooms))); if (maxBedrooms !== undefined) rmParams.set('maxBedrooms', String(Math.ceil(maxBedrooms))); if (minBathrooms !== undefined) rmParams.set('minBathrooms', String(Math.floor(minBathrooms))); @@ -155,20 +199,24 @@ export function buildPropertySearchUrls({ ]; if (rmTypes.length > 0) rmParams.set('propertyTypes', rmTypes.join(',')); } - if (selectedTenures.length > 0) { + if (!isRent && selectedTenures.length > 0) { const rmTenures = selectedTenures.map((t) => (t === 'Freehold' ? 'FREEHOLD' : 'LEASEHOLD')); rmParams.set('tenureTypes', rmTenures.join(',')); } - rmParams.set('_includeSSTC', 'on'); - rightmove = `https://www.rightmove.co.uk/property-for-sale/find.html?${rmParams.toString()}`; + if (!isRent) rmParams.set('_includeSSTC', 'on'); + const rmPath = isRent ? 'property-to-rent' : 'property-for-sale'; + rightmove = `https://www.rightmove.co.uk/${rmPath}/find.html?${rmParams.toString()}`; } // OnTheMarket — postcode slug in URL path (e.g. "SW1A 1AA" → "sw1a-1aa") const otmSlug = postcode.toLowerCase().replace(/\s+/g, '-'); + const otmPrices = isRent ? OTM_RENTS : OTM_PRICES; const otmParams = new URLSearchParams(); otmParams.set('radius', String(nearestRadius(radiusMiles, OTM_RADII))); - if (minPrice !== undefined) otmParams.set('min-price', String(Math.round(minPrice))); - if (maxPrice !== undefined) otmParams.set('max-price', String(Math.round(maxPrice))); + if (minPrice !== undefined) + otmParams.set('min-price', String(snapToAllowed(minPrice, otmPrices, 'floor'))); + if (maxPrice !== undefined) + otmParams.set('max-price', String(snapToAllowed(maxPrice, otmPrices, 'ceil'))); if (selectedTypes.length > 0) { const otmTypes = [ ...new Set(selectedTypes.map((t) => PROPERTY_TYPE_MAP[t]?.onthemarket).filter(Boolean)), @@ -178,15 +226,20 @@ export function buildPropertySearchUrls({ } } otmParams.set('view', 'map-list'); - const onthemarket = `https://www.onthemarket.com/for-sale/property/${otmSlug}/?${otmParams.toString()}`; + const otmPath = isRent ? 'to-rent' : 'for-sale'; + const onthemarket = `https://www.onthemarket.com/${otmPath}/property/${otmSlug}/?${otmParams.toString()}`; // Zoopla + const zPrices = isRent ? ZOOPLA_RENTS : ZOOPLA_PRICES; const zParams = new URLSearchParams(); zParams.set('q', postcode); - zParams.set('search_source', 'for-sale'); + const zSearchSource = isRent ? 'to-rent' : 'for-sale'; + zParams.set('search_source', zSearchSource); zParams.set('radius', String(nearestRadius(radiusMiles, ZOOPLA_RADII))); - if (minPrice !== undefined) zParams.set('price_min', String(Math.round(minPrice))); - if (maxPrice !== undefined) zParams.set('price_max', String(Math.round(maxPrice))); + if (minPrice !== undefined) + zParams.set('price_min', String(snapToAllowed(minPrice, zPrices, 'floor'))); + if (maxPrice !== undefined) + zParams.set('price_max', String(snapToAllowed(maxPrice, zPrices, 'ceil'))); if (selectedTypes.length > 0) { const zTypes = [ ...new Set(selectedTypes.map((t) => PROPERTY_TYPE_MAP[t]?.zoopla).filter(Boolean)), @@ -195,14 +248,9 @@ export function buildPropertySearchUrls({ zParams.append('property_sub_type', zt!); } } - const zoopla = `https://www.zoopla.co.uk/for-sale/property/?${zParams.toString()}`; + const zoopla = `https://www.zoopla.co.uk/${zSearchSource}/property/?${zParams.toString()}`; // OpenRent — rent mode only - const listingStatus = filters['Listing status']; - const isRent = - Array.isArray(listingStatus) && - typeof listingStatus[0] === 'string' && - (listingStatus as string[]).includes('For rent'); let openrent: string | null = null; if (isRent) { const postcodeNoSpaces = postcode.replace(/\s+/g, ''); diff --git a/frontend/src/lib/format.ts b/frontend/src/lib/format.ts index 1562f1c..ceda414 100644 --- a/frontend/src/lib/format.ts +++ b/frontend/src/lib/format.ts @@ -23,6 +23,26 @@ export function formatFilterValue(value: number, raw?: boolean): string { return value.toFixed(2); } +/** Parse a user-typed value like "250k", "1.2M", "£300000", "50 sqm" back to a number. */ +export function parseInputValue( + text: string, + opts?: { prefix?: string; suffix?: string; step?: number } +): number | null { + let s = text.trim(); + if (opts?.prefix) s = s.replace(new RegExp(`^\\${opts.prefix}`), ''); + if (opts?.suffix) s = s.replace(new RegExp(`${opts.suffix.trim()}$`), ''); + s = s.trim().replace(/,/g, ''); + const m = s.match(/^(-?\d+\.?\d*)\s*([kKmM]?)$/); + if (!m) return null; + let val = parseFloat(m[1]); + if (isNaN(val)) return null; + const unit = m[2].toLowerCase(); + if (unit === 'k') val *= 1_000; + else if (unit === 'm') val *= 1_000_000; + if (opts?.step) val = Math.round(val / opts.step) * opts.step; + return val; +} + export function formatDuration(d: string): string { if (d === 'F') return 'Freehold'; if (d === 'L') return 'Leasehold'; diff --git a/pipeline/transform/merge.py b/pipeline/transform/merge.py index 40f7201..b691760 100644 --- a/pipeline/transform/merge.py +++ b/pipeline/transform/merge.py @@ -186,11 +186,11 @@ def _build( lsoa_pop = pl.scan_parquet(lsoa_population_path) wide = wide.join(lsoa_pop, on="lsoa21", how="left") wide = wide.with_columns( - (pl.col("serious_crime_avg_yr") / pl.col("population") * 1000) - .round(1) + pl.when(pl.col("population") > 0) + .then((pl.col("serious_crime_avg_yr") / pl.col("population") * 1000).round(1)) .alias("serious_crime_per_1k"), - (pl.col("minor_crime_avg_yr") / pl.col("population") * 1000) - .round(1) + pl.when(pl.col("population") > 0) + .then((pl.col("minor_crime_avg_yr") / pl.col("population") * 1000).round(1)) .alias("minor_crime_per_1k"), ).drop("population") diff --git a/pipeline/utils/download.py b/pipeline/utils/download.py index 558184f..d889c89 100644 --- a/pipeline/utils/download.py +++ b/pipeline/utils/download.py @@ -37,4 +37,4 @@ def extract_zip(zip_path: Path, extract_dir: Path) -> None: """Extract a ZIP archive into the given directory.""" extract_dir.mkdir(parents=True, exist_ok=True) with zipfile.ZipFile(zip_path, "r") as zf: - zf.extractall(extract_dir) + zf.extractall(extract_dir, filter="data") diff --git a/pipeline/utils/fuzzy_join.py b/pipeline/utils/fuzzy_join.py index 9ca8e4a..5fa6cc8 100644 --- a/pipeline/utils/fuzzy_join.py +++ b/pipeline/utils/fuzzy_join.py @@ -84,7 +84,7 @@ def fuzzy_join_on_postcode( right_match["_right_postcode"], right_match["_right_address"], ): - if postcode is not None: + if address is not None and postcode is not None: right_by_postcode.setdefault(postcode, []).append((idx, address)) # Group left side by postcode diff --git a/screenshot/src/screenshot.ts b/screenshot/src/screenshot.ts index c271dc0..097f078 100644 --- a/screenshot/src/screenshot.ts +++ b/screenshot/src/screenshot.ts @@ -5,7 +5,7 @@ import { NetworkCache } from './network-cache.js'; const VIEWPORT = { width: 1200, height: 630 }; const NAVIGATION_TIMEOUT = 15_000; const READY_TIMEOUT = 15_000; -const RENDER_BUFFER_MS = 200; +const RENDER_BUFFER_MS = 500; const POOL_SIZE = 3; let browser: Browser | null = null; @@ -262,9 +262,10 @@ export async function takeScreenshot(url: string, authHeader?: string): Promise< const t2 = performance.now(); console.log(` Ready: ${(t2 - t1).toFixed(0)}ms`); - // Brief buffer for SwiftShader to finish rendering the WebGL frame. - // Reduced from 500ms → 200ms since tiles now load from the in-memory - // cache and don't need network round-trips. + // Buffer for SwiftShader to finish rendering the WebGL frame after + // __screenshot_ready fires. The frontend uses double-rAF before signaling, + // so one paint cycle has already completed — this is extra safety for + // compositor staging and any residual tile/layer rendering. await page.waitForTimeout(RENDER_BUFFER_MS); // JPEG at quality 85: ~3-5x faster encoding than PNG with negligible diff --git a/scripts/zoopla_experiment.py b/scripts/zoopla_experiment.py index 43db52d..fdcaf9f 100755 --- a/scripts/zoopla_experiment.py +++ b/scripts/zoopla_experiment.py @@ -1,15 +1,21 @@ #!/usr/bin/env -S uv run --project ../finder -"""Zoopla scraping experiment — Playwright with stealth + network interception. +"""Zoopla scraping experiment — working prototype using Camoufox. -Zoopla uses Next.js App Router with React Server Components. The listing data -is NOT in __NEXT_DATA__ or the initial HTML — it's fetched client-side after -hydration. This means we need a real browser that: - 1. Passes Cloudflare's bot detection - 2. Executes JavaScript to trigger the client-side data fetch - 3. Intercepts the network response OR scrapes the rendered DOM +Key findings: + - Zoopla uses Cloudflare Turnstile (managed interactive challenge) + - Playwright headless Chromium + stealth patches CANNOT beat it + - Camoufox (anti-fingerprinting Firefox fork) PASSES Cloudflare + - Zoopla uses Next.js App Router with React Server Components (RSC) + - Listing data is NOT in __NEXT_DATA__ — it's server-rendered in RSC stream + - URL-based location slugs (e.g. /properties/london/) return 0 results + - Must use the search autocomplete (GraphQL: getGeoSuggestion) to resolve + a location, then submit the form to get results + - GraphQL endpoint: api-graphql-lambda.prod.zoopla.co.uk/graphql + - Listings loaded via getTopLeadListingIds + getRareFindLeadListingIds ops Usage: - uv run --project finder scripts/zoopla_experiment.py [OUTCODE] + uv run --project finder scripts/zoopla_experiment.py [LOCATION] + uv run --project finder scripts/zoopla_experiment.py "Tower Hamlets" """ import json @@ -25,294 +31,250 @@ logging.basicConfig( ) log = logging.getLogger("zoopla-exp") -ZOOPLA_BASE = "https://www.zoopla.co.uk" -CHANNELS = { - "BUY": "for-sale", - "RENT": "to-rent", -} +def scrape_zoopla(location: str = "London", channel: str = "BUY"): + from camoufox.sync_api import Camoufox + tab_label = "Buy" if channel == "BUY" else "Rent" + log.info("Scraping Zoopla: location=%s channel=%s", location, channel) -def run_playwright_stealth(outcode: str, channel: str = "BUY"): - """Use Playwright with stealth patches to scrape Zoopla. + with Camoufox(headless=True) as browser: + page = browser.new_page() - Strategy: - 1. Launch stealth browser to bypass Cloudflare - 2. Navigate to search page - 3. Wait for listings to render (client-side hydration) - 4. Try two extraction methods: - a. Intercept network requests for API data (cleanest) - b. Parse the rendered DOM (fallback) - """ - from playwright.sync_api import sync_playwright - from playwright_stealth import Stealth + # Intercept GraphQL responses + graphql_responses = [] - url_segment = CHANNELS[channel] - search_url = f"{ZOOPLA_BASE}/{url_segment}/properties/{outcode.lower()}/" - log.info("Target: %s", search_url) - - intercepted_data = [] - - def handle_response(response): - """Capture any API responses that look like listing data.""" - url = response.url - # Look for API/data endpoints - if any(kw in url for kw in ["/api/", "graphql", "search", "listing", "property"]): - try: - if "application/json" in (response.headers.get("content-type", "")): + def on_resp(response): + url = response.url + ct = response.headers.get("content-type", "") + if "json" in ct and "graphql" in url: + try: body = response.json() - intercepted_data.append({"url": url, "data": body}) - log.info(" [intercepted] %s (%s)", url[:100], type(body).__name__) + req = response.request.post_data or "" + graphql_responses.append({"body": body, "req": req}) + except Exception: + pass + + page.on("response", on_resp) + + # Step 1: Load homepage and pass Cloudflare + log.info("Loading Zoopla homepage...") + page.goto("https://www.zoopla.co.uk/", wait_until="domcontentloaded", timeout=60000) + + for i in range(20): + if "Just a moment" not in page.title(): + break + time.sleep(3) + else: + log.error("Cloudflare did not resolve after 60s") + return [] + + log.info("Homepage loaded: %s", page.title()) + time.sleep(3) + + # Step 2: Dismiss cookie consent (shadow DOM) + page.evaluate("""() => { + const aside = document.querySelector('#usercentrics-cmp-ui'); + if (aside && aside.shadowRoot) { + const btns = aside.shadowRoot.querySelectorAll('button'); + for (const btn of btns) { + if (btn.innerText.includes('Accept')) { btn.click(); return; } + } + } + aside?.remove(); + }""") + time.sleep(2) + + # Step 3: Select Buy/Rent tab if needed + if channel == "RENT": + rent_tab = page.query_selector('button:has-text("Rent")') or page.query_selector(f'[role="tab"]:has-text("{tab_label}")') + if rent_tab: + rent_tab.click() + time.sleep(1) + + # Step 4: Type location into search and select autocomplete suggestion + log.info("Searching for '%s'...", location) + search_input = ( + page.query_selector('input[name="autosuggest-input"]') + or page.query_selector('input[type="text"]') + ) + if not search_input: + log.error("Could not find search input") + return [] + + search_input.click() + time.sleep(0.5) + search_input.fill("") # Clear any existing text + search_input.type(location, delay=80) + time.sleep(3) + + # Select first autocomplete suggestion + first_option = page.query_selector('[role="option"]') + if first_option: + suggestion_text = first_option.inner_text() + log.info("Selecting suggestion: %s", suggestion_text) + first_option.click() + time.sleep(1) + else: + log.warning("No autocomplete suggestions appeared") + + # Step 5: Submit search + search_btn = page.query_selector('button:has-text("Search")') + if search_btn: + search_btn.click() + else: + search_input.press("Enter") + + log.info("Waiting for results...") + time.sleep(10) + + final_url = page.url + final_title = page.title() + log.info("URL: %s", final_url) + log.info("Title: %s", final_title) + + # Step 6: Extract listings from rendered DOM + listings = page.evaluate(r"""() => { + const links = Array.from(document.querySelectorAll( + 'a[href*="/for-sale/details/"], a[href*="/new-homes/details/"], a[href*="/to-rent/details/"]' + )); + + const seen = new Set(); + const results = []; + + for (const link of links) { + const href = link.href; + const match = href.match(/\/details\/(\d+)\//); + if (!match) continue; + + const id = match[1]; + if (seen.has(id)) continue; + seen.add(id); + + // Walk up to find the listing card container + let card = link; + for (let j = 0; j < 10; j++) { + card = card.parentElement; + if (!card) break; + const text = card.innerText || ''; + // A listing card should have a price and at least beds or area + if (text.includes('£') && (text.includes('bed') || text.includes('sq ft'))) { + break; + } + } + if (!card) continue; + + const text = card.innerText || ''; + const lines = text.split('\n').map(l => l.trim()).filter(Boolean); + + const priceMatch = text.match(/£([\d,]+)/); + const bedsMatch = text.match(/(\d+)\s*beds?/i); + const bathsMatch = text.match(/(\d+)\s*baths?/i); + const recMatch = text.match(/(\d+)\s*reception/i); + const areaMatch = text.match(/([\d,]+)\s*sq\s*ft/i); + + // Try to find address — usually a line with a postcode or comma-separated location + let address = ''; + for (const line of lines) { + if (/[A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2}/i.test(line) || + (line.includes(',') && !line.includes('£') && !line.match(/^\d+ beds?/i))) { + address = line; + break; + } + } + + // Tenure + let tenure = ''; + if (/freehold/i.test(text)) tenure = 'Freehold'; + else if (/leasehold/i.test(text)) tenure = 'Leasehold'; + + results.push({ + id: id, + url: href.replace(window.location.origin, ''), + price: priceMatch ? parseInt(priceMatch[1].replace(/,/g, '')) : null, + beds: bedsMatch ? parseInt(bedsMatch[1]) : null, + baths: bathsMatch ? parseInt(bathsMatch[1]) : null, + receptions: recMatch ? parseInt(recMatch[1]) : null, + floor_area_sqft: areaMatch ? parseInt(areaMatch[1].replace(/,/g, '')) : null, + address: address, + tenure: tenure, + text_preview: lines.slice(0, 10).join(' | '), + }); + } + + return results; + }""") + + log.info("Extracted %d unique listings from page 1", len(listings)) + + # Step 7: Check for results count and pagination + body_text = page.inner_text("body") + count_match = re.search(r"([\d,]+)\s+results?", body_text) + total_results = int(count_match.group(1).replace(",", "")) if count_match else len(listings) + log.info("Total results: %d", total_results) + + # Step 8: Log GraphQL operations we saw + log.info("GraphQL operations intercepted:") + for gql in graphql_responses: + try: + req = json.loads(gql["req"]) + op = req.get("operationName", "?") + log.info(" - %s", op) except Exception: pass - with sync_playwright() as p: - # Launch with stealth-friendly args - browser = p.chromium.launch( - headless=True, - args=[ - "--disable-blink-features=AutomationControlled", - "--no-sandbox", - "--disable-dev-shm-usage", - "--disable-web-security", - "--lang=en-GB", - ], - ) - context = browser.new_context( - locale="en-GB", - timezone_id="Europe/London", - viewport={"width": 1920, "height": 1080}, - user_agent=( - "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " - "(KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36" - ), - ) - page = context.new_page() - - # Apply stealth patches (Linux platform, Chrome UA) - stealth = Stealth( - navigator_platform_override="Linux x86_64", - navigator_languages_override=("en-GB", "en"), - ) - stealth.apply_stealth_sync(page) - - # Listen for responses to intercept API data - page.on("response", handle_response) - - # Navigate - log.info("Navigating to %s ...", search_url) - try: - page.goto(search_url, wait_until="domcontentloaded", timeout=60000) - except Exception as e: - log.error("Navigation failed: %s", e) - browser.close() - return - - # Wait for Cloudflare to resolve - log.info("Waiting for Cloudflare challenge to resolve ...") - for attempt in range(20): - content = page.content() - title = page.title() - if "Just a moment" in content and "challenge" in content.lower(): - log.info(" Cloudflare challenge still active (%d/20) title=%s", attempt + 1, title) - time.sleep(3) - else: - log.info(" Challenge resolved! title=%s", title) - break - else: - log.error("Cloudflare challenge did not resolve") - # Dump page content for debugging - print("\n=== Cloudflare challenge page ===") - print(page.content()[:3000]) - browser.close() - return - - # Wait for actual content to render - log.info("Waiting for listing content to render ...") - try: - # Try waiting for property cards to appear - page.wait_for_selector( - '[data-testid="search-result"], [data-testid="regular-listings"], ' - '.listing-results, .css-kdnlof, [class*="ListingCard"], ' - '[class*="listing"], [class*="PropertyCard"]', - timeout=15000, - ) - log.info("Listing elements found in DOM!") - except Exception: - log.warning("No listing elements found by selector. Trying to wait for prices...") - try: - page.wait_for_function( - "document.querySelectorAll('a[href*=\"/for-sale/details/\"]').length > 0", - timeout=15000, - ) - log.info("Listing links found in DOM!") - except Exception: - log.warning("No listing links either. Page may still be loading or we're blocked.") - - # Give hydration a moment - time.sleep(3) - - # --- Extraction Method A: Check intercepted network data --- - if intercepted_data: - print(f"\n=== Intercepted {len(intercepted_data)} API responses ===") - for item in intercepted_data: - print(f"\nURL: {item['url'][:150]}") - data = item["data"] - if isinstance(data, dict): - print(f"Keys: {list(data.keys())[:15]}") - # Look for listings inside - for k, v in data.items(): - if isinstance(v, list) and len(v) > 2 and isinstance(v[0], dict): - print(f" {k}: list of {len(v)} items, [0] keys={list(v[0].keys())[:10]}") - elif isinstance(data, list) and data: - print(f"Array of {len(data)} items") - if isinstance(data[0], dict): - print(f" [0] keys: {list(data[0].keys())[:15]}") - print(json.dumps(data, indent=2, default=str, ensure_ascii=False)[:2000]) - - # --- Extraction Method B: Parse rendered DOM --- - log.info("Extracting from rendered DOM ...") - - # Get full page content after hydration - content = page.content() - - # Find listing URLs - listing_urls = re.findall(r'href="(/for-sale/details/\d+/[^"]*)"', content) - log.info("Found %d listing detail links", len(listing_urls)) - - # Find prices - prices = re.findall(r'£([\d,]+)', content) - log.info("Found %d price strings", len(prices)) - if prices: - log.info("Prices: %s", prices[:10]) - - # Try to extract structured listing data from the page - listings = page.evaluate("""() => { - // Try to find listing cards via various selectors - const selectors = [ - '[data-testid="search-result"]', - '[data-testid="regular-listings"] > div', - 'a[href*="/for-sale/details/"]', - '[class*="ListingCard"]', - '[class*="listing-result"]', - ]; - - for (const sel of selectors) { - const elements = document.querySelectorAll(sel); - if (elements.length > 2) { - return { - selector: sel, - count: elements.length, - // Get text and href from first 3 - samples: Array.from(elements).slice(0, 3).map(el => ({ - text: el.innerText?.substring(0, 300), - href: el.href || el.querySelector('a')?.href || '', - html: el.outerHTML?.substring(0, 500), - })) - }; - } - } - - // Fallback: find all links to listing detail pages - const links = Array.from(document.querySelectorAll('a[href*="/details/"]')); - if (links.length > 0) { - return { - selector: 'a[href*="/details/"]', - count: links.length, - samples: links.slice(0, 5).map(el => ({ - text: el.innerText?.substring(0, 300), - href: el.href, - parentText: el.closest('div, li, article')?.innerText?.substring(0, 500) || '', - })) - }; - } - - // Last resort: get page structure - return { - selector: 'none', - count: 0, - bodyText: document.body?.innerText?.substring(0, 2000), - title: document.title, - }; - }""") - - print(f"\n=== DOM Extraction Results ===") - print(json.dumps(listings, indent=2, ensure_ascii=False)[:5000]) - - # Also extract cookies for potential reuse - cookies = context.cookies() - zoopla_cookies = {c["name"]: c["value"] for c in cookies if ".zoopla.co.uk" in c.get("domain", "")} + # Step 9: Extract cookies for potential curl_cffi reuse + cookies = page.context.cookies() + session_cookies = { + c["name"]: c["value"] + for c in cookies + if "zoopla" in c.get("domain", "") or "cf" in c.get("name", "").lower() + } ua = page.evaluate("navigator.userAgent") - print(f"\n=== Session Info ===") - print(f"Cookies ({len(zoopla_cookies)}): {list(zoopla_cookies.keys())}") - print(f"User-Agent: {ua}") - - if zoopla_cookies: - # Save cookies for reuse - print(f"\n=== Reusable cookie env vars ===") - for name, value in zoopla_cookies.items(): - print(f" {name}={value[:50]}...") - - # --- Try a detail page if we found any listing URLs --- - if listing_urls: - detail_path = listing_urls[0] - detail_url = f"{ZOOPLA_BASE}{detail_path}" - log.info("--- Fetching detail page: %s ---", detail_url) - time.sleep(2) - - page.goto(detail_url, wait_until="domcontentloaded", timeout=30000) - time.sleep(5) # Let it hydrate - - detail = page.evaluate("""() => { - const result = {}; - - // Price - const priceEl = document.querySelector('[data-testid="price"]') - || document.querySelector('[class*="price"]'); - result.price = priceEl?.innerText || ''; - - // Address - const addrEl = document.querySelector('[data-testid="address-label"]') - || document.querySelector('h1') || document.querySelector('address'); - result.address = addrEl?.innerText || ''; - - // Key features - const features = Array.from(document.querySelectorAll('[data-testid="listing_feature"] li, [class*="feature"] li')); - result.features = features.map(f => f.innerText).slice(0, 15); - - // Bedrooms/bathrooms from icons or text - const specs = document.querySelectorAll('[data-testid="beds-label"], [data-testid="baths-label"], [class*="bed"], [class*="bath"]'); - result.specs = Array.from(specs).map(s => s.innerText).slice(0, 5); - - // Description - const desc = document.querySelector('[data-testid="listing_description"], [class*="description"]'); - result.description = desc?.innerText?.substring(0, 500) || ''; - - // Agent - const agent = document.querySelector('[data-testid="agent-details"], [class*="agent"]'); - result.agent = agent?.innerText?.substring(0, 200) || ''; - - // Full page text summary - result.pageTitle = document.title; - result.bodyPreview = document.body?.innerText?.substring(0, 1000); - - return result; - }""") - - print(f"\n=== Detail Page Data ===") - print(json.dumps(detail, indent=2, ensure_ascii=False)[:3000]) - - browser.close() + return { + "url": final_url, + "title": final_title, + "total_results": total_results, + "listings": listings, + "cookies": session_cookies, + "user_agent": ua, + } def main(): - outcode = sys.argv[1] if len(sys.argv) > 1 else "E1" - channel = "BUY" - log.info("=== Zoopla Scraping Experiment (Playwright Stealth) ===") - log.info("Outcode: %s, Channel: %s", outcode, channel) - run_playwright_stealth(outcode, channel) - log.info("=== Done ===") + location = sys.argv[1] if len(sys.argv) > 1 else "London" + + result = scrape_zoopla(location, channel="BUY") + if not result: + log.error("Scraping failed") + sys.exit(1) + + listings = result["listings"] + print(f"\n{'='*60}") + print(f" Zoopla: {result['title']}") + print(f" URL: {result['url']}") + print(f" Total: {result['total_results']} results, {len(listings)} extracted") + print(f"{'='*60}\n") + + for i, listing in enumerate(listings): + print(f"--- Listing {i+1}: {listing['url']} ---") + display = {k: v for k, v in listing.items() if k != "text_preview" and v} + print(json.dumps(display, indent=2, ensure_ascii=False)) + print() + + # Summary stats + prices = [l["price"] for l in listings if l["price"]] + beds = [l["beds"] for l in listings if l["beds"]] + if prices: + print(f"Price range: £{min(prices):,} - £{max(prices):,}") + print(f"Median: £{sorted(prices)[len(prices)//2]:,}") + if beds: + print(f"Bedrooms: {min(beds)}-{max(beds)}") + + # Cookie info for reuse + print(f"\nSession cookies ({len(result['cookies'])} cookies)") + print(f"User-Agent: {result['user_agent']}") if __name__ == "__main__": diff --git a/server-rs/src/data/poi.rs b/server-rs/src/data/poi.rs index c909682..8488cfb 100644 --- a/server-rs/src/data/poi.rs +++ b/server-rs/src/data/poi.rs @@ -23,7 +23,7 @@ pub struct POIData { /// Byte offset into `id_buffer` where each row's ID starts. id_offsets: Vec, /// Length in bytes of each row's ID. - id_lengths: Vec, + id_lengths: Vec, pub group: InternedColumn, pub category: InternedColumn, pub name: Vec, @@ -101,7 +101,7 @@ impl POIData { let mut id_lengths = Vec::with_capacity(row_count); for s in &id_raw { let offset = id_buffer.len() as u32; - let length = s.len().min(u8::MAX as usize) as u8; + let length = s.len().min(u16::MAX as usize) as u16; id_offsets.push(offset); id_lengths.push(length); id_buffer.push_str(&s[..length as usize]); diff --git a/server-rs/src/data/postcodes.rs b/server-rs/src/data/postcodes.rs index 3a0def3..ba0a14e 100644 --- a/server-rs/src/data/postcodes.rs +++ b/server-rs/src/data/postcodes.rs @@ -128,6 +128,7 @@ impl PostcodeData { // Compute centroid across all vertices from all rings let total_vertices: usize = rings.iter().map(|ring| ring.len()).sum(); let centroid = if total_vertices == 0 { + tracing::warn!(postcode = %postcode, "Postcode polygon has zero vertices, defaulting centroid to (0,0)"); (0.0, 0.0) } else { let mut sum_lat: f32 = 0.0; diff --git a/server-rs/src/features.rs b/server-rs/src/features.rs index cc906fd..f51c456 100644 --- a/server-rs/src/features.rs +++ b/server-rs/src/features.rs @@ -68,9 +68,9 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[ features: &[ FeatureConfig { name: "Last known price", - bounds: Bounds::Fixed { - min: 0.0, - max: 2_000_000.0, + bounds: Bounds::Percentile { + low: 0.0, + high: 98.0, }, step: 10000.0, description: "Most recent sale price from the Land Registry", @@ -79,15 +79,15 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[ prefix: "£", suffix: "", raw: false, - absolute: true, + absolute: false, modes: &["historical"], linked: "", }, FeatureConfig { name: "Estimated current price", - bounds: Bounds::Fixed { - min: 0.0, - max: 2_000_000.0, + bounds: Bounds::Percentile { + low: 0.0, + high: 98.0, }, step: 10000.0, description: "Inflation-adjusted estimate of the current property value", @@ -96,7 +96,7 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[ prefix: "£", suffix: "", raw: false, - absolute: true, + absolute: false, modes: &["historical"], linked: "Asking price", }, @@ -252,9 +252,9 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[ }, FeatureConfig { name: "Asking price", - bounds: Bounds::Fixed { - min: 0.0, - max: 2_000_000.0, + bounds: Bounds::Percentile { + low: 0.0, + high: 98.0, }, step: 10000.0, description: "Listed asking price for properties currently for sale", @@ -263,15 +263,15 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[ prefix: "£", suffix: "", raw: false, - absolute: true, + absolute: false, modes: &["buy"], linked: "Estimated current price", }, FeatureConfig { name: "Asking rent (monthly)", - bounds: Bounds::Fixed { - min: 0.0, - max: 10_000.0, + bounds: Bounds::Percentile { + low: 0.0, + high: 98.0, }, step: 50.0, description: "Listed monthly rent for properties currently for rent", @@ -280,7 +280,7 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[ prefix: "£", suffix: "/mo", raw: false, - absolute: true, + absolute: false, modes: &["rent"], linked: "Estimated monthly rent", }, @@ -870,7 +870,7 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[ min: 0.0, max: 100.0, }, - step: 1.0, + step: 0.1, description: "Percentage of population identifying as South Asian", detail: "From the 2021 Census. Percentage of the local authority population identifying as Indian, Pakistani, Bangladeshi, or any other Asian background.", source: "ethnicity", @@ -887,7 +887,7 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[ min: 0.0, max: 100.0, }, - step: 1.0, + step: 0.1, description: "Percentage of population identifying as East Asian", detail: "From the 2021 Census. Percentage of the local authority population identifying as Chinese.", source: "ethnicity", @@ -904,7 +904,7 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[ min: 0.0, max: 100.0, }, - step: 1.0, + step: 0.1, description: "Percentage of population identifying as Black", detail: "From the 2021 Census. Percentage of the local authority population identifying as Black, Black British, Caribbean, or African.", source: "ethnicity", @@ -921,7 +921,7 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[ min: 0.0, max: 100.0, }, - step: 1.0, + step: 0.1, description: "Percentage of population identifying as Mixed or Multiple ethnic groups", detail: "From the 2021 Census. Percentage of the local authority population identifying as Mixed or Multiple ethnic groups (White and Black Caribbean, White and Black African, White and Asian, or any other Mixed or Multiple background).", source: "ethnicity", @@ -938,7 +938,7 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[ min: 0.0, max: 100.0, }, - step: 1.0, + step: 0.1, description: "Percentage of population identifying as Other ethnic group", detail: "From the 2021 Census. Percentage of the local authority population identifying as Other ethnic group (Arab or any other ethnic group not covered by the main categories).", source: "ethnicity", diff --git a/server-rs/src/main.rs b/server-rs/src/main.rs index a8a3373..f9ed091 100644 --- a/server-rs/src/main.rs +++ b/server-rs/src/main.rs @@ -365,6 +365,7 @@ async fn main() -> anyhow::Result<()> { info!("Precomputed AI filters system prompt"); let token_cache = Arc::new(auth::TokenCache::new()); + let superuser_token_cache = Arc::new(pocketbase::SuperuserTokenCache::new()); let app_state = AppState { data: property_data, @@ -392,6 +393,7 @@ async fn main() -> anyhow::Result<()> { gemini_model: cli.gemini_model, travel_time_store, token_cache, + superuser_token_cache, ai_filters_system_prompt, google_maps_api_key: cli.google_maps_api_key, stripe_secret_key: cli.stripe_secret_key, diff --git a/server-rs/src/og_middleware.rs b/server-rs/src/og_middleware.rs index 81e4227..89d6ba8 100644 --- a/server-rs/src/og_middleware.rs +++ b/server-rs/src/og_middleware.rs @@ -65,6 +65,14 @@ pub async fn og_middleware(request: Request, next: Next) -> Response { format!("{}/api/screenshot?og=1&{}", state.public_url, query_string) }; + let og_url = if query_string.is_empty() { + format!("{}{}", state.public_url, path) + } else { + format!("{}{}?{}", state.public_url, path, query_string) + }; + + let og_logo = format!("{}/favicon.svg", state.public_url); + let (og_title, og_description) = if is_invite { ( "You\u{2019}re invited to Perfect Postcode", @@ -81,6 +89,8 @@ pub async fn og_middleware(request: Request, next: Next) -> Response { r#" + + diff --git a/server-rs/src/parsing/filters.rs b/server-rs/src/parsing/filters.rs index 7991693..04fde0e 100644 --- a/server-rs/src/parsing/filters.rs +++ b/server-rs/src/parsing/filters.rs @@ -54,16 +54,21 @@ pub fn parse_filters( // Check if this is an enum feature if let Some(values) = enum_values.get(&feat_idx) { // Enum filter: convert string values to u16 indices - let allowed: FxHashSet = rest - .split('|') - .filter_map(|value| { - let value = value.trim(); - values - .iter() - .position(|existing| existing == value) - .map(|position| position as u16) - }) - .collect(); + let mut allowed: FxHashSet = FxHashSet::default(); + for value in rest.split('|') { + let value = value.trim(); + match values.iter().position(|existing| existing == value) { + Some(position) => { + allowed.insert(position as u16); + } + None => { + return Err(format!( + "Unknown value '{}' for enum feature '{}'. Valid values: {:?}", + value, name, values + )); + } + } + } enums.push(ParsedEnumFilter { feat_idx, allowed }); } else { // Numeric filter: parse min:max and encode to u16 @@ -369,20 +374,16 @@ mod tests { } #[test] - fn parse_enum_with_unknown_value() { + fn parse_enum_with_unknown_value_errors() { let tq = test_quant(4, 2); - let (_numeric, enums) = parse_filters( + let result = parse_filters( Some("Type:Detached|Unknown|Flats/Maisonettes"), &extended_feature_map(), &extended_enum_values(), &tq.as_ref(), - ) - .unwrap(); - - assert_eq!(enums.len(), 1); - assert!(enums[0].allowed.contains(&0)); // Detached - assert!(enums[0].allowed.contains(&3)); // Flats/Maisonettes - assert_eq!(enums[0].allowed.len(), 2); + ); + assert!(result.is_err()); + assert!(result.unwrap_err().contains("Unknown value 'Unknown'")); } #[test] diff --git a/server-rs/src/pocketbase.rs b/server-rs/src/pocketbase.rs index 528018e..f99630a 100644 --- a/server-rs/src/pocketbase.rs +++ b/server-rs/src/pocketbase.rs @@ -1,13 +1,62 @@ use std::sync::Arc; -use std::time::Duration; +use std::time::{Duration, Instant}; use metrics::gauge; +use parking_lot::RwLock; use reqwest::Client; use serde::{Deserialize, Serialize}; use tracing::{info, warn}; use crate::state::AppState; +/// Cache TTL for the superuser token. PocketBase superuser JWTs are valid for +/// ~14 days by default, so 10 minutes is very conservative while eliminating +/// nearly all redundant auth requests (metrics poller, newsletter, invites, etc.). +const SUPERUSER_TOKEN_TTL_SECS: u64 = 600; + +pub struct SuperuserTokenCache { + token: RwLock>, +} + +impl SuperuserTokenCache { + pub fn new() -> Self { + Self { + token: RwLock::new(None), + } + } +} + +/// Get a cached superuser token, or authenticate fresh if expired/missing. +pub async fn get_superuser_token(state: &AppState) -> anyhow::Result { + // Check cache first (read lock — cheap, non-blocking for other readers) + { + let cached = state.superuser_token_cache.token.read(); + if let Some((token, created)) = cached.as_ref() { + if created.elapsed().as_secs() < SUPERUSER_TOKEN_TTL_SECS { + return Ok(token.clone()); + } + } + } + + // Cache miss or expired — fetch a fresh token + let pb_url = state.pocketbase_url.trim_end_matches('/'); + let token = auth_superuser( + &state.http_client, + pb_url, + &state.pocketbase_admin_email, + &state.pocketbase_admin_password, + ) + .await?; + + // Store in cache + { + let mut cached = state.superuser_token_cache.token.write(); + *cached = Some((token.clone(), Instant::now())); + } + + Ok(token) +} + #[derive(Deserialize)] struct AuthResponse { token: String, @@ -775,21 +824,14 @@ pub fn start_metrics_poller(shared: Arc) { } async fn poll_pocketbase_counts(state: &AppState) { - let pb_url = state.pocketbase_url.trim_end_matches('/'); - let token = match auth_superuser( - &state.http_client, - pb_url, - &state.pocketbase_admin_email, - &state.pocketbase_admin_password, - ) - .await - { + let token = match get_superuser_token(state).await { Ok(tk) => tk, Err(err) => { warn!("PocketBase metrics poll auth failed: {err}"); return; } }; + let pb_url = state.pocketbase_url.trim_end_matches('/'); // Simple collection counts for (collection, metric_name) in [ diff --git a/server-rs/src/routes/ai_filters.rs b/server-rs/src/routes/ai_filters.rs index d451676..e1e40f6 100644 --- a/server-rs/src/routes/ai_filters.rs +++ b/server-rs/src/routes/ai_filters.rs @@ -12,7 +12,7 @@ use tracing::{info, warn}; use crate::auth::OptionalUser; use crate::consts::{AI_FILTERS_MAX_TOKENS, AI_FILTERS_TEMPERATURE, AI_FILTERS_WEEKLY_TOKEN_LIMIT}; use crate::data::slugify; -use crate::pocketbase::auth_superuser; +use crate::pocketbase::get_superuser_token; use crate::routes::{FeatureInfo, FeaturesResponse}; use crate::state::{AppState, SharedState}; use crate::utils::gemini_chat; @@ -37,6 +37,8 @@ pub struct AiFiltersRequest { query: String, /// Current filters for conversational refinement (e.g. "make it cheaper") context: Option, + /// Current listing mode (historical/buy/rent). Defaults to "historical". + listing_type: Option, } #[derive(Serialize)] @@ -58,6 +60,8 @@ pub struct AiFiltersResponse { /// What the LLM couldn't map to existing filters (empty if everything matched) #[serde(skip_serializing_if = "String::is_empty")] notes: String, + /// The listing mode used for this response (historical/buy/rent) + listing_type: String, } /// Strip markdown code fences (```json ... ``` or ``` ... ```) from LLM output. @@ -268,6 +272,37 @@ pub fn build_system_prompt( modes_list, )); + // Listing modes section + parts.push( + "\n--- LISTING MODES ---\n\ + There are three listing modes that control which property data is shown:\n\ + - \"historical\": Historical sales from Land Registry (default). Uses features like \ + \"Last known price\", \"Estimated current price\", \"Price per sqm\".\n\ + - \"buy\": Properties currently listed for sale. Uses features like \"Asking price\", \ + \"Asking price per sqm\".\n\ + - \"rent\": Properties currently listed for rent. Uses features like \ + \"Asking rent (monthly)\".\n\ + \n\ + When the user mentions buying, purchasing, for-sale properties, or asking prices, \ + set listing_type to \"buy\".\n\ + When the user mentions renting, letting, rental properties, or monthly rent, \ + set listing_type to \"rent\".\n\ + When the user doesn't specify or mentions historical prices/past sales, \ + omit listing_type to keep the current mode.\n\ + \n\ + Features marked with [mode] below are only available in that mode. \ + Features without a mode annotation work in all modes. \ + ONLY use features valid for the chosen listing_type.\n\ + If the user mentions price and the mode is \"buy\", use \"Asking price\" (not \"Last known price\").\n\ + If the user mentions rent/price and the mode is \"rent\", use \"Asking rent (monthly)\".\n\ + \n\ + Feature equivalences across modes:\n\ + - \"Estimated current price\" (historical) ↔ \"Asking price\" (buy)\n\ + - \"Est. price per sqm\" (historical) ↔ \"Asking price per sqm\" (buy)\n\ + - \"Estimated monthly rent\" (historical) ↔ \"Asking rent (monthly)\" (rent)" + .to_string(), + ); + // Feature catalogue parts.push("\n--- AVAILABLE FEATURES ---\n".to_string()); for group in &features.groups { @@ -285,11 +320,17 @@ pub fn build_system_prompt( description, prefix, suffix, + modes, .. } => { + let mode_str = if modes.is_empty() { + String::new() + } else { + format!(" [{}]", modes.join("/")) + }; parts.push(format!( - "- \"{}\" (numeric, {}{:.0}{} to {}{:.0}{}): {}", - name, prefix, min, suffix, prefix, max, suffix, description + "- \"{}\"{} (numeric, {}{:.0}{} to {}{:.0}{}): {}", + name, mode_str, prefix, min, suffix, prefix, max, suffix, description )); } FeatureInfo::Enum { @@ -298,6 +339,10 @@ pub fn build_system_prompt( description, .. } => { + // Skip Listing status — handled via listing_type field + if name == "Listing status" { + continue; + } parts.push(format!( "- \"{}\" (enum, values: [{}]): {}", name, @@ -381,10 +426,37 @@ pub fn build_system_prompt( .to_string(), ); + // Examples showing listing mode switching + parts.push( + "\nUser: \"2 bed flat to rent under £1500/month\"\n\ + Output: {\"listing_type\": \"rent\", \ + \"numeric_filters\": [{\"name\": \"Asking rent (monthly)\", \"bound\": \"max\", \"value\": 1500}], \ + \"enum_filters\": [{\"name\": \"Property type\", \"values\": [\"Flats/Maisonettes\"]}], \ + \"travel_time_filters\": [], \ + \"notes\": \"\"}" + .to_string(), + ); + + parts.push( + "\nUser: \"3 bed house to buy under 500k with good schools\"\n\ + Output: {\"listing_type\": \"buy\", \ + \"numeric_filters\": [{\"name\": \"Asking price\", \"bound\": \"max\", \"value\": 500000}, \ + {\"name\": \"Good+ primary schools within 5km\", \"bound\": \"min\", \"value\": 5}], \ + \"enum_filters\": [{\"name\": \"Property type\", \ + \"values\": [\"Detached\", \"Semi-Detached\", \"Terraced\"]}], \ + \"travel_time_filters\": [], \ + \"notes\": \"\"}" + .to_string(), + ); + // Output format reminder parts.push( "\n--- OUTPUT FORMAT ---\n\ - {\"numeric_filters\": [...], \"enum_filters\": [...], \"travel_time_filters\": [{\"mode\": \"...\", \"slug\": \"...\", \"label\": \"...\", \"bound\": \"min\"|\"max\", \"value\": N}, ...], \"notes\": \"...\"}\n\ + {\"listing_type\": \"buy\"|\"rent\" (OPTIONAL — only when switching mode), \ + \"numeric_filters\": [...], \"enum_filters\": [...], \ + \"travel_time_filters\": [{\"mode\": \"...\", \"slug\": \"...\", \"label\": \"...\", \ + \"bound\": \"min\"|\"max\", \"value\": N}, ...], \"notes\": \"...\"}\n\ + - listing_type: include only when the user explicitly wants to buy or rent. Omit to keep current mode.\n\ - travel_time_filters: use ONLY slugs returned by search_destinations. If a place isn't found, mention it in notes.\n\ Respond with ONLY the JSON object. No explanation." .to_string(), @@ -409,19 +481,12 @@ async fn fetch_ai_usage( state: &AppState, user_id: &str, ) -> Result<(u64, u64), (StatusCode, String)> { - let pb_url = state.pocketbase_url.trim_end_matches('/'); - let token = auth_superuser( - &state.http_client, - pb_url, - &state.pocketbase_admin_email, - &state.pocketbase_admin_password, - ) - .await - .map_err(|err| { + let token = get_superuser_token(state).await.map_err(|err| { warn!("Failed to auth superuser for AI usage check: {err}"); (StatusCode::BAD_GATEWAY, "Internal error".into()) })?; + let pb_url = state.pocketbase_url.trim_end_matches('/'); let url = format!("{pb_url}/api/collections/users/records/{user_id}"); let resp = state .http_client @@ -460,15 +525,7 @@ async fn fetch_ai_usage( /// Update the user's AI token usage in PocketBase. /// Best-effort — logs warnings on failure but does not propagate errors. async fn update_ai_usage(state: &AppState, user_id: &str, tokens_used: u64, week: u64) { - let pb_url = state.pocketbase_url.trim_end_matches('/'); - let token = match auth_superuser( - &state.http_client, - pb_url, - &state.pocketbase_admin_email, - &state.pocketbase_admin_password, - ) - .await - { + let token = match get_superuser_token(state).await { Ok(tk) => tk, Err(err) => { warn!("Failed to auth superuser for AI usage update: {err}"); @@ -476,6 +533,7 @@ async fn update_ai_usage(state: &AppState, user_id: &str, tokens_used: u64, week } }; + let pb_url = state.pocketbase_url.trim_end_matches('/'); let url = format!("{pb_url}/api/collections/users/records/{user_id}"); let res = state .http_client @@ -533,9 +591,17 @@ pub async fn post_ai_filters( let tools = build_tool_declarations(&state); - // Build user message with optional context for conversational refinement + // Resolve current listing mode from request + let current_mode = req.listing_type.as_deref().unwrap_or("historical"); + let current_mode = match current_mode { + "historical" | "buy" | "rent" => current_mode, + _ => "historical", + }; + + // Build user message with listing mode and optional context for conversational refinement let user_text = if let Some(ref ctx) = req.context { let mut msg = String::new(); + msg.push_str(&format!("Current listing mode: {}\n", current_mode)); msg.push_str("Currently active filters:\n"); msg.push_str(&serde_json::to_string(&ctx.filters).unwrap_or_default()); if !ctx.travel_time.is_empty() { @@ -553,7 +619,10 @@ pub async fn post_ai_filters( msg.push_str(&format!("\nUser request: {}", req.query)); msg } else { - req.query.clone() + format!( + "Current listing mode: {}\nUser request: {}", + current_mode, req.query + ) }; let mut contents = vec![json!({ @@ -679,7 +748,17 @@ pub async fn post_ai_filters( } }; - let filters = validate_and_convert(&raw, &state.features_response); + // Resolve listing_type: LLM output > request > "historical" + let listing_type = raw + .get("listing_type") + .and_then(|val| val.as_str()) + .unwrap_or(current_mode); + let listing_type = match listing_type { + "historical" | "buy" | "rent" => listing_type, + _ => current_mode, + }; + + let mut filters = validate_and_convert(&raw, &state.features_response, listing_type); let travel_time_filters = validate_travel_time_filters(&raw, &state); let notes = raw .get("notes") @@ -687,6 +766,16 @@ pub async fn post_ai_filters( .unwrap_or("") .to_string(); + // Auto-inject Listing status filter for the chosen mode + let listing_value = match listing_type { + "buy" => "For sale", + "rent" => "For rent", + _ => "Historical sale", + }; + if let Value::Object(ref mut map) = filters { + map.insert("Listing status".to_string(), json!([listing_value])); + } + // Update usage with total accumulated tokens let new_total = tokens_used + total_tokens_accumulated; update_ai_usage(&state, &user.id, new_total, current_week).await; @@ -698,6 +787,7 @@ pub async fn post_ai_filters( filters, travel_time_filters, notes, + listing_type: listing_type.to_string(), })); } @@ -787,10 +877,10 @@ fn validate_travel_time_filters(raw: &Value, state: &AppState) -> Vec Value { +fn validate_and_convert(raw: &Value, features: &FeaturesResponse, listing_type: &str) -> Value { let mut result = serde_json::Map::new(); - // Build lookup maps from feature metadata + // Build lookup maps from feature metadata, filtering by listing mode let mut numeric_features: rustc_hash::FxHashMap<&str, (f32, f32)> = rustc_hash::FxHashMap::default(); let mut enum_features: rustc_hash::FxHashMap<&str, &[String]> = @@ -799,11 +889,23 @@ fn validate_and_convert(raw: &Value, features: &FeaturesResponse) -> Value { for group in &features.groups { for feature in &group.features { match feature { - FeatureInfo::Numeric { name, min, max, .. } => { - numeric_features.insert(name, (*min, *max)); + FeatureInfo::Numeric { + name, + min, + max, + modes, + .. + } => { + // Only include features valid for the chosen listing mode + if modes.is_empty() || modes.contains(&listing_type) { + numeric_features.insert(name, (*min, *max)); + } } FeatureInfo::Enum { name, values, .. } => { - enum_features.insert(name, values); + // Skip Listing status — handled via auto-injection + if name != "Listing status" { + enum_features.insert(name, values); + } } } } diff --git a/server-rs/src/routes/checkout.rs b/server-rs/src/routes/checkout.rs index 2fda1b2..34bf338 100644 --- a/server-rs/src/routes/checkout.rs +++ b/server-rs/src/routes/checkout.rs @@ -8,7 +8,7 @@ use serde::{Deserialize, Serialize}; use tracing::{info, warn}; use crate::auth::OptionalUser; -use crate::pocketbase::auth_superuser; +use crate::pocketbase::get_superuser_token; use crate::state::{AppState, SharedState}; use super::pricing::{count_licensed_users, price_for_count}; @@ -88,6 +88,8 @@ pub async fn post_checkout( state.stripe_referral_coupon_id.clone(), )); info!(code = %code, "Applying referral coupon to checkout"); + } else { + warn!(code = %code, "Referral code validation failed, proceeding without discount"); } } @@ -131,15 +133,9 @@ pub async fn post_checkout( /// Grant a license by updating the user's subscription to "licensed" in PocketBase. async fn grant_license(state: &AppState, user_id: &str) -> anyhow::Result<()> { - let pb_url = state.pocketbase_url.trim_end_matches('/'); - let token = auth_superuser( - &state.http_client, - pb_url, - &state.pocketbase_admin_email, - &state.pocketbase_admin_password, - ) - .await?; + let token = get_superuser_token(state).await?; + let pb_url = state.pocketbase_url.trim_end_matches('/'); let url = format!("{pb_url}/api/collections/users/records/{user_id}"); let resp = state .http_client diff --git a/server-rs/src/routes/invites.rs b/server-rs/src/routes/invites.rs index 10d8324..f71056f 100644 --- a/server-rs/src/routes/invites.rs +++ b/server-rs/src/routes/invites.rs @@ -8,7 +8,7 @@ use serde::{Deserialize, Serialize}; use tracing::{info, warn}; use crate::auth::OptionalUser; -use crate::pocketbase::auth_superuser; +use crate::pocketbase::get_superuser_token; use crate::state::SharedState; #[derive(Serialize)] @@ -118,14 +118,7 @@ pub async fn post_invites( let code = generate_invite_code(); let pb_url = state.pocketbase_url.trim_end_matches('/'); - let token = match auth_superuser( - &state.http_client, - pb_url, - &state.pocketbase_admin_email, - &state.pocketbase_admin_password, - ) - .await - { + let token = match get_superuser_token(&state).await { Ok(t) => t, Err(err) => { warn!("Failed to auth as PocketBase superuser: {err}"); @@ -202,14 +195,7 @@ pub async fn get_invite( let pb_url = state.pocketbase_url.trim_end_matches('/'); - let token = match auth_superuser( - &state.http_client, - pb_url, - &state.pocketbase_admin_email, - &state.pocketbase_admin_password, - ) - .await - { + let token = match get_superuser_token(&state).await { Ok(t) => t, Err(err) => { warn!("Failed to auth as PocketBase superuser: {err}"); @@ -325,14 +311,7 @@ pub async fn post_redeem_invite( let pb_url = state.pocketbase_url.trim_end_matches('/'); - let token = match auth_superuser( - &state.http_client, - pb_url, - &state.pocketbase_admin_email, - &state.pocketbase_admin_password, - ) - .await - { + let token = match get_superuser_token(&state).await { Ok(t) => t, Err(err) => { warn!("Failed to auth as PocketBase superuser: {err}"); @@ -500,14 +479,7 @@ pub async fn get_invites( let pb_url = state.pocketbase_url.trim_end_matches('/'); - let token = match auth_superuser( - &state.http_client, - pb_url, - &state.pocketbase_admin_email, - &state.pocketbase_admin_password, - ) - .await - { + let token = match get_superuser_token(&state).await { Ok(t) => t, Err(err) => { warn!("Failed to auth as PocketBase superuser: {err}"); diff --git a/server-rs/src/routes/newsletter.rs b/server-rs/src/routes/newsletter.rs index 28a71bc..19971de 100644 --- a/server-rs/src/routes/newsletter.rs +++ b/server-rs/src/routes/newsletter.rs @@ -8,7 +8,7 @@ use serde::Deserialize; use tracing::warn; use crate::auth::OptionalUser; -use crate::pocketbase::auth_superuser; +use crate::pocketbase::get_superuser_token; use crate::state::SharedState; #[derive(Deserialize)] @@ -27,16 +27,7 @@ pub async fn patch_newsletter( None => return StatusCode::UNAUTHORIZED.into_response(), }; - let pb_url = state.pocketbase_url.trim_end_matches('/'); - - let token = match auth_superuser( - &state.http_client, - pb_url, - &state.pocketbase_admin_email, - &state.pocketbase_admin_password, - ) - .await - { + let token = match get_superuser_token(&state).await { Ok(t) => t, Err(err) => { warn!("Failed to authenticate as PocketBase superuser: {err}"); @@ -44,6 +35,7 @@ pub async fn patch_newsletter( } }; + let pb_url = state.pocketbase_url.trim_end_matches('/'); let url = format!("{pb_url}/api/collections/users/records/{}", user.id); let res = state .http_client diff --git a/server-rs/src/routes/postcodes.rs b/server-rs/src/routes/postcodes.rs index 118948b..9332a10 100644 --- a/server-rs/src/routes/postcodes.rs +++ b/server-rs/src/routes/postcodes.rs @@ -281,7 +281,7 @@ pub async fn get_postcodes( histogram!("postcodes_response_count").record(features.len() as f64); - let truncated = features.len() > MAX_CELLS_PER_REQUEST; + let truncated = features.len() >= MAX_CELLS_PER_REQUEST; let t_total = t0.elapsed(); info!( postcodes_before_filter, diff --git a/server-rs/src/routes/pricing.rs b/server-rs/src/routes/pricing.rs index e7428c6..872ee24 100644 --- a/server-rs/src/routes/pricing.rs +++ b/server-rs/src/routes/pricing.rs @@ -7,7 +7,7 @@ use axum::Json; use serde::Serialize; use tracing::warn; -use crate::pocketbase::auth_superuser; +use crate::pocketbase::get_superuser_token; use crate::state::{AppState, SharedState}; /// Pricing tiers: (cumulative user cap, price in pence). @@ -45,15 +45,9 @@ pub fn price_for_count(count: u64) -> u64 { /// Count users with subscription="licensed" in PocketBase. pub async fn count_licensed_users(state: &AppState) -> anyhow::Result { - let pb_url = state.pocketbase_url.trim_end_matches('/'); - let token = auth_superuser( - &state.http_client, - pb_url, - &state.pocketbase_admin_email, - &state.pocketbase_admin_password, - ) - .await?; + let token = get_superuser_token(state).await?; + let pb_url = state.pocketbase_url.trim_end_matches('/'); let filter = "subscription=\"licensed\""; let url = format!( "{pb_url}/api/collections/users/records?filter={}&perPage=1", diff --git a/server-rs/src/routes/reload.rs b/server-rs/src/routes/reload.rs index f8cd79d..0d12069 100644 --- a/server-rs/src/routes/reload.rs +++ b/server-rs/src/routes/reload.rs @@ -147,6 +147,7 @@ fn rebuild_data(shared: &SharedState, start: Instant) -> anyhow::Result<(usize, poi_category_groups: Arc::clone(&old.poi_category_groups), travel_time_store: Arc::clone(&old.travel_time_store), token_cache: Arc::clone(&old.token_cache), + superuser_token_cache: Arc::clone(&old.superuser_token_cache), // Config (cheap clone) screenshot_url: old.screenshot_url.clone(), diff --git a/server-rs/src/routes/shorten.rs b/server-rs/src/routes/shorten.rs index 17905c0..2d82bf1 100644 --- a/server-rs/src/routes/shorten.rs +++ b/server-rs/src/routes/shorten.rs @@ -8,7 +8,7 @@ use rand::Rng; use serde::{Deserialize, Serialize}; use tracing::warn; -use crate::pocketbase::auth_superuser; +use crate::pocketbase::get_superuser_token; use crate::state::SharedState; const CODE_LEN: usize = 8; @@ -42,14 +42,7 @@ pub async fn post_shorten(State(shared): State>, Json(req): Jso let state = shared.load_state(); let pb_url = state.pocketbase_url.trim_end_matches('/'); - let token = match auth_superuser( - &state.http_client, - pb_url, - &state.pocketbase_admin_email, - &state.pocketbase_admin_password, - ) - .await - { + let token = match get_superuser_token(&state).await { Ok(t) => t, Err(err) => { warn!("PocketBase superuser auth failed: {err}"); @@ -102,14 +95,7 @@ pub async fn get_short_url(State(shared): State>, Path(code): P let pb_url = state.pocketbase_url.trim_end_matches('/'); - let token = match auth_superuser( - &state.http_client, - pb_url, - &state.pocketbase_admin_email, - &state.pocketbase_admin_password, - ) - .await - { + let token = match get_superuser_token(&state).await { Ok(t) => t, Err(err) => { warn!("PocketBase superuser auth failed: {err}"); diff --git a/server-rs/src/routes/stripe_webhook.rs b/server-rs/src/routes/stripe_webhook.rs index 7ffe5fe..39da697 100644 --- a/server-rs/src/routes/stripe_webhook.rs +++ b/server-rs/src/routes/stripe_webhook.rs @@ -8,7 +8,7 @@ use hmac::{Hmac, Mac}; use sha2::Sha256; use tracing::{info, warn}; -use crate::pocketbase::auth_superuser; +use crate::pocketbase::get_superuser_token; use crate::state::SharedState; type HmacSha256 = Hmac; @@ -31,6 +31,19 @@ fn verify_signature(payload: &[u8], sig_header: &str, secret: &str) -> bool { _ => return false, }; + // Reject webhooks older than 5 minutes to prevent replay attacks + if let Ok(ts_secs) = ts.parse::() { + let now = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .as_secs() as i64; + if (now - ts_secs).abs() > 300 { + return false; + } + } else { + return false; + } + // Compute expected signature: HMAC-SHA256(secret, "TIMESTAMP.PAYLOAD") let signed_payload = format!("{ts}.{}", String::from_utf8_lossy(payload)); let mut mac = match HmacSha256::new_from_slice(secret.as_bytes()) { @@ -94,15 +107,7 @@ pub async fn post_stripe_webhook( } // Update user subscription to "licensed" via PocketBase superuser auth - let pb_url = state.pocketbase_url.trim_end_matches('/'); - let token = match auth_superuser( - &state.http_client, - pb_url, - &state.pocketbase_admin_email, - &state.pocketbase_admin_password, - ) - .await - { + let token = match get_superuser_token(&state).await { Ok(t) => t, Err(err) => { warn!("Failed to auth as PocketBase superuser in webhook: {err}"); @@ -110,6 +115,7 @@ pub async fn post_stripe_webhook( } }; + let pb_url = state.pocketbase_url.trim_end_matches('/'); let url = format!("{pb_url}/api/collections/users/records/{user_id}"); let res = state .http_client diff --git a/server-rs/src/state.rs b/server-rs/src/state.rs index 6b8bca8..45a383d 100644 --- a/server-rs/src/state.rs +++ b/server-rs/src/state.rs @@ -9,6 +9,7 @@ use crate::auth::TokenCache; use crate::data::{ POICategoryGroup, POIData, PlaceData, PostcodeData, PropertyData, TravelTimeStore, }; +use crate::pocketbase::SuperuserTokenCache; use crate::routes::FeaturesResponse; use crate::utils::GridIndex; @@ -44,6 +45,8 @@ pub struct AppState { pub travel_time_store: Arc, /// Token validation cache (60s TTL) pub token_cache: Arc, + /// Cached PocketBase superuser token (10min TTL) to avoid rate-limiting + pub superuser_token_cache: Arc, // --- Config (cheap to clone) --- /// URL of the screenshot service (e.g. http://screenshot:8002)