diff --git a/.forgejo/workflows/docker-publish.yml b/.forgejo/workflows/docker-publish.yml index 19542d8..c86ee42 100644 --- a/.forgejo/workflows/docker-publish.yml +++ b/.forgejo/workflows/docker-publish.yml @@ -39,8 +39,10 @@ jobs: host="127.0.0.1:13000" fi repo=$(echo "${{ gitea.repository }}" | tr '[:upper:]' '[:lower:]') + owner="${repo%%/*}" { echo "host=${host}" + echo "owner=${owner}" echo "image=${host}/${repo}" echo "screenshot_image=${host}/${repo}-screenshot" } >> "$GITHUB_OUTPUT" @@ -49,8 +51,8 @@ jobs: uses: https://github.com/docker/login-action@v3 with: registry: ${{ steps.registry.outputs.host }} - username: ${{ gitea.actor }} - password: ${{ secrets.GITEA_TOKEN }} + username: ${{ steps.registry.outputs.owner }} + password: ${{ secrets.FORGEJO_PACKAGE_TOKEN }} - name: Extract metadata (main) id: meta diff --git a/.gitignore b/.gitignore index 14ca6fe..013b7bc 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,7 @@ **/dist server-rs/target .task +.tmp/ frontend/public/assets/* !frontend/public/assets/fonts/ !frontend/public/assets/fonts/** diff --git a/docker-compose.yml b/docker-compose.yml index 1817045..4079e21 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -29,8 +29,7 @@ services: - .:/app - cargo-home:/usr/local/cargo - cargo-target:/app/server-rs/target - - ./property-data:/app/data:ro - - ./property-data/travel-times:/app/data/travel-times:ro + - ./property-data2:/app/data:ro - ./finder/data:/app/finder-data:ro environment: POCKETBASE_URL: http://pocketbase:8090 @@ -51,7 +50,7 @@ services: BUGSINK_ENVIRONMENT: ${BUGSINK_ENVIRONMENT:-development} BUGSINK_RELEASE: ${BUGSINK_RELEASE:-} BUGSINK_SEND_DEFAULT_PII: ${BUGSINK_SEND_DEFAULT_PII:-false} - ACTUAL_LISTINGS_PATH: /app/finder-data/online_listings_buy_filtered.parquet + ACTUAL_LISTINGS_PATH: /app/finder-data/online_listings_buy.parquet depends_on: screenshot: condition: service_healthy diff --git a/finder/constants.py b/finder/constants.py index c456f62..a4870b3 100644 --- a/finder/constants.py +++ b/finder/constants.py @@ -20,11 +20,6 @@ TYPEAHEAD_URL = "https://los.rightmove.co.uk/typeahead" SEARCH_URL = "https://www.rightmove.co.uk/api/property-search/listing/search" RIGHTMOVE_BASE = "https://www.rightmove.co.uk" -# home.co.uk -HOMECOUK_BASE = "https://home.co.uk" -HOMECOUK_API_BASE = f"{HOMECOUK_BASE}/api" -HOMECOUK_PER_PAGE = 30 # max supported by the API - # Zoopla ZOOPLA_BASE = "https://www.zoopla.co.uk" @@ -108,13 +103,13 @@ PROPERTY_TYPE_MAP = { "House Boat": "Other", "Barn": "Other", "Serviced Apartments": "Other", - # Space-separated variants (from home.co.uk underscore/hyphen normalization) + # Space-separated variants from legacy provider normalization. "Semi Detached": "Semi-Detached", "Semi Detached Bungalow": "Semi-Detached", "End Of Terrace": "Terraced", "End Terrace": "Terraced", "Block Of Apartments": "Other", - # Lowercase variants (from home.co.uk / Rightmove APIs) + # Lowercase variants from listing APIs. "house": "Detached", "bungalow": "Other", "townhouse": "Terraced", diff --git a/finder/homecouk.py b/finder/homecouk.py deleted file mode 100644 index 60d421d..0000000 --- a/finder/homecouk.py +++ /dev/null @@ -1,461 +0,0 @@ -import json -import logging -import os -import random -import re -import time -from urllib.parse import unquote - -from curl_cffi.requests import Session -from curl_cffi.requests.errors import RequestsError - -from constants import ( - DELAY_BETWEEN_PAGES, - HOMECOUK_API_BASE, - HOMECOUK_BASE, - HOMECOUK_PER_PAGE, - MAX_BEDROOMS, - PROPERTY_TYPE_MAP, - RETRY_BASE_DELAY, -) -from spatial import PostcodeSpatialIndex -from transform import ( - normalize_postcode, - normalize_sub_type, - parse_int_value, - validate_floor_area, -) - -log = logging.getLogger("homecouk") - - -class CookiesExpiredError(Exception): - """Raised when home.co.uk returns 403, indicating cookies need refresh.""" - - -class PaginationError(Exception): - """Raised when home.co.uk pagination cannot be completed.""" - - -# Channel mapping: internal name → URL path segment -HOMECOUK_URL_SEGMENT = "for-sale" - - -def load_cookies() -> tuple[dict[str, str], str] | None: - """Get home.co.uk cookies + user-agent. - - Environment cookies are optional. When they are not present, bootstrap a - regular local session by visiting home.co.uk with curl_cffi's Chrome - impersonation and reusing the cookies set by the site. - """ - user_agent = os.environ.get( - "HOMECOUK_USER_AGENT", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " - "AppleWebKit/537.36 (KHTML, like Gecko) " - "Chrome/145.0.0.0 Safari/537.36", - ) - - env_cookies = { - name: value - for name, value in { - "cf_clearance": os.environ.get("HOMECOUK_CF_CLEARANCE", ""), - "homecouk_session": os.environ.get("HOMECOUK_SESSION", ""), - "XSRF-TOKEN": os.environ.get("HOMECOUK_XSRF_TOKEN", ""), - }.items() - if value - } - if env_cookies.get("homecouk_session"): - return env_cookies, user_agent - - session = Session(impersonate="chrome") - session.headers.update( - { - "User-Agent": user_agent, - "Accept": ( - "text/html,application/xhtml+xml,application/xml;q=0.9," - "*/*;q=0.8" - ), - } - ) - - for url in (HOMECOUK_BASE, f"{HOMECOUK_BASE}/for-sale/br1/"): - try: - response = session.get(url, timeout=30) - except RequestsError as exc: - log.warning("home.co.uk cookie bootstrap failed for %s: %s", url, exc) - continue - if response.status_code == 403: - raise CookiesExpiredError("home.co.uk returned HTTP 403 during bootstrap") - if response.status_code >= 400: - log.warning( - "home.co.uk cookie bootstrap got HTTP %d from %s", - response.status_code, - url, - ) - - cookies = session.cookies.get_dict() - if cookies.get("homecouk_session") and cookies.get("XSRF-TOKEN"): - log.info("home.co.uk local session bootstrapped") - return cookies, user_agent - - log.warning("home.co.uk did not provide session cookies during bootstrap") - return None - - -def make_client(cookies: dict[str, str], user_agent: str) -> Session: - """Create a curl_cffi Session configured for home.co.uk API calls. - Uses Chrome TLS impersonation so browser-derived cookies remain valid.""" - session = Session(impersonate="chrome") - session.headers.update( - { - "User-Agent": user_agent, - "Accept": "application/json, text/plain, */*", - "x-requested-with": "XMLHttpRequest", - } - ) - # Laravel CSRF: the XSRF-TOKEN cookie value must also be sent as the - # X-XSRF-TOKEN request header (URL-decoded). Without this header, the - # server rejects every request with 419/403. - xsrf = cookies.get("XSRF-TOKEN") - if xsrf: - session.headers["X-XSRF-TOKEN"] = unquote(xsrf) - for name, value in cookies.items(): - session.cookies.set(name, value, domain="home.co.uk") - return session - - -def fetch_page( - client: Session, url: str, params: dict, max_retries: int = 3 -) -> dict | None: - """GET JSON with retries on 429/5xx. Returns None on permanent failure. - 403 means cookies expired — raises CookiesExpiredError immediately.""" - for attempt in range(max_retries): - try: - resp = client.get(url, params=params, timeout=30) - if resp.status_code == 200: - try: - return resp.json() - except json.JSONDecodeError: - log.error( - "Non-JSON response from %s (got %s)", - url, - resp.headers.get("content-type", "?"), - ) - return None - if resp.status_code == 403: - raise CookiesExpiredError("HTTP 403 — cookies likely expired") - if resp.status_code in (429, 500, 502, 503, 504): - delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1) - log.warning( - "HTTP %d from %s, retry %d/%d in %.1fs", - resp.status_code, - url, - attempt + 1, - max_retries, - delay, - ) - time.sleep(delay) - continue - log.error("HTTP %d from %s (non-retryable)", resp.status_code, url) - return None - except CookiesExpiredError: - raise - except RequestsError as e: - delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1) - log.warning( - "%s from %s, retry %d/%d in %.1fs", - type(e).__name__, - url, - attempt + 1, - max_retries, - delay, - ) - time.sleep(delay) - log.error("All %d retries exhausted for %s", max_retries, url) - return None - - -def _coerce_positive_int(value) -> int | None: - parsed = parse_int_value(value) - if parsed is None or parsed <= 0: - return None - return parsed - - -def _property_identity(prop: dict, page: int, index: int) -> str: - for key in ("listing_id", "property_id", "id"): - value = prop.get(key) - if value: - return f"{key}:{value}" - return ( - f"page:{page}:index:{index}:" - f"{prop.get('display_address') or prop.get('address') or ''}:" - f"{prop.get('price') or prop.get('latest_price') or ''}" - ) - - -def parse_floor_area(description: str | None) -> float | None: - """Try to extract floor area from description text like '789 sq.ft.' or '73 sq.m.'.""" - if not description: - return None - m = re.search( - r"([\d,]+(?:\.\d+)?)\s*(?:sq\.?\s*ft|square\s+feet|ft(?:\^?2|²))", - description, - re.IGNORECASE, - ) - if m: - sqft = float(m.group(1).replace(",", "")) - return validate_floor_area(round(sqft * 0.092903, 1)) - m = re.search( - r"([\d,]+(?:\.\d+)?)\s*(?:sq\.?\s*m|square\s+met(?:er|re)s?|m(?:\^?2|²))", - description, - re.IGNORECASE, - ) - if m: - return validate_floor_area(round(float(m.group(1).replace(",", "")), 1)) - return None - - -def parse_tenure(prop: dict) -> str | None: - """Extract tenure from home.co.uk property data. - - Checks multiple sources in priority order: - 1. Dedicated 'tenure' or 'tenure_type' field in the API response - 2. Free-text search in the description for 'freehold' / 'leasehold' - 3. Free-text search in features lists - - home.co.uk aggregates listings from estate agents, so tenure is often - embedded in the description text rather than a structured field. - """ - # 1. Check dedicated tenure fields (in case the API adds them) - for key in ("tenure", "tenure_type", "tenureType"): - val = prop.get(key) - if val and isinstance(val, str): - lower = val.lower().strip() - if "leasehold" in lower: - return "Leasehold" - if "freehold" in lower: - return "Freehold" - - # 2. Check description text — estate agents often include tenure here - description = prop.get("description") or "" - if description: - lower_desc = description.lower() - if re.search(r"\bleasehold\b", lower_desc): - return "Leasehold" - if re.search(r"\bfreehold\b", lower_desc): - # Matches "Freehold" and "Share of Freehold" (both = freehold ownership) - return "Freehold" - - # 3. Check features / key_features lists if present - for key in ("features", "key_features", "keyFeatures"): - features = prop.get(key) - if features and isinstance(features, list): - for feat in features: - if not isinstance(feat, str): - continue - lower_feat = feat.lower() - if "leasehold" in lower_feat: - return "Leasehold" - if "freehold" in lower_feat: - return "Freehold" - - return None - - -def map_property_type(raw_type: str | None) -> str: - """Map home.co.uk property type to canonical type.""" - if not raw_type: - return "Other" - canonical = PROPERTY_TYPE_MAP.get(raw_type) - if canonical: - return canonical - # Home.co.uk uses types like "House", "Flat", "Apartment", "Detached", etc. - # Try common patterns - lower = raw_type.lower() - excluded_flat_like = ( - "block of apartment", - "house of multiple occupation", - "private halls", - "retirement", - "serviced apartment", - ) - if any(term in lower for term in excluded_flat_like): - return "Other" - if ( - "flat" in lower - or "apartment" in lower - or "maisonette" in lower - or "studio" in lower - ): - return "Flats/Maisonettes" - if "detached" in lower and "semi" not in lower: - return "Detached" - if "semi" in lower: - return "Semi-Detached" - if "terrace" in lower or "mews" in lower: - return "Terraced" - log.debug("Unknown property type: %r — mapping to Other", raw_type) - return "Other" - - -def transform_property( - prop: dict, - pc_index: PostcodeSpatialIndex, -) -> dict | None: - """Transform a raw home.co.uk property dict into our output schema.""" - lat = prop.get("latitude") - lng = prop.get("longitude") - if lat is None or lng is None: - return None - - # Validate coordinates are in England - if not (49 <= lat <= 56 and -7 <= lng <= 2): - log.debug("Coords outside England: lat=%.4f lng=%.4f — skipping", lat, lng) - return None - - price = parse_int_value(prop.get("price")) or parse_int_value( - prop.get("latest_price") - ) - if not price or price <= 0: - return None - - # Home.co.uk provides postcodes directly, but fall back to spatial index - postcode = prop.get("postcode") - if not postcode: - postcode = pc_index.nearest(lat, lng) - if not postcode: - log.debug("No postcode for property at %.4f, %.4f — skipping", lat, lng) - return None - - raw_beds = parse_int_value(prop.get("bedrooms")) or 0 - raw_baths = parse_int_value(prop.get("bathrooms")) or 0 - bedrooms = raw_beds if 0 <= raw_beds <= MAX_BEDROOMS else 0 - bathrooms = raw_baths if 0 <= raw_baths <= MAX_BEDROOMS else 0 - if raw_beds > MAX_BEDROOMS or raw_baths > MAX_BEDROOMS: - log.warning( - "home.co.uk %s: implausible beds=%d baths=%d (capped to 0)", - prop.get("listing_id") or prop.get("property_id") or "?", - raw_beds, raw_baths, - ) - - listing_type = prop.get("listing_property_type") or prop.get("property_type") or "" - address = prop.get("display_address") or prop.get("address") or "" - - # Derive price qualifier from reduction info - price_qualifier = "" - if prop.get("is_reduced"): - pct = prop.get("reduction_percent", 0) - if pct: - price_qualifier = f"Reduced by {pct}%" - else: - price_qualifier = "Reduced" - - listing_id = prop.get("listing_id") or prop.get("property_id") or "" - - return { - "id": f"hk_{listing_id}", # prefix to avoid collision with Rightmove IDs - "Bedrooms": bedrooms, - "Bathrooms": bathrooms, - "Number of bedrooms & living rooms": bedrooms + bathrooms, - "lon": lng, - "lat": lat, - "Postcode": normalize_postcode(postcode), - "Address per Property Register": address, - "Leasehold/Freehold": parse_tenure(prop), - "Property type": map_property_type(listing_type), - "Property sub-type": normalize_sub_type(listing_type), - "price": price, - "price_frequency": "", - "Price qualifier": price_qualifier, - "Total floor area (sqm)": parse_floor_area(prop.get("description")), - "Listing URL": f"{HOMECOUK_BASE}/property/{listing_id}", - "Listing features": [], # not available from home.co.uk - "first_visible_date": prop.get("added_date") or "", - } - - -def search_outcode( - client: Session, - outcode: str, - pc_index: PostcodeSpatialIndex, - max_properties: int | None = None, -) -> list[dict]: - """Paginate through sale search results for one outcode.""" - url_segment = HOMECOUK_URL_SEGMENT - url = f"{HOMECOUK_API_BASE}/{url_segment}/{outcode.lower()}/" - properties = [] - page = 1 - last_page: int | None = None - total_results: int | None = None - seen_ids: set[str] = set() - - while True: - params = { - "page": str(page), - "sort": "date_desc", - "per_page": str(HOMECOUK_PER_PAGE), - } - - # Set referer to match the page URL pattern - client.headers["referer"] = ( - f"https://home.co.uk/{url_segment}/{outcode.lower()}/" - f"?page={page}&sort=date_desc&per_page={HOMECOUK_PER_PAGE}" - ) - - data = fetch_page(client, url, params) - if not data: - raise PaginationError(f"home.co.uk {outcode} page {page} failed to load") - - pagination = data.get("pagination", {}) or {} - if last_page is None: - last_page = _coerce_positive_int(pagination.get("last_page")) - if total_results is None: - total_results = _coerce_positive_int(pagination.get("total")) - - raw_props = data.get("properties", []) - if not raw_props: - if total_results and page <= (last_page or page): - raise PaginationError( - f"home.co.uk {outcode} page {page} returned no properties " - f"before the advertised end" - ) - break - - page_ids = { - _property_identity(prop, page, idx) for idx, prop in enumerate(raw_props) - } - if page_ids and page_ids.issubset(seen_ids): - raise PaginationError( - f"home.co.uk {outcode} page {page} repeated previously seen results" - ) - seen_ids.update(page_ids) - - for prop in raw_props: - try: - transformed = transform_property(prop, pc_index) - except Exception as exc: - log.warning( - "home.co.uk %s property %s failed to transform: %s", - outcode, - prop.get("listing_id") or prop.get("property_id") or "?", - exc, - ) - continue - if transformed: - properties.append(transformed) - if max_properties is not None and len(properties) >= max_properties: - return properties - - if last_page is not None: - if page >= last_page: - break - elif total_results is not None and len(seen_ids) >= total_results: - break - elif len(raw_props) < HOMECOUK_PER_PAGE: - break - - page += 1 - time.sleep(DELAY_BETWEEN_PAGES) - - return properties diff --git a/finder/listing_filters.py b/finder/listing_filters.py deleted file mode 100644 index f1eca04..0000000 --- a/finder/listing_filters.py +++ /dev/null @@ -1,63 +0,0 @@ -"""Shared target filters for manual buy-listing scrapes.""" - -import math -from typing import Any - -BUY_MAX_PRICE = 1_000_000 -BUY_MIN_BEDROOMS = 2 -BUY_MAX_BEDROOMS = 5 -BUY_ALLOWED_BATHROOMS = frozenset({2, 3}) -BUY_MIN_FLOOR_AREA_SQM = 90.0 -BUY_MAX_FLOOR_AREA_SQM = 170.0 -BUY_PROPERTY_TYPES = frozenset({"Flats/Maisonettes"}) - -BUY_MIN_FLOOR_AREA_SQFT = round(BUY_MIN_FLOOR_AREA_SQM / 0.092903) -BUY_MAX_FLOOR_AREA_SQFT = round(BUY_MAX_FLOOR_AREA_SQM / 0.092903) - - -def _number(value: Any) -> float | None: - if value is None: - return None - try: - number = float(value) - except (TypeError, ValueError): - return None - if not math.isfinite(number): - return None - return number - - -def _int(value: Any) -> int | None: - number = _number(value) - if number is None or not number.is_integer(): - return None - return int(number) - - -def matches_strict_buy_listing_filter(prop: dict) -> bool: - """Exact filter used to guard scraped/output datasets.""" - if "price" in prop: - price = _number(prop.get("price")) - else: - price = _number(prop.get("Asking price")) - if price is None or price <= 0 or price >= BUY_MAX_PRICE: - return False - - bedrooms = _int(prop.get("Bedrooms")) - if bedrooms is None or ( - bedrooms < BUY_MIN_BEDROOMS or bedrooms > BUY_MAX_BEDROOMS - ): - return False - - property_type = prop.get("Property type") - if property_type not in BUY_PROPERTY_TYPES: - return False - - bathrooms = _int(prop.get("Bathrooms")) - if bathrooms not in BUY_ALLOWED_BATHROOMS: - return False - - floor_area = _number(prop.get("Total floor area (sqm)")) - if floor_area is None: - return False - return BUY_MIN_FLOOR_AREA_SQM <= floor_area <= BUY_MAX_FLOOR_AREA_SQM diff --git a/finder/main.py b/finder/main.py index ede5b1f..49fd755 100644 --- a/finder/main.py +++ b/finder/main.py @@ -5,10 +5,10 @@ import tempfile import time from pathlib import Path -from constants import DATA_DIR +from constants import DATA_DIR, REPO_DIR -SOURCE_CHOICES = ("rightmove", "homecouk", "zoopla", "all") +SOURCE_CHOICES = ("rightmove", "zoopla", "all") TEST_MAX_PROPERTIES_PER_SOURCE = 100 TEST_OUTCODES = ( "E1", @@ -28,14 +28,16 @@ log = logging.getLogger("finder") def configure_standalone_runtime() -> None: """Keep browser/cache/temp files on the project volume for local runs.""" - runtime_dir = DATA_DIR / ".runtime" + runtime_dir = REPO_DIR / ".tmp" / "finder" cache_dir = runtime_dir / "cache" temp_dir = runtime_dir / "tmp" cache_dir.mkdir(parents=True, exist_ok=True) temp_dir.mkdir(parents=True, exist_ok=True) - os.environ.setdefault("XDG_CACHE_HOME", str(cache_dir)) - os.environ.setdefault("TMPDIR", str(temp_dir)) + os.environ["XDG_CACHE_HOME"] = str(cache_dir) + os.environ["TMPDIR"] = str(temp_dir) + os.environ["TEMP"] = str(temp_dir) + os.environ["TMP"] = str(temp_dir) tempfile.tempdir = str(temp_dir) @@ -47,7 +49,7 @@ def parse_args() -> argparse.Namespace: "--source", choices=SOURCE_CHOICES, default="all", - help="Portal to scrape. 'all' runs Rightmove, home.co.uk, and Zoopla.", + help="Portal to scrape. 'all' runs Rightmove and Zoopla.", ) parser.add_argument( "--output-dir", @@ -89,7 +91,7 @@ def configure_logging() -> None: def selected_sources(source: str) -> list[str]: if source == "all": - return ["rightmove", "homecouk", "zoopla"] + return ["rightmove", "zoopla"] return [source] diff --git a/finder/pyproject.toml b/finder/pyproject.toml index 9023939..c183df8 100644 --- a/finder/pyproject.toml +++ b/finder/pyproject.toml @@ -4,7 +4,6 @@ version = "0.1.0" requires-python = ">=3.12" dependencies = [ "httpx", - "curl_cffi", "polars", "fake-useragent>=2.2.0", "playwright>=1.58.0", diff --git a/finder/rightmove.py b/finder/rightmove.py index 0a3d7a2..883c68a 100644 --- a/finder/rightmove.py +++ b/finder/rightmove.py @@ -10,15 +10,6 @@ from constants import ( TYPEAHEAD_URL, ) from http_client import fetch_with_retry -from listing_filters import ( - BUY_ALLOWED_BATHROOMS, - BUY_MAX_BEDROOMS, - BUY_MAX_FLOOR_AREA_SQFT, - BUY_MAX_PRICE, - BUY_MIN_BEDROOMS, - BUY_MIN_FLOOR_AREA_SQFT, - matches_strict_buy_listing_filter, -) from spatial import PostcodeSpatialIndex from transform import transform_property @@ -31,24 +22,6 @@ outcode_cache: dict[str, str] = {} # Requesting index >= 1008 returns HTTP 400. _MAX_INDEX = 1008 -_BASE_BUY_SEARCH_PARAMS = { - "propertyTypes": "flat", - "minBedrooms": str(BUY_MIN_BEDROOMS), - "maxBedrooms": str(BUY_MAX_BEDROOMS), - "minBathrooms": str(min(BUY_ALLOWED_BATHROOMS)), - "maxBathrooms": str(max(BUY_ALLOWED_BATHROOMS)), - "minSize": str(BUY_MIN_FLOOR_AREA_SQFT), - "maxSize": str(BUY_MAX_FLOOR_AREA_SQFT), - "maxPrice": str(BUY_MAX_PRICE - 1), -} - - -def _buy_search_params(extra_params: dict | None = None) -> dict: - params = dict(_BASE_BUY_SEARCH_PARAMS) - if extra_params: - params.update(extra_params) - return params - def resolve_outcode_id(client: httpx.Client, outcode: str) -> str | None: """Look up Rightmove's internal ID for an outcode via typeahead API.""" @@ -77,7 +50,6 @@ def _paginate( outcode: str, channel_cfg: dict, pc_index: PostcodeSpatialIndex, - extra_params: dict | None = None, max_properties: int | None = None, ) -> tuple[list[dict], int]: """Paginate through search results. Returns (properties, result_count).""" @@ -94,9 +66,6 @@ def _paginate( "channel": channel_cfg["channel"], "transactionType": channel_cfg["transactionType"], } - if extra_params: - params.update(extra_params) - data = fetch_with_retry(client, SEARCH_URL, params) if not data: log.warning( @@ -123,7 +92,7 @@ def _paginate( exc, ) continue - if transformed and matches_strict_buy_listing_filter(transformed): + if transformed: properties.append(transformed) if max_properties is not None and len(properties) >= max_properties: return properties, result_count @@ -137,7 +106,7 @@ def _paginate( break if index >= _MAX_INDEX: log.warning( - "%s/%s: %d filtered results exceed Rightmove's %d-result page cap", + "%s/%s: %d results exceed Rightmove's %d-result page cap", outcode, channel_cfg["channel"], result_count, @@ -158,18 +127,13 @@ def search_outcode( pc_index: PostcodeSpatialIndex, max_properties: int | None = None, ) -> list[dict]: - """Paginate through search results for one outcode+channel. Returns transformed properties. - - Search requests set the supported Rightmove filters directly: flats, - 2-5 bedrooms, 2-3 bathrooms, 969-1830 sq ft, and asking price below £1m. - """ + """Paginate through unfiltered sale results for one outcode+channel.""" properties, _ = _paginate( client, outcode_id, outcode, channel_cfg, pc_index, - extra_params=_buy_search_params(), max_properties=max_properties, ) diff --git a/finder/scraper.py b/finder/scraper.py index 14bbd87..5f8b568 100644 --- a/finder/scraper.py +++ b/finder/scraper.py @@ -14,12 +14,7 @@ from constants import ( LONDON_OUTCODE_PREFIXES, ) -from homecouk import CookiesExpiredError -from homecouk import load_cookies as load_homecouk_cookies -from homecouk import make_client as make_homecouk_client -from homecouk import search_outcode as homecouk_search_outcode from http_client import make_client -from listing_filters import matches_strict_buy_listing_filter from rightmove import resolve_outcode_id from rightmove import search_outcode as rightmove_search_outcode from spatial import PostcodeSpatialIndex @@ -30,7 +25,7 @@ from zoopla import search_outcode as zoopla_search_outcode log = logging.getLogger("rightmove") -SOURCE_ORDER = ("rightmove", "homecouk", "zoopla") +SOURCE_ORDER = ("rightmove", "zoopla") SALE_CHANNEL = CHANNELS[0] LONDON_AREAS = sorted({prefix.upper() for prefix in LONDON_OUTCODE_PREFIXES}) OUTCODE_RE = re.compile(r"^([A-Z]{1,2}\d[A-Z0-9]?)") @@ -260,16 +255,7 @@ def _store_properties( dropped_outside_area, ) - eligible = [prop for prop in londonish if matches_strict_buy_listing_filter(prop)] - dropped_non_matching = len(londonish) - len(eligible) - if dropped_non_matching: - log.debug( - "%s dropped %d properties outside the strict buy-listing filters", - source, - dropped_non_matching, - ) - - selected = eligible if remaining is None else eligible[:remaining] + selected = londonish if remaining is None else londonish[:remaining] results[source].extend(selected) return len(selected) @@ -290,6 +276,8 @@ def _launch_zoopla_with_retries(attempts: int = 3): for attempt in range(1, attempts + 1): try: return launch_zoopla_browser() + except TurnstileError: + raise except Exception as exc: last_error = exc log.warning( @@ -304,13 +292,6 @@ def _launch_zoopla_with_retries(attempts: int = 3): raise last_error -def _new_homecouk_client(): - cookie_data = load_homecouk_cookies() - if not cookie_data: - return None - return make_homecouk_client(*cookie_data) - - def _scrape_rightmove( outcodes: list[str], pc_index: PostcodeSpatialIndex, @@ -368,74 +349,6 @@ def _scrape_rightmove( client.close() -def _scrape_homecouk( - outcodes: list[str], - pc_index: PostcodeSpatialIndex, - results: dict[str, list[dict]], - errors: list[str], - max_properties_per_source: int | None, -) -> None: - client = _new_homecouk_client() - if client is None: - log.warning("home.co.uk skipped: could not bootstrap a local session") - return - - try: - for outcode in outcodes: - if _source_remaining(results, "homecouk", max_properties_per_source) == 0: - log.info("home.co.uk cap reached") - return - - for attempt in range(2): - try: - # home.co.uk cannot express the full filter set at source. - # Fetch the outcode page set first; _store_properties applies - # the strict filter and source cap after transformation. - props = homecouk_search_outcode( - client, - outcode, - pc_index, - max_properties=None, - ) - added = _store_properties( - results, - "homecouk", - props, - max_properties_per_source, - ) - log.info("home.co.uk %s: +%d", outcode, added) - break - except CookiesExpiredError as exc: - if attempt == 1: - _record_error(errors, "homecouk", outcode, exc) - break - - log.warning( - "home.co.uk cookies expired at %s; refreshing local session", - outcode, - ) - try: - client.close() - except Exception: - pass - client = _new_homecouk_client() - if client is None: - _record_error( - errors, - "homecouk", - outcode, - RuntimeError("could not refresh local session"), - ) - return - except Exception as exc: - _record_error(errors, "homecouk", outcode, exc) - break - - time.sleep(DELAY_BETWEEN_OUTCODES) - finally: - client.close() - - def _scrape_zoopla( outcodes: list[str], pc_index: PostcodeSpatialIndex, @@ -459,9 +372,8 @@ def _scrape_zoopla( for attempt in range(2): try: - # Zoopla source-side filters are unverified here. Fetch the - # outcode page set first; _store_properties applies the - # strict filter and source cap after transformation. + # Fetch the outcode page set first; _store_properties applies + # the London-ish postcode filter and source cap after transformation. props, _ = zoopla_search_outcode( page, outcode, @@ -539,15 +451,6 @@ def run_scrape( max_properties_per_source, ) - if "homecouk" in selected_sources: - _scrape_homecouk( - selected_outcodes, - pc_index, - results, - errors, - max_properties_per_source, - ) - if "zoopla" in selected_sources: if pc_coords is None: pc_coords = build_postcode_coords() @@ -567,20 +470,10 @@ def run_scrape( else: if output_path.exists(): output_path.unlink() - log.warning("No strict properties to write to %s", output_path) - - filtered = [prop for prop in merged if matches_strict_buy_listing_filter(prop)] - filtered_output_path = output_base / "online_listings_buy_filtered.parquet" - if filtered: - write_parquet(filtered, filtered_output_path) - else: - if filtered_output_path.exists(): - filtered_output_path.unlink() - log.warning("No strict-filtered properties to write to %s", filtered_output_path) + log.warning("No London-ish properties to write to %s", output_path) counts = { "total": len(merged), - "filtered_total": len(filtered), "deduped": deduped, "sources": source_counts, } @@ -588,9 +481,8 @@ def run_scrape( f"{source}:{source_counts[source]}" for source in SOURCE_ORDER ) log.info( - "Sale scrape complete: %d unique, %d strict-filtered (%s deduped:%d)", + "Sale scrape complete: %d unique (%s deduped:%d)", len(merged), - len(filtered), source_summary, deduped, ) @@ -603,7 +495,6 @@ def run_scrape( }, "counts": counts, "path": str(output_path), - "filtered_path": str(filtered_output_path), "errors": errors, "elapsed_seconds": round(time.time() - started_at, 3), } diff --git a/finder/transform.py b/finder/transform.py index a55fdad..e210bee 100644 --- a/finder/transform.py +++ b/finder/transform.py @@ -76,7 +76,7 @@ def normalize_sub_type(sub_type: str | None) -> str: """Normalize property sub-type for consistent storage. Fixes delimiter inconsistencies (underscores/hyphens → spaces) from - home.co.uk and truncates Zoopla description fragments that were + legacy listing data and truncates Zoopla description fragments that were accidentally captured as sub-types. """ if not sub_type: @@ -200,31 +200,13 @@ def transform_property( price_obj = prop.get("price", {}) amount = parse_int_value(price_obj.get("amount")) - if not amount: - return None - price = amount - if price <= 0: - return None + price = amount or 0 display_prices = price_obj.get("displayPrices", []) price_qualifier = ( display_prices[0].get("displayPriceQualifier", "") if display_prices else "" ) - # POA / Auction listings have unreliable prices — treat as no price - pq_lower = price_qualifier.lower() - non_comparable_price_terms = ( - "poa", - "auction", - "shared ownership", - "shared equity", - "part buy", - "part rent", - "from", - ) - if any(term in pq_lower for term in non_comparable_price_terms): - return None - sub_type = prop.get("propertySubType", "") raw_beds = parse_int_value(prop.get("bedrooms")) or 0 raw_baths = parse_int_value(prop.get("bathrooms")) or 0 diff --git a/finder/uv.lock b/finder/uv.lock index d3d31e9..553f48b 100644 --- a/finder/uv.lock +++ b/finder/uv.lock @@ -72,63 +72,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/9a/3c/c17fb3ca2d9c3acff52e30b309f538586f9f5b9c9cf454f3845fc9af4881/certifi-2026.2.25-py3-none-any.whl", hash = "sha256:027692e4402ad994f1c42e52a4997a9763c646b73e4096e4d5d6db8af1d6f0fa", size = 153684, upload-time = "2026-02-25T02:54:15.766Z" }, ] -[[package]] -name = "cffi" -version = "2.0.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "pycparser", marker = "implementation_name != 'PyPy'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/eb/56/b1ba7935a17738ae8453301356628e8147c79dbb825bcbc73dc7401f9846/cffi-2.0.0.tar.gz", hash = "sha256:44d1b5909021139fe36001ae048dbdde8214afa20200eda0f64c068cac5d5529", size = 523588, upload-time = "2025-09-08T23:24:04.541Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/ea/47/4f61023ea636104d4f16ab488e268b93008c3d0bb76893b1b31db1f96802/cffi-2.0.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:6d02d6655b0e54f54c4ef0b94eb6be0607b70853c45ce98bd278dc7de718be5d", size = 185271, upload-time = "2025-09-08T23:22:44.795Z" }, - { url = "https://files.pythonhosted.org/packages/df/a2/781b623f57358e360d62cdd7a8c681f074a71d445418a776eef0aadb4ab4/cffi-2.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8eca2a813c1cb7ad4fb74d368c2ffbbb4789d377ee5bb8df98373c2cc0dee76c", size = 181048, upload-time = "2025-09-08T23:22:45.938Z" }, - { url = "https://files.pythonhosted.org/packages/ff/df/a4f0fbd47331ceeba3d37c2e51e9dfc9722498becbeec2bd8bc856c9538a/cffi-2.0.0-cp312-cp312-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:21d1152871b019407d8ac3985f6775c079416c282e431a4da6afe7aefd2bccbe", size = 212529, upload-time = "2025-09-08T23:22:47.349Z" }, - { url = "https://files.pythonhosted.org/packages/d5/72/12b5f8d3865bf0f87cf1404d8c374e7487dcf097a1c91c436e72e6badd83/cffi-2.0.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:b21e08af67b8a103c71a250401c78d5e0893beff75e28c53c98f4de42f774062", size = 220097, upload-time = "2025-09-08T23:22:48.677Z" }, - { url = "https://files.pythonhosted.org/packages/c2/95/7a135d52a50dfa7c882ab0ac17e8dc11cec9d55d2c18dda414c051c5e69e/cffi-2.0.0-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:1e3a615586f05fc4065a8b22b8152f0c1b00cdbc60596d187c2a74f9e3036e4e", size = 207983, upload-time = "2025-09-08T23:22:50.06Z" }, - { url = "https://files.pythonhosted.org/packages/3a/c8/15cb9ada8895957ea171c62dc78ff3e99159ee7adb13c0123c001a2546c1/cffi-2.0.0-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:81afed14892743bbe14dacb9e36d9e0e504cd204e0b165062c488942b9718037", size = 206519, upload-time = "2025-09-08T23:22:51.364Z" }, - { url = "https://files.pythonhosted.org/packages/78/2d/7fa73dfa841b5ac06c7b8855cfc18622132e365f5b81d02230333ff26e9e/cffi-2.0.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:3e17ed538242334bf70832644a32a7aae3d83b57567f9fd60a26257e992b79ba", size = 219572, upload-time = "2025-09-08T23:22:52.902Z" }, - { url = "https://files.pythonhosted.org/packages/07/e0/267e57e387b4ca276b90f0434ff88b2c2241ad72b16d31836adddfd6031b/cffi-2.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3925dd22fa2b7699ed2617149842d2e6adde22b262fcbfada50e3d195e4b3a94", size = 222963, upload-time = "2025-09-08T23:22:54.518Z" }, - { url = "https://files.pythonhosted.org/packages/b6/75/1f2747525e06f53efbd878f4d03bac5b859cbc11c633d0fb81432d98a795/cffi-2.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:2c8f814d84194c9ea681642fd164267891702542f028a15fc97d4674b6206187", size = 221361, upload-time = "2025-09-08T23:22:55.867Z" }, - { url = "https://files.pythonhosted.org/packages/7b/2b/2b6435f76bfeb6bbf055596976da087377ede68df465419d192acf00c437/cffi-2.0.0-cp312-cp312-win32.whl", hash = "sha256:da902562c3e9c550df360bfa53c035b2f241fed6d9aef119048073680ace4a18", size = 172932, upload-time = "2025-09-08T23:22:57.188Z" }, - { url = "https://files.pythonhosted.org/packages/f8/ed/13bd4418627013bec4ed6e54283b1959cf6db888048c7cf4b4c3b5b36002/cffi-2.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:da68248800ad6320861f129cd9c1bf96ca849a2771a59e0344e88681905916f5", size = 183557, upload-time = "2025-09-08T23:22:58.351Z" }, - { url = "https://files.pythonhosted.org/packages/95/31/9f7f93ad2f8eff1dbc1c3656d7ca5bfd8fb52c9d786b4dcf19b2d02217fa/cffi-2.0.0-cp312-cp312-win_arm64.whl", hash = "sha256:4671d9dd5ec934cb9a73e7ee9676f9362aba54f7f34910956b84d727b0d73fb6", size = 177762, upload-time = "2025-09-08T23:22:59.668Z" }, - { url = "https://files.pythonhosted.org/packages/4b/8d/a0a47a0c9e413a658623d014e91e74a50cdd2c423f7ccfd44086ef767f90/cffi-2.0.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:00bdf7acc5f795150faa6957054fbbca2439db2f775ce831222b66f192f03beb", size = 185230, upload-time = "2025-09-08T23:23:00.879Z" }, - { url = "https://files.pythonhosted.org/packages/4a/d2/a6c0296814556c68ee32009d9c2ad4f85f2707cdecfd7727951ec228005d/cffi-2.0.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:45d5e886156860dc35862657e1494b9bae8dfa63bf56796f2fb56e1679fc0bca", size = 181043, upload-time = "2025-09-08T23:23:02.231Z" }, - { url = "https://files.pythonhosted.org/packages/b0/1e/d22cc63332bd59b06481ceaac49d6c507598642e2230f201649058a7e704/cffi-2.0.0-cp313-cp313-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:07b271772c100085dd28b74fa0cd81c8fb1a3ba18b21e03d7c27f3436a10606b", size = 212446, upload-time = "2025-09-08T23:23:03.472Z" }, - { url = "https://files.pythonhosted.org/packages/a9/f5/a2c23eb03b61a0b8747f211eb716446c826ad66818ddc7810cc2cc19b3f2/cffi-2.0.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d48a880098c96020b02d5a1f7d9251308510ce8858940e6fa99ece33f610838b", size = 220101, upload-time = "2025-09-08T23:23:04.792Z" }, - { url = "https://files.pythonhosted.org/packages/f2/7f/e6647792fc5850d634695bc0e6ab4111ae88e89981d35ac269956605feba/cffi-2.0.0-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:f93fd8e5c8c0a4aa1f424d6173f14a892044054871c771f8566e4008eaa359d2", size = 207948, upload-time = "2025-09-08T23:23:06.127Z" }, - { url = "https://files.pythonhosted.org/packages/cb/1e/a5a1bd6f1fb30f22573f76533de12a00bf274abcdc55c8edab639078abb6/cffi-2.0.0-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:dd4f05f54a52fb558f1ba9f528228066954fee3ebe629fc1660d874d040ae5a3", size = 206422, upload-time = "2025-09-08T23:23:07.753Z" }, - { url = "https://files.pythonhosted.org/packages/98/df/0a1755e750013a2081e863e7cd37e0cdd02664372c754e5560099eb7aa44/cffi-2.0.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c8d3b5532fc71b7a77c09192b4a5a200ea992702734a2e9279a37f2478236f26", size = 219499, upload-time = "2025-09-08T23:23:09.648Z" }, - { url = "https://files.pythonhosted.org/packages/50/e1/a969e687fcf9ea58e6e2a928ad5e2dd88cc12f6f0ab477e9971f2309b57c/cffi-2.0.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:d9b29c1f0ae438d5ee9acb31cadee00a58c46cc9c0b2f9038c6b0b3470877a8c", size = 222928, upload-time = "2025-09-08T23:23:10.928Z" }, - { url = "https://files.pythonhosted.org/packages/36/54/0362578dd2c9e557a28ac77698ed67323ed5b9775ca9d3fe73fe191bb5d8/cffi-2.0.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6d50360be4546678fc1b79ffe7a66265e28667840010348dd69a314145807a1b", size = 221302, upload-time = "2025-09-08T23:23:12.42Z" }, - { url = "https://files.pythonhosted.org/packages/eb/6d/bf9bda840d5f1dfdbf0feca87fbdb64a918a69bca42cfa0ba7b137c48cb8/cffi-2.0.0-cp313-cp313-win32.whl", hash = "sha256:74a03b9698e198d47562765773b4a8309919089150a0bb17d829ad7b44b60d27", size = 172909, upload-time = "2025-09-08T23:23:14.32Z" }, - { url = "https://files.pythonhosted.org/packages/37/18/6519e1ee6f5a1e579e04b9ddb6f1676c17368a7aba48299c3759bbc3c8b3/cffi-2.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:19f705ada2530c1167abacb171925dd886168931e0a7b78f5bffcae5c6b5be75", size = 183402, upload-time = "2025-09-08T23:23:15.535Z" }, - { url = "https://files.pythonhosted.org/packages/cb/0e/02ceeec9a7d6ee63bb596121c2c8e9b3a9e150936f4fbef6ca1943e6137c/cffi-2.0.0-cp313-cp313-win_arm64.whl", hash = "sha256:256f80b80ca3853f90c21b23ee78cd008713787b1b1e93eae9f3d6a7134abd91", size = 177780, upload-time = "2025-09-08T23:23:16.761Z" }, - { url = "https://files.pythonhosted.org/packages/92/c4/3ce07396253a83250ee98564f8d7e9789fab8e58858f35d07a9a2c78de9f/cffi-2.0.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:fc33c5141b55ed366cfaad382df24fe7dcbc686de5be719b207bb248e3053dc5", size = 185320, upload-time = "2025-09-08T23:23:18.087Z" }, - { url = "https://files.pythonhosted.org/packages/59/dd/27e9fa567a23931c838c6b02d0764611c62290062a6d4e8ff7863daf9730/cffi-2.0.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:c654de545946e0db659b3400168c9ad31b5d29593291482c43e3564effbcee13", size = 181487, upload-time = "2025-09-08T23:23:19.622Z" }, - { url = "https://files.pythonhosted.org/packages/d6/43/0e822876f87ea8a4ef95442c3d766a06a51fc5298823f884ef87aaad168c/cffi-2.0.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:24b6f81f1983e6df8db3adc38562c83f7d4a0c36162885ec7f7b77c7dcbec97b", size = 220049, upload-time = "2025-09-08T23:23:20.853Z" }, - { url = "https://files.pythonhosted.org/packages/b4/89/76799151d9c2d2d1ead63c2429da9ea9d7aac304603de0c6e8764e6e8e70/cffi-2.0.0-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:12873ca6cb9b0f0d3a0da705d6086fe911591737a59f28b7936bdfed27c0d47c", size = 207793, upload-time = "2025-09-08T23:23:22.08Z" }, - { url = "https://files.pythonhosted.org/packages/bb/dd/3465b14bb9e24ee24cb88c9e3730f6de63111fffe513492bf8c808a3547e/cffi-2.0.0-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:d9b97165e8aed9272a6bb17c01e3cc5871a594a446ebedc996e2397a1c1ea8ef", size = 206300, upload-time = "2025-09-08T23:23:23.314Z" }, - { url = "https://files.pythonhosted.org/packages/47/d9/d83e293854571c877a92da46fdec39158f8d7e68da75bf73581225d28e90/cffi-2.0.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:afb8db5439b81cf9c9d0c80404b60c3cc9c3add93e114dcae767f1477cb53775", size = 219244, upload-time = "2025-09-08T23:23:24.541Z" }, - { url = "https://files.pythonhosted.org/packages/2b/0f/1f177e3683aead2bb00f7679a16451d302c436b5cbf2505f0ea8146ef59e/cffi-2.0.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:737fe7d37e1a1bffe70bd5754ea763a62a066dc5913ca57e957824b72a85e205", size = 222828, upload-time = "2025-09-08T23:23:26.143Z" }, - { url = "https://files.pythonhosted.org/packages/c6/0f/cafacebd4b040e3119dcb32fed8bdef8dfe94da653155f9d0b9dc660166e/cffi-2.0.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:38100abb9d1b1435bc4cc340bb4489635dc2f0da7456590877030c9b3d40b0c1", size = 220926, upload-time = "2025-09-08T23:23:27.873Z" }, - { url = "https://files.pythonhosted.org/packages/3e/aa/df335faa45b395396fcbc03de2dfcab242cd61a9900e914fe682a59170b1/cffi-2.0.0-cp314-cp314-win32.whl", hash = "sha256:087067fa8953339c723661eda6b54bc98c5625757ea62e95eb4898ad5e776e9f", size = 175328, upload-time = "2025-09-08T23:23:44.61Z" }, - { url = "https://files.pythonhosted.org/packages/bb/92/882c2d30831744296ce713f0feb4c1cd30f346ef747b530b5318715cc367/cffi-2.0.0-cp314-cp314-win_amd64.whl", hash = "sha256:203a48d1fb583fc7d78a4c6655692963b860a417c0528492a6bc21f1aaefab25", size = 185650, upload-time = "2025-09-08T23:23:45.848Z" }, - { url = "https://files.pythonhosted.org/packages/9f/2c/98ece204b9d35a7366b5b2c6539c350313ca13932143e79dc133ba757104/cffi-2.0.0-cp314-cp314-win_arm64.whl", hash = "sha256:dbd5c7a25a7cb98f5ca55d258b103a2054f859a46ae11aaf23134f9cc0d356ad", size = 180687, upload-time = "2025-09-08T23:23:47.105Z" }, - { url = "https://files.pythonhosted.org/packages/3e/61/c768e4d548bfa607abcda77423448df8c471f25dbe64fb2ef6d555eae006/cffi-2.0.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:9a67fc9e8eb39039280526379fb3a70023d77caec1852002b4da7e8b270c4dd9", size = 188773, upload-time = "2025-09-08T23:23:29.347Z" }, - { url = "https://files.pythonhosted.org/packages/2c/ea/5f76bce7cf6fcd0ab1a1058b5af899bfbef198bea4d5686da88471ea0336/cffi-2.0.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:7a66c7204d8869299919db4d5069a82f1561581af12b11b3c9f48c584eb8743d", size = 185013, upload-time = "2025-09-08T23:23:30.63Z" }, - { url = "https://files.pythonhosted.org/packages/be/b4/c56878d0d1755cf9caa54ba71e5d049479c52f9e4afc230f06822162ab2f/cffi-2.0.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:7cc09976e8b56f8cebd752f7113ad07752461f48a58cbba644139015ac24954c", size = 221593, upload-time = "2025-09-08T23:23:31.91Z" }, - { url = "https://files.pythonhosted.org/packages/e0/0d/eb704606dfe8033e7128df5e90fee946bbcb64a04fcdaa97321309004000/cffi-2.0.0-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:92b68146a71df78564e4ef48af17551a5ddd142e5190cdf2c5624d0c3ff5b2e8", size = 209354, upload-time = "2025-09-08T23:23:33.214Z" }, - { url = "https://files.pythonhosted.org/packages/d8/19/3c435d727b368ca475fb8742ab97c9cb13a0de600ce86f62eab7fa3eea60/cffi-2.0.0-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:b1e74d11748e7e98e2f426ab176d4ed720a64412b6a15054378afdb71e0f37dc", size = 208480, upload-time = "2025-09-08T23:23:34.495Z" }, - { url = "https://files.pythonhosted.org/packages/d0/44/681604464ed9541673e486521497406fadcc15b5217c3e326b061696899a/cffi-2.0.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:28a3a209b96630bca57cce802da70c266eb08c6e97e5afd61a75611ee6c64592", size = 221584, upload-time = "2025-09-08T23:23:36.096Z" }, - { url = "https://files.pythonhosted.org/packages/25/8e/342a504ff018a2825d395d44d63a767dd8ebc927ebda557fecdaca3ac33a/cffi-2.0.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:7553fb2090d71822f02c629afe6042c299edf91ba1bf94951165613553984512", size = 224443, upload-time = "2025-09-08T23:23:37.328Z" }, - { url = "https://files.pythonhosted.org/packages/e1/5e/b666bacbbc60fbf415ba9988324a132c9a7a0448a9a8f125074671c0f2c3/cffi-2.0.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:6c6c373cfc5c83a975506110d17457138c8c63016b563cc9ed6e056a82f13ce4", size = 223437, upload-time = "2025-09-08T23:23:38.945Z" }, - { url = "https://files.pythonhosted.org/packages/a0/1d/ec1a60bd1a10daa292d3cd6bb0b359a81607154fb8165f3ec95fe003b85c/cffi-2.0.0-cp314-cp314t-win32.whl", hash = "sha256:1fc9ea04857caf665289b7a75923f2c6ed559b8298a1b8c49e59f7dd95c8481e", size = 180487, upload-time = "2025-09-08T23:23:40.423Z" }, - { url = "https://files.pythonhosted.org/packages/bf/41/4c1168c74fac325c0c8156f04b6749c8b6a8f405bbf91413ba088359f60d/cffi-2.0.0-cp314-cp314t-win_amd64.whl", hash = "sha256:d68b6cef7827e8641e8ef16f4494edda8b36104d79773a334beaa1e3521430f6", size = 191726, upload-time = "2025-09-08T23:23:41.742Z" }, - { url = "https://files.pythonhosted.org/packages/ae/3a/dbeec9d1ee0844c679f6bb5d6ad4e9f198b1224f4e7a32825f47f6192b0c/cffi-2.0.0-cp314-cp314t-win_arm64.whl", hash = "sha256:0a1527a803f0a659de1af2e1fd700213caba79377e27e4693648c2923da066f9", size = 184195, upload-time = "2025-09-08T23:23:43.004Z" }, -] - [[package]] name = "charset-normalizer" version = "3.4.6" @@ -223,29 +166,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" }, ] -[[package]] -name = "curl-cffi" -version = "0.14.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "certifi" }, - { name = "cffi" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/9b/c9/0067d9a25ed4592b022d4558157fcdb6e123516083700786d38091688767/curl_cffi-0.14.0.tar.gz", hash = "sha256:5ffbc82e59f05008ec08ea432f0e535418823cda44178ee518906a54f27a5f0f", size = 162633, upload-time = "2025-12-16T03:25:07.931Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/aa/f0/0f21e9688eaac85e705537b3a87a5588d0cefb2f09d83e83e0e8be93aa99/curl_cffi-0.14.0-cp39-abi3-macosx_14_0_arm64.whl", hash = "sha256:e35e89c6a69872f9749d6d5fda642ed4fc159619329e99d577d0104c9aad5893", size = 3087277, upload-time = "2025-12-16T03:24:49.607Z" }, - { url = "https://files.pythonhosted.org/packages/ba/a3/0419bd48fce5b145cb6a2344c6ac17efa588f5b0061f212c88e0723da026/curl_cffi-0.14.0-cp39-abi3-macosx_15_0_x86_64.whl", hash = "sha256:5945478cd28ad7dfb5c54473bcfb6743ee1d66554d57951fdf8fc0e7d8cf4e45", size = 5804650, upload-time = "2025-12-16T03:24:51.518Z" }, - { url = "https://files.pythonhosted.org/packages/e2/07/a238dd062b7841b8caa2fa8a359eb997147ff3161288f0dd46654d898b4d/curl_cffi-0.14.0-cp39-abi3-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c42e8fa3c667db9ccd2e696ee47adcd3cd5b0838d7282f3fc45f6c0ef3cfdfa7", size = 8231918, upload-time = "2025-12-16T03:24:52.862Z" }, - { url = "https://files.pythonhosted.org/packages/7c/d2/ce907c9b37b5caf76ac08db40cc4ce3d9f94c5500db68a195af3513eacbc/curl_cffi-0.14.0-cp39-abi3-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:060fe2c99c41d3cb7f894de318ddf4b0301b08dca70453d769bd4e74b36b8483", size = 8654624, upload-time = "2025-12-16T03:24:54.579Z" }, - { url = "https://files.pythonhosted.org/packages/f2/ae/6256995b18c75e6ef76b30753a5109e786813aa79088b27c8eabb1ef85c9/curl_cffi-0.14.0-cp39-abi3-manylinux_2_28_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:b158c41a25388690dd0d40b5bc38d1e0f512135f17fdb8029868cbc1993d2e5b", size = 8010654, upload-time = "2025-12-16T03:24:56.507Z" }, - { url = "https://files.pythonhosted.org/packages/fb/10/ff64249e516b103cb762e0a9dca3ee0f04cf25e2a1d5d9838e0f1273d071/curl_cffi-0.14.0-cp39-abi3-manylinux_2_28_i686.whl", hash = "sha256:1439fbef3500fb723333c826adf0efb0e2e5065a703fb5eccce637a2250db34a", size = 7781969, upload-time = "2025-12-16T03:24:57.885Z" }, - { url = "https://files.pythonhosted.org/packages/51/76/d6f7bb76c2d12811aa7ff16f5e17b678abdd1b357b9a8ac56310ceccabd5/curl_cffi-0.14.0-cp39-abi3-manylinux_2_34_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:e7176f2c2d22b542e3cf261072a81deb018cfa7688930f95dddef215caddb469", size = 7969133, upload-time = "2025-12-16T03:24:59.261Z" }, - { url = "https://files.pythonhosted.org/packages/23/7c/cca39c0ed4e1772613d3cba13091c0e9d3b89365e84b9bf9838259a3cd8f/curl_cffi-0.14.0-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:03f21ade2d72978c2bb8670e9b6de5260e2755092b02d94b70b906813662998d", size = 9080167, upload-time = "2025-12-16T03:25:00.946Z" }, - { url = "https://files.pythonhosted.org/packages/75/03/a942d7119d3e8911094d157598ae0169b1c6ca1bd3f27d7991b279bcc45b/curl_cffi-0.14.0-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:58ebf02de64ee5c95613209ddacb014c2d2f86298d7080c0a1c12ed876ee0690", size = 9520464, upload-time = "2025-12-16T03:25:02.922Z" }, - { url = "https://files.pythonhosted.org/packages/a2/77/78900e9b0833066d2274bda75cba426fdb4cef7fbf6a4f6a6ca447607bec/curl_cffi-0.14.0-cp39-abi3-win_amd64.whl", hash = "sha256:6e503f9a103f6ae7acfb3890c843b53ec030785a22ae7682a22cc43afb94123e", size = 1677416, upload-time = "2025-12-16T03:25:04.902Z" }, - { url = "https://files.pythonhosted.org/packages/5c/7c/d2ba86b0b3e1e2830bd94163d047de122c69a8df03c5c7c36326c456ad82/curl_cffi-0.14.0-cp39-abi3-win_arm64.whl", hash = "sha256:2eed50a969201605c863c4c31269dfc3e0da52916086ac54553cfa353022425c", size = 1425067, upload-time = "2025-12-16T03:25:06.454Z" }, -] - [[package]] name = "cython" version = "3.2.4" @@ -274,7 +194,6 @@ version = "0.1.0" source = { virtual = "." } dependencies = [ { name = "camoufox" }, - { name = "curl-cffi" }, { name = "fake-useragent" }, { name = "httpx" }, { name = "playwright" }, @@ -284,7 +203,6 @@ dependencies = [ [package.metadata] requires-dist = [ { name = "camoufox", specifier = ">=0.4.11" }, - { name = "curl-cffi" }, { name = "fake-useragent", specifier = ">=2.2.0" }, { name = "httpx" }, { name = "playwright", specifier = ">=1.58.0" }, @@ -639,15 +557,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b3/eb/936f5eeae196e8c8aaabe5f7d98891be8a5bbc741d50ce5c60f55575ad29/polars_runtime_32-1.39.0-cp310-abi3-win_arm64.whl", hash = "sha256:d69abde5f148566860bbe910010847bd7791e72f7c8063a4d2c462246a33a72a", size = 41885761, upload-time = "2026-03-12T14:23:16.773Z" }, ] -[[package]] -name = "pycparser" -version = "3.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/1b/7d/92392ff7815c21062bea51aa7b87d45576f649f16458d78b7cf94b9ab2e6/pycparser-3.0.tar.gz", hash = "sha256:600f49d217304a5902ac3c37e1281c9fe94e4d0489de643a9504c5cdfdfc6b29", size = 103492, upload-time = "2026-01-21T14:26:51.89Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/0c/c3/44f3fbbfa403ea2a7c779186dc20772604442dde72947e7d01069cbe98e3/pycparser-3.0-py3-none-any.whl", hash = "sha256:b727414169a36b7d524c1c3e31839a521725078d7b2ff038656844266160a992", size = 48172, upload-time = "2026-01-21T14:26:50.693Z" }, -] - [[package]] name = "pyee" version = "13.0.1" diff --git a/finder/zoopla.py b/finder/zoopla.py index e2a2363..8c88f18 100644 --- a/finder/zoopla.py +++ b/finder/zoopla.py @@ -1,8 +1,8 @@ """Zoopla (zoopla.co.uk) scraper — sale properties. Zoopla is behind Cloudflare Turnstile (managed interactive challenge), which -blocks all HTTP clients (curl_cffi, httpx) and even Playwright with stealth -patches. Only Camoufox (an anti-fingerprinting Firefox fork) passes reliably. +blocks non-browser HTTP clients and even Playwright with stealth patches. Only +Camoufox (an anti-fingerprinting Firefox fork) passes reliably. Zoopla uses Next.js App Router with React Server Components (RSC). Search result data is server-rendered in an RSC stream, not available via @@ -19,11 +19,20 @@ Architecture: """ import logging +import os import re +import sys import time +from pathlib import Path from urllib.parse import parse_qsl, urlencode, urljoin, urlparse, urlunparse -from constants import DELAY_BETWEEN_PAGES, MAX_BEDROOMS, PROPERTY_TYPE_MAP, ZOOPLA_BASE +from constants import ( + DATA_DIR, + DELAY_BETWEEN_PAGES, + MAX_BEDROOMS, + PROPERTY_TYPE_MAP, + ZOOPLA_BASE, +) from spatial import PostcodeSpatialIndex from transform import normalize_sub_type, parse_int_value, validate_floor_area @@ -255,11 +264,120 @@ _DISMISS_COOKIES_JS = """() => { # --------------------------------------------------------------------------- +_FALSE_ENV_VALUES = {"0", "false", "no", "off"} +_TRUE_ENV_VALUES = {"1", "true", "yes", "on"} + + +def _env_bool_or_virtual(name: str, default: bool | str) -> bool | str: + raw = os.environ.get(name) + if raw is None: + return default + + value = raw.strip().lower() + if value == "virtual": + return "virtual" + if value in _TRUE_ENV_VALUES: + return True + if value in _FALSE_ENV_VALUES: + return False + raise ValueError( + f"{name} must be one of 1/0, true/false, yes/no, on/off, or virtual" + ) + + +def _visible_display_available() -> bool: + if sys.platform.startswith("linux"): + return bool(os.environ.get("DISPLAY") or os.environ.get("WAYLAND_DISPLAY")) + return True + + +def _zoopla_headless_mode() -> bool | str: + # Prefer a visible browser by default so Cloudflare can be completed by the + # person running the scrape. In display-less Linux shells, keep startup + # headless and fail fast with an actionable error if a challenge appears. + default: bool | str = not _visible_display_available() + return _env_bool_or_virtual("ZOOPLA_HEADLESS", default) + + +def _zoopla_profile_dir() -> Path: + raw = os.environ.get("ZOOPLA_PROFILE_DIR") + if raw: + return Path(raw).expanduser().resolve() + return (DATA_DIR / ".runtime" / "zoopla-profile").expanduser().resolve() + + +def _challenge_timeout_seconds() -> int: + raw = os.environ.get("ZOOPLA_CHALLENGE_TIMEOUT_SECONDS") + if raw is None: + return 300 + try: + timeout = int(raw) + except ValueError as exc: + raise ValueError("ZOOPLA_CHALLENGE_TIMEOUT_SECONDS must be an integer") from exc + if timeout < 1: + raise ValueError("ZOOPLA_CHALLENGE_TIMEOUT_SECONDS must be greater than zero") + return timeout + + +def _is_turnstile_challenge(page) -> bool: + try: + if "just a moment" in page.title().lower(): + return True + except Exception: + pass + + try: + return bool( + page.query_selector( + 'iframe[src*="challenges.cloudflare.com"], ' + 'input[name="cf-turnstile-response"]' + ) + ) + except Exception: + return False + + +def _wait_for_turnstile(page, headless_mode: bool | str) -> None: + if not _is_turnstile_challenge(page): + return + + profile_dir = _zoopla_profile_dir() + if headless_mode is True or headless_mode == "virtual": + raise TurnstileError( + "Cloudflare Turnstile requires a visible browser session. " + "Run Zoopla from a desktop session with ZOOPLA_HEADLESS=0; " + f"the solved session will be saved in {profile_dir}." + ) + + timeout = _challenge_timeout_seconds() + log.warning( + "Cloudflare Turnstile challenge shown. Complete it in the Zoopla browser " + "window; waiting up to %ds. Profile: %s", + timeout, + profile_dir, + ) + try: + page.bring_to_front() + except Exception: + pass + + deadline = time.monotonic() + timeout + while time.monotonic() < deadline: + time.sleep(3) + if not _is_turnstile_challenge(page): + log.info("Cloudflare challenge resolved") + return + + raise TurnstileError( + f"Cloudflare Turnstile was not completed after {timeout}s" + ) + + def launch_browser(): """Launch Camoufox, navigate to Zoopla homepage, pass Cloudflare Turnstile, and dismiss cookie consent. Returns (browser, page) tuple. - Raises TurnstileError if Cloudflare cannot be passed within two minutes. + Raises TurnstileError if Cloudflare cannot be completed. Caller must close browser when done.""" from camoufox.pkgman import camoufox_path @@ -269,61 +387,50 @@ def launch_browser(): from camoufox.sync_api import Camoufox - log.info("Launching Camoufox browser for Zoopla...") - camoufox = Camoufox(headless=True) + headless_mode = _zoopla_headless_mode() + profile_dir = _zoopla_profile_dir() + profile_dir.mkdir(parents=True, exist_ok=True) + + log.info( + "Launching Camoufox browser for Zoopla (headless=%s, profile=%s)...", + headless_mode, + profile_dir, + ) + camoufox = Camoufox( + headless=headless_mode, + persistent_context=True, + user_data_dir=str(profile_dir), + locale=["en-GB", "en"], + enable_cache=True, + ) raw_browser = camoufox.__enter__() browser = _ManagedCamoufoxBrowser(camoufox, raw_browser) - page = browser.new_page() + page = raw_browser.pages[0] if raw_browser.pages else raw_browser.new_page() - log.info("Navigating to Zoopla homepage...") - page.goto(f"{ZOOPLA_BASE}/", wait_until="domcontentloaded", timeout=60000) + try: + log.info("Navigating to Zoopla homepage...") + page.goto(f"{ZOOPLA_BASE}/", wait_until="domcontentloaded", timeout=60000) + _wait_for_turnstile(page, headless_mode) - # Wait for Cloudflare Turnstile to resolve. - # Try clicking the Turnstile checkbox if present (helps in some cases). - for i in range(40): - if "Just a moment" not in page.title(): - break - # Attempt to click the Turnstile checkbox in the challenge iframe - for frame in page.frames: - if "challenges.cloudflare.com" in frame.url: - try: - iframe_el = page.query_selector('iframe[src*="challenges.cloudflare"]') - if iframe_el: - box = iframe_el.bounding_box() - if box: - page.mouse.click(box["x"] + 30, box["y"] + box["height"] / 2) - except Exception: - pass - break - time.sleep(3) - else: - page.close() - browser.close() - raise TurnstileError("Cloudflare Turnstile did not resolve after 120s") + log.info("Zoopla browser ready — title: %s", page.title()) + time.sleep(2) - log.info("Cloudflare passed — title: %s", page.title()) - time.sleep(2) - - # Dismiss cookie consent - page.evaluate(_DISMISS_COOKIES_JS) - time.sleep(1) + # Dismiss cookie consent + page.evaluate(_DISMISS_COOKIES_JS) + time.sleep(1) + except Exception: + try: + page.close() + finally: + browser.close() + raise return browser, page def _ensure_not_challenged(page) -> None: """Check if current page is a Cloudflare challenge and wait/raise.""" - if "Just a moment" not in page.title(): - return - - log.warning("Cloudflare challenge detected mid-session, waiting...") - for i in range(40): - time.sleep(3) - if "Just a moment" not in page.title(): - log.info("Cloudflare challenge resolved") - return - - raise TurnstileError("Cloudflare re-challenge did not resolve after 120s") + _wait_for_turnstile(page, _zoopla_headless_mode()) # --------------------------------------------------------------------------- @@ -704,9 +811,7 @@ def transform_property( Zoopla search cards do not include coordinates, so we resolve lat/lng from postcodes extracted from the address text.""" - price = parse_int_value(raw.get("price")) - if not price or price <= 0: - return None + price = parse_int_value(raw.get("price")) or 0 address = raw.get("address", "") @@ -856,7 +961,7 @@ def search_outcode( sample = raw_listings[0] if raw_listings else {} log.debug( "Zoopla %s %s: extracted %d raw listings but all %d dropped in transform " - "(no price/postcode/coords). Sample raw: price=%s address=%r", + "(no postcode/coords). Sample raw: price=%s address=%r", outcode, "BUY", len(raw_listings), dropped, sample.get("price"), sample.get("address", ""), ) diff --git a/frontend/src/components/map/JourneyInstructions.tsx b/frontend/src/components/map/JourneyInstructions.tsx index 90da9bb..914c72f 100644 --- a/frontend/src/components/map/JourneyInstructions.tsx +++ b/frontend/src/components/map/JourneyInstructions.tsx @@ -68,6 +68,34 @@ const ROUTE_COLORS: Record = { }; const NON_TUBE_NAMES = new Set(['DLR', 'London Overground', 'Elizabeth line']); +const GOOGLE_MAPS_DEPARTURE_TIME_ZONE = 'Europe/London'; +const londonDateFormatter = new Intl.DateTimeFormat('en-GB', { + timeZone: GOOGLE_MAPS_DEPARTURE_TIME_ZONE, + year: 'numeric', + month: '2-digit', + day: '2-digit', +}); +const londonDateTimeFormatter = new Intl.DateTimeFormat('en-GB', { + timeZone: GOOGLE_MAPS_DEPARTURE_TIME_ZONE, + year: 'numeric', + month: '2-digit', + day: '2-digit', + hour: '2-digit', + minute: '2-digit', + second: '2-digit', + hour12: false, + hourCycle: 'h23', +}); + +function dateTimeParts(formatter: Intl.DateTimeFormat, date: Date): Record { + const parts: Record = {}; + formatter.formatToParts(date).forEach((part) => { + if (part.type !== 'literal') { + parts[part.type] = Number(part.value); + } + }); + return parts; +} /** Strip trailing parenthesized GTFS route IDs and NaPTAN stop codes (e.g. "(6757261)", "(9400ZZLUCGT1)") */ function stripId(label: string): string { @@ -87,15 +115,48 @@ function getRouteDisplay(mode: string): { label: string; color: string; darkText return { label: clean, color: '#6b7280', darkText: false }; } -/** Returns a Unix timestamp for the next Monday at 07:30 local time. */ +function londonOffsetMs(utcMs: number): number { + const parts = dateTimeParts(londonDateTimeFormatter, new Date(utcMs)); + const londonAsUtcMs = Date.UTC( + parts.year, + parts.month - 1, + parts.day, + parts.hour, + parts.minute, + parts.second + ); + return londonAsUtcMs - utcMs; +} + +function londonTimeToUtcMs( + year: number, + month: number, + day: number, + hour: number, + minute: number +): number { + const localAsUtcMs = Date.UTC(year, month - 1, day, hour, minute, 0, 0); + const offsetMs = londonOffsetMs(localAsUtcMs); + const utcMs = localAsUtcMs - offsetMs; + const correctedOffsetMs = londonOffsetMs(utcMs); + return correctedOffsetMs === offsetMs ? utcMs : localAsUtcMs - correctedOffsetMs; +} + +/** Returns a Unix timestamp for the next Monday at 07:30 Europe/London time. */ function nextMondayAt730(): number { const now = new Date(); - const day = now.getDay(); // 0=Sun … 6=Sat + const today = dateTimeParts(londonDateFormatter, now); + const day = new Date(Date.UTC(today.year, today.month - 1, today.day)).getUTCDay(); const daysUntil = day === 0 ? 1 : day === 1 ? 7 : 8 - day; - const monday = new Date(now); - monday.setDate(now.getDate() + daysUntil); - monday.setHours(7, 30, 0, 0); - return Math.floor(monday.getTime() / 1000); + const monday = new Date(Date.UTC(today.year, today.month - 1, today.day + daysUntil)); + const utcMs = londonTimeToUtcMs( + monday.getUTCFullYear(), + monday.getUTCMonth() + 1, + monday.getUTCDate(), + 7, + 30 + ); + return Math.floor(utcMs / 1000); } function googleMapsDestination( diff --git a/frontend/src/components/map/MapPage.tsx b/frontend/src/components/map/MapPage.tsx index bf45ace..7b6a8da 100644 --- a/frontend/src/components/map/MapPage.tsx +++ b/frontend/src/components/map/MapPage.tsx @@ -419,6 +419,7 @@ export default function MapPage({ const { listings: actualListings } = useActualListings(mapData.bounds, { filterParam: actualListingsFilterParam, travelParam: actualListingsTravelParam, + shareCode, }); const [isAreaGroupExpanded, toggleAreaGroup] = useCollapsibleGroups(true); diff --git a/frontend/src/hooks/useActualListings.ts b/frontend/src/hooks/useActualListings.ts index e8c056e..572e275 100644 --- a/frontend/src/hooks/useActualListings.ts +++ b/frontend/src/hooks/useActualListings.ts @@ -7,11 +7,12 @@ const DEBOUNCE_MS = 200; interface UseActualListingsOptions { filterParam?: string; travelParam?: string; + shareCode?: string; } export function useActualListings( bounds: Bounds | null, - { filterParam = '', travelParam = '' }: UseActualListingsOptions = {} + { filterParam = '', travelParam = '', shareCode = '' }: UseActualListingsOptions = {} ) { const [listings, setListings] = useState([]); const debounceRef = useRef | null>(null); @@ -38,11 +39,15 @@ export function useActualListings( const params = new URLSearchParams({ bounds: boundsStr }); if (filterParam) params.set('filters', filterParam); if (travelParam) params.set('travel', travelParam); + if (shareCode) params.set('share', shareCode); const res = await fetch( apiUrl('actual-listings', params), authHeaders({ signal: abortControllerRef.current.signal }) ); - if (!res.ok) throw new Error(`Actual listings fetch failed: HTTP ${res.status}`); + if (!res.ok) { + if (requestIdRef.current === requestId) setListings([]); + throw new Error(`Actual listings fetch failed: HTTP ${res.status}`); + } const json: ActualListingsResponse = await res.json(); if (requestIdRef.current !== requestId) return; setListings(json.listings || []); @@ -57,7 +62,7 @@ export function useActualListings( }; // listings intentionally excluded — it's internal state, not an input. // eslint-disable-next-line react-hooks/exhaustive-deps - }, [bounds, filterParam, travelParam]); + }, [bounds, filterParam, travelParam, shareCode]); return { listings }; } diff --git a/frontend/src/hooks/useDeckLayers.ts b/frontend/src/hooks/useDeckLayers.ts index f15cfc5..fe52717 100644 --- a/frontend/src/hooks/useDeckLayers.ts +++ b/frontend/src/hooks/useDeckLayers.ts @@ -109,9 +109,6 @@ export function useDeckLayers({ listings: actualListings, zoom, isDark, - hexagonData: data, - postcodeData, - usePostcodeView, }); // --- Refs for deck.gl accessors --- diff --git a/frontend/src/hooks/useListingLayers.ts b/frontend/src/hooks/useListingLayers.ts index 3e0f0b7..7d7c97b 100644 --- a/frontend/src/hooks/useListingLayers.ts +++ b/frontend/src/hooks/useListingLayers.ts @@ -1,9 +1,8 @@ import { useCallback, useMemo, useRef, useState } from 'react'; import type { Layer, PickingInfo } from '@deck.gl/core'; import { ScatterplotLayer, TextLayer } from '@deck.gl/layers'; -import { getResolution, latLngToCell } from 'h3-js'; -import type { ActualListing, HexagonData, PostcodeFeature } from '../types'; +import type { ActualListing } from '../types'; import { trackEvent } from '../lib/analytics'; const PRICE_LABEL_MIN_ZOOM = 14; @@ -19,14 +18,6 @@ interface UseListingLayersProps { listings: ActualListing[]; zoom: number; isDark: boolean; - hexagonData: HexagonData[]; - postcodeData: PostcodeFeature[]; - usePostcodeView: boolean; -} - -function normalizePostcode(value: string | undefined | null): string { - if (!value) return ''; - return value.replace(/\s+/g, '').toUpperCase(); } function formatShortPrice(price: number): string { @@ -35,57 +26,9 @@ function formatShortPrice(price: number): string { return `£${price}`; } -export function useListingLayers({ - listings, - zoom, - isDark, - hexagonData, - postcodeData, - usePostcodeView, -}: UseListingLayersProps) { +export function useListingLayers({ listings, zoom, isDark }: UseListingLayersProps) { const [popupInfo, setPopupInfo] = useState(null); - // Split into two memos so the inactive view's data changes don't invalidate - // the active filtered list. (e.g. in postcode view, hexagonData updates must - // not retrigger filtering / downstream layer rebuilds.) - const postcodeFilteredListings = useMemo(() => { - if (!usePostcodeView || listings.length === 0) return null; - const allowed = new Set(); - for (const feature of postcodeData) { - if (feature.properties.count > 0) { - allowed.add(normalizePostcode(feature.properties.postcode)); - } - } - if (allowed.size === 0) return []; - return listings.filter((listing) => allowed.has(normalizePostcode(listing.postcode))); - }, [listings, postcodeData, usePostcodeView]); - - const hexFilteredListings = useMemo(() => { - if (usePostcodeView || listings.length === 0) return null; - const allowed = new Set(); - let cellResolution: number | null = null; - for (const cell of hexagonData) { - if (cell.count > 0) { - allowed.add(cell.h3); - if (cellResolution == null) cellResolution = getResolution(cell.h3); - } - } - if (allowed.size === 0 || cellResolution == null) return []; - const resolutionForLookup = cellResolution; - return listings.filter((listing) => { - try { - return allowed.has(latLngToCell(listing.lat, listing.lon, resolutionForLookup)); - } catch { - return false; - } - }); - }, [listings, hexagonData, usePostcodeView]); - - const visibleListings = useMemo(() => { - if (listings.length === 0) return listings; - return (usePostcodeView ? postcodeFilteredListings : hexFilteredListings) ?? []; - }, [listings, usePostcodeView, postcodeFilteredListings, hexFilteredListings]); - const handleHover = useCallback((info: PickingInfo) => { if (info.object && info.x !== undefined && info.y !== undefined) { setPopupInfo({ x: info.x, y: info.y, listing: info.object }); @@ -119,21 +62,21 @@ export function useListingLayers({ () => new ScatterplotLayer({ id: 'actual-listing-shadow', - data: visibleListings, + data: listings, getPosition: (d) => [d.lon, d.lat], getRadius: 8, radiusUnits: 'pixels', getFillColor: isDark ? [0, 0, 0, 80] : [0, 0, 0, 40], pickable: false, }), - [visibleListings, isDark] + [listings, isDark] ); const pinLayer = useMemo( () => new ScatterplotLayer({ id: 'actual-listing-pin', - data: visibleListings, + data: listings, getPosition: (d) => [d.lon, d.lat], getRadius: 7, radiusUnits: 'pixels', @@ -148,12 +91,12 @@ export function useListingLayers({ onHover: stableHover, onClick: stableClick, }), - [visibleListings, stableHover, stableClick] + [listings, stableHover, stableClick] ); const priceLabelLayer = useMemo(() => { if (zoom < PRICE_LABEL_MIN_ZOOM) return null; - const labeled = visibleListings.filter((l) => l.asking_price && l.asking_price > 0); + const labeled = listings.filter((l) => l.asking_price && l.asking_price > 0); return new TextLayer({ id: 'actual-listing-price', data: labeled, @@ -174,11 +117,11 @@ export function useListingLayers({ sizeMaxPixels: 14, pickable: false, }); - }, [visibleListings, zoom, isDark]); + }, [listings, zoom, isDark]); const detailLabelLayer = useMemo(() => { if (zoom < ADDRESS_LABEL_MIN_ZOOM) return null; - const labeled = visibleListings.filter((l) => l.address || l.bedrooms != null); + const labeled = listings.filter((l) => l.address || l.bedrooms != null); return new TextLayer({ id: 'actual-listing-detail', data: labeled, @@ -205,7 +148,7 @@ export function useListingLayers({ sizeMaxPixels: 12, pickable: false, }); - }, [visibleListings, zoom, isDark]); + }, [listings, zoom, isDark]); const listingLayers = useMemo(() => { const layers: Layer[] = [pinShadowLayer, pinLayer]; diff --git a/frontend/src/lib/consts.ts b/frontend/src/lib/consts.ts index aea4f33..59ed271 100644 --- a/frontend/src/lib/consts.ts +++ b/frontend/src/lib/consts.ts @@ -132,7 +132,6 @@ export const POI_GROUP_COLORS: Record = { export const POI_CATEGORY_LOGOS: Record = { Airport: '/assets/twemoji/2708.png', Aldi: '/assets/poi-icons/logos/aldi.svg', - 'Allendale Co-operative Society': '/assets/poi-icons/logos/coop.svg', Amazon: '/assets/poi-icons/brands_2024/amazon_fresh.svg', Asda: '/assets/poi-icons/logos/asda.svg', 'Asda Express': '/assets/poi-icons/logos/asda.svg', @@ -148,26 +147,18 @@ export const POI_CATEGORY_LOGOS: Record = { 'Bus stop': '/assets/twemoji/1f68f.png', 'Butcher & Fishmonger': '/assets/twemoji/1f969.png', Centra: '/assets/poi-icons/logos/centra.svg', - 'Central England Co-operative': '/assets/poi-icons/logos/coop.svg', - 'Chelmsford Star Co-operative Society': '/assets/poi-icons/logos/coop.svg', - 'Clydebank Co-operative': '/assets/poi-icons/logos/coop.svg', 'Co-op': '/assets/poi-icons/logos/coop.svg', - 'Coniston Co-operative Society': '/assets/poi-icons/logos/coop.svg', COOK: '/assets/poi-icons/brands_2024/cook.svg', 'Convenience Store': '/assets/twemoji/1f3ea.png', Costco: '/assets/poi-icons/logos/costco.svg', 'Deli & Specialty': '/assets/twemoji/1f9c6.png', 'Dunnes Stores': '/assets/poi-icons/brands_2024/dunnes_stores.svg', - 'East of England Co-operative': '/assets/poi-icons/logos/coop.svg', Farmfoods: '/assets/poi-icons/brands_2023/supermarkets/farmfoods.svg', Ferry: '/assets/twemoji/26f4.png', Greengrocer: '/assets/twemoji/1f96c.png', - 'Heart of England Co-operative': '/assets/poi-icons/logos/coop.svg', 'Heron Foods': '/assets/poi-icons/brands_2023/supermarkets/heron_foods.svg', Iceland: '/assets/poi-icons/brands_2024/iceland.svg', Lidl: '/assets/poi-icons/logos/lidl.svg', - 'Langdale Co-operative Society': '/assets/poi-icons/logos/coop.svg', - 'Lincolnshire Co-operative': '/assets/poi-icons/logos/coop.svg', Makro: '/assets/poi-icons/brands_2024/makro.svg', 'M&S': '/assets/poi-icons/brands_2024/mns.svg', 'M&S Clothing': '/assets/poi-icons/brands_2024/mns.svg', @@ -175,7 +166,6 @@ export const POI_CATEGORY_LOGOS: Record = { 'M&S Hospital': '/assets/poi-icons/brands_2024/mns.svg', 'M&S MSA': '/assets/poi-icons/brands_2024/mns.svg', 'M&S Outlet': '/assets/poi-icons/brands_2024/mns.svg', - 'Midcounties Co-operative': '/assets/poi-icons/logos/coop.svg', Morrisons: '/assets/poi-icons/logos/morrisons.svg', 'Morrisons Daily': '/assets/poi-icons/brands_2024/morrisons_daily.svg', 'Off-Licence': '/assets/twemoji/1f377.png', @@ -183,16 +173,12 @@ export const POI_CATEGORY_LOGOS: Record = { 'Rail station': '/assets/twemoji/1f686.png', "Sainsbury's": '/assets/poi-icons/logos/sainsburys.svg', "Sainsbury's Local": '/assets/poi-icons/brands_2024/sainsburys_local.svg', - 'Scottish Midland Co-operative': '/assets/poi-icons/logos/coop.svg', Spar: '/assets/poi-icons/logos/spar.svg', Supermarket: '/assets/twemoji/1f6d2.png', - 'Tamworth Co-operative Society': '/assets/poi-icons/logos/coop.svg', Tesco: '/assets/poi-icons/logos/tesco.svg', 'Tesco Express': '/assets/poi-icons/logos/tesco_express.svg', 'Tesco Extra': '/assets/poi-icons/logos/tesco_extra.svg', 'Taxi rank': '/assets/twemoji/1f695.png', - 'The Radstock Co-operative Society': '/assets/poi-icons/logos/coop.svg', - 'The Southern Co-operative': '/assets/poi-icons/logos/coop.svg', 'The Food Warehouse': '/assets/poi-icons/logos/the_food_warehouse.png', 'Tube station': '/assets/poi-icons/public_transport/london_tube.svg', Waitrose: '/assets/poi-icons/logos/waitrose.svg', diff --git a/pipeline/__init__.py b/pipeline/__init__.py index e69de29..3329b9d 100644 --- a/pipeline/__init__.py +++ b/pipeline/__init__.py @@ -0,0 +1,3 @@ +from .local_temp import configure_tempfile_defaults + +configure_tempfile_defaults() diff --git a/pipeline/download/arcgis.py b/pipeline/download/arcgis.py index 059f7de..418d9bf 100644 --- a/pipeline/download/arcgis.py +++ b/pipeline/download/arcgis.py @@ -3,6 +3,7 @@ import tempfile import polars as pl from pathlib import Path +from pipeline.local_temp import local_tmp_dir from pipeline.utils import download, extract_zip URL = "https://www.arcgis.com/sharing/rest/content/items/36b718ad00de49afb9ad364f8b815b9e/data" @@ -40,7 +41,7 @@ def main() -> None: ) args = parser.parse_args() - with tempfile.TemporaryDirectory() as cache_dir: + with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as cache_dir: download_path = Path(cache_dir) / "arcgis_data.zip" extract_path = Path(cache_dir) / "arcgis_extracted" diff --git a/pipeline/download/broadband.py b/pipeline/download/broadband.py index b0db77e..dda2098 100644 --- a/pipeline/download/broadband.py +++ b/pipeline/download/broadband.py @@ -7,6 +7,7 @@ from pathlib import Path import httpx +from pipeline.local_temp import local_tmp_dir from pipeline.utils import download, extract_zip # Ofcom Connected Nations 2025 - Fixed broadband performance (output area & local authority level) @@ -84,7 +85,7 @@ def main() -> None: ) args = parser.parse_args() - with tempfile.TemporaryDirectory() as cache_dir: + with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as cache_dir: cache = Path(cache_dir) zip_path = cache / "broadband_performance.zip" extract_dir = cache / "extracted" diff --git a/pipeline/download/deprivation_data.py b/pipeline/download/deprivation_data.py index b4ad3eb..068ade4 100644 --- a/pipeline/download/deprivation_data.py +++ b/pipeline/download/deprivation_data.py @@ -3,6 +3,7 @@ import tempfile import polars as pl from pathlib import Path +from pipeline.local_temp import local_tmp_dir from pipeline.utils import download URL = "https://assets.publishing.service.gov.uk/media/691ded34513046b952c500bd/File_5_IoD2025_Scores_for_the_Indices_of_Deprivation.xlsx" @@ -33,7 +34,7 @@ def main() -> None: ) args = parser.parse_args() - with tempfile.TemporaryDirectory() as cache_dir: + with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as cache_dir: xlsx_path = Path(cache_dir) / "IoD2025_Scores.xlsx" download(URL, xlsx_path, timeout=60) convert_to_parquet(xlsx_path, args.output) diff --git a/pipeline/download/geolytix_retail_points.py b/pipeline/download/geolytix_retail_points.py index 3520105..b6118cf 100644 --- a/pipeline/download/geolytix_retail_points.py +++ b/pipeline/download/geolytix_retail_points.py @@ -8,6 +8,7 @@ from zipfile import ZipFile import polars as pl +from pipeline.local_temp import local_tmp_dir from pipeline.utils.download import download @@ -70,7 +71,9 @@ def download_geolytix_retail_points(output_path: Path) -> None: """Download the GEOLYTIX ZIP, extract the latest CSV, and write parquet.""" output_path.parent.mkdir(parents=True, exist_ok=True) - with TemporaryDirectory(prefix="geolytix_retail_points_") as tmp: + with TemporaryDirectory( + prefix="geolytix_retail_points_", dir=local_tmp_dir() + ) as tmp: zip_path = Path(tmp) / "geolytix_retail_points.zip" download(GEOLYTIX_RETAIL_POINTS_URL, zip_path, timeout=300) df = read_latest_csv(zip_path) diff --git a/pipeline/download/noise.py b/pipeline/download/noise.py index e382f50..f2e85a3 100644 --- a/pipeline/download/noise.py +++ b/pipeline/download/noise.py @@ -31,6 +31,8 @@ from pyproj import Transformer from rasterio.transform import rowcol from scipy.ndimage import maximum_filter +from pipeline.local_temp import local_tmp_dir + # Noise sources: # (label, column_name, WCS base URL, coverage ID, WCS version, allow_missing_tiles) # Road/rail work with WCS 1.0.0; airport requires WCS 2.0.1 and returns 500 @@ -437,7 +439,7 @@ def main() -> None: result = postcodes.select("postcode") - with tempfile.TemporaryDirectory() as tmp: + with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as tmp: for ( label, col_name, diff --git a/pipeline/download/ofsted.py b/pipeline/download/ofsted.py index 7618bd8..62979c5 100644 --- a/pipeline/download/ofsted.py +++ b/pipeline/download/ofsted.py @@ -3,6 +3,7 @@ import tempfile import polars as pl from pathlib import Path +from pipeline.local_temp import local_tmp_dir from pipeline.utils import download # Management information - state-funded schools - latest inspections (as at 28 Feb 2026) @@ -36,7 +37,7 @@ def main() -> None: ) args = parser.parse_args() - with tempfile.TemporaryDirectory() as cache_dir: + with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as cache_dir: csv_path = Path(cache_dir) / "ofsted_latest_inspections.csv" download(URL, csv_path, timeout=60) convert_to_parquet(csv_path, args.output) diff --git a/pipeline/download/os_greenspace.py b/pipeline/download/os_greenspace.py index da2be08..734edf4 100644 --- a/pipeline/download/os_greenspace.py +++ b/pipeline/download/os_greenspace.py @@ -25,6 +25,7 @@ from pyproj import Transformer from shapely.errors import GEOSException from shapely.geometry import shape as to_shapely +from pipeline.local_temp import local_tmp_dir from pipeline.utils.download import download, extract_zip logger = logging.getLogger(__name__) @@ -171,7 +172,7 @@ def _read_site_centroids( def download_greenspace(output: Path) -> None: output.parent.mkdir(parents=True, exist_ok=True) - with tempfile.TemporaryDirectory() as cache_dir: + with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as cache_dir: zip_path = Path(cache_dir) / "greenspace.zip" extract_dir = Path(cache_dir) / "extracted" diff --git a/pipeline/download/pois.py b/pipeline/download/pois.py index c369375..5d1b0aa 100644 --- a/pipeline/download/pois.py +++ b/pipeline/download/pois.py @@ -11,6 +11,7 @@ from shapely.geometry import Point from shapely.wkb import loads as load_wkb from tqdm import tqdm +from pipeline.local_temp import local_tmp_dir from pipeline.utils.england_geometry import ( ENGLAND_BBOX_EAST, ENGLAND_BBOX_NORTH, @@ -184,7 +185,7 @@ def main() -> None: england_polygon = load_england_polygon(args.boundary) - tmp_dir = Path(mkdtemp(prefix="pois_")) + tmp_dir = Path(mkdtemp(prefix="pois_", dir=local_tmp_dir())) with tqdm( unit=" elements", unit_scale=True, diff --git a/pipeline/download/postcodes.py b/pipeline/download/postcodes.py index 28099e8..289562b 100644 --- a/pipeline/download/postcodes.py +++ b/pipeline/download/postcodes.py @@ -12,6 +12,7 @@ import tarfile import tempfile from pathlib import Path +from pipeline.local_temp import local_tmp_dir from pipeline.utils import download URL = "https://postcodes-mapit-static.s3.eu-west-2.amazonaws.com/data/gb-postcodes-v5.tar.bz2" @@ -37,7 +38,7 @@ def main() -> None: ) args = parser.parse_args() - with tempfile.TemporaryDirectory() as cache_dir: + with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as cache_dir: cache = Path(cache_dir) archive_path = cache / "gb-postcodes-v5.tar.bz2" extract_dir = cache / "extracted" diff --git a/pipeline/download/price_paid.py b/pipeline/download/price_paid.py index a186636..6b44572 100644 --- a/pipeline/download/price_paid.py +++ b/pipeline/download/price_paid.py @@ -3,6 +3,7 @@ import tempfile import polars as pl from pathlib import Path +from pipeline.local_temp import local_tmp_dir from pipeline.utils import download URL = "http://prod.publicdata.landregistry.gov.uk.s3-website-eu-west-1.amazonaws.com/pp-complete.csv" @@ -55,7 +56,7 @@ def main() -> None: ) args = parser.parse_args() - with tempfile.TemporaryDirectory() as cache_dir: + with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as cache_dir: csv_path = Path(cache_dir) / "price-paid-complete.csv" download(URL, csv_path) diff --git a/pipeline/download/rental_prices.py b/pipeline/download/rental_prices.py index b7f98d8..e9a8041 100644 --- a/pipeline/download/rental_prices.py +++ b/pipeline/download/rental_prices.py @@ -13,6 +13,7 @@ from pathlib import Path import polars as pl +from pipeline.local_temp import local_tmp_dir from pipeline.utils import download URL = "https://www.ons.gov.uk/file?uri=/economy/inflationandpriceindices/datasets/priceindexofprivaterentsukmonthlypricestatistics/25march2026/priceindexofprivaterentsukmonthlypricestatistics.xlsx" @@ -114,7 +115,7 @@ def main() -> None: ) args = parser.parse_args() - with tempfile.TemporaryDirectory() as cache_dir: + with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as cache_dir: xlsx_path = Path(cache_dir) / "pipr_monthly.xlsx" download(URL, xlsx_path, timeout=120) convert_to_parquet(xlsx_path, args.output) diff --git a/pipeline/download/transit_network.py b/pipeline/download/transit_network.py index 4cbf5f8..a7d384b 100644 --- a/pipeline/download/transit_network.py +++ b/pipeline/download/transit_network.py @@ -36,6 +36,8 @@ from pathlib import Path from tqdm import tqdm +from pipeline.local_temp import local_tmp_dir + ENGLAND_PBF_URL = ( "https://download.geofabrik.de/europe/united-kingdom/england-latest.osm.pbf" ) @@ -164,7 +166,10 @@ def clean_gtfs(src: Path, dst: Path) -> None: ) tmp = tempfile.NamedTemporaryFile( - mode="wb", delete=False, suffix=".txt" + mode="wb", + delete=False, + suffix=".txt", + dir=local_tmp_dir(), ) tmp.write(header) @@ -388,7 +393,10 @@ def convert_high_freq_to_frequency_based( trip_id_idx = cols.index("trip_id") tmp = tempfile.NamedTemporaryFile( - mode="wb", delete=False, suffix=".txt" + mode="wb", + delete=False, + suffix=".txt", + dir=local_tmp_dir(), ) tmp.write(header) for line in f: @@ -408,7 +416,10 @@ def convert_high_freq_to_frequency_based( trip_id_idx = cols.index("trip_id") tmp = tempfile.NamedTemporaryFile( - mode="wb", delete=False, suffix=".txt" + mode="wb", + delete=False, + suffix=".txt", + dir=local_tmp_dir(), ) tmp.write(header) for line in f: @@ -451,8 +462,8 @@ def download_tfl_transxchange(raw_dir: Path) -> Path: def download_naptan() -> None: - """Download NaPTAN stops to /tmp/Stops.csv (needed by transxchange2gtfs).""" - dest = Path("/tmp/Stops.csv") + """Download NaPTAN stops to the local temp dir for transxchange2gtfs.""" + dest = local_tmp_dir() / "Stops.csv" if dest.exists(): print(f"NaPTAN Stops.csv already exists: {dest}") return @@ -661,7 +672,10 @@ def clean_national_rail_gtfs(src: Path, dst: Path) -> None: ) tmp = tempfile.NamedTemporaryFile( - mode="wb", delete=False, suffix=".txt" + mode="wb", + delete=False, + suffix=".txt", + dir=local_tmp_dir(), ) tmp.write(header) @@ -718,7 +732,10 @@ def clean_national_rail_gtfs(src: Path, dst: Path) -> None: lon_idx = cols.index("stop_lon") tmp = tempfile.NamedTemporaryFile( - mode="wb", delete=False, suffix=".txt" + mode="wb", + delete=False, + suffix=".txt", + dir=local_tmp_dir(), ) tmp.write(header) @@ -749,7 +766,10 @@ def clean_national_rail_gtfs(src: Path, dst: Path) -> None: rt_idx = cols.index("route_type") tmp = tempfile.NamedTemporaryFile( - mode="wb", delete=False, suffix=".txt" + mode="wb", + delete=False, + suffix=".txt", + dir=local_tmp_dir(), ) tmp.write(header) @@ -774,7 +794,10 @@ def clean_national_rail_gtfs(src: Path, dst: Path) -> None: trip_id_idx = cols.index("trip_id") tmp = tempfile.NamedTemporaryFile( - mode="wb", delete=False, suffix=".txt" + mode="wb", + delete=False, + suffix=".txt", + dir=local_tmp_dir(), ) tmp.write(header) @@ -797,7 +820,10 @@ def clean_national_rail_gtfs(src: Path, dst: Path) -> None: end_idx = cols.index("end_date") tmp = tempfile.NamedTemporaryFile( - mode="wb", delete=False, suffix=".txt" + mode="wb", + delete=False, + suffix=".txt", + dir=local_tmp_dir(), ) tmp.write(header) diff --git a/pipeline/download/transxchange2gtfs_shim.js b/pipeline/download/transxchange2gtfs_shim.js index e1952e8..1c0a1c2 100644 --- a/pipeline/download/transxchange2gtfs_shim.js +++ b/pipeline/download/transxchange2gtfs_shim.js @@ -15,6 +15,16 @@ if (!pkgDirArg || converterArgs.length < 2) { } const pkgDir = path.resolve(pkgDirArg); +const defaultTmpDir = path.resolve(__dirname, "..", "..", ".tmp"); +const localTmpDir = + process.env.TMPDIR || process.env.TEMP || process.env.TMP || defaultTmpDir; +const stopsCsv = path.join(localTmpDir, "Stops.csv"); +const converterTmpPrefix = path.join(localTmpDir, "transxchange2gtfs_"); +const converterTmpPatch = + `static TMP = ${JSON.stringify(converterTmpPrefix)}` + + ` + process.pid + ${JSON.stringify(path.sep)};`; + +fs.mkdirSync(localTmpDir, { recursive: true }); function replaceOnce(relativePath, before, after) { const file = path.join(pkgDir, relativePath); @@ -37,6 +47,26 @@ function replaceOnce(relativePath, before, after) { // GTFS shapes are optional for R5 routing. Clear shape references and omit // shapes.txt so missing route geometry does not drop otherwise usable trips. function patchPackage() { + replaceOnce( + "dist/Container.js", + "static TMP = `/tmp/transxchange2gtfs_${process.pid}/`;", + converterTmpPatch, + ); + replaceOnce( + "dist/Container.js", + 'fs.existsSync("/tmp/Stops.csv")', + `fs.existsSync(${JSON.stringify(stopsCsv)})`, + ); + replaceOnce( + "dist/Container.js", + 'fs.createReadStream("/tmp/Stops.csv", "utf8")', + `fs.createReadStream(${JSON.stringify(stopsCsv)}, "utf8")`, + ); + replaceOnce( + "dist/converter/GetStopData.js", + 'fs.createWriteStream("/tmp/Stops.csv")', + `fs.createWriteStream(${JSON.stringify(stopsCsv)})`, + ); replaceOnce( "dist/transxchange/TransXChangeJourneyStream.js", "distanceSoFarM += routeLink.Distance;", diff --git a/pipeline/download/uprn_lookup.py b/pipeline/download/uprn_lookup.py index ce644fd..3f2c804 100644 --- a/pipeline/download/uprn_lookup.py +++ b/pipeline/download/uprn_lookup.py @@ -13,6 +13,7 @@ from pathlib import Path import polars as pl +from pipeline.local_temp import local_tmp_dir from pipeline.utils import download, extract_zip URL = "https://www.arcgis.com/sharing/rest/content/items/4e0b4b3fbc2540caae27e7be532e61be/data" @@ -62,7 +63,7 @@ def main() -> None: ) args = parser.parse_args() - with tempfile.TemporaryDirectory() as cache_dir: + with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as cache_dir: zip_path = Path(cache_dir) / "uprn_lookup.zip" extract_path = Path(cache_dir) / "uprn_extracted" diff --git a/pipeline/transform/join_epc_pp.py b/pipeline/transform/join_epc_pp.py index 98cfd45..7ceccce 100644 --- a/pipeline/transform/join_epc_pp.py +++ b/pipeline/transform/join_epc_pp.py @@ -10,6 +10,8 @@ import pyarrow as pa import pyarrow.csv as pa_csv import pyarrow.parquet as pq +from pipeline.local_temp import local_tmp_dir + from ..utils import ( fuzzy_join_on_postcode, normalize_address_key, @@ -192,7 +194,9 @@ def main(): ) args = parser.parse_args() - with tempfile.TemporaryDirectory(prefix="epc_certificates_") as tmpdir: + with tempfile.TemporaryDirectory( + prefix="epc_certificates_", dir=local_tmp_dir() + ) as tmpdir: _run(args.epc, args.price_paid, args.output, Path(tmpdir)) diff --git a/pipeline/transform/postcode_boundaries/uprn.py b/pipeline/transform/postcode_boundaries/uprn.py index ae78b36..a1f4bdc 100644 --- a/pipeline/transform/postcode_boundaries/uprn.py +++ b/pipeline/transform/postcode_boundaries/uprn.py @@ -3,6 +3,8 @@ from pathlib import Path import numpy as np import polars as pl +from pipeline.local_temp import local_tmp_dir + from .memory import release_memory @@ -17,7 +19,9 @@ def load_uprns(uprn_path: Path) -> tuple[pl.DataFrame, dict[str, tuple[int, int] print("Loading UPRN lookup...") # Sort via streaming sink to avoid polars doubling memory during in-memory sort - with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as tmp: + with tempfile.NamedTemporaryFile( + suffix=".parquet", delete=False, dir=local_tmp_dir() + ) as tmp: tmp_path = Path(tmp.name) ( pl.scan_parquet(uprn_path) diff --git a/pipeline/transform/test_transform_poi.py b/pipeline/transform/test_transform_poi.py index ca9ad92..c9a067f 100644 --- a/pipeline/transform/test_transform_poi.py +++ b/pipeline/transform/test_transform_poi.py @@ -79,6 +79,39 @@ def test_transform_grocery_retail_points_keeps_fascia_icon_category(): ] +def test_transform_grocery_retail_points_merges_cooperative_societies(): + raw = pl.DataFrame( + { + "id": [101, 102, 103], + "retailer": [ + "Central England Co-operative", + "Lincolnshire Co-operative", + "The Southern Co-operative", + ], + "fascia": [ + "Central England Co-operative", + "The Co-operative Food", + None, + ], + "store_name": [ + "Central Co-op Test", + "Lincolnshire Co-op Test", + "Southern Co-op Test", + ], + "long_wgs": [-0.141, -0.142, -0.143], + "lat_wgs": [51.515, 51.516, 51.517], + } + ) + + pois = transform_grocery_retail_points(raw, min_chain_locations=1) + + assert pois.select("category", "icon_category").to_dicts() == [ + {"category": "Co-op", "icon_category": "Co-op"}, + {"category": "Co-op", "icon_category": "Co-op"}, + {"category": "Co-op", "icon_category": "Co-op"}, + ] + + def test_transform_grocery_retail_points_accepts_base_fascias(): raw = pl.DataFrame( { diff --git a/pipeline/transform/transform_poi.py b/pipeline/transform/transform_poi.py index 552bb75..617fc45 100644 --- a/pipeline/transform/transform_poi.py +++ b/pipeline/transform/transform_poi.py @@ -623,6 +623,7 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [ "shop/outpost", "shop/pawnbroker", "shop/photo", + "shop/photo_studio", "shop/plant_hire", "shop/printer_ink", "shop/printing", @@ -843,6 +844,7 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [ [ "healthcare/physiotherapist", "healthcare/podiatrist", + "healthcare/occupational_therapist", ], ), ( @@ -1171,7 +1173,6 @@ GROCERY_RETAILER_DISPLAY_NAME_OVERRIDES: dict[str, str] = { "Heron": "Heron Foods", "Marks and Spencer": "M&S", "Sainsburys": "Sainsbury's", - "The Co-operative Group": "Co-op", } @@ -1238,6 +1239,8 @@ def normalize_grocery_retailer(retailer: str | None) -> str: if retailer is None: return "" retailer = retailer.strip() + if retailer in COOP_RETAILERS: + return "Co-op" return GROCERY_RETAILER_DISPLAY_NAME_OVERRIDES.get(retailer, retailer) diff --git a/pipeline/utils/fuzzy_join.py b/pipeline/utils/fuzzy_join.py index 8ea3a96..2870739 100644 --- a/pipeline/utils/fuzzy_join.py +++ b/pipeline/utils/fuzzy_join.py @@ -9,6 +9,8 @@ import polars as pl from thefuzz import fuzz from tqdm import tqdm +from pipeline.local_temp import local_tmp_dir + _NUMBER_RE = re.compile(r"\d+") _POSTCODE_RE = r"^[A-Z]{1,2}\d[A-Z\d]?\d[A-Z]{2}$" MIN_FUZZY_SCORE = 60 @@ -57,7 +59,7 @@ def fuzzy_join_on_postcode( have null right columns. """ - tmpdir = tempfile.mkdtemp(prefix="fuzzy_join_") + tmpdir = tempfile.mkdtemp(prefix="fuzzy_join_", dir=local_tmp_dir()) left_path = Path(tmpdir) / "left.parquet" right_path = Path(tmpdir) / "right.parquet" diff --git a/property-data/.gitignore b/property-data/.gitignore deleted file mode 100644 index d6b7ef3..0000000 --- a/property-data/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -* -!.gitignore diff --git a/server-rs/src/data/actual_listings.rs b/server-rs/src/data/actual_listings.rs index f7d2c70..4b3289f 100644 --- a/server-rs/src/data/actual_listings.rs +++ b/server-rs/src/data/actual_listings.rs @@ -6,6 +6,8 @@ use polars::prelude::*; use serde::Serialize; use tracing::info; +use crate::consts::{NAN_U16, QUANT_SCALE}; +use crate::data::{PropertyData, QuantRef}; use crate::utils::{normalize_postcode, GridIndex, InternedColumn}; const GRID_CELL_SIZE: f32 = 0.01; @@ -52,15 +54,22 @@ pub struct ActualListingData { pub listing_status: InternedColumn, pub listing_date_iso: Vec>, pub features: Vec>, + /// Row-major feature matrix aligned with PropertyData::feature_names. + /// + /// Rows start from a best-effort address/postcode join to the historical property + /// dataset, then live listing fields such as asking price and property type are + /// overlaid where available. This lets the listings endpoint use the same filter + /// execution path as the property endpoints. + pub filter_feature_data: Vec, pub grid: GridIndex, } impl ActualListingData { - pub fn load(parquet_path: &Path) -> Result { - super::run_polars_io(|| Self::load_inner(parquet_path)) + pub fn load(parquet_path: &Path, property_data: &PropertyData) -> Result { + super::run_polars_io(|| Self::load_inner(parquet_path, Some(property_data))) } - fn load_inner(parquet_path: &Path) -> Result { + fn load_inner(parquet_path: &Path, property_data: Option<&PropertyData>) -> Result { info!("Loading actual listings from {:?}", parquet_path); let pl_path = PlRefPath::try_from_path(parquet_path) .context("Failed to normalize actual listings parquet path")?; @@ -99,6 +108,18 @@ impl ActualListingData { let price_qualifier = InternedColumn::build(&opt_to_string(&price_qualifier_raw)); let listing_status = InternedColumn::build(&opt_to_string(&listing_status_raw)); + let filter_feature_data = build_filter_feature_data( + property_data, + &postcode, + &address, + &property_type_raw, + &leasehold_freehold_raw, + &rooms_total, + &floor_area_sqm, + &asking_price, + &asking_price_per_sqm, + ); + let grid = GridIndex::build(&lat, &lon, GRID_CELL_SIZE); info!(rows = row_count, "Actual listings loaded"); @@ -122,6 +143,7 @@ impl ActualListingData { listing_status, listing_date_iso, features, + filter_feature_data, grid, }) } @@ -150,6 +172,201 @@ impl ActualListingData { } } +#[allow(clippy::too_many_arguments)] +fn build_filter_feature_data( + property_data: Option<&PropertyData>, + postcode: &[String], + address: &[Option], + property_type: &[Option], + leasehold_freehold: &[Option], + rooms_total: &[Option], + floor_area_sqm: &[Option], + asking_price: &[Option], + asking_price_per_sqm: &[Option], +) -> Vec { + let Some(property_data) = property_data else { + return Vec::new(); + }; + + let num_features = property_data.num_features; + let mut feature_data = vec![NAN_U16; postcode.len() * num_features]; + let mut joined_rows = 0usize; + + for (row, postcode_value) in postcode.iter().enumerate() { + let Some(address_value) = address[row] + .as_deref() + .map(str::trim) + .filter(|v| !v.is_empty()) + else { + continue; + }; + + let query = format!("{address_value} {postcode_value}"); + let Some(&property_row) = property_data.search_addresses(&query, 1).first() else { + continue; + }; + if property_data.postcode(property_row) != postcode_value { + continue; + } + + let dst = row * num_features; + let src = property_row * num_features; + feature_data[dst..dst + num_features] + .copy_from_slice(&property_data.feature_data[src..src + num_features]); + joined_rows += 1; + } + + let quant = property_data.quant_ref(); + overlay_numeric_feature( + &mut feature_data, + property_data, + &quant, + "Total floor area (sqm)", + floor_area_sqm.iter().copied(), + false, + ); + overlay_numeric_feature( + &mut feature_data, + property_data, + &quant, + "Number of bedrooms & living rooms", + rooms_total.iter().map(|value| value.map(|v| v as f32)), + false, + ); + overlay_numeric_feature( + &mut feature_data, + property_data, + &quant, + "Estimated current price", + asking_price.iter().map(|value| value.map(|v| v as f32)), + true, + ); + overlay_numeric_feature( + &mut feature_data, + property_data, + &quant, + "Last known price", + asking_price.iter().map(|value| value.map(|v| v as f32)), + true, + ); + overlay_numeric_feature( + &mut feature_data, + property_data, + &quant, + "Est. price per sqm", + asking_price_per_sqm.iter().copied(), + true, + ); + overlay_numeric_feature( + &mut feature_data, + property_data, + &quant, + "Price per sqm", + asking_price_per_sqm.iter().copied(), + true, + ); + overlay_enum_feature( + &mut feature_data, + property_data, + "Property type", + property_type.iter().map(Option::as_deref), + false, + ); + overlay_enum_feature( + &mut feature_data, + property_data, + "Leasehold/Freehold", + leasehold_freehold.iter().map(Option::as_deref), + false, + ); + + info!( + rows = postcode.len(), + joined_rows, "Actual listings joined to property feature matrix" + ); + + feature_data +} + +fn feature_index(property_data: &PropertyData, name: &str) -> Option { + property_data + .feature_names + .iter() + .position(|candidate| candidate == name) +} + +fn overlay_numeric_feature( + feature_data: &mut [u16], + property_data: &PropertyData, + quant: &QuantRef<'_>, + name: &str, + values: I, + clear_missing: bool, +) where + I: IntoIterator>, +{ + let Some(feat_idx) = feature_index(property_data, name) else { + return; + }; + if feat_idx >= property_data.num_numeric { + return; + } + + let num_features = property_data.num_features; + for (row, value) in values.into_iter().enumerate() { + let dst = row * num_features + feat_idx; + match value { + Some(value) => feature_data[dst] = encode_numeric_value(quant, feat_idx, value), + None if clear_missing => feature_data[dst] = NAN_U16, + None => {} + } + } +} + +fn overlay_enum_feature<'a, I>( + feature_data: &mut [u16], + property_data: &PropertyData, + name: &str, + values: I, + clear_missing: bool, +) where + I: IntoIterator>, +{ + let Some(feat_idx) = feature_index(property_data, name) else { + return; + }; + let Some(enum_values) = property_data.enum_values.get(&feat_idx) else { + return; + }; + + let num_features = property_data.num_features; + for (row, value) in values.into_iter().enumerate() { + let dst = row * num_features + feat_idx; + let encoded = value + .map(str::trim) + .filter(|text| !text.is_empty()) + .and_then(|text| enum_values.iter().position(|candidate| candidate == text)) + .map(|position| position as u16); + match encoded { + Some(value) => feature_data[dst] = value, + None if clear_missing => feature_data[dst] = NAN_U16, + None => {} + } + } +} + +fn encode_numeric_value(quant: &QuantRef<'_>, feat_idx: usize, value: f32) -> u16 { + if !value.is_finite() { + return NAN_U16; + } + let range = quant.quant_range[feat_idx]; + if range <= 0.0 { + return 0; + } + let normalized = (value - quant.quant_min[feat_idx]) / range; + (normalized * QUANT_SCALE).round().clamp(0.0, QUANT_SCALE) as u16 +} + fn opt_to_string(values: &[Option]) -> Vec { values .iter() @@ -311,7 +528,7 @@ mod tests { return; }; - let data = ActualListingData::load(&path).expect("listings load"); + let data = ActualListingData::load_inner(&path, None).expect("listings load"); assert!(!data.lat.is_empty()); assert_eq!(data.lat.len(), data.lon.len()); assert_eq!(data.lat.len(), data.postcode.len()); diff --git a/server-rs/src/data/poi.rs b/server-rs/src/data/poi.rs index 5968704..0e497ab 100644 --- a/server-rs/src/data/poi.rs +++ b/server-rs/src/data/poi.rs @@ -30,16 +30,6 @@ const GROCERY_DASHBOARD_CATEGORIES: &[&str] = &[ "Budgens", "Centra", "Co-op", - "Central England Co-operative", - "Chelmsford Star Co-operative Society", - "East of England Co-operative", - "Heart of England Co-operative", - "Lincolnshire Co-operative", - "Midcounties Co-operative", - "Scottish Midland Co-operative", - "Tamworth Co-operative Society", - "The Radstock Co-operative Society", - "The Southern Co-operative", "COOK", "Costco", "Dunnes Stores", @@ -104,10 +94,35 @@ fn add_category_filter_index( } } +fn canonical_poi_category(category: &str) -> &str { + match category { + "Allendale Co-operative Society" + | "Central England Co-operative" + | "Channel Islands Co-operative Society" + | "Chelmsford Star Co-operative Society" + | "Clydebank Co-operative" + | "Coniston Co-operative Society" + | "Co-op Food" + | "East of England Co-operative" + | "Heart of England Co-operative" + | "Langdale Co-operative Society" + | "Lincolnshire Co-operative" + | "Midcounties Co-operative" + | "Scottish Midland Co-operative" + | "Tamworth Co-operative Society" + | "The Co-operative Food" + | "The Co-operative Food PFS" + | "The Co-operative Group" + | "The Radstock Co-operative Society" + | "The Southern Co-operative" => "Co-op", + _ => category, + } +} + pub fn resolve_poi_category_filter(category_values: &[String], categories: &str) -> FxHashSet { let mut selected = FxHashSet::default(); for part in categories.split(',') { - let category = part.trim(); + let category = canonical_poi_category(part.trim()); if category.is_empty() { continue; } @@ -200,12 +215,18 @@ impl POIData { let id_raw: Vec = extract_str_col(&df, "id")?; let name = extract_str_col(&df, "name")?; - let category_raw = extract_str_col(&df, "category")?; + let category_raw: Vec = extract_str_col(&df, "category")? + .into_iter() + .map(|category| canonical_poi_category(&category).to_string()) + .collect(); let group_raw = extract_str_col(&df, "group")?; let lat = extract_f32_col(&df, "lat")?; let lng = extract_f32_col(&df, "lng")?; let emoji_raw = extract_str_col(&df, "emoji")?; - let icon_category_raw = extract_str_col(&df, "icon_category")?; + let icon_category_raw: Vec = extract_str_col(&df, "icon_category")? + .into_iter() + .map(|category| canonical_poi_category(&category).to_string()) + .collect(); // Pack POI IDs into a contiguous buffer let total_id_bytes: usize = id_raw.iter().map(|s| s.len()).sum(); @@ -351,4 +372,19 @@ mod tests { assert!(selected.is_empty()); } + + #[test] + fn coop_category_aliases_resolve_to_single_category() { + let values = vec!["Co-op".to_string(), "Tesco".to_string()]; + + let selected = resolve_poi_category_filter( + &values, + "Central England Co-operative,The Southern Co-operative", + ); + + assert!(selected.contains(&0)); + assert_eq!(selected.len(), 1); + assert_eq!(canonical_poi_category("Lincolnshire Co-operative"), "Co-op"); + assert_eq!(canonical_poi_category("Tesco"), "Tesco"); + } } diff --git a/server-rs/src/features.rs b/server-rs/src/features.rs index 975315b..fd51b6c 100644 --- a/server-rs/src/features.rs +++ b/server-rs/src/features.rs @@ -1014,22 +1014,6 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[ }, ]; -/// Feature names that describe an individual property (price, size, type, etc.) rather -/// than the surrounding area. Use this to skip filters that should not exclude live -/// listings on the map even though they hide aggregated property rows. -pub fn property_level_feature_names() -> Vec<&'static str> { - const PROPERTY_GROUPS: &[&str] = &["Properties", "Property prices"]; - FEATURE_GROUPS - .iter() - .filter(|group| PROPERTY_GROUPS.contains(&group.name)) - .flat_map(|group| group.features.iter()) - .map(|feature| match feature { - Feature::Numeric(c) => c.name, - Feature::Enum(c) => c.name, - }) - .collect() -} - /// Flat ordered list of all numeric feature names (follows group order). pub fn all_numeric_feature_names() -> Vec<&'static str> { FEATURE_GROUPS diff --git a/server-rs/src/main.rs b/server-rs/src/main.rs index 95dd7a8..f0e9018 100644 --- a/server-rs/src/main.rs +++ b/server-rs/src/main.rs @@ -541,7 +541,7 @@ async fn main() -> anyhow::Result<()> { bail!("Actual listings parquet not found: {}", path.display()); } info!("Loading actual listings from {}", path.display()); - let listings = data::ActualListingData::load(path)?; + let listings = data::ActualListingData::load(path, &property_data)?; trim_allocator("actual listings load"); info!(rows = listings.lat.len(), "Actual listings loaded"); Some(Arc::new(listings)) diff --git a/server-rs/src/routes/actual_listings.rs b/server-rs/src/routes/actual_listings.rs index 525cfa9..f4e1cb0 100644 --- a/server-rs/src/routes/actual_listings.rs +++ b/server-rs/src/routes/actual_listings.rs @@ -1,16 +1,20 @@ use std::sync::Arc; use axum::extract::{Query, State}; -use axum::response::Json; +use axum::response::{IntoResponse, Json, Response}; +use axum::Extension; use rustc_hash::FxHashSet; use serde::{Deserialize, Serialize}; use tracing::info; use crate::api_error::ApiError; +use crate::auth::OptionalUser; +use crate::consts::NAN_U16; use crate::data::ActualListing; -use crate::features::property_level_feature_names; +use crate::licensing::{check_license_bounds, resolve_share_code}; use crate::parsing::{ parse_filters_with_poi, require_bounds, row_passes_filters, row_passes_poi_filters, + ParsedEnumFilter, ParsedFilter, }; use crate::state::{AppState, SharedState}; @@ -25,6 +29,8 @@ pub struct ActualListingsParams { travel: Option, /// Number of results to skip. Defaults to 0. offset: Option, + /// Share-link code; grants bbox-scoped access for unlicensed users. + share: Option, } #[derive(Serialize)] @@ -35,10 +41,24 @@ pub struct ActualListingsResponse { pub truncated: bool, } +const LISTING_LEVEL_FILTER_FEATURES: &[&str] = &[ + "Property type", + "Leasehold/Freehold", + "Total floor area (sqm)", + "Number of bedrooms & living rooms", + "Estimated current price", + "Last known price", + "Est. price per sqm", + "Price per sqm", +]; + +const KEEP_UNKNOWN_LISTING_FILTER_FEATURES: &[&str] = &["Total floor area (sqm)"]; + pub async fn get_actual_listings( State(shared): State>, + Extension(user): Extension, Query(params): Query, -) -> Result, ApiError> { +) -> Result, Response> { let state = shared.load_state(); let offset = params.offset.unwrap_or(0); let Some(actual_listings) = state.actual_listings.clone() else { @@ -49,11 +69,15 @@ pub async fn get_actual_listings( truncated: false, })); }; - let (south, west, north, east) = require_bounds(params.bounds).map_err(ApiError::from)?; + let (south, west, north, east) = + require_bounds(params.bounds).map_err(IntoResponse::into_response)?; + + let share_bounds = resolve_share_code(&state, params.share.as_deref()).await; + check_license_bounds(&user.0, (south, west, north, east), share_bounds)?; let quant = state.data.quant_ref(); let poi_quant = state.data.poi_metrics.quant_ref(); - let (mut parsed_filters, mut parsed_enum_filters, parsed_poi_filters) = parse_filters_with_poi( + let (parsed_filters, parsed_enum_filters, parsed_poi_filters) = parse_filters_with_poi( params.filters.as_deref(), &state.feature_name_to_index, &state.data.enum_values, @@ -61,40 +85,38 @@ pub async fn get_actual_listings( &state.data.poi_metrics.name_to_index, &poi_quant, ) - .map_err(ApiError::BadRequest)?; + .map_err(|err| ApiError::BadRequest(err).into_response())?; - // Drop property-level filters (price, sqm, build year, beds, type, etc.) so they - // don't hide live listings — those are individual-property concerns the user can - // judge from the pin itself. We only keep area/postcode-level filters here. - let property_level_idxs: FxHashSet = property_level_feature_names() - .into_iter() - .filter_map(|name| state.feature_name_to_index.get(name).copied()) - .collect(); - parsed_filters.retain(|f| !property_level_idxs.contains(&f.feat_idx)); - parsed_enum_filters.retain(|f| !property_level_idxs.contains(&f.feat_idx)); + let travel_entries = parse_optional_travel(params.travel.as_deref()) + .map_err(|err| ApiError::BadRequest(err).into_response())?; - let travel_entries = - parse_optional_travel(params.travel.as_deref()).map_err(ApiError::BadRequest)?; + let listing_level_feature_idxs = listing_level_filter_feature_idxs(&state); + let keep_unknown_listing_filter_idxs = keep_unknown_listing_filter_feature_idxs(&state); + let (listing_filters, postcode_filters) = + split_numeric_filters(parsed_filters, &listing_level_feature_idxs); + let (listing_enum_filters, postcode_enum_filters) = + split_enum_filters(parsed_enum_filters, &listing_level_feature_idxs); - let has_area_filters = !parsed_filters.is_empty() - || !parsed_enum_filters.is_empty() + let has_postcode_filters = !postcode_filters.is_empty() + || !postcode_enum_filters.is_empty() || !parsed_poi_filters.is_empty() || !travel_entries.is_empty(); + let has_listing_filters = !listing_filters.is_empty() || !listing_enum_filters.is_empty(); let state_clone = state.clone(); let response = tokio::task::spawn_blocking(move || -> Result { let t0 = std::time::Instant::now(); - let passing_postcodes = if has_area_filters { + let passing_postcodes = if has_postcode_filters { Some(compute_passing_postcodes( &state_clone, south, west, north, east, - &parsed_filters, - &parsed_enum_filters, + &postcode_filters, + &postcode_enum_filters, &parsed_poi_filters, &travel_entries, )?) @@ -116,6 +138,18 @@ pub async fn get_actual_listings( return None; } } + if has_listing_filters + && !row_passes_listing_filters( + row, + &listing_filters, + &listing_enum_filters, + &actual_listings.filter_feature_data, + state_clone.data.num_features, + &keep_unknown_listing_filter_idxs, + ) + { + return None; + } Some(row) }) .collect(); @@ -142,7 +176,8 @@ pub async fn get_actual_listings( total = total_matching, total_in_bounds, offset, - filtered = passing_postcodes.is_some(), + postcode_filtered = passing_postcodes.is_some(), + listing_filtered = has_listing_filters, ms = format_args!("{:.1}", elapsed.as_secs_f64() * 1000.0), "GET /api/actual-listings" ); @@ -155,12 +190,82 @@ pub async fn get_actual_listings( }) }) .await - .map_err(|error| ApiError::Internal(error.to_string()))? - .map_err(ApiError::Internal)?; + .map_err(|error| ApiError::Internal(error.to_string()).into_response())? + .map_err(|err| ApiError::Internal(err).into_response())?; Ok(Json(response)) } +fn listing_level_filter_feature_idxs(state: &AppState) -> FxHashSet { + feature_idxs(state, LISTING_LEVEL_FILTER_FEATURES) +} + +fn keep_unknown_listing_filter_feature_idxs(state: &AppState) -> FxHashSet { + feature_idxs(state, KEEP_UNKNOWN_LISTING_FILTER_FEATURES) +} + +fn feature_idxs(state: &AppState, names: &[&str]) -> FxHashSet { + names + .iter() + .filter_map(|name| state.feature_name_to_index.get(*name).copied()) + .collect() +} + +fn split_numeric_filters( + filters: Vec, + listing_level_feature_idxs: &FxHashSet, +) -> (Vec, Vec) { + let mut listing_filters = Vec::new(); + let mut postcode_filters = Vec::new(); + for filter in filters { + if listing_level_feature_idxs.contains(&filter.feat_idx) { + listing_filters.push(filter); + } else { + postcode_filters.push(filter); + } + } + (listing_filters, postcode_filters) +} + +fn split_enum_filters( + filters: Vec, + listing_level_feature_idxs: &FxHashSet, +) -> (Vec, Vec) { + let mut listing_filters = Vec::new(); + let mut postcode_filters = Vec::new(); + for filter in filters { + if listing_level_feature_idxs.contains(&filter.feat_idx) { + listing_filters.push(filter); + } else { + postcode_filters.push(filter); + } + } + (listing_filters, postcode_filters) +} + +fn row_passes_listing_filters( + row: usize, + filters: &[ParsedFilter], + enum_filters: &[ParsedEnumFilter], + feature_data: &[u16], + num_features: usize, + keep_unknown_filter_idxs: &FxHashSet, +) -> bool { + let base = row * num_features; + + filters.iter().all(|filter| { + let raw = feature_data[base + filter.feat_idx]; + if raw == NAN_U16 { + keep_unknown_filter_idxs.contains(&filter.feat_idx) + } else { + raw >= filter.min_u16 && raw <= filter.max_u16 + } + }) && enum_filters.iter().all(|filter| { + let raw = feature_data[base + filter.feat_idx]; + raw != NAN_U16 && filter.allowed.contains(&raw) + }) +} + #[allow(clippy::too_many_arguments)] fn compute_passing_postcodes( state: &AppState, @@ -224,3 +329,111 @@ fn compute_passing_postcodes( Ok(passing) } + +#[cfg(test)] +mod tests { + use super::*; + + fn numeric_filter(feat_idx: usize) -> ParsedFilter { + ParsedFilter { + feat_idx, + min_u16: 0, + max_u16: 100, + } + } + + fn enum_filter(feat_idx: usize) -> ParsedEnumFilter { + ParsedEnumFilter { + feat_idx, + allowed: [0u16].into_iter().collect(), + } + } + + #[test] + fn splits_actual_listing_filters_by_listing_native_features() { + let listing_level_feature_idxs: FxHashSet = [1usize, 3].into_iter().collect(); + + let (listing_filters, postcode_filters) = split_numeric_filters( + vec![numeric_filter(0), numeric_filter(1), numeric_filter(3)], + &listing_level_feature_idxs, + ); + assert_eq!( + listing_filters + .iter() + .map(|filter| filter.feat_idx) + .collect::>(), + vec![1, 3] + ); + assert_eq!( + postcode_filters + .iter() + .map(|filter| filter.feat_idx) + .collect::>(), + vec![0] + ); + + let (listing_enum_filters, postcode_enum_filters) = split_enum_filters( + vec![enum_filter(2), enum_filter(3)], + &listing_level_feature_idxs, + ); + assert_eq!( + listing_enum_filters + .iter() + .map(|filter| filter.feat_idx) + .collect::>(), + vec![3] + ); + assert_eq!( + postcode_enum_filters + .iter() + .map(|filter| filter.feat_idx) + .collect::>(), + vec![2] + ); + } + + #[test] + fn listing_floor_area_filter_keeps_unknown_values() { + let floor_area_filter = ParsedFilter { + feat_idx: 0, + min_u16: 10, + max_u16: 20, + }; + let keep_unknown_filter_idxs: FxHashSet = [0usize].into_iter().collect(); + + assert!(row_passes_listing_filters( + 0, + &[floor_area_filter], + &[], + &[NAN_U16], + 1, + &keep_unknown_filter_idxs + )); + + assert!(!row_passes_listing_filters( + 0, + &[ParsedFilter { + feat_idx: 0, + min_u16: 10, + max_u16: 20, + }], + &[], + &[9], + 1, + &keep_unknown_filter_idxs + )); + + assert!(row_passes_listing_filters( + 0, + &[ParsedFilter { + feat_idx: 0, + min_u16: 10, + max_u16: 20, + }], + &[], + &[15], + 1, + &keep_unknown_filter_idxs + )); + } +}