All good
Some checks failed
CI / Check (push) Has been cancelled
Build and publish Docker image / build-and-push (push) Has been cancelled

This commit is contained in:
Andras Schmelczer 2026-05-18 21:20:10 +01:00
parent 6ea544a0f6
commit 6cc7288126
45 changed files with 929 additions and 1043 deletions

View file

@ -39,8 +39,10 @@ jobs:
host="127.0.0.1:13000"
fi
repo=$(echo "${{ gitea.repository }}" | tr '[:upper:]' '[:lower:]')
owner="${repo%%/*}"
{
echo "host=${host}"
echo "owner=${owner}"
echo "image=${host}/${repo}"
echo "screenshot_image=${host}/${repo}-screenshot"
} >> "$GITHUB_OUTPUT"
@ -49,8 +51,8 @@ jobs:
uses: https://github.com/docker/login-action@v3
with:
registry: ${{ steps.registry.outputs.host }}
username: ${{ gitea.actor }}
password: ${{ secrets.GITEA_TOKEN }}
username: ${{ steps.registry.outputs.owner }}
password: ${{ secrets.FORGEJO_PACKAGE_TOKEN }}
- name: Extract metadata (main)
id: meta

1
.gitignore vendored
View file

@ -5,6 +5,7 @@
**/dist
server-rs/target
.task
.tmp/
frontend/public/assets/*
!frontend/public/assets/fonts/
!frontend/public/assets/fonts/**

View file

@ -29,8 +29,7 @@ services:
- .:/app
- cargo-home:/usr/local/cargo
- cargo-target:/app/server-rs/target
- ./property-data:/app/data:ro
- ./property-data/travel-times:/app/data/travel-times:ro
- ./property-data2:/app/data:ro
- ./finder/data:/app/finder-data:ro
environment:
POCKETBASE_URL: http://pocketbase:8090
@ -51,7 +50,7 @@ services:
BUGSINK_ENVIRONMENT: ${BUGSINK_ENVIRONMENT:-development}
BUGSINK_RELEASE: ${BUGSINK_RELEASE:-}
BUGSINK_SEND_DEFAULT_PII: ${BUGSINK_SEND_DEFAULT_PII:-false}
ACTUAL_LISTINGS_PATH: /app/finder-data/online_listings_buy_filtered.parquet
ACTUAL_LISTINGS_PATH: /app/finder-data/online_listings_buy.parquet
depends_on:
screenshot:
condition: service_healthy

View file

@ -20,11 +20,6 @@ TYPEAHEAD_URL = "https://los.rightmove.co.uk/typeahead"
SEARCH_URL = "https://www.rightmove.co.uk/api/property-search/listing/search"
RIGHTMOVE_BASE = "https://www.rightmove.co.uk"
# home.co.uk
HOMECOUK_BASE = "https://home.co.uk"
HOMECOUK_API_BASE = f"{HOMECOUK_BASE}/api"
HOMECOUK_PER_PAGE = 30 # max supported by the API
# Zoopla
ZOOPLA_BASE = "https://www.zoopla.co.uk"
@ -108,13 +103,13 @@ PROPERTY_TYPE_MAP = {
"House Boat": "Other",
"Barn": "Other",
"Serviced Apartments": "Other",
# Space-separated variants (from home.co.uk underscore/hyphen normalization)
# Space-separated variants from legacy provider normalization.
"Semi Detached": "Semi-Detached",
"Semi Detached Bungalow": "Semi-Detached",
"End Of Terrace": "Terraced",
"End Terrace": "Terraced",
"Block Of Apartments": "Other",
# Lowercase variants (from home.co.uk / Rightmove APIs)
# Lowercase variants from listing APIs.
"house": "Detached",
"bungalow": "Other",
"townhouse": "Terraced",

View file

@ -1,461 +0,0 @@
import json
import logging
import os
import random
import re
import time
from urllib.parse import unquote
from curl_cffi.requests import Session
from curl_cffi.requests.errors import RequestsError
from constants import (
DELAY_BETWEEN_PAGES,
HOMECOUK_API_BASE,
HOMECOUK_BASE,
HOMECOUK_PER_PAGE,
MAX_BEDROOMS,
PROPERTY_TYPE_MAP,
RETRY_BASE_DELAY,
)
from spatial import PostcodeSpatialIndex
from transform import (
normalize_postcode,
normalize_sub_type,
parse_int_value,
validate_floor_area,
)
log = logging.getLogger("homecouk")
class CookiesExpiredError(Exception):
"""Raised when home.co.uk returns 403, indicating cookies need refresh."""
class PaginationError(Exception):
"""Raised when home.co.uk pagination cannot be completed."""
# Channel mapping: internal name → URL path segment
HOMECOUK_URL_SEGMENT = "for-sale"
def load_cookies() -> tuple[dict[str, str], str] | None:
"""Get home.co.uk cookies + user-agent.
Environment cookies are optional. When they are not present, bootstrap a
regular local session by visiting home.co.uk with curl_cffi's Chrome
impersonation and reusing the cookies set by the site.
"""
user_agent = os.environ.get(
"HOMECOUK_USER_AGENT",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/145.0.0.0 Safari/537.36",
)
env_cookies = {
name: value
for name, value in {
"cf_clearance": os.environ.get("HOMECOUK_CF_CLEARANCE", ""),
"homecouk_session": os.environ.get("HOMECOUK_SESSION", ""),
"XSRF-TOKEN": os.environ.get("HOMECOUK_XSRF_TOKEN", ""),
}.items()
if value
}
if env_cookies.get("homecouk_session"):
return env_cookies, user_agent
session = Session(impersonate="chrome")
session.headers.update(
{
"User-Agent": user_agent,
"Accept": (
"text/html,application/xhtml+xml,application/xml;q=0.9,"
"*/*;q=0.8"
),
}
)
for url in (HOMECOUK_BASE, f"{HOMECOUK_BASE}/for-sale/br1/"):
try:
response = session.get(url, timeout=30)
except RequestsError as exc:
log.warning("home.co.uk cookie bootstrap failed for %s: %s", url, exc)
continue
if response.status_code == 403:
raise CookiesExpiredError("home.co.uk returned HTTP 403 during bootstrap")
if response.status_code >= 400:
log.warning(
"home.co.uk cookie bootstrap got HTTP %d from %s",
response.status_code,
url,
)
cookies = session.cookies.get_dict()
if cookies.get("homecouk_session") and cookies.get("XSRF-TOKEN"):
log.info("home.co.uk local session bootstrapped")
return cookies, user_agent
log.warning("home.co.uk did not provide session cookies during bootstrap")
return None
def make_client(cookies: dict[str, str], user_agent: str) -> Session:
"""Create a curl_cffi Session configured for home.co.uk API calls.
Uses Chrome TLS impersonation so browser-derived cookies remain valid."""
session = Session(impersonate="chrome")
session.headers.update(
{
"User-Agent": user_agent,
"Accept": "application/json, text/plain, */*",
"x-requested-with": "XMLHttpRequest",
}
)
# Laravel CSRF: the XSRF-TOKEN cookie value must also be sent as the
# X-XSRF-TOKEN request header (URL-decoded). Without this header, the
# server rejects every request with 419/403.
xsrf = cookies.get("XSRF-TOKEN")
if xsrf:
session.headers["X-XSRF-TOKEN"] = unquote(xsrf)
for name, value in cookies.items():
session.cookies.set(name, value, domain="home.co.uk")
return session
def fetch_page(
client: Session, url: str, params: dict, max_retries: int = 3
) -> dict | None:
"""GET JSON with retries on 429/5xx. Returns None on permanent failure.
403 means cookies expired raises CookiesExpiredError immediately."""
for attempt in range(max_retries):
try:
resp = client.get(url, params=params, timeout=30)
if resp.status_code == 200:
try:
return resp.json()
except json.JSONDecodeError:
log.error(
"Non-JSON response from %s (got %s)",
url,
resp.headers.get("content-type", "?"),
)
return None
if resp.status_code == 403:
raise CookiesExpiredError("HTTP 403 — cookies likely expired")
if resp.status_code in (429, 500, 502, 503, 504):
delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1)
log.warning(
"HTTP %d from %s, retry %d/%d in %.1fs",
resp.status_code,
url,
attempt + 1,
max_retries,
delay,
)
time.sleep(delay)
continue
log.error("HTTP %d from %s (non-retryable)", resp.status_code, url)
return None
except CookiesExpiredError:
raise
except RequestsError as e:
delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1)
log.warning(
"%s from %s, retry %d/%d in %.1fs",
type(e).__name__,
url,
attempt + 1,
max_retries,
delay,
)
time.sleep(delay)
log.error("All %d retries exhausted for %s", max_retries, url)
return None
def _coerce_positive_int(value) -> int | None:
parsed = parse_int_value(value)
if parsed is None or parsed <= 0:
return None
return parsed
def _property_identity(prop: dict, page: int, index: int) -> str:
for key in ("listing_id", "property_id", "id"):
value = prop.get(key)
if value:
return f"{key}:{value}"
return (
f"page:{page}:index:{index}:"
f"{prop.get('display_address') or prop.get('address') or ''}:"
f"{prop.get('price') or prop.get('latest_price') or ''}"
)
def parse_floor_area(description: str | None) -> float | None:
"""Try to extract floor area from description text like '789 sq.ft.' or '73 sq.m.'."""
if not description:
return None
m = re.search(
r"([\d,]+(?:\.\d+)?)\s*(?:sq\.?\s*ft|square\s+feet|ft(?:\^?2|²))",
description,
re.IGNORECASE,
)
if m:
sqft = float(m.group(1).replace(",", ""))
return validate_floor_area(round(sqft * 0.092903, 1))
m = re.search(
r"([\d,]+(?:\.\d+)?)\s*(?:sq\.?\s*m|square\s+met(?:er|re)s?|m(?:\^?2|²))",
description,
re.IGNORECASE,
)
if m:
return validate_floor_area(round(float(m.group(1).replace(",", "")), 1))
return None
def parse_tenure(prop: dict) -> str | None:
"""Extract tenure from home.co.uk property data.
Checks multiple sources in priority order:
1. Dedicated 'tenure' or 'tenure_type' field in the API response
2. Free-text search in the description for 'freehold' / 'leasehold'
3. Free-text search in features lists
home.co.uk aggregates listings from estate agents, so tenure is often
embedded in the description text rather than a structured field.
"""
# 1. Check dedicated tenure fields (in case the API adds them)
for key in ("tenure", "tenure_type", "tenureType"):
val = prop.get(key)
if val and isinstance(val, str):
lower = val.lower().strip()
if "leasehold" in lower:
return "Leasehold"
if "freehold" in lower:
return "Freehold"
# 2. Check description text — estate agents often include tenure here
description = prop.get("description") or ""
if description:
lower_desc = description.lower()
if re.search(r"\bleasehold\b", lower_desc):
return "Leasehold"
if re.search(r"\bfreehold\b", lower_desc):
# Matches "Freehold" and "Share of Freehold" (both = freehold ownership)
return "Freehold"
# 3. Check features / key_features lists if present
for key in ("features", "key_features", "keyFeatures"):
features = prop.get(key)
if features and isinstance(features, list):
for feat in features:
if not isinstance(feat, str):
continue
lower_feat = feat.lower()
if "leasehold" in lower_feat:
return "Leasehold"
if "freehold" in lower_feat:
return "Freehold"
return None
def map_property_type(raw_type: str | None) -> str:
"""Map home.co.uk property type to canonical type."""
if not raw_type:
return "Other"
canonical = PROPERTY_TYPE_MAP.get(raw_type)
if canonical:
return canonical
# Home.co.uk uses types like "House", "Flat", "Apartment", "Detached", etc.
# Try common patterns
lower = raw_type.lower()
excluded_flat_like = (
"block of apartment",
"house of multiple occupation",
"private halls",
"retirement",
"serviced apartment",
)
if any(term in lower for term in excluded_flat_like):
return "Other"
if (
"flat" in lower
or "apartment" in lower
or "maisonette" in lower
or "studio" in lower
):
return "Flats/Maisonettes"
if "detached" in lower and "semi" not in lower:
return "Detached"
if "semi" in lower:
return "Semi-Detached"
if "terrace" in lower or "mews" in lower:
return "Terraced"
log.debug("Unknown property type: %r — mapping to Other", raw_type)
return "Other"
def transform_property(
prop: dict,
pc_index: PostcodeSpatialIndex,
) -> dict | None:
"""Transform a raw home.co.uk property dict into our output schema."""
lat = prop.get("latitude")
lng = prop.get("longitude")
if lat is None or lng is None:
return None
# Validate coordinates are in England
if not (49 <= lat <= 56 and -7 <= lng <= 2):
log.debug("Coords outside England: lat=%.4f lng=%.4f — skipping", lat, lng)
return None
price = parse_int_value(prop.get("price")) or parse_int_value(
prop.get("latest_price")
)
if not price or price <= 0:
return None
# Home.co.uk provides postcodes directly, but fall back to spatial index
postcode = prop.get("postcode")
if not postcode:
postcode = pc_index.nearest(lat, lng)
if not postcode:
log.debug("No postcode for property at %.4f, %.4f — skipping", lat, lng)
return None
raw_beds = parse_int_value(prop.get("bedrooms")) or 0
raw_baths = parse_int_value(prop.get("bathrooms")) or 0
bedrooms = raw_beds if 0 <= raw_beds <= MAX_BEDROOMS else 0
bathrooms = raw_baths if 0 <= raw_baths <= MAX_BEDROOMS else 0
if raw_beds > MAX_BEDROOMS or raw_baths > MAX_BEDROOMS:
log.warning(
"home.co.uk %s: implausible beds=%d baths=%d (capped to 0)",
prop.get("listing_id") or prop.get("property_id") or "?",
raw_beds, raw_baths,
)
listing_type = prop.get("listing_property_type") or prop.get("property_type") or ""
address = prop.get("display_address") or prop.get("address") or ""
# Derive price qualifier from reduction info
price_qualifier = ""
if prop.get("is_reduced"):
pct = prop.get("reduction_percent", 0)
if pct:
price_qualifier = f"Reduced by {pct}%"
else:
price_qualifier = "Reduced"
listing_id = prop.get("listing_id") or prop.get("property_id") or ""
return {
"id": f"hk_{listing_id}", # prefix to avoid collision with Rightmove IDs
"Bedrooms": bedrooms,
"Bathrooms": bathrooms,
"Number of bedrooms & living rooms": bedrooms + bathrooms,
"lon": lng,
"lat": lat,
"Postcode": normalize_postcode(postcode),
"Address per Property Register": address,
"Leasehold/Freehold": parse_tenure(prop),
"Property type": map_property_type(listing_type),
"Property sub-type": normalize_sub_type(listing_type),
"price": price,
"price_frequency": "",
"Price qualifier": price_qualifier,
"Total floor area (sqm)": parse_floor_area(prop.get("description")),
"Listing URL": f"{HOMECOUK_BASE}/property/{listing_id}",
"Listing features": [], # not available from home.co.uk
"first_visible_date": prop.get("added_date") or "",
}
def search_outcode(
client: Session,
outcode: str,
pc_index: PostcodeSpatialIndex,
max_properties: int | None = None,
) -> list[dict]:
"""Paginate through sale search results for one outcode."""
url_segment = HOMECOUK_URL_SEGMENT
url = f"{HOMECOUK_API_BASE}/{url_segment}/{outcode.lower()}/"
properties = []
page = 1
last_page: int | None = None
total_results: int | None = None
seen_ids: set[str] = set()
while True:
params = {
"page": str(page),
"sort": "date_desc",
"per_page": str(HOMECOUK_PER_PAGE),
}
# Set referer to match the page URL pattern
client.headers["referer"] = (
f"https://home.co.uk/{url_segment}/{outcode.lower()}/"
f"?page={page}&sort=date_desc&per_page={HOMECOUK_PER_PAGE}"
)
data = fetch_page(client, url, params)
if not data:
raise PaginationError(f"home.co.uk {outcode} page {page} failed to load")
pagination = data.get("pagination", {}) or {}
if last_page is None:
last_page = _coerce_positive_int(pagination.get("last_page"))
if total_results is None:
total_results = _coerce_positive_int(pagination.get("total"))
raw_props = data.get("properties", [])
if not raw_props:
if total_results and page <= (last_page or page):
raise PaginationError(
f"home.co.uk {outcode} page {page} returned no properties "
f"before the advertised end"
)
break
page_ids = {
_property_identity(prop, page, idx) for idx, prop in enumerate(raw_props)
}
if page_ids and page_ids.issubset(seen_ids):
raise PaginationError(
f"home.co.uk {outcode} page {page} repeated previously seen results"
)
seen_ids.update(page_ids)
for prop in raw_props:
try:
transformed = transform_property(prop, pc_index)
except Exception as exc:
log.warning(
"home.co.uk %s property %s failed to transform: %s",
outcode,
prop.get("listing_id") or prop.get("property_id") or "?",
exc,
)
continue
if transformed:
properties.append(transformed)
if max_properties is not None and len(properties) >= max_properties:
return properties
if last_page is not None:
if page >= last_page:
break
elif total_results is not None and len(seen_ids) >= total_results:
break
elif len(raw_props) < HOMECOUK_PER_PAGE:
break
page += 1
time.sleep(DELAY_BETWEEN_PAGES)
return properties

View file

@ -1,63 +0,0 @@
"""Shared target filters for manual buy-listing scrapes."""
import math
from typing import Any
BUY_MAX_PRICE = 1_000_000
BUY_MIN_BEDROOMS = 2
BUY_MAX_BEDROOMS = 5
BUY_ALLOWED_BATHROOMS = frozenset({2, 3})
BUY_MIN_FLOOR_AREA_SQM = 90.0
BUY_MAX_FLOOR_AREA_SQM = 170.0
BUY_PROPERTY_TYPES = frozenset({"Flats/Maisonettes"})
BUY_MIN_FLOOR_AREA_SQFT = round(BUY_MIN_FLOOR_AREA_SQM / 0.092903)
BUY_MAX_FLOOR_AREA_SQFT = round(BUY_MAX_FLOOR_AREA_SQM / 0.092903)
def _number(value: Any) -> float | None:
if value is None:
return None
try:
number = float(value)
except (TypeError, ValueError):
return None
if not math.isfinite(number):
return None
return number
def _int(value: Any) -> int | None:
number = _number(value)
if number is None or not number.is_integer():
return None
return int(number)
def matches_strict_buy_listing_filter(prop: dict) -> bool:
"""Exact filter used to guard scraped/output datasets."""
if "price" in prop:
price = _number(prop.get("price"))
else:
price = _number(prop.get("Asking price"))
if price is None or price <= 0 or price >= BUY_MAX_PRICE:
return False
bedrooms = _int(prop.get("Bedrooms"))
if bedrooms is None or (
bedrooms < BUY_MIN_BEDROOMS or bedrooms > BUY_MAX_BEDROOMS
):
return False
property_type = prop.get("Property type")
if property_type not in BUY_PROPERTY_TYPES:
return False
bathrooms = _int(prop.get("Bathrooms"))
if bathrooms not in BUY_ALLOWED_BATHROOMS:
return False
floor_area = _number(prop.get("Total floor area (sqm)"))
if floor_area is None:
return False
return BUY_MIN_FLOOR_AREA_SQM <= floor_area <= BUY_MAX_FLOOR_AREA_SQM

View file

@ -5,10 +5,10 @@ import tempfile
import time
from pathlib import Path
from constants import DATA_DIR
from constants import DATA_DIR, REPO_DIR
SOURCE_CHOICES = ("rightmove", "homecouk", "zoopla", "all")
SOURCE_CHOICES = ("rightmove", "zoopla", "all")
TEST_MAX_PROPERTIES_PER_SOURCE = 100
TEST_OUTCODES = (
"E1",
@ -28,14 +28,16 @@ log = logging.getLogger("finder")
def configure_standalone_runtime() -> None:
"""Keep browser/cache/temp files on the project volume for local runs."""
runtime_dir = DATA_DIR / ".runtime"
runtime_dir = REPO_DIR / ".tmp" / "finder"
cache_dir = runtime_dir / "cache"
temp_dir = runtime_dir / "tmp"
cache_dir.mkdir(parents=True, exist_ok=True)
temp_dir.mkdir(parents=True, exist_ok=True)
os.environ.setdefault("XDG_CACHE_HOME", str(cache_dir))
os.environ.setdefault("TMPDIR", str(temp_dir))
os.environ["XDG_CACHE_HOME"] = str(cache_dir)
os.environ["TMPDIR"] = str(temp_dir)
os.environ["TEMP"] = str(temp_dir)
os.environ["TMP"] = str(temp_dir)
tempfile.tempdir = str(temp_dir)
@ -47,7 +49,7 @@ def parse_args() -> argparse.Namespace:
"--source",
choices=SOURCE_CHOICES,
default="all",
help="Portal to scrape. 'all' runs Rightmove, home.co.uk, and Zoopla.",
help="Portal to scrape. 'all' runs Rightmove and Zoopla.",
)
parser.add_argument(
"--output-dir",
@ -89,7 +91,7 @@ def configure_logging() -> None:
def selected_sources(source: str) -> list[str]:
if source == "all":
return ["rightmove", "homecouk", "zoopla"]
return ["rightmove", "zoopla"]
return [source]

View file

@ -4,7 +4,6 @@ version = "0.1.0"
requires-python = ">=3.12"
dependencies = [
"httpx",
"curl_cffi",
"polars",
"fake-useragent>=2.2.0",
"playwright>=1.58.0",

View file

@ -10,15 +10,6 @@ from constants import (
TYPEAHEAD_URL,
)
from http_client import fetch_with_retry
from listing_filters import (
BUY_ALLOWED_BATHROOMS,
BUY_MAX_BEDROOMS,
BUY_MAX_FLOOR_AREA_SQFT,
BUY_MAX_PRICE,
BUY_MIN_BEDROOMS,
BUY_MIN_FLOOR_AREA_SQFT,
matches_strict_buy_listing_filter,
)
from spatial import PostcodeSpatialIndex
from transform import transform_property
@ -31,24 +22,6 @@ outcode_cache: dict[str, str] = {}
# Requesting index >= 1008 returns HTTP 400.
_MAX_INDEX = 1008
_BASE_BUY_SEARCH_PARAMS = {
"propertyTypes": "flat",
"minBedrooms": str(BUY_MIN_BEDROOMS),
"maxBedrooms": str(BUY_MAX_BEDROOMS),
"minBathrooms": str(min(BUY_ALLOWED_BATHROOMS)),
"maxBathrooms": str(max(BUY_ALLOWED_BATHROOMS)),
"minSize": str(BUY_MIN_FLOOR_AREA_SQFT),
"maxSize": str(BUY_MAX_FLOOR_AREA_SQFT),
"maxPrice": str(BUY_MAX_PRICE - 1),
}
def _buy_search_params(extra_params: dict | None = None) -> dict:
params = dict(_BASE_BUY_SEARCH_PARAMS)
if extra_params:
params.update(extra_params)
return params
def resolve_outcode_id(client: httpx.Client, outcode: str) -> str | None:
"""Look up Rightmove's internal ID for an outcode via typeahead API."""
@ -77,7 +50,6 @@ def _paginate(
outcode: str,
channel_cfg: dict,
pc_index: PostcodeSpatialIndex,
extra_params: dict | None = None,
max_properties: int | None = None,
) -> tuple[list[dict], int]:
"""Paginate through search results. Returns (properties, result_count)."""
@ -94,9 +66,6 @@ def _paginate(
"channel": channel_cfg["channel"],
"transactionType": channel_cfg["transactionType"],
}
if extra_params:
params.update(extra_params)
data = fetch_with_retry(client, SEARCH_URL, params)
if not data:
log.warning(
@ -123,7 +92,7 @@ def _paginate(
exc,
)
continue
if transformed and matches_strict_buy_listing_filter(transformed):
if transformed:
properties.append(transformed)
if max_properties is not None and len(properties) >= max_properties:
return properties, result_count
@ -137,7 +106,7 @@ def _paginate(
break
if index >= _MAX_INDEX:
log.warning(
"%s/%s: %d filtered results exceed Rightmove's %d-result page cap",
"%s/%s: %d results exceed Rightmove's %d-result page cap",
outcode,
channel_cfg["channel"],
result_count,
@ -158,18 +127,13 @@ def search_outcode(
pc_index: PostcodeSpatialIndex,
max_properties: int | None = None,
) -> list[dict]:
"""Paginate through search results for one outcode+channel. Returns transformed properties.
Search requests set the supported Rightmove filters directly: flats,
2-5 bedrooms, 2-3 bathrooms, 969-1830 sq ft, and asking price below £1m.
"""
"""Paginate through unfiltered sale results for one outcode+channel."""
properties, _ = _paginate(
client,
outcode_id,
outcode,
channel_cfg,
pc_index,
extra_params=_buy_search_params(),
max_properties=max_properties,
)

View file

@ -14,12 +14,7 @@ from constants import (
LONDON_OUTCODE_PREFIXES,
)
from homecouk import CookiesExpiredError
from homecouk import load_cookies as load_homecouk_cookies
from homecouk import make_client as make_homecouk_client
from homecouk import search_outcode as homecouk_search_outcode
from http_client import make_client
from listing_filters import matches_strict_buy_listing_filter
from rightmove import resolve_outcode_id
from rightmove import search_outcode as rightmove_search_outcode
from spatial import PostcodeSpatialIndex
@ -30,7 +25,7 @@ from zoopla import search_outcode as zoopla_search_outcode
log = logging.getLogger("rightmove")
SOURCE_ORDER = ("rightmove", "homecouk", "zoopla")
SOURCE_ORDER = ("rightmove", "zoopla")
SALE_CHANNEL = CHANNELS[0]
LONDON_AREAS = sorted({prefix.upper() for prefix in LONDON_OUTCODE_PREFIXES})
OUTCODE_RE = re.compile(r"^([A-Z]{1,2}\d[A-Z0-9]?)")
@ -260,16 +255,7 @@ def _store_properties(
dropped_outside_area,
)
eligible = [prop for prop in londonish if matches_strict_buy_listing_filter(prop)]
dropped_non_matching = len(londonish) - len(eligible)
if dropped_non_matching:
log.debug(
"%s dropped %d properties outside the strict buy-listing filters",
source,
dropped_non_matching,
)
selected = eligible if remaining is None else eligible[:remaining]
selected = londonish if remaining is None else londonish[:remaining]
results[source].extend(selected)
return len(selected)
@ -290,6 +276,8 @@ def _launch_zoopla_with_retries(attempts: int = 3):
for attempt in range(1, attempts + 1):
try:
return launch_zoopla_browser()
except TurnstileError:
raise
except Exception as exc:
last_error = exc
log.warning(
@ -304,13 +292,6 @@ def _launch_zoopla_with_retries(attempts: int = 3):
raise last_error
def _new_homecouk_client():
cookie_data = load_homecouk_cookies()
if not cookie_data:
return None
return make_homecouk_client(*cookie_data)
def _scrape_rightmove(
outcodes: list[str],
pc_index: PostcodeSpatialIndex,
@ -368,74 +349,6 @@ def _scrape_rightmove(
client.close()
def _scrape_homecouk(
outcodes: list[str],
pc_index: PostcodeSpatialIndex,
results: dict[str, list[dict]],
errors: list[str],
max_properties_per_source: int | None,
) -> None:
client = _new_homecouk_client()
if client is None:
log.warning("home.co.uk skipped: could not bootstrap a local session")
return
try:
for outcode in outcodes:
if _source_remaining(results, "homecouk", max_properties_per_source) == 0:
log.info("home.co.uk cap reached")
return
for attempt in range(2):
try:
# home.co.uk cannot express the full filter set at source.
# Fetch the outcode page set first; _store_properties applies
# the strict filter and source cap after transformation.
props = homecouk_search_outcode(
client,
outcode,
pc_index,
max_properties=None,
)
added = _store_properties(
results,
"homecouk",
props,
max_properties_per_source,
)
log.info("home.co.uk %s: +%d", outcode, added)
break
except CookiesExpiredError as exc:
if attempt == 1:
_record_error(errors, "homecouk", outcode, exc)
break
log.warning(
"home.co.uk cookies expired at %s; refreshing local session",
outcode,
)
try:
client.close()
except Exception:
pass
client = _new_homecouk_client()
if client is None:
_record_error(
errors,
"homecouk",
outcode,
RuntimeError("could not refresh local session"),
)
return
except Exception as exc:
_record_error(errors, "homecouk", outcode, exc)
break
time.sleep(DELAY_BETWEEN_OUTCODES)
finally:
client.close()
def _scrape_zoopla(
outcodes: list[str],
pc_index: PostcodeSpatialIndex,
@ -459,9 +372,8 @@ def _scrape_zoopla(
for attempt in range(2):
try:
# Zoopla source-side filters are unverified here. Fetch the
# outcode page set first; _store_properties applies the
# strict filter and source cap after transformation.
# Fetch the outcode page set first; _store_properties applies
# the London-ish postcode filter and source cap after transformation.
props, _ = zoopla_search_outcode(
page,
outcode,
@ -539,15 +451,6 @@ def run_scrape(
max_properties_per_source,
)
if "homecouk" in selected_sources:
_scrape_homecouk(
selected_outcodes,
pc_index,
results,
errors,
max_properties_per_source,
)
if "zoopla" in selected_sources:
if pc_coords is None:
pc_coords = build_postcode_coords()
@ -567,20 +470,10 @@ def run_scrape(
else:
if output_path.exists():
output_path.unlink()
log.warning("No strict properties to write to %s", output_path)
filtered = [prop for prop in merged if matches_strict_buy_listing_filter(prop)]
filtered_output_path = output_base / "online_listings_buy_filtered.parquet"
if filtered:
write_parquet(filtered, filtered_output_path)
else:
if filtered_output_path.exists():
filtered_output_path.unlink()
log.warning("No strict-filtered properties to write to %s", filtered_output_path)
log.warning("No London-ish properties to write to %s", output_path)
counts = {
"total": len(merged),
"filtered_total": len(filtered),
"deduped": deduped,
"sources": source_counts,
}
@ -588,9 +481,8 @@ def run_scrape(
f"{source}:{source_counts[source]}" for source in SOURCE_ORDER
)
log.info(
"Sale scrape complete: %d unique, %d strict-filtered (%s deduped:%d)",
"Sale scrape complete: %d unique (%s deduped:%d)",
len(merged),
len(filtered),
source_summary,
deduped,
)
@ -603,7 +495,6 @@ def run_scrape(
},
"counts": counts,
"path": str(output_path),
"filtered_path": str(filtered_output_path),
"errors": errors,
"elapsed_seconds": round(time.time() - started_at, 3),
}

View file

@ -76,7 +76,7 @@ def normalize_sub_type(sub_type: str | None) -> str:
"""Normalize property sub-type for consistent storage.
Fixes delimiter inconsistencies (underscores/hyphens spaces) from
home.co.uk and truncates Zoopla description fragments that were
legacy listing data and truncates Zoopla description fragments that were
accidentally captured as sub-types.
"""
if not sub_type:
@ -200,31 +200,13 @@ def transform_property(
price_obj = prop.get("price", {})
amount = parse_int_value(price_obj.get("amount"))
if not amount:
return None
price = amount
if price <= 0:
return None
price = amount or 0
display_prices = price_obj.get("displayPrices", [])
price_qualifier = (
display_prices[0].get("displayPriceQualifier", "") if display_prices else ""
)
# POA / Auction listings have unreliable prices — treat as no price
pq_lower = price_qualifier.lower()
non_comparable_price_terms = (
"poa",
"auction",
"shared ownership",
"shared equity",
"part buy",
"part rent",
"from",
)
if any(term in pq_lower for term in non_comparable_price_terms):
return None
sub_type = prop.get("propertySubType", "")
raw_beds = parse_int_value(prop.get("bedrooms")) or 0
raw_baths = parse_int_value(prop.get("bathrooms")) or 0

91
finder/uv.lock generated
View file

@ -72,63 +72,6 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/9a/3c/c17fb3ca2d9c3acff52e30b309f538586f9f5b9c9cf454f3845fc9af4881/certifi-2026.2.25-py3-none-any.whl", hash = "sha256:027692e4402ad994f1c42e52a4997a9763c646b73e4096e4d5d6db8af1d6f0fa", size = 153684, upload-time = "2026-02-25T02:54:15.766Z" },
]
[[package]]
name = "cffi"
version = "2.0.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "pycparser", marker = "implementation_name != 'PyPy'" },
]
sdist = { url = "https://files.pythonhosted.org/packages/eb/56/b1ba7935a17738ae8453301356628e8147c79dbb825bcbc73dc7401f9846/cffi-2.0.0.tar.gz", hash = "sha256:44d1b5909021139fe36001ae048dbdde8214afa20200eda0f64c068cac5d5529", size = 523588, upload-time = "2025-09-08T23:24:04.541Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/ea/47/4f61023ea636104d4f16ab488e268b93008c3d0bb76893b1b31db1f96802/cffi-2.0.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:6d02d6655b0e54f54c4ef0b94eb6be0607b70853c45ce98bd278dc7de718be5d", size = 185271, upload-time = "2025-09-08T23:22:44.795Z" },
{ url = "https://files.pythonhosted.org/packages/df/a2/781b623f57358e360d62cdd7a8c681f074a71d445418a776eef0aadb4ab4/cffi-2.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8eca2a813c1cb7ad4fb74d368c2ffbbb4789d377ee5bb8df98373c2cc0dee76c", size = 181048, upload-time = "2025-09-08T23:22:45.938Z" },
{ url = "https://files.pythonhosted.org/packages/ff/df/a4f0fbd47331ceeba3d37c2e51e9dfc9722498becbeec2bd8bc856c9538a/cffi-2.0.0-cp312-cp312-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:21d1152871b019407d8ac3985f6775c079416c282e431a4da6afe7aefd2bccbe", size = 212529, upload-time = "2025-09-08T23:22:47.349Z" },
{ url = "https://files.pythonhosted.org/packages/d5/72/12b5f8d3865bf0f87cf1404d8c374e7487dcf097a1c91c436e72e6badd83/cffi-2.0.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:b21e08af67b8a103c71a250401c78d5e0893beff75e28c53c98f4de42f774062", size = 220097, upload-time = "2025-09-08T23:22:48.677Z" },
{ url = "https://files.pythonhosted.org/packages/c2/95/7a135d52a50dfa7c882ab0ac17e8dc11cec9d55d2c18dda414c051c5e69e/cffi-2.0.0-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:1e3a615586f05fc4065a8b22b8152f0c1b00cdbc60596d187c2a74f9e3036e4e", size = 207983, upload-time = "2025-09-08T23:22:50.06Z" },
{ url = "https://files.pythonhosted.org/packages/3a/c8/15cb9ada8895957ea171c62dc78ff3e99159ee7adb13c0123c001a2546c1/cffi-2.0.0-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:81afed14892743bbe14dacb9e36d9e0e504cd204e0b165062c488942b9718037", size = 206519, upload-time = "2025-09-08T23:22:51.364Z" },
{ url = "https://files.pythonhosted.org/packages/78/2d/7fa73dfa841b5ac06c7b8855cfc18622132e365f5b81d02230333ff26e9e/cffi-2.0.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:3e17ed538242334bf70832644a32a7aae3d83b57567f9fd60a26257e992b79ba", size = 219572, upload-time = "2025-09-08T23:22:52.902Z" },
{ url = "https://files.pythonhosted.org/packages/07/e0/267e57e387b4ca276b90f0434ff88b2c2241ad72b16d31836adddfd6031b/cffi-2.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3925dd22fa2b7699ed2617149842d2e6adde22b262fcbfada50e3d195e4b3a94", size = 222963, upload-time = "2025-09-08T23:22:54.518Z" },
{ url = "https://files.pythonhosted.org/packages/b6/75/1f2747525e06f53efbd878f4d03bac5b859cbc11c633d0fb81432d98a795/cffi-2.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:2c8f814d84194c9ea681642fd164267891702542f028a15fc97d4674b6206187", size = 221361, upload-time = "2025-09-08T23:22:55.867Z" },
{ url = "https://files.pythonhosted.org/packages/7b/2b/2b6435f76bfeb6bbf055596976da087377ede68df465419d192acf00c437/cffi-2.0.0-cp312-cp312-win32.whl", hash = "sha256:da902562c3e9c550df360bfa53c035b2f241fed6d9aef119048073680ace4a18", size = 172932, upload-time = "2025-09-08T23:22:57.188Z" },
{ url = "https://files.pythonhosted.org/packages/f8/ed/13bd4418627013bec4ed6e54283b1959cf6db888048c7cf4b4c3b5b36002/cffi-2.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:da68248800ad6320861f129cd9c1bf96ca849a2771a59e0344e88681905916f5", size = 183557, upload-time = "2025-09-08T23:22:58.351Z" },
{ url = "https://files.pythonhosted.org/packages/95/31/9f7f93ad2f8eff1dbc1c3656d7ca5bfd8fb52c9d786b4dcf19b2d02217fa/cffi-2.0.0-cp312-cp312-win_arm64.whl", hash = "sha256:4671d9dd5ec934cb9a73e7ee9676f9362aba54f7f34910956b84d727b0d73fb6", size = 177762, upload-time = "2025-09-08T23:22:59.668Z" },
{ url = "https://files.pythonhosted.org/packages/4b/8d/a0a47a0c9e413a658623d014e91e74a50cdd2c423f7ccfd44086ef767f90/cffi-2.0.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:00bdf7acc5f795150faa6957054fbbca2439db2f775ce831222b66f192f03beb", size = 185230, upload-time = "2025-09-08T23:23:00.879Z" },
{ url = "https://files.pythonhosted.org/packages/4a/d2/a6c0296814556c68ee32009d9c2ad4f85f2707cdecfd7727951ec228005d/cffi-2.0.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:45d5e886156860dc35862657e1494b9bae8dfa63bf56796f2fb56e1679fc0bca", size = 181043, upload-time = "2025-09-08T23:23:02.231Z" },
{ url = "https://files.pythonhosted.org/packages/b0/1e/d22cc63332bd59b06481ceaac49d6c507598642e2230f201649058a7e704/cffi-2.0.0-cp313-cp313-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:07b271772c100085dd28b74fa0cd81c8fb1a3ba18b21e03d7c27f3436a10606b", size = 212446, upload-time = "2025-09-08T23:23:03.472Z" },
{ url = "https://files.pythonhosted.org/packages/a9/f5/a2c23eb03b61a0b8747f211eb716446c826ad66818ddc7810cc2cc19b3f2/cffi-2.0.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d48a880098c96020b02d5a1f7d9251308510ce8858940e6fa99ece33f610838b", size = 220101, upload-time = "2025-09-08T23:23:04.792Z" },
{ url = "https://files.pythonhosted.org/packages/f2/7f/e6647792fc5850d634695bc0e6ab4111ae88e89981d35ac269956605feba/cffi-2.0.0-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:f93fd8e5c8c0a4aa1f424d6173f14a892044054871c771f8566e4008eaa359d2", size = 207948, upload-time = "2025-09-08T23:23:06.127Z" },
{ url = "https://files.pythonhosted.org/packages/cb/1e/a5a1bd6f1fb30f22573f76533de12a00bf274abcdc55c8edab639078abb6/cffi-2.0.0-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:dd4f05f54a52fb558f1ba9f528228066954fee3ebe629fc1660d874d040ae5a3", size = 206422, upload-time = "2025-09-08T23:23:07.753Z" },
{ url = "https://files.pythonhosted.org/packages/98/df/0a1755e750013a2081e863e7cd37e0cdd02664372c754e5560099eb7aa44/cffi-2.0.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c8d3b5532fc71b7a77c09192b4a5a200ea992702734a2e9279a37f2478236f26", size = 219499, upload-time = "2025-09-08T23:23:09.648Z" },
{ url = "https://files.pythonhosted.org/packages/50/e1/a969e687fcf9ea58e6e2a928ad5e2dd88cc12f6f0ab477e9971f2309b57c/cffi-2.0.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:d9b29c1f0ae438d5ee9acb31cadee00a58c46cc9c0b2f9038c6b0b3470877a8c", size = 222928, upload-time = "2025-09-08T23:23:10.928Z" },
{ url = "https://files.pythonhosted.org/packages/36/54/0362578dd2c9e557a28ac77698ed67323ed5b9775ca9d3fe73fe191bb5d8/cffi-2.0.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6d50360be4546678fc1b79ffe7a66265e28667840010348dd69a314145807a1b", size = 221302, upload-time = "2025-09-08T23:23:12.42Z" },
{ url = "https://files.pythonhosted.org/packages/eb/6d/bf9bda840d5f1dfdbf0feca87fbdb64a918a69bca42cfa0ba7b137c48cb8/cffi-2.0.0-cp313-cp313-win32.whl", hash = "sha256:74a03b9698e198d47562765773b4a8309919089150a0bb17d829ad7b44b60d27", size = 172909, upload-time = "2025-09-08T23:23:14.32Z" },
{ url = "https://files.pythonhosted.org/packages/37/18/6519e1ee6f5a1e579e04b9ddb6f1676c17368a7aba48299c3759bbc3c8b3/cffi-2.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:19f705ada2530c1167abacb171925dd886168931e0a7b78f5bffcae5c6b5be75", size = 183402, upload-time = "2025-09-08T23:23:15.535Z" },
{ url = "https://files.pythonhosted.org/packages/cb/0e/02ceeec9a7d6ee63bb596121c2c8e9b3a9e150936f4fbef6ca1943e6137c/cffi-2.0.0-cp313-cp313-win_arm64.whl", hash = "sha256:256f80b80ca3853f90c21b23ee78cd008713787b1b1e93eae9f3d6a7134abd91", size = 177780, upload-time = "2025-09-08T23:23:16.761Z" },
{ url = "https://files.pythonhosted.org/packages/92/c4/3ce07396253a83250ee98564f8d7e9789fab8e58858f35d07a9a2c78de9f/cffi-2.0.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:fc33c5141b55ed366cfaad382df24fe7dcbc686de5be719b207bb248e3053dc5", size = 185320, upload-time = "2025-09-08T23:23:18.087Z" },
{ url = "https://files.pythonhosted.org/packages/59/dd/27e9fa567a23931c838c6b02d0764611c62290062a6d4e8ff7863daf9730/cffi-2.0.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:c654de545946e0db659b3400168c9ad31b5d29593291482c43e3564effbcee13", size = 181487, upload-time = "2025-09-08T23:23:19.622Z" },
{ url = "https://files.pythonhosted.org/packages/d6/43/0e822876f87ea8a4ef95442c3d766a06a51fc5298823f884ef87aaad168c/cffi-2.0.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:24b6f81f1983e6df8db3adc38562c83f7d4a0c36162885ec7f7b77c7dcbec97b", size = 220049, upload-time = "2025-09-08T23:23:20.853Z" },
{ url = "https://files.pythonhosted.org/packages/b4/89/76799151d9c2d2d1ead63c2429da9ea9d7aac304603de0c6e8764e6e8e70/cffi-2.0.0-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:12873ca6cb9b0f0d3a0da705d6086fe911591737a59f28b7936bdfed27c0d47c", size = 207793, upload-time = "2025-09-08T23:23:22.08Z" },
{ url = "https://files.pythonhosted.org/packages/bb/dd/3465b14bb9e24ee24cb88c9e3730f6de63111fffe513492bf8c808a3547e/cffi-2.0.0-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:d9b97165e8aed9272a6bb17c01e3cc5871a594a446ebedc996e2397a1c1ea8ef", size = 206300, upload-time = "2025-09-08T23:23:23.314Z" },
{ url = "https://files.pythonhosted.org/packages/47/d9/d83e293854571c877a92da46fdec39158f8d7e68da75bf73581225d28e90/cffi-2.0.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:afb8db5439b81cf9c9d0c80404b60c3cc9c3add93e114dcae767f1477cb53775", size = 219244, upload-time = "2025-09-08T23:23:24.541Z" },
{ url = "https://files.pythonhosted.org/packages/2b/0f/1f177e3683aead2bb00f7679a16451d302c436b5cbf2505f0ea8146ef59e/cffi-2.0.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:737fe7d37e1a1bffe70bd5754ea763a62a066dc5913ca57e957824b72a85e205", size = 222828, upload-time = "2025-09-08T23:23:26.143Z" },
{ url = "https://files.pythonhosted.org/packages/c6/0f/cafacebd4b040e3119dcb32fed8bdef8dfe94da653155f9d0b9dc660166e/cffi-2.0.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:38100abb9d1b1435bc4cc340bb4489635dc2f0da7456590877030c9b3d40b0c1", size = 220926, upload-time = "2025-09-08T23:23:27.873Z" },
{ url = "https://files.pythonhosted.org/packages/3e/aa/df335faa45b395396fcbc03de2dfcab242cd61a9900e914fe682a59170b1/cffi-2.0.0-cp314-cp314-win32.whl", hash = "sha256:087067fa8953339c723661eda6b54bc98c5625757ea62e95eb4898ad5e776e9f", size = 175328, upload-time = "2025-09-08T23:23:44.61Z" },
{ url = "https://files.pythonhosted.org/packages/bb/92/882c2d30831744296ce713f0feb4c1cd30f346ef747b530b5318715cc367/cffi-2.0.0-cp314-cp314-win_amd64.whl", hash = "sha256:203a48d1fb583fc7d78a4c6655692963b860a417c0528492a6bc21f1aaefab25", size = 185650, upload-time = "2025-09-08T23:23:45.848Z" },
{ url = "https://files.pythonhosted.org/packages/9f/2c/98ece204b9d35a7366b5b2c6539c350313ca13932143e79dc133ba757104/cffi-2.0.0-cp314-cp314-win_arm64.whl", hash = "sha256:dbd5c7a25a7cb98f5ca55d258b103a2054f859a46ae11aaf23134f9cc0d356ad", size = 180687, upload-time = "2025-09-08T23:23:47.105Z" },
{ url = "https://files.pythonhosted.org/packages/3e/61/c768e4d548bfa607abcda77423448df8c471f25dbe64fb2ef6d555eae006/cffi-2.0.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:9a67fc9e8eb39039280526379fb3a70023d77caec1852002b4da7e8b270c4dd9", size = 188773, upload-time = "2025-09-08T23:23:29.347Z" },
{ url = "https://files.pythonhosted.org/packages/2c/ea/5f76bce7cf6fcd0ab1a1058b5af899bfbef198bea4d5686da88471ea0336/cffi-2.0.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:7a66c7204d8869299919db4d5069a82f1561581af12b11b3c9f48c584eb8743d", size = 185013, upload-time = "2025-09-08T23:23:30.63Z" },
{ url = "https://files.pythonhosted.org/packages/be/b4/c56878d0d1755cf9caa54ba71e5d049479c52f9e4afc230f06822162ab2f/cffi-2.0.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:7cc09976e8b56f8cebd752f7113ad07752461f48a58cbba644139015ac24954c", size = 221593, upload-time = "2025-09-08T23:23:31.91Z" },
{ url = "https://files.pythonhosted.org/packages/e0/0d/eb704606dfe8033e7128df5e90fee946bbcb64a04fcdaa97321309004000/cffi-2.0.0-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:92b68146a71df78564e4ef48af17551a5ddd142e5190cdf2c5624d0c3ff5b2e8", size = 209354, upload-time = "2025-09-08T23:23:33.214Z" },
{ url = "https://files.pythonhosted.org/packages/d8/19/3c435d727b368ca475fb8742ab97c9cb13a0de600ce86f62eab7fa3eea60/cffi-2.0.0-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:b1e74d11748e7e98e2f426ab176d4ed720a64412b6a15054378afdb71e0f37dc", size = 208480, upload-time = "2025-09-08T23:23:34.495Z" },
{ url = "https://files.pythonhosted.org/packages/d0/44/681604464ed9541673e486521497406fadcc15b5217c3e326b061696899a/cffi-2.0.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:28a3a209b96630bca57cce802da70c266eb08c6e97e5afd61a75611ee6c64592", size = 221584, upload-time = "2025-09-08T23:23:36.096Z" },
{ url = "https://files.pythonhosted.org/packages/25/8e/342a504ff018a2825d395d44d63a767dd8ebc927ebda557fecdaca3ac33a/cffi-2.0.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:7553fb2090d71822f02c629afe6042c299edf91ba1bf94951165613553984512", size = 224443, upload-time = "2025-09-08T23:23:37.328Z" },
{ url = "https://files.pythonhosted.org/packages/e1/5e/b666bacbbc60fbf415ba9988324a132c9a7a0448a9a8f125074671c0f2c3/cffi-2.0.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:6c6c373cfc5c83a975506110d17457138c8c63016b563cc9ed6e056a82f13ce4", size = 223437, upload-time = "2025-09-08T23:23:38.945Z" },
{ url = "https://files.pythonhosted.org/packages/a0/1d/ec1a60bd1a10daa292d3cd6bb0b359a81607154fb8165f3ec95fe003b85c/cffi-2.0.0-cp314-cp314t-win32.whl", hash = "sha256:1fc9ea04857caf665289b7a75923f2c6ed559b8298a1b8c49e59f7dd95c8481e", size = 180487, upload-time = "2025-09-08T23:23:40.423Z" },
{ url = "https://files.pythonhosted.org/packages/bf/41/4c1168c74fac325c0c8156f04b6749c8b6a8f405bbf91413ba088359f60d/cffi-2.0.0-cp314-cp314t-win_amd64.whl", hash = "sha256:d68b6cef7827e8641e8ef16f4494edda8b36104d79773a334beaa1e3521430f6", size = 191726, upload-time = "2025-09-08T23:23:41.742Z" },
{ url = "https://files.pythonhosted.org/packages/ae/3a/dbeec9d1ee0844c679f6bb5d6ad4e9f198b1224f4e7a32825f47f6192b0c/cffi-2.0.0-cp314-cp314t-win_arm64.whl", hash = "sha256:0a1527a803f0a659de1af2e1fd700213caba79377e27e4693648c2923da066f9", size = 184195, upload-time = "2025-09-08T23:23:43.004Z" },
]
[[package]]
name = "charset-normalizer"
version = "3.4.6"
@ -223,29 +166,6 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" },
]
[[package]]
name = "curl-cffi"
version = "0.14.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "certifi" },
{ name = "cffi" },
]
sdist = { url = "https://files.pythonhosted.org/packages/9b/c9/0067d9a25ed4592b022d4558157fcdb6e123516083700786d38091688767/curl_cffi-0.14.0.tar.gz", hash = "sha256:5ffbc82e59f05008ec08ea432f0e535418823cda44178ee518906a54f27a5f0f", size = 162633, upload-time = "2025-12-16T03:25:07.931Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/aa/f0/0f21e9688eaac85e705537b3a87a5588d0cefb2f09d83e83e0e8be93aa99/curl_cffi-0.14.0-cp39-abi3-macosx_14_0_arm64.whl", hash = "sha256:e35e89c6a69872f9749d6d5fda642ed4fc159619329e99d577d0104c9aad5893", size = 3087277, upload-time = "2025-12-16T03:24:49.607Z" },
{ url = "https://files.pythonhosted.org/packages/ba/a3/0419bd48fce5b145cb6a2344c6ac17efa588f5b0061f212c88e0723da026/curl_cffi-0.14.0-cp39-abi3-macosx_15_0_x86_64.whl", hash = "sha256:5945478cd28ad7dfb5c54473bcfb6743ee1d66554d57951fdf8fc0e7d8cf4e45", size = 5804650, upload-time = "2025-12-16T03:24:51.518Z" },
{ url = "https://files.pythonhosted.org/packages/e2/07/a238dd062b7841b8caa2fa8a359eb997147ff3161288f0dd46654d898b4d/curl_cffi-0.14.0-cp39-abi3-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c42e8fa3c667db9ccd2e696ee47adcd3cd5b0838d7282f3fc45f6c0ef3cfdfa7", size = 8231918, upload-time = "2025-12-16T03:24:52.862Z" },
{ url = "https://files.pythonhosted.org/packages/7c/d2/ce907c9b37b5caf76ac08db40cc4ce3d9f94c5500db68a195af3513eacbc/curl_cffi-0.14.0-cp39-abi3-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:060fe2c99c41d3cb7f894de318ddf4b0301b08dca70453d769bd4e74b36b8483", size = 8654624, upload-time = "2025-12-16T03:24:54.579Z" },
{ url = "https://files.pythonhosted.org/packages/f2/ae/6256995b18c75e6ef76b30753a5109e786813aa79088b27c8eabb1ef85c9/curl_cffi-0.14.0-cp39-abi3-manylinux_2_28_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:b158c41a25388690dd0d40b5bc38d1e0f512135f17fdb8029868cbc1993d2e5b", size = 8010654, upload-time = "2025-12-16T03:24:56.507Z" },
{ url = "https://files.pythonhosted.org/packages/fb/10/ff64249e516b103cb762e0a9dca3ee0f04cf25e2a1d5d9838e0f1273d071/curl_cffi-0.14.0-cp39-abi3-manylinux_2_28_i686.whl", hash = "sha256:1439fbef3500fb723333c826adf0efb0e2e5065a703fb5eccce637a2250db34a", size = 7781969, upload-time = "2025-12-16T03:24:57.885Z" },
{ url = "https://files.pythonhosted.org/packages/51/76/d6f7bb76c2d12811aa7ff16f5e17b678abdd1b357b9a8ac56310ceccabd5/curl_cffi-0.14.0-cp39-abi3-manylinux_2_34_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:e7176f2c2d22b542e3cf261072a81deb018cfa7688930f95dddef215caddb469", size = 7969133, upload-time = "2025-12-16T03:24:59.261Z" },
{ url = "https://files.pythonhosted.org/packages/23/7c/cca39c0ed4e1772613d3cba13091c0e9d3b89365e84b9bf9838259a3cd8f/curl_cffi-0.14.0-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:03f21ade2d72978c2bb8670e9b6de5260e2755092b02d94b70b906813662998d", size = 9080167, upload-time = "2025-12-16T03:25:00.946Z" },
{ url = "https://files.pythonhosted.org/packages/75/03/a942d7119d3e8911094d157598ae0169b1c6ca1bd3f27d7991b279bcc45b/curl_cffi-0.14.0-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:58ebf02de64ee5c95613209ddacb014c2d2f86298d7080c0a1c12ed876ee0690", size = 9520464, upload-time = "2025-12-16T03:25:02.922Z" },
{ url = "https://files.pythonhosted.org/packages/a2/77/78900e9b0833066d2274bda75cba426fdb4cef7fbf6a4f6a6ca447607bec/curl_cffi-0.14.0-cp39-abi3-win_amd64.whl", hash = "sha256:6e503f9a103f6ae7acfb3890c843b53ec030785a22ae7682a22cc43afb94123e", size = 1677416, upload-time = "2025-12-16T03:25:04.902Z" },
{ url = "https://files.pythonhosted.org/packages/5c/7c/d2ba86b0b3e1e2830bd94163d047de122c69a8df03c5c7c36326c456ad82/curl_cffi-0.14.0-cp39-abi3-win_arm64.whl", hash = "sha256:2eed50a969201605c863c4c31269dfc3e0da52916086ac54553cfa353022425c", size = 1425067, upload-time = "2025-12-16T03:25:06.454Z" },
]
[[package]]
name = "cython"
version = "3.2.4"
@ -274,7 +194,6 @@ version = "0.1.0"
source = { virtual = "." }
dependencies = [
{ name = "camoufox" },
{ name = "curl-cffi" },
{ name = "fake-useragent" },
{ name = "httpx" },
{ name = "playwright" },
@ -284,7 +203,6 @@ dependencies = [
[package.metadata]
requires-dist = [
{ name = "camoufox", specifier = ">=0.4.11" },
{ name = "curl-cffi" },
{ name = "fake-useragent", specifier = ">=2.2.0" },
{ name = "httpx" },
{ name = "playwright", specifier = ">=1.58.0" },
@ -639,15 +557,6 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/b3/eb/936f5eeae196e8c8aaabe5f7d98891be8a5bbc741d50ce5c60f55575ad29/polars_runtime_32-1.39.0-cp310-abi3-win_arm64.whl", hash = "sha256:d69abde5f148566860bbe910010847bd7791e72f7c8063a4d2c462246a33a72a", size = 41885761, upload-time = "2026-03-12T14:23:16.773Z" },
]
[[package]]
name = "pycparser"
version = "3.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/1b/7d/92392ff7815c21062bea51aa7b87d45576f649f16458d78b7cf94b9ab2e6/pycparser-3.0.tar.gz", hash = "sha256:600f49d217304a5902ac3c37e1281c9fe94e4d0489de643a9504c5cdfdfc6b29", size = 103492, upload-time = "2026-01-21T14:26:51.89Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/0c/c3/44f3fbbfa403ea2a7c779186dc20772604442dde72947e7d01069cbe98e3/pycparser-3.0-py3-none-any.whl", hash = "sha256:b727414169a36b7d524c1c3e31839a521725078d7b2ff038656844266160a992", size = 48172, upload-time = "2026-01-21T14:26:50.693Z" },
]
[[package]]
name = "pyee"
version = "13.0.1"

View file

@ -1,8 +1,8 @@
"""Zoopla (zoopla.co.uk) scraper — sale properties.
Zoopla is behind Cloudflare Turnstile (managed interactive challenge), which
blocks all HTTP clients (curl_cffi, httpx) and even Playwright with stealth
patches. Only Camoufox (an anti-fingerprinting Firefox fork) passes reliably.
blocks non-browser HTTP clients and even Playwright with stealth patches. Only
Camoufox (an anti-fingerprinting Firefox fork) passes reliably.
Zoopla uses Next.js App Router with React Server Components (RSC). Search
result data is server-rendered in an RSC stream, not available via
@ -19,11 +19,20 @@ Architecture:
"""
import logging
import os
import re
import sys
import time
from pathlib import Path
from urllib.parse import parse_qsl, urlencode, urljoin, urlparse, urlunparse
from constants import DELAY_BETWEEN_PAGES, MAX_BEDROOMS, PROPERTY_TYPE_MAP, ZOOPLA_BASE
from constants import (
DATA_DIR,
DELAY_BETWEEN_PAGES,
MAX_BEDROOMS,
PROPERTY_TYPE_MAP,
ZOOPLA_BASE,
)
from spatial import PostcodeSpatialIndex
from transform import normalize_sub_type, parse_int_value, validate_floor_area
@ -255,11 +264,120 @@ _DISMISS_COOKIES_JS = """() => {
# ---------------------------------------------------------------------------
_FALSE_ENV_VALUES = {"0", "false", "no", "off"}
_TRUE_ENV_VALUES = {"1", "true", "yes", "on"}
def _env_bool_or_virtual(name: str, default: bool | str) -> bool | str:
raw = os.environ.get(name)
if raw is None:
return default
value = raw.strip().lower()
if value == "virtual":
return "virtual"
if value in _TRUE_ENV_VALUES:
return True
if value in _FALSE_ENV_VALUES:
return False
raise ValueError(
f"{name} must be one of 1/0, true/false, yes/no, on/off, or virtual"
)
def _visible_display_available() -> bool:
if sys.platform.startswith("linux"):
return bool(os.environ.get("DISPLAY") or os.environ.get("WAYLAND_DISPLAY"))
return True
def _zoopla_headless_mode() -> bool | str:
# Prefer a visible browser by default so Cloudflare can be completed by the
# person running the scrape. In display-less Linux shells, keep startup
# headless and fail fast with an actionable error if a challenge appears.
default: bool | str = not _visible_display_available()
return _env_bool_or_virtual("ZOOPLA_HEADLESS", default)
def _zoopla_profile_dir() -> Path:
raw = os.environ.get("ZOOPLA_PROFILE_DIR")
if raw:
return Path(raw).expanduser().resolve()
return (DATA_DIR / ".runtime" / "zoopla-profile").expanduser().resolve()
def _challenge_timeout_seconds() -> int:
raw = os.environ.get("ZOOPLA_CHALLENGE_TIMEOUT_SECONDS")
if raw is None:
return 300
try:
timeout = int(raw)
except ValueError as exc:
raise ValueError("ZOOPLA_CHALLENGE_TIMEOUT_SECONDS must be an integer") from exc
if timeout < 1:
raise ValueError("ZOOPLA_CHALLENGE_TIMEOUT_SECONDS must be greater than zero")
return timeout
def _is_turnstile_challenge(page) -> bool:
try:
if "just a moment" in page.title().lower():
return True
except Exception:
pass
try:
return bool(
page.query_selector(
'iframe[src*="challenges.cloudflare.com"], '
'input[name="cf-turnstile-response"]'
)
)
except Exception:
return False
def _wait_for_turnstile(page, headless_mode: bool | str) -> None:
if not _is_turnstile_challenge(page):
return
profile_dir = _zoopla_profile_dir()
if headless_mode is True or headless_mode == "virtual":
raise TurnstileError(
"Cloudflare Turnstile requires a visible browser session. "
"Run Zoopla from a desktop session with ZOOPLA_HEADLESS=0; "
f"the solved session will be saved in {profile_dir}."
)
timeout = _challenge_timeout_seconds()
log.warning(
"Cloudflare Turnstile challenge shown. Complete it in the Zoopla browser "
"window; waiting up to %ds. Profile: %s",
timeout,
profile_dir,
)
try:
page.bring_to_front()
except Exception:
pass
deadline = time.monotonic() + timeout
while time.monotonic() < deadline:
time.sleep(3)
if not _is_turnstile_challenge(page):
log.info("Cloudflare challenge resolved")
return
raise TurnstileError(
f"Cloudflare Turnstile was not completed after {timeout}s"
)
def launch_browser():
"""Launch Camoufox, navigate to Zoopla homepage, pass Cloudflare Turnstile,
and dismiss cookie consent. Returns (browser, page) tuple.
Raises TurnstileError if Cloudflare cannot be passed within two minutes.
Raises TurnstileError if Cloudflare cannot be completed.
Caller must close browser when done."""
from camoufox.pkgman import camoufox_path
@ -269,61 +387,50 @@ def launch_browser():
from camoufox.sync_api import Camoufox
log.info("Launching Camoufox browser for Zoopla...")
camoufox = Camoufox(headless=True)
headless_mode = _zoopla_headless_mode()
profile_dir = _zoopla_profile_dir()
profile_dir.mkdir(parents=True, exist_ok=True)
log.info(
"Launching Camoufox browser for Zoopla (headless=%s, profile=%s)...",
headless_mode,
profile_dir,
)
camoufox = Camoufox(
headless=headless_mode,
persistent_context=True,
user_data_dir=str(profile_dir),
locale=["en-GB", "en"],
enable_cache=True,
)
raw_browser = camoufox.__enter__()
browser = _ManagedCamoufoxBrowser(camoufox, raw_browser)
page = browser.new_page()
page = raw_browser.pages[0] if raw_browser.pages else raw_browser.new_page()
try:
log.info("Navigating to Zoopla homepage...")
page.goto(f"{ZOOPLA_BASE}/", wait_until="domcontentloaded", timeout=60000)
_wait_for_turnstile(page, headless_mode)
# Wait for Cloudflare Turnstile to resolve.
# Try clicking the Turnstile checkbox if present (helps in some cases).
for i in range(40):
if "Just a moment" not in page.title():
break
# Attempt to click the Turnstile checkbox in the challenge iframe
for frame in page.frames:
if "challenges.cloudflare.com" in frame.url:
try:
iframe_el = page.query_selector('iframe[src*="challenges.cloudflare"]')
if iframe_el:
box = iframe_el.bounding_box()
if box:
page.mouse.click(box["x"] + 30, box["y"] + box["height"] / 2)
except Exception:
pass
break
time.sleep(3)
else:
page.close()
browser.close()
raise TurnstileError("Cloudflare Turnstile did not resolve after 120s")
log.info("Cloudflare passed — title: %s", page.title())
log.info("Zoopla browser ready — title: %s", page.title())
time.sleep(2)
# Dismiss cookie consent
page.evaluate(_DISMISS_COOKIES_JS)
time.sleep(1)
except Exception:
try:
page.close()
finally:
browser.close()
raise
return browser, page
def _ensure_not_challenged(page) -> None:
"""Check if current page is a Cloudflare challenge and wait/raise."""
if "Just a moment" not in page.title():
return
log.warning("Cloudflare challenge detected mid-session, waiting...")
for i in range(40):
time.sleep(3)
if "Just a moment" not in page.title():
log.info("Cloudflare challenge resolved")
return
raise TurnstileError("Cloudflare re-challenge did not resolve after 120s")
_wait_for_turnstile(page, _zoopla_headless_mode())
# ---------------------------------------------------------------------------
@ -704,9 +811,7 @@ def transform_property(
Zoopla search cards do not include coordinates, so we resolve lat/lng
from postcodes extracted from the address text."""
price = parse_int_value(raw.get("price"))
if not price or price <= 0:
return None
price = parse_int_value(raw.get("price")) or 0
address = raw.get("address", "")
@ -856,7 +961,7 @@ def search_outcode(
sample = raw_listings[0] if raw_listings else {}
log.debug(
"Zoopla %s %s: extracted %d raw listings but all %d dropped in transform "
"(no price/postcode/coords). Sample raw: price=%s address=%r",
"(no postcode/coords). Sample raw: price=%s address=%r",
outcode, "BUY", len(raw_listings), dropped,
sample.get("price"), sample.get("address", ""),
)

View file

@ -68,6 +68,34 @@ const ROUTE_COLORS: Record<string, { color: string; darkText?: boolean }> = {
};
const NON_TUBE_NAMES = new Set(['DLR', 'London Overground', 'Elizabeth line']);
const GOOGLE_MAPS_DEPARTURE_TIME_ZONE = 'Europe/London';
const londonDateFormatter = new Intl.DateTimeFormat('en-GB', {
timeZone: GOOGLE_MAPS_DEPARTURE_TIME_ZONE,
year: 'numeric',
month: '2-digit',
day: '2-digit',
});
const londonDateTimeFormatter = new Intl.DateTimeFormat('en-GB', {
timeZone: GOOGLE_MAPS_DEPARTURE_TIME_ZONE,
year: 'numeric',
month: '2-digit',
day: '2-digit',
hour: '2-digit',
minute: '2-digit',
second: '2-digit',
hour12: false,
hourCycle: 'h23',
});
function dateTimeParts(formatter: Intl.DateTimeFormat, date: Date): Record<string, number> {
const parts: Record<string, number> = {};
formatter.formatToParts(date).forEach((part) => {
if (part.type !== 'literal') {
parts[part.type] = Number(part.value);
}
});
return parts;
}
/** Strip trailing parenthesized GTFS route IDs and NaPTAN stop codes (e.g. "(6757261)", "(9400ZZLUCGT1)") */
function stripId(label: string): string {
@ -87,15 +115,48 @@ function getRouteDisplay(mode: string): { label: string; color: string; darkText
return { label: clean, color: '#6b7280', darkText: false };
}
/** Returns a Unix timestamp for the next Monday at 07:30 local time. */
function londonOffsetMs(utcMs: number): number {
const parts = dateTimeParts(londonDateTimeFormatter, new Date(utcMs));
const londonAsUtcMs = Date.UTC(
parts.year,
parts.month - 1,
parts.day,
parts.hour,
parts.minute,
parts.second
);
return londonAsUtcMs - utcMs;
}
function londonTimeToUtcMs(
year: number,
month: number,
day: number,
hour: number,
minute: number
): number {
const localAsUtcMs = Date.UTC(year, month - 1, day, hour, minute, 0, 0);
const offsetMs = londonOffsetMs(localAsUtcMs);
const utcMs = localAsUtcMs - offsetMs;
const correctedOffsetMs = londonOffsetMs(utcMs);
return correctedOffsetMs === offsetMs ? utcMs : localAsUtcMs - correctedOffsetMs;
}
/** Returns a Unix timestamp for the next Monday at 07:30 Europe/London time. */
function nextMondayAt730(): number {
const now = new Date();
const day = now.getDay(); // 0=Sun … 6=Sat
const today = dateTimeParts(londonDateFormatter, now);
const day = new Date(Date.UTC(today.year, today.month - 1, today.day)).getUTCDay();
const daysUntil = day === 0 ? 1 : day === 1 ? 7 : 8 - day;
const monday = new Date(now);
monday.setDate(now.getDate() + daysUntil);
monday.setHours(7, 30, 0, 0);
return Math.floor(monday.getTime() / 1000);
const monday = new Date(Date.UTC(today.year, today.month - 1, today.day + daysUntil));
const utcMs = londonTimeToUtcMs(
monday.getUTCFullYear(),
monday.getUTCMonth() + 1,
monday.getUTCDate(),
7,
30
);
return Math.floor(utcMs / 1000);
}
function googleMapsDestination(

View file

@ -419,6 +419,7 @@ export default function MapPage({
const { listings: actualListings } = useActualListings(mapData.bounds, {
filterParam: actualListingsFilterParam,
travelParam: actualListingsTravelParam,
shareCode,
});
const [isAreaGroupExpanded, toggleAreaGroup] = useCollapsibleGroups(true);

View file

@ -7,11 +7,12 @@ const DEBOUNCE_MS = 200;
interface UseActualListingsOptions {
filterParam?: string;
travelParam?: string;
shareCode?: string;
}
export function useActualListings(
bounds: Bounds | null,
{ filterParam = '', travelParam = '' }: UseActualListingsOptions = {}
{ filterParam = '', travelParam = '', shareCode = '' }: UseActualListingsOptions = {}
) {
const [listings, setListings] = useState<ActualListing[]>([]);
const debounceRef = useRef<ReturnType<typeof setTimeout> | null>(null);
@ -38,11 +39,15 @@ export function useActualListings(
const params = new URLSearchParams({ bounds: boundsStr });
if (filterParam) params.set('filters', filterParam);
if (travelParam) params.set('travel', travelParam);
if (shareCode) params.set('share', shareCode);
const res = await fetch(
apiUrl('actual-listings', params),
authHeaders({ signal: abortControllerRef.current.signal })
);
if (!res.ok) throw new Error(`Actual listings fetch failed: HTTP ${res.status}`);
if (!res.ok) {
if (requestIdRef.current === requestId) setListings([]);
throw new Error(`Actual listings fetch failed: HTTP ${res.status}`);
}
const json: ActualListingsResponse = await res.json();
if (requestIdRef.current !== requestId) return;
setListings(json.listings || []);
@ -57,7 +62,7 @@ export function useActualListings(
};
// listings intentionally excluded — it's internal state, not an input.
// eslint-disable-next-line react-hooks/exhaustive-deps
}, [bounds, filterParam, travelParam]);
}, [bounds, filterParam, travelParam, shareCode]);
return { listings };
}

View file

@ -109,9 +109,6 @@ export function useDeckLayers({
listings: actualListings,
zoom,
isDark,
hexagonData: data,
postcodeData,
usePostcodeView,
});
// --- Refs for deck.gl accessors ---

View file

@ -1,9 +1,8 @@
import { useCallback, useMemo, useRef, useState } from 'react';
import type { Layer, PickingInfo } from '@deck.gl/core';
import { ScatterplotLayer, TextLayer } from '@deck.gl/layers';
import { getResolution, latLngToCell } from 'h3-js';
import type { ActualListing, HexagonData, PostcodeFeature } from '../types';
import type { ActualListing } from '../types';
import { trackEvent } from '../lib/analytics';
const PRICE_LABEL_MIN_ZOOM = 14;
@ -19,14 +18,6 @@ interface UseListingLayersProps {
listings: ActualListing[];
zoom: number;
isDark: boolean;
hexagonData: HexagonData[];
postcodeData: PostcodeFeature[];
usePostcodeView: boolean;
}
function normalizePostcode(value: string | undefined | null): string {
if (!value) return '';
return value.replace(/\s+/g, '').toUpperCase();
}
function formatShortPrice(price: number): string {
@ -35,57 +26,9 @@ function formatShortPrice(price: number): string {
return `£${price}`;
}
export function useListingLayers({
listings,
zoom,
isDark,
hexagonData,
postcodeData,
usePostcodeView,
}: UseListingLayersProps) {
export function useListingLayers({ listings, zoom, isDark }: UseListingLayersProps) {
const [popupInfo, setPopupInfo] = useState<ListingPopupInfo | null>(null);
// Split into two memos so the inactive view's data changes don't invalidate
// the active filtered list. (e.g. in postcode view, hexagonData updates must
// not retrigger filtering / downstream layer rebuilds.)
const postcodeFilteredListings = useMemo(() => {
if (!usePostcodeView || listings.length === 0) return null;
const allowed = new Set<string>();
for (const feature of postcodeData) {
if (feature.properties.count > 0) {
allowed.add(normalizePostcode(feature.properties.postcode));
}
}
if (allowed.size === 0) return [];
return listings.filter((listing) => allowed.has(normalizePostcode(listing.postcode)));
}, [listings, postcodeData, usePostcodeView]);
const hexFilteredListings = useMemo(() => {
if (usePostcodeView || listings.length === 0) return null;
const allowed = new Set<string>();
let cellResolution: number | null = null;
for (const cell of hexagonData) {
if (cell.count > 0) {
allowed.add(cell.h3);
if (cellResolution == null) cellResolution = getResolution(cell.h3);
}
}
if (allowed.size === 0 || cellResolution == null) return [];
const resolutionForLookup = cellResolution;
return listings.filter((listing) => {
try {
return allowed.has(latLngToCell(listing.lat, listing.lon, resolutionForLookup));
} catch {
return false;
}
});
}, [listings, hexagonData, usePostcodeView]);
const visibleListings = useMemo(() => {
if (listings.length === 0) return listings;
return (usePostcodeView ? postcodeFilteredListings : hexFilteredListings) ?? [];
}, [listings, usePostcodeView, postcodeFilteredListings, hexFilteredListings]);
const handleHover = useCallback((info: PickingInfo<ActualListing>) => {
if (info.object && info.x !== undefined && info.y !== undefined) {
setPopupInfo({ x: info.x, y: info.y, listing: info.object });
@ -119,21 +62,21 @@ export function useListingLayers({
() =>
new ScatterplotLayer<ActualListing>({
id: 'actual-listing-shadow',
data: visibleListings,
data: listings,
getPosition: (d) => [d.lon, d.lat],
getRadius: 8,
radiusUnits: 'pixels',
getFillColor: isDark ? [0, 0, 0, 80] : [0, 0, 0, 40],
pickable: false,
}),
[visibleListings, isDark]
[listings, isDark]
);
const pinLayer = useMemo(
() =>
new ScatterplotLayer<ActualListing>({
id: 'actual-listing-pin',
data: visibleListings,
data: listings,
getPosition: (d) => [d.lon, d.lat],
getRadius: 7,
radiusUnits: 'pixels',
@ -148,12 +91,12 @@ export function useListingLayers({
onHover: stableHover,
onClick: stableClick,
}),
[visibleListings, stableHover, stableClick]
[listings, stableHover, stableClick]
);
const priceLabelLayer = useMemo(() => {
if (zoom < PRICE_LABEL_MIN_ZOOM) return null;
const labeled = visibleListings.filter((l) => l.asking_price && l.asking_price > 0);
const labeled = listings.filter((l) => l.asking_price && l.asking_price > 0);
return new TextLayer<ActualListing>({
id: 'actual-listing-price',
data: labeled,
@ -174,11 +117,11 @@ export function useListingLayers({
sizeMaxPixels: 14,
pickable: false,
});
}, [visibleListings, zoom, isDark]);
}, [listings, zoom, isDark]);
const detailLabelLayer = useMemo(() => {
if (zoom < ADDRESS_LABEL_MIN_ZOOM) return null;
const labeled = visibleListings.filter((l) => l.address || l.bedrooms != null);
const labeled = listings.filter((l) => l.address || l.bedrooms != null);
return new TextLayer<ActualListing>({
id: 'actual-listing-detail',
data: labeled,
@ -205,7 +148,7 @@ export function useListingLayers({
sizeMaxPixels: 12,
pickable: false,
});
}, [visibleListings, zoom, isDark]);
}, [listings, zoom, isDark]);
const listingLayers = useMemo(() => {
const layers: Layer[] = [pinShadowLayer, pinLayer];

View file

@ -132,7 +132,6 @@ export const POI_GROUP_COLORS: Record<string, [number, number, number]> = {
export const POI_CATEGORY_LOGOS: Record<string, string> = {
Airport: '/assets/twemoji/2708.png',
Aldi: '/assets/poi-icons/logos/aldi.svg',
'Allendale Co-operative Society': '/assets/poi-icons/logos/coop.svg',
Amazon: '/assets/poi-icons/brands_2024/amazon_fresh.svg',
Asda: '/assets/poi-icons/logos/asda.svg',
'Asda Express': '/assets/poi-icons/logos/asda.svg',
@ -148,26 +147,18 @@ export const POI_CATEGORY_LOGOS: Record<string, string> = {
'Bus stop': '/assets/twemoji/1f68f.png',
'Butcher & Fishmonger': '/assets/twemoji/1f969.png',
Centra: '/assets/poi-icons/logos/centra.svg',
'Central England Co-operative': '/assets/poi-icons/logos/coop.svg',
'Chelmsford Star Co-operative Society': '/assets/poi-icons/logos/coop.svg',
'Clydebank Co-operative': '/assets/poi-icons/logos/coop.svg',
'Co-op': '/assets/poi-icons/logos/coop.svg',
'Coniston Co-operative Society': '/assets/poi-icons/logos/coop.svg',
COOK: '/assets/poi-icons/brands_2024/cook.svg',
'Convenience Store': '/assets/twemoji/1f3ea.png',
Costco: '/assets/poi-icons/logos/costco.svg',
'Deli & Specialty': '/assets/twemoji/1f9c6.png',
'Dunnes Stores': '/assets/poi-icons/brands_2024/dunnes_stores.svg',
'East of England Co-operative': '/assets/poi-icons/logos/coop.svg',
Farmfoods: '/assets/poi-icons/brands_2023/supermarkets/farmfoods.svg',
Ferry: '/assets/twemoji/26f4.png',
Greengrocer: '/assets/twemoji/1f96c.png',
'Heart of England Co-operative': '/assets/poi-icons/logos/coop.svg',
'Heron Foods': '/assets/poi-icons/brands_2023/supermarkets/heron_foods.svg',
Iceland: '/assets/poi-icons/brands_2024/iceland.svg',
Lidl: '/assets/poi-icons/logos/lidl.svg',
'Langdale Co-operative Society': '/assets/poi-icons/logos/coop.svg',
'Lincolnshire Co-operative': '/assets/poi-icons/logos/coop.svg',
Makro: '/assets/poi-icons/brands_2024/makro.svg',
'M&S': '/assets/poi-icons/brands_2024/mns.svg',
'M&S Clothing': '/assets/poi-icons/brands_2024/mns.svg',
@ -175,7 +166,6 @@ export const POI_CATEGORY_LOGOS: Record<string, string> = {
'M&S Hospital': '/assets/poi-icons/brands_2024/mns.svg',
'M&S MSA': '/assets/poi-icons/brands_2024/mns.svg',
'M&S Outlet': '/assets/poi-icons/brands_2024/mns.svg',
'Midcounties Co-operative': '/assets/poi-icons/logos/coop.svg',
Morrisons: '/assets/poi-icons/logos/morrisons.svg',
'Morrisons Daily': '/assets/poi-icons/brands_2024/morrisons_daily.svg',
'Off-Licence': '/assets/twemoji/1f377.png',
@ -183,16 +173,12 @@ export const POI_CATEGORY_LOGOS: Record<string, string> = {
'Rail station': '/assets/twemoji/1f686.png',
"Sainsbury's": '/assets/poi-icons/logos/sainsburys.svg',
"Sainsbury's Local": '/assets/poi-icons/brands_2024/sainsburys_local.svg',
'Scottish Midland Co-operative': '/assets/poi-icons/logos/coop.svg',
Spar: '/assets/poi-icons/logos/spar.svg',
Supermarket: '/assets/twemoji/1f6d2.png',
'Tamworth Co-operative Society': '/assets/poi-icons/logos/coop.svg',
Tesco: '/assets/poi-icons/logos/tesco.svg',
'Tesco Express': '/assets/poi-icons/logos/tesco_express.svg',
'Tesco Extra': '/assets/poi-icons/logos/tesco_extra.svg',
'Taxi rank': '/assets/twemoji/1f695.png',
'The Radstock Co-operative Society': '/assets/poi-icons/logos/coop.svg',
'The Southern Co-operative': '/assets/poi-icons/logos/coop.svg',
'The Food Warehouse': '/assets/poi-icons/logos/the_food_warehouse.png',
'Tube station': '/assets/poi-icons/public_transport/london_tube.svg',
Waitrose: '/assets/poi-icons/logos/waitrose.svg',

View file

@ -0,0 +1,3 @@
from .local_temp import configure_tempfile_defaults
configure_tempfile_defaults()

View file

@ -3,6 +3,7 @@ import tempfile
import polars as pl
from pathlib import Path
from pipeline.local_temp import local_tmp_dir
from pipeline.utils import download, extract_zip
URL = "https://www.arcgis.com/sharing/rest/content/items/36b718ad00de49afb9ad364f8b815b9e/data"
@ -40,7 +41,7 @@ def main() -> None:
)
args = parser.parse_args()
with tempfile.TemporaryDirectory() as cache_dir:
with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as cache_dir:
download_path = Path(cache_dir) / "arcgis_data.zip"
extract_path = Path(cache_dir) / "arcgis_extracted"

View file

@ -7,6 +7,7 @@ from pathlib import Path
import httpx
from pipeline.local_temp import local_tmp_dir
from pipeline.utils import download, extract_zip
# Ofcom Connected Nations 2025 - Fixed broadband performance (output area & local authority level)
@ -84,7 +85,7 @@ def main() -> None:
)
args = parser.parse_args()
with tempfile.TemporaryDirectory() as cache_dir:
with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as cache_dir:
cache = Path(cache_dir)
zip_path = cache / "broadband_performance.zip"
extract_dir = cache / "extracted"

View file

@ -3,6 +3,7 @@ import tempfile
import polars as pl
from pathlib import Path
from pipeline.local_temp import local_tmp_dir
from pipeline.utils import download
URL = "https://assets.publishing.service.gov.uk/media/691ded34513046b952c500bd/File_5_IoD2025_Scores_for_the_Indices_of_Deprivation.xlsx"
@ -33,7 +34,7 @@ def main() -> None:
)
args = parser.parse_args()
with tempfile.TemporaryDirectory() as cache_dir:
with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as cache_dir:
xlsx_path = Path(cache_dir) / "IoD2025_Scores.xlsx"
download(URL, xlsx_path, timeout=60)
convert_to_parquet(xlsx_path, args.output)

View file

@ -8,6 +8,7 @@ from zipfile import ZipFile
import polars as pl
from pipeline.local_temp import local_tmp_dir
from pipeline.utils.download import download
@ -70,7 +71,9 @@ def download_geolytix_retail_points(output_path: Path) -> None:
"""Download the GEOLYTIX ZIP, extract the latest CSV, and write parquet."""
output_path.parent.mkdir(parents=True, exist_ok=True)
with TemporaryDirectory(prefix="geolytix_retail_points_") as tmp:
with TemporaryDirectory(
prefix="geolytix_retail_points_", dir=local_tmp_dir()
) as tmp:
zip_path = Path(tmp) / "geolytix_retail_points.zip"
download(GEOLYTIX_RETAIL_POINTS_URL, zip_path, timeout=300)
df = read_latest_csv(zip_path)

View file

@ -31,6 +31,8 @@ from pyproj import Transformer
from rasterio.transform import rowcol
from scipy.ndimage import maximum_filter
from pipeline.local_temp import local_tmp_dir
# Noise sources:
# (label, column_name, WCS base URL, coverage ID, WCS version, allow_missing_tiles)
# Road/rail work with WCS 1.0.0; airport requires WCS 2.0.1 and returns 500
@ -437,7 +439,7 @@ def main() -> None:
result = postcodes.select("postcode")
with tempfile.TemporaryDirectory() as tmp:
with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as tmp:
for (
label,
col_name,

View file

@ -3,6 +3,7 @@ import tempfile
import polars as pl
from pathlib import Path
from pipeline.local_temp import local_tmp_dir
from pipeline.utils import download
# Management information - state-funded schools - latest inspections (as at 28 Feb 2026)
@ -36,7 +37,7 @@ def main() -> None:
)
args = parser.parse_args()
with tempfile.TemporaryDirectory() as cache_dir:
with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as cache_dir:
csv_path = Path(cache_dir) / "ofsted_latest_inspections.csv"
download(URL, csv_path, timeout=60)
convert_to_parquet(csv_path, args.output)

View file

@ -25,6 +25,7 @@ from pyproj import Transformer
from shapely.errors import GEOSException
from shapely.geometry import shape as to_shapely
from pipeline.local_temp import local_tmp_dir
from pipeline.utils.download import download, extract_zip
logger = logging.getLogger(__name__)
@ -171,7 +172,7 @@ def _read_site_centroids(
def download_greenspace(output: Path) -> None:
output.parent.mkdir(parents=True, exist_ok=True)
with tempfile.TemporaryDirectory() as cache_dir:
with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as cache_dir:
zip_path = Path(cache_dir) / "greenspace.zip"
extract_dir = Path(cache_dir) / "extracted"

View file

@ -11,6 +11,7 @@ from shapely.geometry import Point
from shapely.wkb import loads as load_wkb
from tqdm import tqdm
from pipeline.local_temp import local_tmp_dir
from pipeline.utils.england_geometry import (
ENGLAND_BBOX_EAST,
ENGLAND_BBOX_NORTH,
@ -184,7 +185,7 @@ def main() -> None:
england_polygon = load_england_polygon(args.boundary)
tmp_dir = Path(mkdtemp(prefix="pois_"))
tmp_dir = Path(mkdtemp(prefix="pois_", dir=local_tmp_dir()))
with tqdm(
unit=" elements",
unit_scale=True,

View file

@ -12,6 +12,7 @@ import tarfile
import tempfile
from pathlib import Path
from pipeline.local_temp import local_tmp_dir
from pipeline.utils import download
URL = "https://postcodes-mapit-static.s3.eu-west-2.amazonaws.com/data/gb-postcodes-v5.tar.bz2"
@ -37,7 +38,7 @@ def main() -> None:
)
args = parser.parse_args()
with tempfile.TemporaryDirectory() as cache_dir:
with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as cache_dir:
cache = Path(cache_dir)
archive_path = cache / "gb-postcodes-v5.tar.bz2"
extract_dir = cache / "extracted"

View file

@ -3,6 +3,7 @@ import tempfile
import polars as pl
from pathlib import Path
from pipeline.local_temp import local_tmp_dir
from pipeline.utils import download
URL = "http://prod.publicdata.landregistry.gov.uk.s3-website-eu-west-1.amazonaws.com/pp-complete.csv"
@ -55,7 +56,7 @@ def main() -> None:
)
args = parser.parse_args()
with tempfile.TemporaryDirectory() as cache_dir:
with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as cache_dir:
csv_path = Path(cache_dir) / "price-paid-complete.csv"
download(URL, csv_path)

View file

@ -13,6 +13,7 @@ from pathlib import Path
import polars as pl
from pipeline.local_temp import local_tmp_dir
from pipeline.utils import download
URL = "https://www.ons.gov.uk/file?uri=/economy/inflationandpriceindices/datasets/priceindexofprivaterentsukmonthlypricestatistics/25march2026/priceindexofprivaterentsukmonthlypricestatistics.xlsx"
@ -114,7 +115,7 @@ def main() -> None:
)
args = parser.parse_args()
with tempfile.TemporaryDirectory() as cache_dir:
with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as cache_dir:
xlsx_path = Path(cache_dir) / "pipr_monthly.xlsx"
download(URL, xlsx_path, timeout=120)
convert_to_parquet(xlsx_path, args.output)

View file

@ -36,6 +36,8 @@ from pathlib import Path
from tqdm import tqdm
from pipeline.local_temp import local_tmp_dir
ENGLAND_PBF_URL = (
"https://download.geofabrik.de/europe/united-kingdom/england-latest.osm.pbf"
)
@ -164,7 +166,10 @@ def clean_gtfs(src: Path, dst: Path) -> None:
)
tmp = tempfile.NamedTemporaryFile(
mode="wb", delete=False, suffix=".txt"
mode="wb",
delete=False,
suffix=".txt",
dir=local_tmp_dir(),
)
tmp.write(header)
@ -388,7 +393,10 @@ def convert_high_freq_to_frequency_based(
trip_id_idx = cols.index("trip_id")
tmp = tempfile.NamedTemporaryFile(
mode="wb", delete=False, suffix=".txt"
mode="wb",
delete=False,
suffix=".txt",
dir=local_tmp_dir(),
)
tmp.write(header)
for line in f:
@ -408,7 +416,10 @@ def convert_high_freq_to_frequency_based(
trip_id_idx = cols.index("trip_id")
tmp = tempfile.NamedTemporaryFile(
mode="wb", delete=False, suffix=".txt"
mode="wb",
delete=False,
suffix=".txt",
dir=local_tmp_dir(),
)
tmp.write(header)
for line in f:
@ -451,8 +462,8 @@ def download_tfl_transxchange(raw_dir: Path) -> Path:
def download_naptan() -> None:
"""Download NaPTAN stops to /tmp/Stops.csv (needed by transxchange2gtfs)."""
dest = Path("/tmp/Stops.csv")
"""Download NaPTAN stops to the local temp dir for transxchange2gtfs."""
dest = local_tmp_dir() / "Stops.csv"
if dest.exists():
print(f"NaPTAN Stops.csv already exists: {dest}")
return
@ -661,7 +672,10 @@ def clean_national_rail_gtfs(src: Path, dst: Path) -> None:
)
tmp = tempfile.NamedTemporaryFile(
mode="wb", delete=False, suffix=".txt"
mode="wb",
delete=False,
suffix=".txt",
dir=local_tmp_dir(),
)
tmp.write(header)
@ -718,7 +732,10 @@ def clean_national_rail_gtfs(src: Path, dst: Path) -> None:
lon_idx = cols.index("stop_lon")
tmp = tempfile.NamedTemporaryFile(
mode="wb", delete=False, suffix=".txt"
mode="wb",
delete=False,
suffix=".txt",
dir=local_tmp_dir(),
)
tmp.write(header)
@ -749,7 +766,10 @@ def clean_national_rail_gtfs(src: Path, dst: Path) -> None:
rt_idx = cols.index("route_type")
tmp = tempfile.NamedTemporaryFile(
mode="wb", delete=False, suffix=".txt"
mode="wb",
delete=False,
suffix=".txt",
dir=local_tmp_dir(),
)
tmp.write(header)
@ -774,7 +794,10 @@ def clean_national_rail_gtfs(src: Path, dst: Path) -> None:
trip_id_idx = cols.index("trip_id")
tmp = tempfile.NamedTemporaryFile(
mode="wb", delete=False, suffix=".txt"
mode="wb",
delete=False,
suffix=".txt",
dir=local_tmp_dir(),
)
tmp.write(header)
@ -797,7 +820,10 @@ def clean_national_rail_gtfs(src: Path, dst: Path) -> None:
end_idx = cols.index("end_date")
tmp = tempfile.NamedTemporaryFile(
mode="wb", delete=False, suffix=".txt"
mode="wb",
delete=False,
suffix=".txt",
dir=local_tmp_dir(),
)
tmp.write(header)

View file

@ -15,6 +15,16 @@ if (!pkgDirArg || converterArgs.length < 2) {
}
const pkgDir = path.resolve(pkgDirArg);
const defaultTmpDir = path.resolve(__dirname, "..", "..", ".tmp");
const localTmpDir =
process.env.TMPDIR || process.env.TEMP || process.env.TMP || defaultTmpDir;
const stopsCsv = path.join(localTmpDir, "Stops.csv");
const converterTmpPrefix = path.join(localTmpDir, "transxchange2gtfs_");
const converterTmpPatch =
`static TMP = ${JSON.stringify(converterTmpPrefix)}` +
` + process.pid + ${JSON.stringify(path.sep)};`;
fs.mkdirSync(localTmpDir, { recursive: true });
function replaceOnce(relativePath, before, after) {
const file = path.join(pkgDir, relativePath);
@ -37,6 +47,26 @@ function replaceOnce(relativePath, before, after) {
// GTFS shapes are optional for R5 routing. Clear shape references and omit
// shapes.txt so missing route geometry does not drop otherwise usable trips.
function patchPackage() {
replaceOnce(
"dist/Container.js",
"static TMP = `/tmp/transxchange2gtfs_${process.pid}/`;",
converterTmpPatch,
);
replaceOnce(
"dist/Container.js",
'fs.existsSync("/tmp/Stops.csv")',
`fs.existsSync(${JSON.stringify(stopsCsv)})`,
);
replaceOnce(
"dist/Container.js",
'fs.createReadStream("/tmp/Stops.csv", "utf8")',
`fs.createReadStream(${JSON.stringify(stopsCsv)}, "utf8")`,
);
replaceOnce(
"dist/converter/GetStopData.js",
'fs.createWriteStream("/tmp/Stops.csv")',
`fs.createWriteStream(${JSON.stringify(stopsCsv)})`,
);
replaceOnce(
"dist/transxchange/TransXChangeJourneyStream.js",
"distanceSoFarM += routeLink.Distance;",

View file

@ -13,6 +13,7 @@ from pathlib import Path
import polars as pl
from pipeline.local_temp import local_tmp_dir
from pipeline.utils import download, extract_zip
URL = "https://www.arcgis.com/sharing/rest/content/items/4e0b4b3fbc2540caae27e7be532e61be/data"
@ -62,7 +63,7 @@ def main() -> None:
)
args = parser.parse_args()
with tempfile.TemporaryDirectory() as cache_dir:
with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as cache_dir:
zip_path = Path(cache_dir) / "uprn_lookup.zip"
extract_path = Path(cache_dir) / "uprn_extracted"

View file

@ -10,6 +10,8 @@ import pyarrow as pa
import pyarrow.csv as pa_csv
import pyarrow.parquet as pq
from pipeline.local_temp import local_tmp_dir
from ..utils import (
fuzzy_join_on_postcode,
normalize_address_key,
@ -192,7 +194,9 @@ def main():
)
args = parser.parse_args()
with tempfile.TemporaryDirectory(prefix="epc_certificates_") as tmpdir:
with tempfile.TemporaryDirectory(
prefix="epc_certificates_", dir=local_tmp_dir()
) as tmpdir:
_run(args.epc, args.price_paid, args.output, Path(tmpdir))

View file

@ -3,6 +3,8 @@ from pathlib import Path
import numpy as np
import polars as pl
from pipeline.local_temp import local_tmp_dir
from .memory import release_memory
@ -17,7 +19,9 @@ def load_uprns(uprn_path: Path) -> tuple[pl.DataFrame, dict[str, tuple[int, int]
print("Loading UPRN lookup...")
# Sort via streaming sink to avoid polars doubling memory during in-memory sort
with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as tmp:
with tempfile.NamedTemporaryFile(
suffix=".parquet", delete=False, dir=local_tmp_dir()
) as tmp:
tmp_path = Path(tmp.name)
(
pl.scan_parquet(uprn_path)

View file

@ -79,6 +79,39 @@ def test_transform_grocery_retail_points_keeps_fascia_icon_category():
]
def test_transform_grocery_retail_points_merges_cooperative_societies():
raw = pl.DataFrame(
{
"id": [101, 102, 103],
"retailer": [
"Central England Co-operative",
"Lincolnshire Co-operative",
"The Southern Co-operative",
],
"fascia": [
"Central England Co-operative",
"The Co-operative Food",
None,
],
"store_name": [
"Central Co-op Test",
"Lincolnshire Co-op Test",
"Southern Co-op Test",
],
"long_wgs": [-0.141, -0.142, -0.143],
"lat_wgs": [51.515, 51.516, 51.517],
}
)
pois = transform_grocery_retail_points(raw, min_chain_locations=1)
assert pois.select("category", "icon_category").to_dicts() == [
{"category": "Co-op", "icon_category": "Co-op"},
{"category": "Co-op", "icon_category": "Co-op"},
{"category": "Co-op", "icon_category": "Co-op"},
]
def test_transform_grocery_retail_points_accepts_base_fascias():
raw = pl.DataFrame(
{

View file

@ -623,6 +623,7 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
"shop/outpost",
"shop/pawnbroker",
"shop/photo",
"shop/photo_studio",
"shop/plant_hire",
"shop/printer_ink",
"shop/printing",
@ -843,6 +844,7 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
[
"healthcare/physiotherapist",
"healthcare/podiatrist",
"healthcare/occupational_therapist",
],
),
(
@ -1171,7 +1173,6 @@ GROCERY_RETAILER_DISPLAY_NAME_OVERRIDES: dict[str, str] = {
"Heron": "Heron Foods",
"Marks and Spencer": "M&S",
"Sainsburys": "Sainsbury's",
"The Co-operative Group": "Co-op",
}
@ -1238,6 +1239,8 @@ def normalize_grocery_retailer(retailer: str | None) -> str:
if retailer is None:
return ""
retailer = retailer.strip()
if retailer in COOP_RETAILERS:
return "Co-op"
return GROCERY_RETAILER_DISPLAY_NAME_OVERRIDES.get(retailer, retailer)

View file

@ -9,6 +9,8 @@ import polars as pl
from thefuzz import fuzz
from tqdm import tqdm
from pipeline.local_temp import local_tmp_dir
_NUMBER_RE = re.compile(r"\d+")
_POSTCODE_RE = r"^[A-Z]{1,2}\d[A-Z\d]?\d[A-Z]{2}$"
MIN_FUZZY_SCORE = 60
@ -57,7 +59,7 @@ def fuzzy_join_on_postcode(
have null right columns.
"""
tmpdir = tempfile.mkdtemp(prefix="fuzzy_join_")
tmpdir = tempfile.mkdtemp(prefix="fuzzy_join_", dir=local_tmp_dir())
left_path = Path(tmpdir) / "left.parquet"
right_path = Path(tmpdir) / "right.parquet"

View file

@ -1,2 +0,0 @@
*
!.gitignore

View file

@ -6,6 +6,8 @@ use polars::prelude::*;
use serde::Serialize;
use tracing::info;
use crate::consts::{NAN_U16, QUANT_SCALE};
use crate::data::{PropertyData, QuantRef};
use crate::utils::{normalize_postcode, GridIndex, InternedColumn};
const GRID_CELL_SIZE: f32 = 0.01;
@ -52,15 +54,22 @@ pub struct ActualListingData {
pub listing_status: InternedColumn,
pub listing_date_iso: Vec<Option<String>>,
pub features: Vec<Vec<String>>,
/// Row-major feature matrix aligned with PropertyData::feature_names.
///
/// Rows start from a best-effort address/postcode join to the historical property
/// dataset, then live listing fields such as asking price and property type are
/// overlaid where available. This lets the listings endpoint use the same filter
/// execution path as the property endpoints.
pub filter_feature_data: Vec<u16>,
pub grid: GridIndex,
}
impl ActualListingData {
pub fn load(parquet_path: &Path) -> Result<Self> {
super::run_polars_io(|| Self::load_inner(parquet_path))
pub fn load(parquet_path: &Path, property_data: &PropertyData) -> Result<Self> {
super::run_polars_io(|| Self::load_inner(parquet_path, Some(property_data)))
}
fn load_inner(parquet_path: &Path) -> Result<Self> {
fn load_inner(parquet_path: &Path, property_data: Option<&PropertyData>) -> Result<Self> {
info!("Loading actual listings from {:?}", parquet_path);
let pl_path = PlRefPath::try_from_path(parquet_path)
.context("Failed to normalize actual listings parquet path")?;
@ -99,6 +108,18 @@ impl ActualListingData {
let price_qualifier = InternedColumn::build(&opt_to_string(&price_qualifier_raw));
let listing_status = InternedColumn::build(&opt_to_string(&listing_status_raw));
let filter_feature_data = build_filter_feature_data(
property_data,
&postcode,
&address,
&property_type_raw,
&leasehold_freehold_raw,
&rooms_total,
&floor_area_sqm,
&asking_price,
&asking_price_per_sqm,
);
let grid = GridIndex::build(&lat, &lon, GRID_CELL_SIZE);
info!(rows = row_count, "Actual listings loaded");
@ -122,6 +143,7 @@ impl ActualListingData {
listing_status,
listing_date_iso,
features,
filter_feature_data,
grid,
})
}
@ -150,6 +172,201 @@ impl ActualListingData {
}
}
#[allow(clippy::too_many_arguments)]
fn build_filter_feature_data(
property_data: Option<&PropertyData>,
postcode: &[String],
address: &[Option<String>],
property_type: &[Option<String>],
leasehold_freehold: &[Option<String>],
rooms_total: &[Option<i32>],
floor_area_sqm: &[Option<f32>],
asking_price: &[Option<i64>],
asking_price_per_sqm: &[Option<f32>],
) -> Vec<u16> {
let Some(property_data) = property_data else {
return Vec::new();
};
let num_features = property_data.num_features;
let mut feature_data = vec![NAN_U16; postcode.len() * num_features];
let mut joined_rows = 0usize;
for (row, postcode_value) in postcode.iter().enumerate() {
let Some(address_value) = address[row]
.as_deref()
.map(str::trim)
.filter(|v| !v.is_empty())
else {
continue;
};
let query = format!("{address_value} {postcode_value}");
let Some(&property_row) = property_data.search_addresses(&query, 1).first() else {
continue;
};
if property_data.postcode(property_row) != postcode_value {
continue;
}
let dst = row * num_features;
let src = property_row * num_features;
feature_data[dst..dst + num_features]
.copy_from_slice(&property_data.feature_data[src..src + num_features]);
joined_rows += 1;
}
let quant = property_data.quant_ref();
overlay_numeric_feature(
&mut feature_data,
property_data,
&quant,
"Total floor area (sqm)",
floor_area_sqm.iter().copied(),
false,
);
overlay_numeric_feature(
&mut feature_data,
property_data,
&quant,
"Number of bedrooms & living rooms",
rooms_total.iter().map(|value| value.map(|v| v as f32)),
false,
);
overlay_numeric_feature(
&mut feature_data,
property_data,
&quant,
"Estimated current price",
asking_price.iter().map(|value| value.map(|v| v as f32)),
true,
);
overlay_numeric_feature(
&mut feature_data,
property_data,
&quant,
"Last known price",
asking_price.iter().map(|value| value.map(|v| v as f32)),
true,
);
overlay_numeric_feature(
&mut feature_data,
property_data,
&quant,
"Est. price per sqm",
asking_price_per_sqm.iter().copied(),
true,
);
overlay_numeric_feature(
&mut feature_data,
property_data,
&quant,
"Price per sqm",
asking_price_per_sqm.iter().copied(),
true,
);
overlay_enum_feature(
&mut feature_data,
property_data,
"Property type",
property_type.iter().map(Option::as_deref),
false,
);
overlay_enum_feature(
&mut feature_data,
property_data,
"Leasehold/Freehold",
leasehold_freehold.iter().map(Option::as_deref),
false,
);
info!(
rows = postcode.len(),
joined_rows, "Actual listings joined to property feature matrix"
);
feature_data
}
fn feature_index(property_data: &PropertyData, name: &str) -> Option<usize> {
property_data
.feature_names
.iter()
.position(|candidate| candidate == name)
}
fn overlay_numeric_feature<I>(
feature_data: &mut [u16],
property_data: &PropertyData,
quant: &QuantRef<'_>,
name: &str,
values: I,
clear_missing: bool,
) where
I: IntoIterator<Item = Option<f32>>,
{
let Some(feat_idx) = feature_index(property_data, name) else {
return;
};
if feat_idx >= property_data.num_numeric {
return;
}
let num_features = property_data.num_features;
for (row, value) in values.into_iter().enumerate() {
let dst = row * num_features + feat_idx;
match value {
Some(value) => feature_data[dst] = encode_numeric_value(quant, feat_idx, value),
None if clear_missing => feature_data[dst] = NAN_U16,
None => {}
}
}
}
fn overlay_enum_feature<'a, I>(
feature_data: &mut [u16],
property_data: &PropertyData,
name: &str,
values: I,
clear_missing: bool,
) where
I: IntoIterator<Item = Option<&'a str>>,
{
let Some(feat_idx) = feature_index(property_data, name) else {
return;
};
let Some(enum_values) = property_data.enum_values.get(&feat_idx) else {
return;
};
let num_features = property_data.num_features;
for (row, value) in values.into_iter().enumerate() {
let dst = row * num_features + feat_idx;
let encoded = value
.map(str::trim)
.filter(|text| !text.is_empty())
.and_then(|text| enum_values.iter().position(|candidate| candidate == text))
.map(|position| position as u16);
match encoded {
Some(value) => feature_data[dst] = value,
None if clear_missing => feature_data[dst] = NAN_U16,
None => {}
}
}
}
fn encode_numeric_value(quant: &QuantRef<'_>, feat_idx: usize, value: f32) -> u16 {
if !value.is_finite() {
return NAN_U16;
}
let range = quant.quant_range[feat_idx];
if range <= 0.0 {
return 0;
}
let normalized = (value - quant.quant_min[feat_idx]) / range;
(normalized * QUANT_SCALE).round().clamp(0.0, QUANT_SCALE) as u16
}
fn opt_to_string(values: &[Option<String>]) -> Vec<String> {
values
.iter()
@ -311,7 +528,7 @@ mod tests {
return;
};
let data = ActualListingData::load(&path).expect("listings load");
let data = ActualListingData::load_inner(&path, None).expect("listings load");
assert!(!data.lat.is_empty());
assert_eq!(data.lat.len(), data.lon.len());
assert_eq!(data.lat.len(), data.postcode.len());

View file

@ -30,16 +30,6 @@ const GROCERY_DASHBOARD_CATEGORIES: &[&str] = &[
"Budgens",
"Centra",
"Co-op",
"Central England Co-operative",
"Chelmsford Star Co-operative Society",
"East of England Co-operative",
"Heart of England Co-operative",
"Lincolnshire Co-operative",
"Midcounties Co-operative",
"Scottish Midland Co-operative",
"Tamworth Co-operative Society",
"The Radstock Co-operative Society",
"The Southern Co-operative",
"COOK",
"Costco",
"Dunnes Stores",
@ -104,10 +94,35 @@ fn add_category_filter_index(
}
}
fn canonical_poi_category(category: &str) -> &str {
match category {
"Allendale Co-operative Society"
| "Central England Co-operative"
| "Channel Islands Co-operative Society"
| "Chelmsford Star Co-operative Society"
| "Clydebank Co-operative"
| "Coniston Co-operative Society"
| "Co-op Food"
| "East of England Co-operative"
| "Heart of England Co-operative"
| "Langdale Co-operative Society"
| "Lincolnshire Co-operative"
| "Midcounties Co-operative"
| "Scottish Midland Co-operative"
| "Tamworth Co-operative Society"
| "The Co-operative Food"
| "The Co-operative Food PFS"
| "The Co-operative Group"
| "The Radstock Co-operative Society"
| "The Southern Co-operative" => "Co-op",
_ => category,
}
}
pub fn resolve_poi_category_filter(category_values: &[String], categories: &str) -> FxHashSet<u16> {
let mut selected = FxHashSet::default();
for part in categories.split(',') {
let category = part.trim();
let category = canonical_poi_category(part.trim());
if category.is_empty() {
continue;
}
@ -200,12 +215,18 @@ impl POIData {
let id_raw: Vec<String> = extract_str_col(&df, "id")?;
let name = extract_str_col(&df, "name")?;
let category_raw = extract_str_col(&df, "category")?;
let category_raw: Vec<String> = extract_str_col(&df, "category")?
.into_iter()
.map(|category| canonical_poi_category(&category).to_string())
.collect();
let group_raw = extract_str_col(&df, "group")?;
let lat = extract_f32_col(&df, "lat")?;
let lng = extract_f32_col(&df, "lng")?;
let emoji_raw = extract_str_col(&df, "emoji")?;
let icon_category_raw = extract_str_col(&df, "icon_category")?;
let icon_category_raw: Vec<String> = extract_str_col(&df, "icon_category")?
.into_iter()
.map(|category| canonical_poi_category(&category).to_string())
.collect();
// Pack POI IDs into a contiguous buffer
let total_id_bytes: usize = id_raw.iter().map(|s| s.len()).sum();
@ -351,4 +372,19 @@ mod tests {
assert!(selected.is_empty());
}
#[test]
fn coop_category_aliases_resolve_to_single_category() {
let values = vec!["Co-op".to_string(), "Tesco".to_string()];
let selected = resolve_poi_category_filter(
&values,
"Central England Co-operative,The Southern Co-operative",
);
assert!(selected.contains(&0));
assert_eq!(selected.len(), 1);
assert_eq!(canonical_poi_category("Lincolnshire Co-operative"), "Co-op");
assert_eq!(canonical_poi_category("Tesco"), "Tesco");
}
}

View file

@ -1014,22 +1014,6 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[
},
];
/// Feature names that describe an individual property (price, size, type, etc.) rather
/// than the surrounding area. Use this to skip filters that should not exclude live
/// listings on the map even though they hide aggregated property rows.
pub fn property_level_feature_names() -> Vec<&'static str> {
const PROPERTY_GROUPS: &[&str] = &["Properties", "Property prices"];
FEATURE_GROUPS
.iter()
.filter(|group| PROPERTY_GROUPS.contains(&group.name))
.flat_map(|group| group.features.iter())
.map(|feature| match feature {
Feature::Numeric(c) => c.name,
Feature::Enum(c) => c.name,
})
.collect()
}
/// Flat ordered list of all numeric feature names (follows group order).
pub fn all_numeric_feature_names() -> Vec<&'static str> {
FEATURE_GROUPS

View file

@ -541,7 +541,7 @@ async fn main() -> anyhow::Result<()> {
bail!("Actual listings parquet not found: {}", path.display());
}
info!("Loading actual listings from {}", path.display());
let listings = data::ActualListingData::load(path)?;
let listings = data::ActualListingData::load(path, &property_data)?;
trim_allocator("actual listings load");
info!(rows = listings.lat.len(), "Actual listings loaded");
Some(Arc::new(listings))

View file

@ -1,16 +1,20 @@
use std::sync::Arc;
use axum::extract::{Query, State};
use axum::response::Json;
use axum::response::{IntoResponse, Json, Response};
use axum::Extension;
use rustc_hash::FxHashSet;
use serde::{Deserialize, Serialize};
use tracing::info;
use crate::api_error::ApiError;
use crate::auth::OptionalUser;
use crate::consts::NAN_U16;
use crate::data::ActualListing;
use crate::features::property_level_feature_names;
use crate::licensing::{check_license_bounds, resolve_share_code};
use crate::parsing::{
parse_filters_with_poi, require_bounds, row_passes_filters, row_passes_poi_filters,
ParsedEnumFilter, ParsedFilter,
};
use crate::state::{AppState, SharedState};
@ -25,6 +29,8 @@ pub struct ActualListingsParams {
travel: Option<String>,
/// Number of results to skip. Defaults to 0.
offset: Option<usize>,
/// Share-link code; grants bbox-scoped access for unlicensed users.
share: Option<String>,
}
#[derive(Serialize)]
@ -35,10 +41,24 @@ pub struct ActualListingsResponse {
pub truncated: bool,
}
const LISTING_LEVEL_FILTER_FEATURES: &[&str] = &[
"Property type",
"Leasehold/Freehold",
"Total floor area (sqm)",
"Number of bedrooms & living rooms",
"Estimated current price",
"Last known price",
"Est. price per sqm",
"Price per sqm",
];
const KEEP_UNKNOWN_LISTING_FILTER_FEATURES: &[&str] = &["Total floor area (sqm)"];
pub async fn get_actual_listings(
State(shared): State<Arc<SharedState>>,
Extension(user): Extension<OptionalUser>,
Query(params): Query<ActualListingsParams>,
) -> Result<Json<ActualListingsResponse>, ApiError> {
) -> Result<Json<ActualListingsResponse>, Response> {
let state = shared.load_state();
let offset = params.offset.unwrap_or(0);
let Some(actual_listings) = state.actual_listings.clone() else {
@ -49,11 +69,15 @@ pub async fn get_actual_listings(
truncated: false,
}));
};
let (south, west, north, east) = require_bounds(params.bounds).map_err(ApiError::from)?;
let (south, west, north, east) =
require_bounds(params.bounds).map_err(IntoResponse::into_response)?;
let share_bounds = resolve_share_code(&state, params.share.as_deref()).await;
check_license_bounds(&user.0, (south, west, north, east), share_bounds)?;
let quant = state.data.quant_ref();
let poi_quant = state.data.poi_metrics.quant_ref();
let (mut parsed_filters, mut parsed_enum_filters, parsed_poi_filters) = parse_filters_with_poi(
let (parsed_filters, parsed_enum_filters, parsed_poi_filters) = parse_filters_with_poi(
params.filters.as_deref(),
&state.feature_name_to_index,
&state.data.enum_values,
@ -61,40 +85,38 @@ pub async fn get_actual_listings(
&state.data.poi_metrics.name_to_index,
&poi_quant,
)
.map_err(ApiError::BadRequest)?;
.map_err(|err| ApiError::BadRequest(err).into_response())?;
// Drop property-level filters (price, sqm, build year, beds, type, etc.) so they
// don't hide live listings — those are individual-property concerns the user can
// judge from the pin itself. We only keep area/postcode-level filters here.
let property_level_idxs: FxHashSet<usize> = property_level_feature_names()
.into_iter()
.filter_map(|name| state.feature_name_to_index.get(name).copied())
.collect();
parsed_filters.retain(|f| !property_level_idxs.contains(&f.feat_idx));
parsed_enum_filters.retain(|f| !property_level_idxs.contains(&f.feat_idx));
let travel_entries = parse_optional_travel(params.travel.as_deref())
.map_err(|err| ApiError::BadRequest(err).into_response())?;
let travel_entries =
parse_optional_travel(params.travel.as_deref()).map_err(ApiError::BadRequest)?;
let listing_level_feature_idxs = listing_level_filter_feature_idxs(&state);
let keep_unknown_listing_filter_idxs = keep_unknown_listing_filter_feature_idxs(&state);
let (listing_filters, postcode_filters) =
split_numeric_filters(parsed_filters, &listing_level_feature_idxs);
let (listing_enum_filters, postcode_enum_filters) =
split_enum_filters(parsed_enum_filters, &listing_level_feature_idxs);
let has_area_filters = !parsed_filters.is_empty()
|| !parsed_enum_filters.is_empty()
let has_postcode_filters = !postcode_filters.is_empty()
|| !postcode_enum_filters.is_empty()
|| !parsed_poi_filters.is_empty()
|| !travel_entries.is_empty();
let has_listing_filters = !listing_filters.is_empty() || !listing_enum_filters.is_empty();
let state_clone = state.clone();
let response =
tokio::task::spawn_blocking(move || -> Result<ActualListingsResponse, String> {
let t0 = std::time::Instant::now();
let passing_postcodes = if has_area_filters {
let passing_postcodes = if has_postcode_filters {
Some(compute_passing_postcodes(
&state_clone,
south,
west,
north,
east,
&parsed_filters,
&parsed_enum_filters,
&postcode_filters,
&postcode_enum_filters,
&parsed_poi_filters,
&travel_entries,
)?)
@ -116,6 +138,18 @@ pub async fn get_actual_listings(
return None;
}
}
if has_listing_filters
&& !row_passes_listing_filters(
row,
&listing_filters,
&listing_enum_filters,
&actual_listings.filter_feature_data,
state_clone.data.num_features,
&keep_unknown_listing_filter_idxs,
)
{
return None;
}
Some(row)
})
.collect();
@ -142,7 +176,8 @@ pub async fn get_actual_listings(
total = total_matching,
total_in_bounds,
offset,
filtered = passing_postcodes.is_some(),
postcode_filtered = passing_postcodes.is_some(),
listing_filtered = has_listing_filters,
ms = format_args!("{:.1}", elapsed.as_secs_f64() * 1000.0),
"GET /api/actual-listings"
);
@ -155,12 +190,82 @@ pub async fn get_actual_listings(
})
})
.await
.map_err(|error| ApiError::Internal(error.to_string()))?
.map_err(ApiError::Internal)?;
.map_err(|error| ApiError::Internal(error.to_string()).into_response())?
.map_err(|err| ApiError::Internal(err).into_response())?;
Ok(Json(response))
}
fn listing_level_filter_feature_idxs(state: &AppState) -> FxHashSet<usize> {
feature_idxs(state, LISTING_LEVEL_FILTER_FEATURES)
}
fn keep_unknown_listing_filter_feature_idxs(state: &AppState) -> FxHashSet<usize> {
feature_idxs(state, KEEP_UNKNOWN_LISTING_FILTER_FEATURES)
}
fn feature_idxs(state: &AppState, names: &[&str]) -> FxHashSet<usize> {
names
.iter()
.filter_map(|name| state.feature_name_to_index.get(*name).copied())
.collect()
}
fn split_numeric_filters(
filters: Vec<ParsedFilter>,
listing_level_feature_idxs: &FxHashSet<usize>,
) -> (Vec<ParsedFilter>, Vec<ParsedFilter>) {
let mut listing_filters = Vec::new();
let mut postcode_filters = Vec::new();
for filter in filters {
if listing_level_feature_idxs.contains(&filter.feat_idx) {
listing_filters.push(filter);
} else {
postcode_filters.push(filter);
}
}
(listing_filters, postcode_filters)
}
fn split_enum_filters(
filters: Vec<ParsedEnumFilter>,
listing_level_feature_idxs: &FxHashSet<usize>,
) -> (Vec<ParsedEnumFilter>, Vec<ParsedEnumFilter>) {
let mut listing_filters = Vec::new();
let mut postcode_filters = Vec::new();
for filter in filters {
if listing_level_feature_idxs.contains(&filter.feat_idx) {
listing_filters.push(filter);
} else {
postcode_filters.push(filter);
}
}
(listing_filters, postcode_filters)
}
fn row_passes_listing_filters(
row: usize,
filters: &[ParsedFilter],
enum_filters: &[ParsedEnumFilter],
feature_data: &[u16],
num_features: usize,
keep_unknown_filter_idxs: &FxHashSet<usize>,
) -> bool {
let base = row * num_features;
filters.iter().all(|filter| {
let raw = feature_data[base + filter.feat_idx];
if raw == NAN_U16 {
keep_unknown_filter_idxs.contains(&filter.feat_idx)
} else {
raw >= filter.min_u16 && raw <= filter.max_u16
}
}) && enum_filters.iter().all(|filter| {
let raw = feature_data[base + filter.feat_idx];
raw != NAN_U16 && filter.allowed.contains(&raw)
})
}
#[allow(clippy::too_many_arguments)]
fn compute_passing_postcodes(
state: &AppState,
@ -224,3 +329,111 @@ fn compute_passing_postcodes(
Ok(passing)
}
#[cfg(test)]
mod tests {
use super::*;
fn numeric_filter(feat_idx: usize) -> ParsedFilter {
ParsedFilter {
feat_idx,
min_u16: 0,
max_u16: 100,
}
}
fn enum_filter(feat_idx: usize) -> ParsedEnumFilter {
ParsedEnumFilter {
feat_idx,
allowed: [0u16].into_iter().collect(),
}
}
#[test]
fn splits_actual_listing_filters_by_listing_native_features() {
let listing_level_feature_idxs: FxHashSet<usize> = [1usize, 3].into_iter().collect();
let (listing_filters, postcode_filters) = split_numeric_filters(
vec![numeric_filter(0), numeric_filter(1), numeric_filter(3)],
&listing_level_feature_idxs,
);
assert_eq!(
listing_filters
.iter()
.map(|filter| filter.feat_idx)
.collect::<Vec<_>>(),
vec![1, 3]
);
assert_eq!(
postcode_filters
.iter()
.map(|filter| filter.feat_idx)
.collect::<Vec<_>>(),
vec![0]
);
let (listing_enum_filters, postcode_enum_filters) = split_enum_filters(
vec![enum_filter(2), enum_filter(3)],
&listing_level_feature_idxs,
);
assert_eq!(
listing_enum_filters
.iter()
.map(|filter| filter.feat_idx)
.collect::<Vec<_>>(),
vec![3]
);
assert_eq!(
postcode_enum_filters
.iter()
.map(|filter| filter.feat_idx)
.collect::<Vec<_>>(),
vec![2]
);
}
#[test]
fn listing_floor_area_filter_keeps_unknown_values() {
let floor_area_filter = ParsedFilter {
feat_idx: 0,
min_u16: 10,
max_u16: 20,
};
let keep_unknown_filter_idxs: FxHashSet<usize> = [0usize].into_iter().collect();
assert!(row_passes_listing_filters(
0,
&[floor_area_filter],
&[],
&[NAN_U16],
1,
&keep_unknown_filter_idxs
));
assert!(!row_passes_listing_filters(
0,
&[ParsedFilter {
feat_idx: 0,
min_u16: 10,
max_u16: 20,
}],
&[],
&[9],
1,
&keep_unknown_filter_idxs
));
assert!(row_passes_listing_filters(
0,
&[ParsedFilter {
feat_idx: 0,
min_u16: 10,
max_u16: 20,
}],
&[],
&[15],
1,
&keep_unknown_filter_idxs
));
}
}