All good
This commit is contained in:
parent
6ea544a0f6
commit
6cc7288126
45 changed files with 929 additions and 1043 deletions
|
|
@ -39,8 +39,10 @@ jobs:
|
||||||
host="127.0.0.1:13000"
|
host="127.0.0.1:13000"
|
||||||
fi
|
fi
|
||||||
repo=$(echo "${{ gitea.repository }}" | tr '[:upper:]' '[:lower:]')
|
repo=$(echo "${{ gitea.repository }}" | tr '[:upper:]' '[:lower:]')
|
||||||
|
owner="${repo%%/*}"
|
||||||
{
|
{
|
||||||
echo "host=${host}"
|
echo "host=${host}"
|
||||||
|
echo "owner=${owner}"
|
||||||
echo "image=${host}/${repo}"
|
echo "image=${host}/${repo}"
|
||||||
echo "screenshot_image=${host}/${repo}-screenshot"
|
echo "screenshot_image=${host}/${repo}-screenshot"
|
||||||
} >> "$GITHUB_OUTPUT"
|
} >> "$GITHUB_OUTPUT"
|
||||||
|
|
@ -49,8 +51,8 @@ jobs:
|
||||||
uses: https://github.com/docker/login-action@v3
|
uses: https://github.com/docker/login-action@v3
|
||||||
with:
|
with:
|
||||||
registry: ${{ steps.registry.outputs.host }}
|
registry: ${{ steps.registry.outputs.host }}
|
||||||
username: ${{ gitea.actor }}
|
username: ${{ steps.registry.outputs.owner }}
|
||||||
password: ${{ secrets.GITEA_TOKEN }}
|
password: ${{ secrets.FORGEJO_PACKAGE_TOKEN }}
|
||||||
|
|
||||||
- name: Extract metadata (main)
|
- name: Extract metadata (main)
|
||||||
id: meta
|
id: meta
|
||||||
|
|
|
||||||
1
.gitignore
vendored
1
.gitignore
vendored
|
|
@ -5,6 +5,7 @@
|
||||||
**/dist
|
**/dist
|
||||||
server-rs/target
|
server-rs/target
|
||||||
.task
|
.task
|
||||||
|
.tmp/
|
||||||
frontend/public/assets/*
|
frontend/public/assets/*
|
||||||
!frontend/public/assets/fonts/
|
!frontend/public/assets/fonts/
|
||||||
!frontend/public/assets/fonts/**
|
!frontend/public/assets/fonts/**
|
||||||
|
|
|
||||||
|
|
@ -29,8 +29,7 @@ services:
|
||||||
- .:/app
|
- .:/app
|
||||||
- cargo-home:/usr/local/cargo
|
- cargo-home:/usr/local/cargo
|
||||||
- cargo-target:/app/server-rs/target
|
- cargo-target:/app/server-rs/target
|
||||||
- ./property-data:/app/data:ro
|
- ./property-data2:/app/data:ro
|
||||||
- ./property-data/travel-times:/app/data/travel-times:ro
|
|
||||||
- ./finder/data:/app/finder-data:ro
|
- ./finder/data:/app/finder-data:ro
|
||||||
environment:
|
environment:
|
||||||
POCKETBASE_URL: http://pocketbase:8090
|
POCKETBASE_URL: http://pocketbase:8090
|
||||||
|
|
@ -51,7 +50,7 @@ services:
|
||||||
BUGSINK_ENVIRONMENT: ${BUGSINK_ENVIRONMENT:-development}
|
BUGSINK_ENVIRONMENT: ${BUGSINK_ENVIRONMENT:-development}
|
||||||
BUGSINK_RELEASE: ${BUGSINK_RELEASE:-}
|
BUGSINK_RELEASE: ${BUGSINK_RELEASE:-}
|
||||||
BUGSINK_SEND_DEFAULT_PII: ${BUGSINK_SEND_DEFAULT_PII:-false}
|
BUGSINK_SEND_DEFAULT_PII: ${BUGSINK_SEND_DEFAULT_PII:-false}
|
||||||
ACTUAL_LISTINGS_PATH: /app/finder-data/online_listings_buy_filtered.parquet
|
ACTUAL_LISTINGS_PATH: /app/finder-data/online_listings_buy.parquet
|
||||||
depends_on:
|
depends_on:
|
||||||
screenshot:
|
screenshot:
|
||||||
condition: service_healthy
|
condition: service_healthy
|
||||||
|
|
|
||||||
|
|
@ -20,11 +20,6 @@ TYPEAHEAD_URL = "https://los.rightmove.co.uk/typeahead"
|
||||||
SEARCH_URL = "https://www.rightmove.co.uk/api/property-search/listing/search"
|
SEARCH_URL = "https://www.rightmove.co.uk/api/property-search/listing/search"
|
||||||
RIGHTMOVE_BASE = "https://www.rightmove.co.uk"
|
RIGHTMOVE_BASE = "https://www.rightmove.co.uk"
|
||||||
|
|
||||||
# home.co.uk
|
|
||||||
HOMECOUK_BASE = "https://home.co.uk"
|
|
||||||
HOMECOUK_API_BASE = f"{HOMECOUK_BASE}/api"
|
|
||||||
HOMECOUK_PER_PAGE = 30 # max supported by the API
|
|
||||||
|
|
||||||
# Zoopla
|
# Zoopla
|
||||||
ZOOPLA_BASE = "https://www.zoopla.co.uk"
|
ZOOPLA_BASE = "https://www.zoopla.co.uk"
|
||||||
|
|
||||||
|
|
@ -108,13 +103,13 @@ PROPERTY_TYPE_MAP = {
|
||||||
"House Boat": "Other",
|
"House Boat": "Other",
|
||||||
"Barn": "Other",
|
"Barn": "Other",
|
||||||
"Serviced Apartments": "Other",
|
"Serviced Apartments": "Other",
|
||||||
# Space-separated variants (from home.co.uk underscore/hyphen normalization)
|
# Space-separated variants from legacy provider normalization.
|
||||||
"Semi Detached": "Semi-Detached",
|
"Semi Detached": "Semi-Detached",
|
||||||
"Semi Detached Bungalow": "Semi-Detached",
|
"Semi Detached Bungalow": "Semi-Detached",
|
||||||
"End Of Terrace": "Terraced",
|
"End Of Terrace": "Terraced",
|
||||||
"End Terrace": "Terraced",
|
"End Terrace": "Terraced",
|
||||||
"Block Of Apartments": "Other",
|
"Block Of Apartments": "Other",
|
||||||
# Lowercase variants (from home.co.uk / Rightmove APIs)
|
# Lowercase variants from listing APIs.
|
||||||
"house": "Detached",
|
"house": "Detached",
|
||||||
"bungalow": "Other",
|
"bungalow": "Other",
|
||||||
"townhouse": "Terraced",
|
"townhouse": "Terraced",
|
||||||
|
|
|
||||||
|
|
@ -1,461 +0,0 @@
|
||||||
import json
|
|
||||||
import logging
|
|
||||||
import os
|
|
||||||
import random
|
|
||||||
import re
|
|
||||||
import time
|
|
||||||
from urllib.parse import unquote
|
|
||||||
|
|
||||||
from curl_cffi.requests import Session
|
|
||||||
from curl_cffi.requests.errors import RequestsError
|
|
||||||
|
|
||||||
from constants import (
|
|
||||||
DELAY_BETWEEN_PAGES,
|
|
||||||
HOMECOUK_API_BASE,
|
|
||||||
HOMECOUK_BASE,
|
|
||||||
HOMECOUK_PER_PAGE,
|
|
||||||
MAX_BEDROOMS,
|
|
||||||
PROPERTY_TYPE_MAP,
|
|
||||||
RETRY_BASE_DELAY,
|
|
||||||
)
|
|
||||||
from spatial import PostcodeSpatialIndex
|
|
||||||
from transform import (
|
|
||||||
normalize_postcode,
|
|
||||||
normalize_sub_type,
|
|
||||||
parse_int_value,
|
|
||||||
validate_floor_area,
|
|
||||||
)
|
|
||||||
|
|
||||||
log = logging.getLogger("homecouk")
|
|
||||||
|
|
||||||
|
|
||||||
class CookiesExpiredError(Exception):
|
|
||||||
"""Raised when home.co.uk returns 403, indicating cookies need refresh."""
|
|
||||||
|
|
||||||
|
|
||||||
class PaginationError(Exception):
|
|
||||||
"""Raised when home.co.uk pagination cannot be completed."""
|
|
||||||
|
|
||||||
|
|
||||||
# Channel mapping: internal name → URL path segment
|
|
||||||
HOMECOUK_URL_SEGMENT = "for-sale"
|
|
||||||
|
|
||||||
|
|
||||||
def load_cookies() -> tuple[dict[str, str], str] | None:
|
|
||||||
"""Get home.co.uk cookies + user-agent.
|
|
||||||
|
|
||||||
Environment cookies are optional. When they are not present, bootstrap a
|
|
||||||
regular local session by visiting home.co.uk with curl_cffi's Chrome
|
|
||||||
impersonation and reusing the cookies set by the site.
|
|
||||||
"""
|
|
||||||
user_agent = os.environ.get(
|
|
||||||
"HOMECOUK_USER_AGENT",
|
|
||||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
|
||||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
||||||
"Chrome/145.0.0.0 Safari/537.36",
|
|
||||||
)
|
|
||||||
|
|
||||||
env_cookies = {
|
|
||||||
name: value
|
|
||||||
for name, value in {
|
|
||||||
"cf_clearance": os.environ.get("HOMECOUK_CF_CLEARANCE", ""),
|
|
||||||
"homecouk_session": os.environ.get("HOMECOUK_SESSION", ""),
|
|
||||||
"XSRF-TOKEN": os.environ.get("HOMECOUK_XSRF_TOKEN", ""),
|
|
||||||
}.items()
|
|
||||||
if value
|
|
||||||
}
|
|
||||||
if env_cookies.get("homecouk_session"):
|
|
||||||
return env_cookies, user_agent
|
|
||||||
|
|
||||||
session = Session(impersonate="chrome")
|
|
||||||
session.headers.update(
|
|
||||||
{
|
|
||||||
"User-Agent": user_agent,
|
|
||||||
"Accept": (
|
|
||||||
"text/html,application/xhtml+xml,application/xml;q=0.9,"
|
|
||||||
"*/*;q=0.8"
|
|
||||||
),
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
for url in (HOMECOUK_BASE, f"{HOMECOUK_BASE}/for-sale/br1/"):
|
|
||||||
try:
|
|
||||||
response = session.get(url, timeout=30)
|
|
||||||
except RequestsError as exc:
|
|
||||||
log.warning("home.co.uk cookie bootstrap failed for %s: %s", url, exc)
|
|
||||||
continue
|
|
||||||
if response.status_code == 403:
|
|
||||||
raise CookiesExpiredError("home.co.uk returned HTTP 403 during bootstrap")
|
|
||||||
if response.status_code >= 400:
|
|
||||||
log.warning(
|
|
||||||
"home.co.uk cookie bootstrap got HTTP %d from %s",
|
|
||||||
response.status_code,
|
|
||||||
url,
|
|
||||||
)
|
|
||||||
|
|
||||||
cookies = session.cookies.get_dict()
|
|
||||||
if cookies.get("homecouk_session") and cookies.get("XSRF-TOKEN"):
|
|
||||||
log.info("home.co.uk local session bootstrapped")
|
|
||||||
return cookies, user_agent
|
|
||||||
|
|
||||||
log.warning("home.co.uk did not provide session cookies during bootstrap")
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def make_client(cookies: dict[str, str], user_agent: str) -> Session:
|
|
||||||
"""Create a curl_cffi Session configured for home.co.uk API calls.
|
|
||||||
Uses Chrome TLS impersonation so browser-derived cookies remain valid."""
|
|
||||||
session = Session(impersonate="chrome")
|
|
||||||
session.headers.update(
|
|
||||||
{
|
|
||||||
"User-Agent": user_agent,
|
|
||||||
"Accept": "application/json, text/plain, */*",
|
|
||||||
"x-requested-with": "XMLHttpRequest",
|
|
||||||
}
|
|
||||||
)
|
|
||||||
# Laravel CSRF: the XSRF-TOKEN cookie value must also be sent as the
|
|
||||||
# X-XSRF-TOKEN request header (URL-decoded). Without this header, the
|
|
||||||
# server rejects every request with 419/403.
|
|
||||||
xsrf = cookies.get("XSRF-TOKEN")
|
|
||||||
if xsrf:
|
|
||||||
session.headers["X-XSRF-TOKEN"] = unquote(xsrf)
|
|
||||||
for name, value in cookies.items():
|
|
||||||
session.cookies.set(name, value, domain="home.co.uk")
|
|
||||||
return session
|
|
||||||
|
|
||||||
|
|
||||||
def fetch_page(
|
|
||||||
client: Session, url: str, params: dict, max_retries: int = 3
|
|
||||||
) -> dict | None:
|
|
||||||
"""GET JSON with retries on 429/5xx. Returns None on permanent failure.
|
|
||||||
403 means cookies expired — raises CookiesExpiredError immediately."""
|
|
||||||
for attempt in range(max_retries):
|
|
||||||
try:
|
|
||||||
resp = client.get(url, params=params, timeout=30)
|
|
||||||
if resp.status_code == 200:
|
|
||||||
try:
|
|
||||||
return resp.json()
|
|
||||||
except json.JSONDecodeError:
|
|
||||||
log.error(
|
|
||||||
"Non-JSON response from %s (got %s)",
|
|
||||||
url,
|
|
||||||
resp.headers.get("content-type", "?"),
|
|
||||||
)
|
|
||||||
return None
|
|
||||||
if resp.status_code == 403:
|
|
||||||
raise CookiesExpiredError("HTTP 403 — cookies likely expired")
|
|
||||||
if resp.status_code in (429, 500, 502, 503, 504):
|
|
||||||
delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1)
|
|
||||||
log.warning(
|
|
||||||
"HTTP %d from %s, retry %d/%d in %.1fs",
|
|
||||||
resp.status_code,
|
|
||||||
url,
|
|
||||||
attempt + 1,
|
|
||||||
max_retries,
|
|
||||||
delay,
|
|
||||||
)
|
|
||||||
time.sleep(delay)
|
|
||||||
continue
|
|
||||||
log.error("HTTP %d from %s (non-retryable)", resp.status_code, url)
|
|
||||||
return None
|
|
||||||
except CookiesExpiredError:
|
|
||||||
raise
|
|
||||||
except RequestsError as e:
|
|
||||||
delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1)
|
|
||||||
log.warning(
|
|
||||||
"%s from %s, retry %d/%d in %.1fs",
|
|
||||||
type(e).__name__,
|
|
||||||
url,
|
|
||||||
attempt + 1,
|
|
||||||
max_retries,
|
|
||||||
delay,
|
|
||||||
)
|
|
||||||
time.sleep(delay)
|
|
||||||
log.error("All %d retries exhausted for %s", max_retries, url)
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def _coerce_positive_int(value) -> int | None:
|
|
||||||
parsed = parse_int_value(value)
|
|
||||||
if parsed is None or parsed <= 0:
|
|
||||||
return None
|
|
||||||
return parsed
|
|
||||||
|
|
||||||
|
|
||||||
def _property_identity(prop: dict, page: int, index: int) -> str:
|
|
||||||
for key in ("listing_id", "property_id", "id"):
|
|
||||||
value = prop.get(key)
|
|
||||||
if value:
|
|
||||||
return f"{key}:{value}"
|
|
||||||
return (
|
|
||||||
f"page:{page}:index:{index}:"
|
|
||||||
f"{prop.get('display_address') or prop.get('address') or ''}:"
|
|
||||||
f"{prop.get('price') or prop.get('latest_price') or ''}"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def parse_floor_area(description: str | None) -> float | None:
|
|
||||||
"""Try to extract floor area from description text like '789 sq.ft.' or '73 sq.m.'."""
|
|
||||||
if not description:
|
|
||||||
return None
|
|
||||||
m = re.search(
|
|
||||||
r"([\d,]+(?:\.\d+)?)\s*(?:sq\.?\s*ft|square\s+feet|ft(?:\^?2|²))",
|
|
||||||
description,
|
|
||||||
re.IGNORECASE,
|
|
||||||
)
|
|
||||||
if m:
|
|
||||||
sqft = float(m.group(1).replace(",", ""))
|
|
||||||
return validate_floor_area(round(sqft * 0.092903, 1))
|
|
||||||
m = re.search(
|
|
||||||
r"([\d,]+(?:\.\d+)?)\s*(?:sq\.?\s*m|square\s+met(?:er|re)s?|m(?:\^?2|²))",
|
|
||||||
description,
|
|
||||||
re.IGNORECASE,
|
|
||||||
)
|
|
||||||
if m:
|
|
||||||
return validate_floor_area(round(float(m.group(1).replace(",", "")), 1))
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def parse_tenure(prop: dict) -> str | None:
|
|
||||||
"""Extract tenure from home.co.uk property data.
|
|
||||||
|
|
||||||
Checks multiple sources in priority order:
|
|
||||||
1. Dedicated 'tenure' or 'tenure_type' field in the API response
|
|
||||||
2. Free-text search in the description for 'freehold' / 'leasehold'
|
|
||||||
3. Free-text search in features lists
|
|
||||||
|
|
||||||
home.co.uk aggregates listings from estate agents, so tenure is often
|
|
||||||
embedded in the description text rather than a structured field.
|
|
||||||
"""
|
|
||||||
# 1. Check dedicated tenure fields (in case the API adds them)
|
|
||||||
for key in ("tenure", "tenure_type", "tenureType"):
|
|
||||||
val = prop.get(key)
|
|
||||||
if val and isinstance(val, str):
|
|
||||||
lower = val.lower().strip()
|
|
||||||
if "leasehold" in lower:
|
|
||||||
return "Leasehold"
|
|
||||||
if "freehold" in lower:
|
|
||||||
return "Freehold"
|
|
||||||
|
|
||||||
# 2. Check description text — estate agents often include tenure here
|
|
||||||
description = prop.get("description") or ""
|
|
||||||
if description:
|
|
||||||
lower_desc = description.lower()
|
|
||||||
if re.search(r"\bleasehold\b", lower_desc):
|
|
||||||
return "Leasehold"
|
|
||||||
if re.search(r"\bfreehold\b", lower_desc):
|
|
||||||
# Matches "Freehold" and "Share of Freehold" (both = freehold ownership)
|
|
||||||
return "Freehold"
|
|
||||||
|
|
||||||
# 3. Check features / key_features lists if present
|
|
||||||
for key in ("features", "key_features", "keyFeatures"):
|
|
||||||
features = prop.get(key)
|
|
||||||
if features and isinstance(features, list):
|
|
||||||
for feat in features:
|
|
||||||
if not isinstance(feat, str):
|
|
||||||
continue
|
|
||||||
lower_feat = feat.lower()
|
|
||||||
if "leasehold" in lower_feat:
|
|
||||||
return "Leasehold"
|
|
||||||
if "freehold" in lower_feat:
|
|
||||||
return "Freehold"
|
|
||||||
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def map_property_type(raw_type: str | None) -> str:
|
|
||||||
"""Map home.co.uk property type to canonical type."""
|
|
||||||
if not raw_type:
|
|
||||||
return "Other"
|
|
||||||
canonical = PROPERTY_TYPE_MAP.get(raw_type)
|
|
||||||
if canonical:
|
|
||||||
return canonical
|
|
||||||
# Home.co.uk uses types like "House", "Flat", "Apartment", "Detached", etc.
|
|
||||||
# Try common patterns
|
|
||||||
lower = raw_type.lower()
|
|
||||||
excluded_flat_like = (
|
|
||||||
"block of apartment",
|
|
||||||
"house of multiple occupation",
|
|
||||||
"private halls",
|
|
||||||
"retirement",
|
|
||||||
"serviced apartment",
|
|
||||||
)
|
|
||||||
if any(term in lower for term in excluded_flat_like):
|
|
||||||
return "Other"
|
|
||||||
if (
|
|
||||||
"flat" in lower
|
|
||||||
or "apartment" in lower
|
|
||||||
or "maisonette" in lower
|
|
||||||
or "studio" in lower
|
|
||||||
):
|
|
||||||
return "Flats/Maisonettes"
|
|
||||||
if "detached" in lower and "semi" not in lower:
|
|
||||||
return "Detached"
|
|
||||||
if "semi" in lower:
|
|
||||||
return "Semi-Detached"
|
|
||||||
if "terrace" in lower or "mews" in lower:
|
|
||||||
return "Terraced"
|
|
||||||
log.debug("Unknown property type: %r — mapping to Other", raw_type)
|
|
||||||
return "Other"
|
|
||||||
|
|
||||||
|
|
||||||
def transform_property(
|
|
||||||
prop: dict,
|
|
||||||
pc_index: PostcodeSpatialIndex,
|
|
||||||
) -> dict | None:
|
|
||||||
"""Transform a raw home.co.uk property dict into our output schema."""
|
|
||||||
lat = prop.get("latitude")
|
|
||||||
lng = prop.get("longitude")
|
|
||||||
if lat is None or lng is None:
|
|
||||||
return None
|
|
||||||
|
|
||||||
# Validate coordinates are in England
|
|
||||||
if not (49 <= lat <= 56 and -7 <= lng <= 2):
|
|
||||||
log.debug("Coords outside England: lat=%.4f lng=%.4f — skipping", lat, lng)
|
|
||||||
return None
|
|
||||||
|
|
||||||
price = parse_int_value(prop.get("price")) or parse_int_value(
|
|
||||||
prop.get("latest_price")
|
|
||||||
)
|
|
||||||
if not price or price <= 0:
|
|
||||||
return None
|
|
||||||
|
|
||||||
# Home.co.uk provides postcodes directly, but fall back to spatial index
|
|
||||||
postcode = prop.get("postcode")
|
|
||||||
if not postcode:
|
|
||||||
postcode = pc_index.nearest(lat, lng)
|
|
||||||
if not postcode:
|
|
||||||
log.debug("No postcode for property at %.4f, %.4f — skipping", lat, lng)
|
|
||||||
return None
|
|
||||||
|
|
||||||
raw_beds = parse_int_value(prop.get("bedrooms")) or 0
|
|
||||||
raw_baths = parse_int_value(prop.get("bathrooms")) or 0
|
|
||||||
bedrooms = raw_beds if 0 <= raw_beds <= MAX_BEDROOMS else 0
|
|
||||||
bathrooms = raw_baths if 0 <= raw_baths <= MAX_BEDROOMS else 0
|
|
||||||
if raw_beds > MAX_BEDROOMS or raw_baths > MAX_BEDROOMS:
|
|
||||||
log.warning(
|
|
||||||
"home.co.uk %s: implausible beds=%d baths=%d (capped to 0)",
|
|
||||||
prop.get("listing_id") or prop.get("property_id") or "?",
|
|
||||||
raw_beds, raw_baths,
|
|
||||||
)
|
|
||||||
|
|
||||||
listing_type = prop.get("listing_property_type") or prop.get("property_type") or ""
|
|
||||||
address = prop.get("display_address") or prop.get("address") or ""
|
|
||||||
|
|
||||||
# Derive price qualifier from reduction info
|
|
||||||
price_qualifier = ""
|
|
||||||
if prop.get("is_reduced"):
|
|
||||||
pct = prop.get("reduction_percent", 0)
|
|
||||||
if pct:
|
|
||||||
price_qualifier = f"Reduced by {pct}%"
|
|
||||||
else:
|
|
||||||
price_qualifier = "Reduced"
|
|
||||||
|
|
||||||
listing_id = prop.get("listing_id") or prop.get("property_id") or ""
|
|
||||||
|
|
||||||
return {
|
|
||||||
"id": f"hk_{listing_id}", # prefix to avoid collision with Rightmove IDs
|
|
||||||
"Bedrooms": bedrooms,
|
|
||||||
"Bathrooms": bathrooms,
|
|
||||||
"Number of bedrooms & living rooms": bedrooms + bathrooms,
|
|
||||||
"lon": lng,
|
|
||||||
"lat": lat,
|
|
||||||
"Postcode": normalize_postcode(postcode),
|
|
||||||
"Address per Property Register": address,
|
|
||||||
"Leasehold/Freehold": parse_tenure(prop),
|
|
||||||
"Property type": map_property_type(listing_type),
|
|
||||||
"Property sub-type": normalize_sub_type(listing_type),
|
|
||||||
"price": price,
|
|
||||||
"price_frequency": "",
|
|
||||||
"Price qualifier": price_qualifier,
|
|
||||||
"Total floor area (sqm)": parse_floor_area(prop.get("description")),
|
|
||||||
"Listing URL": f"{HOMECOUK_BASE}/property/{listing_id}",
|
|
||||||
"Listing features": [], # not available from home.co.uk
|
|
||||||
"first_visible_date": prop.get("added_date") or "",
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def search_outcode(
|
|
||||||
client: Session,
|
|
||||||
outcode: str,
|
|
||||||
pc_index: PostcodeSpatialIndex,
|
|
||||||
max_properties: int | None = None,
|
|
||||||
) -> list[dict]:
|
|
||||||
"""Paginate through sale search results for one outcode."""
|
|
||||||
url_segment = HOMECOUK_URL_SEGMENT
|
|
||||||
url = f"{HOMECOUK_API_BASE}/{url_segment}/{outcode.lower()}/"
|
|
||||||
properties = []
|
|
||||||
page = 1
|
|
||||||
last_page: int | None = None
|
|
||||||
total_results: int | None = None
|
|
||||||
seen_ids: set[str] = set()
|
|
||||||
|
|
||||||
while True:
|
|
||||||
params = {
|
|
||||||
"page": str(page),
|
|
||||||
"sort": "date_desc",
|
|
||||||
"per_page": str(HOMECOUK_PER_PAGE),
|
|
||||||
}
|
|
||||||
|
|
||||||
# Set referer to match the page URL pattern
|
|
||||||
client.headers["referer"] = (
|
|
||||||
f"https://home.co.uk/{url_segment}/{outcode.lower()}/"
|
|
||||||
f"?page={page}&sort=date_desc&per_page={HOMECOUK_PER_PAGE}"
|
|
||||||
)
|
|
||||||
|
|
||||||
data = fetch_page(client, url, params)
|
|
||||||
if not data:
|
|
||||||
raise PaginationError(f"home.co.uk {outcode} page {page} failed to load")
|
|
||||||
|
|
||||||
pagination = data.get("pagination", {}) or {}
|
|
||||||
if last_page is None:
|
|
||||||
last_page = _coerce_positive_int(pagination.get("last_page"))
|
|
||||||
if total_results is None:
|
|
||||||
total_results = _coerce_positive_int(pagination.get("total"))
|
|
||||||
|
|
||||||
raw_props = data.get("properties", [])
|
|
||||||
if not raw_props:
|
|
||||||
if total_results and page <= (last_page or page):
|
|
||||||
raise PaginationError(
|
|
||||||
f"home.co.uk {outcode} page {page} returned no properties "
|
|
||||||
f"before the advertised end"
|
|
||||||
)
|
|
||||||
break
|
|
||||||
|
|
||||||
page_ids = {
|
|
||||||
_property_identity(prop, page, idx) for idx, prop in enumerate(raw_props)
|
|
||||||
}
|
|
||||||
if page_ids and page_ids.issubset(seen_ids):
|
|
||||||
raise PaginationError(
|
|
||||||
f"home.co.uk {outcode} page {page} repeated previously seen results"
|
|
||||||
)
|
|
||||||
seen_ids.update(page_ids)
|
|
||||||
|
|
||||||
for prop in raw_props:
|
|
||||||
try:
|
|
||||||
transformed = transform_property(prop, pc_index)
|
|
||||||
except Exception as exc:
|
|
||||||
log.warning(
|
|
||||||
"home.co.uk %s property %s failed to transform: %s",
|
|
||||||
outcode,
|
|
||||||
prop.get("listing_id") or prop.get("property_id") or "?",
|
|
||||||
exc,
|
|
||||||
)
|
|
||||||
continue
|
|
||||||
if transformed:
|
|
||||||
properties.append(transformed)
|
|
||||||
if max_properties is not None and len(properties) >= max_properties:
|
|
||||||
return properties
|
|
||||||
|
|
||||||
if last_page is not None:
|
|
||||||
if page >= last_page:
|
|
||||||
break
|
|
||||||
elif total_results is not None and len(seen_ids) >= total_results:
|
|
||||||
break
|
|
||||||
elif len(raw_props) < HOMECOUK_PER_PAGE:
|
|
||||||
break
|
|
||||||
|
|
||||||
page += 1
|
|
||||||
time.sleep(DELAY_BETWEEN_PAGES)
|
|
||||||
|
|
||||||
return properties
|
|
||||||
|
|
@ -1,63 +0,0 @@
|
||||||
"""Shared target filters for manual buy-listing scrapes."""
|
|
||||||
|
|
||||||
import math
|
|
||||||
from typing import Any
|
|
||||||
|
|
||||||
BUY_MAX_PRICE = 1_000_000
|
|
||||||
BUY_MIN_BEDROOMS = 2
|
|
||||||
BUY_MAX_BEDROOMS = 5
|
|
||||||
BUY_ALLOWED_BATHROOMS = frozenset({2, 3})
|
|
||||||
BUY_MIN_FLOOR_AREA_SQM = 90.0
|
|
||||||
BUY_MAX_FLOOR_AREA_SQM = 170.0
|
|
||||||
BUY_PROPERTY_TYPES = frozenset({"Flats/Maisonettes"})
|
|
||||||
|
|
||||||
BUY_MIN_FLOOR_AREA_SQFT = round(BUY_MIN_FLOOR_AREA_SQM / 0.092903)
|
|
||||||
BUY_MAX_FLOOR_AREA_SQFT = round(BUY_MAX_FLOOR_AREA_SQM / 0.092903)
|
|
||||||
|
|
||||||
|
|
||||||
def _number(value: Any) -> float | None:
|
|
||||||
if value is None:
|
|
||||||
return None
|
|
||||||
try:
|
|
||||||
number = float(value)
|
|
||||||
except (TypeError, ValueError):
|
|
||||||
return None
|
|
||||||
if not math.isfinite(number):
|
|
||||||
return None
|
|
||||||
return number
|
|
||||||
|
|
||||||
|
|
||||||
def _int(value: Any) -> int | None:
|
|
||||||
number = _number(value)
|
|
||||||
if number is None or not number.is_integer():
|
|
||||||
return None
|
|
||||||
return int(number)
|
|
||||||
|
|
||||||
|
|
||||||
def matches_strict_buy_listing_filter(prop: dict) -> bool:
|
|
||||||
"""Exact filter used to guard scraped/output datasets."""
|
|
||||||
if "price" in prop:
|
|
||||||
price = _number(prop.get("price"))
|
|
||||||
else:
|
|
||||||
price = _number(prop.get("Asking price"))
|
|
||||||
if price is None or price <= 0 or price >= BUY_MAX_PRICE:
|
|
||||||
return False
|
|
||||||
|
|
||||||
bedrooms = _int(prop.get("Bedrooms"))
|
|
||||||
if bedrooms is None or (
|
|
||||||
bedrooms < BUY_MIN_BEDROOMS or bedrooms > BUY_MAX_BEDROOMS
|
|
||||||
):
|
|
||||||
return False
|
|
||||||
|
|
||||||
property_type = prop.get("Property type")
|
|
||||||
if property_type not in BUY_PROPERTY_TYPES:
|
|
||||||
return False
|
|
||||||
|
|
||||||
bathrooms = _int(prop.get("Bathrooms"))
|
|
||||||
if bathrooms not in BUY_ALLOWED_BATHROOMS:
|
|
||||||
return False
|
|
||||||
|
|
||||||
floor_area = _number(prop.get("Total floor area (sqm)"))
|
|
||||||
if floor_area is None:
|
|
||||||
return False
|
|
||||||
return BUY_MIN_FLOOR_AREA_SQM <= floor_area <= BUY_MAX_FLOOR_AREA_SQM
|
|
||||||
|
|
@ -5,10 +5,10 @@ import tempfile
|
||||||
import time
|
import time
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from constants import DATA_DIR
|
from constants import DATA_DIR, REPO_DIR
|
||||||
|
|
||||||
|
|
||||||
SOURCE_CHOICES = ("rightmove", "homecouk", "zoopla", "all")
|
SOURCE_CHOICES = ("rightmove", "zoopla", "all")
|
||||||
TEST_MAX_PROPERTIES_PER_SOURCE = 100
|
TEST_MAX_PROPERTIES_PER_SOURCE = 100
|
||||||
TEST_OUTCODES = (
|
TEST_OUTCODES = (
|
||||||
"E1",
|
"E1",
|
||||||
|
|
@ -28,14 +28,16 @@ log = logging.getLogger("finder")
|
||||||
|
|
||||||
def configure_standalone_runtime() -> None:
|
def configure_standalone_runtime() -> None:
|
||||||
"""Keep browser/cache/temp files on the project volume for local runs."""
|
"""Keep browser/cache/temp files on the project volume for local runs."""
|
||||||
runtime_dir = DATA_DIR / ".runtime"
|
runtime_dir = REPO_DIR / ".tmp" / "finder"
|
||||||
cache_dir = runtime_dir / "cache"
|
cache_dir = runtime_dir / "cache"
|
||||||
temp_dir = runtime_dir / "tmp"
|
temp_dir = runtime_dir / "tmp"
|
||||||
cache_dir.mkdir(parents=True, exist_ok=True)
|
cache_dir.mkdir(parents=True, exist_ok=True)
|
||||||
temp_dir.mkdir(parents=True, exist_ok=True)
|
temp_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
os.environ.setdefault("XDG_CACHE_HOME", str(cache_dir))
|
os.environ["XDG_CACHE_HOME"] = str(cache_dir)
|
||||||
os.environ.setdefault("TMPDIR", str(temp_dir))
|
os.environ["TMPDIR"] = str(temp_dir)
|
||||||
|
os.environ["TEMP"] = str(temp_dir)
|
||||||
|
os.environ["TMP"] = str(temp_dir)
|
||||||
tempfile.tempdir = str(temp_dir)
|
tempfile.tempdir = str(temp_dir)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -47,7 +49,7 @@ def parse_args() -> argparse.Namespace:
|
||||||
"--source",
|
"--source",
|
||||||
choices=SOURCE_CHOICES,
|
choices=SOURCE_CHOICES,
|
||||||
default="all",
|
default="all",
|
||||||
help="Portal to scrape. 'all' runs Rightmove, home.co.uk, and Zoopla.",
|
help="Portal to scrape. 'all' runs Rightmove and Zoopla.",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--output-dir",
|
"--output-dir",
|
||||||
|
|
@ -89,7 +91,7 @@ def configure_logging() -> None:
|
||||||
|
|
||||||
def selected_sources(source: str) -> list[str]:
|
def selected_sources(source: str) -> list[str]:
|
||||||
if source == "all":
|
if source == "all":
|
||||||
return ["rightmove", "homecouk", "zoopla"]
|
return ["rightmove", "zoopla"]
|
||||||
return [source]
|
return [source]
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -4,7 +4,6 @@ version = "0.1.0"
|
||||||
requires-python = ">=3.12"
|
requires-python = ">=3.12"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"httpx",
|
"httpx",
|
||||||
"curl_cffi",
|
|
||||||
"polars",
|
"polars",
|
||||||
"fake-useragent>=2.2.0",
|
"fake-useragent>=2.2.0",
|
||||||
"playwright>=1.58.0",
|
"playwright>=1.58.0",
|
||||||
|
|
|
||||||
|
|
@ -10,15 +10,6 @@ from constants import (
|
||||||
TYPEAHEAD_URL,
|
TYPEAHEAD_URL,
|
||||||
)
|
)
|
||||||
from http_client import fetch_with_retry
|
from http_client import fetch_with_retry
|
||||||
from listing_filters import (
|
|
||||||
BUY_ALLOWED_BATHROOMS,
|
|
||||||
BUY_MAX_BEDROOMS,
|
|
||||||
BUY_MAX_FLOOR_AREA_SQFT,
|
|
||||||
BUY_MAX_PRICE,
|
|
||||||
BUY_MIN_BEDROOMS,
|
|
||||||
BUY_MIN_FLOOR_AREA_SQFT,
|
|
||||||
matches_strict_buy_listing_filter,
|
|
||||||
)
|
|
||||||
from spatial import PostcodeSpatialIndex
|
from spatial import PostcodeSpatialIndex
|
||||||
from transform import transform_property
|
from transform import transform_property
|
||||||
|
|
||||||
|
|
@ -31,24 +22,6 @@ outcode_cache: dict[str, str] = {}
|
||||||
# Requesting index >= 1008 returns HTTP 400.
|
# Requesting index >= 1008 returns HTTP 400.
|
||||||
_MAX_INDEX = 1008
|
_MAX_INDEX = 1008
|
||||||
|
|
||||||
_BASE_BUY_SEARCH_PARAMS = {
|
|
||||||
"propertyTypes": "flat",
|
|
||||||
"minBedrooms": str(BUY_MIN_BEDROOMS),
|
|
||||||
"maxBedrooms": str(BUY_MAX_BEDROOMS),
|
|
||||||
"minBathrooms": str(min(BUY_ALLOWED_BATHROOMS)),
|
|
||||||
"maxBathrooms": str(max(BUY_ALLOWED_BATHROOMS)),
|
|
||||||
"minSize": str(BUY_MIN_FLOOR_AREA_SQFT),
|
|
||||||
"maxSize": str(BUY_MAX_FLOOR_AREA_SQFT),
|
|
||||||
"maxPrice": str(BUY_MAX_PRICE - 1),
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def _buy_search_params(extra_params: dict | None = None) -> dict:
|
|
||||||
params = dict(_BASE_BUY_SEARCH_PARAMS)
|
|
||||||
if extra_params:
|
|
||||||
params.update(extra_params)
|
|
||||||
return params
|
|
||||||
|
|
||||||
|
|
||||||
def resolve_outcode_id(client: httpx.Client, outcode: str) -> str | None:
|
def resolve_outcode_id(client: httpx.Client, outcode: str) -> str | None:
|
||||||
"""Look up Rightmove's internal ID for an outcode via typeahead API."""
|
"""Look up Rightmove's internal ID for an outcode via typeahead API."""
|
||||||
|
|
@ -77,7 +50,6 @@ def _paginate(
|
||||||
outcode: str,
|
outcode: str,
|
||||||
channel_cfg: dict,
|
channel_cfg: dict,
|
||||||
pc_index: PostcodeSpatialIndex,
|
pc_index: PostcodeSpatialIndex,
|
||||||
extra_params: dict | None = None,
|
|
||||||
max_properties: int | None = None,
|
max_properties: int | None = None,
|
||||||
) -> tuple[list[dict], int]:
|
) -> tuple[list[dict], int]:
|
||||||
"""Paginate through search results. Returns (properties, result_count)."""
|
"""Paginate through search results. Returns (properties, result_count)."""
|
||||||
|
|
@ -94,9 +66,6 @@ def _paginate(
|
||||||
"channel": channel_cfg["channel"],
|
"channel": channel_cfg["channel"],
|
||||||
"transactionType": channel_cfg["transactionType"],
|
"transactionType": channel_cfg["transactionType"],
|
||||||
}
|
}
|
||||||
if extra_params:
|
|
||||||
params.update(extra_params)
|
|
||||||
|
|
||||||
data = fetch_with_retry(client, SEARCH_URL, params)
|
data = fetch_with_retry(client, SEARCH_URL, params)
|
||||||
if not data:
|
if not data:
|
||||||
log.warning(
|
log.warning(
|
||||||
|
|
@ -123,7 +92,7 @@ def _paginate(
|
||||||
exc,
|
exc,
|
||||||
)
|
)
|
||||||
continue
|
continue
|
||||||
if transformed and matches_strict_buy_listing_filter(transformed):
|
if transformed:
|
||||||
properties.append(transformed)
|
properties.append(transformed)
|
||||||
if max_properties is not None and len(properties) >= max_properties:
|
if max_properties is not None and len(properties) >= max_properties:
|
||||||
return properties, result_count
|
return properties, result_count
|
||||||
|
|
@ -137,7 +106,7 @@ def _paginate(
|
||||||
break
|
break
|
||||||
if index >= _MAX_INDEX:
|
if index >= _MAX_INDEX:
|
||||||
log.warning(
|
log.warning(
|
||||||
"%s/%s: %d filtered results exceed Rightmove's %d-result page cap",
|
"%s/%s: %d results exceed Rightmove's %d-result page cap",
|
||||||
outcode,
|
outcode,
|
||||||
channel_cfg["channel"],
|
channel_cfg["channel"],
|
||||||
result_count,
|
result_count,
|
||||||
|
|
@ -158,18 +127,13 @@ def search_outcode(
|
||||||
pc_index: PostcodeSpatialIndex,
|
pc_index: PostcodeSpatialIndex,
|
||||||
max_properties: int | None = None,
|
max_properties: int | None = None,
|
||||||
) -> list[dict]:
|
) -> list[dict]:
|
||||||
"""Paginate through search results for one outcode+channel. Returns transformed properties.
|
"""Paginate through unfiltered sale results for one outcode+channel."""
|
||||||
|
|
||||||
Search requests set the supported Rightmove filters directly: flats,
|
|
||||||
2-5 bedrooms, 2-3 bathrooms, 969-1830 sq ft, and asking price below £1m.
|
|
||||||
"""
|
|
||||||
properties, _ = _paginate(
|
properties, _ = _paginate(
|
||||||
client,
|
client,
|
||||||
outcode_id,
|
outcode_id,
|
||||||
outcode,
|
outcode,
|
||||||
channel_cfg,
|
channel_cfg,
|
||||||
pc_index,
|
pc_index,
|
||||||
extra_params=_buy_search_params(),
|
|
||||||
max_properties=max_properties,
|
max_properties=max_properties,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -14,12 +14,7 @@ from constants import (
|
||||||
LONDON_OUTCODE_PREFIXES,
|
LONDON_OUTCODE_PREFIXES,
|
||||||
)
|
)
|
||||||
|
|
||||||
from homecouk import CookiesExpiredError
|
|
||||||
from homecouk import load_cookies as load_homecouk_cookies
|
|
||||||
from homecouk import make_client as make_homecouk_client
|
|
||||||
from homecouk import search_outcode as homecouk_search_outcode
|
|
||||||
from http_client import make_client
|
from http_client import make_client
|
||||||
from listing_filters import matches_strict_buy_listing_filter
|
|
||||||
from rightmove import resolve_outcode_id
|
from rightmove import resolve_outcode_id
|
||||||
from rightmove import search_outcode as rightmove_search_outcode
|
from rightmove import search_outcode as rightmove_search_outcode
|
||||||
from spatial import PostcodeSpatialIndex
|
from spatial import PostcodeSpatialIndex
|
||||||
|
|
@ -30,7 +25,7 @@ from zoopla import search_outcode as zoopla_search_outcode
|
||||||
|
|
||||||
log = logging.getLogger("rightmove")
|
log = logging.getLogger("rightmove")
|
||||||
|
|
||||||
SOURCE_ORDER = ("rightmove", "homecouk", "zoopla")
|
SOURCE_ORDER = ("rightmove", "zoopla")
|
||||||
SALE_CHANNEL = CHANNELS[0]
|
SALE_CHANNEL = CHANNELS[0]
|
||||||
LONDON_AREAS = sorted({prefix.upper() for prefix in LONDON_OUTCODE_PREFIXES})
|
LONDON_AREAS = sorted({prefix.upper() for prefix in LONDON_OUTCODE_PREFIXES})
|
||||||
OUTCODE_RE = re.compile(r"^([A-Z]{1,2}\d[A-Z0-9]?)")
|
OUTCODE_RE = re.compile(r"^([A-Z]{1,2}\d[A-Z0-9]?)")
|
||||||
|
|
@ -260,16 +255,7 @@ def _store_properties(
|
||||||
dropped_outside_area,
|
dropped_outside_area,
|
||||||
)
|
)
|
||||||
|
|
||||||
eligible = [prop for prop in londonish if matches_strict_buy_listing_filter(prop)]
|
selected = londonish if remaining is None else londonish[:remaining]
|
||||||
dropped_non_matching = len(londonish) - len(eligible)
|
|
||||||
if dropped_non_matching:
|
|
||||||
log.debug(
|
|
||||||
"%s dropped %d properties outside the strict buy-listing filters",
|
|
||||||
source,
|
|
||||||
dropped_non_matching,
|
|
||||||
)
|
|
||||||
|
|
||||||
selected = eligible if remaining is None else eligible[:remaining]
|
|
||||||
results[source].extend(selected)
|
results[source].extend(selected)
|
||||||
return len(selected)
|
return len(selected)
|
||||||
|
|
||||||
|
|
@ -290,6 +276,8 @@ def _launch_zoopla_with_retries(attempts: int = 3):
|
||||||
for attempt in range(1, attempts + 1):
|
for attempt in range(1, attempts + 1):
|
||||||
try:
|
try:
|
||||||
return launch_zoopla_browser()
|
return launch_zoopla_browser()
|
||||||
|
except TurnstileError:
|
||||||
|
raise
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
last_error = exc
|
last_error = exc
|
||||||
log.warning(
|
log.warning(
|
||||||
|
|
@ -304,13 +292,6 @@ def _launch_zoopla_with_retries(attempts: int = 3):
|
||||||
raise last_error
|
raise last_error
|
||||||
|
|
||||||
|
|
||||||
def _new_homecouk_client():
|
|
||||||
cookie_data = load_homecouk_cookies()
|
|
||||||
if not cookie_data:
|
|
||||||
return None
|
|
||||||
return make_homecouk_client(*cookie_data)
|
|
||||||
|
|
||||||
|
|
||||||
def _scrape_rightmove(
|
def _scrape_rightmove(
|
||||||
outcodes: list[str],
|
outcodes: list[str],
|
||||||
pc_index: PostcodeSpatialIndex,
|
pc_index: PostcodeSpatialIndex,
|
||||||
|
|
@ -368,74 +349,6 @@ def _scrape_rightmove(
|
||||||
client.close()
|
client.close()
|
||||||
|
|
||||||
|
|
||||||
def _scrape_homecouk(
|
|
||||||
outcodes: list[str],
|
|
||||||
pc_index: PostcodeSpatialIndex,
|
|
||||||
results: dict[str, list[dict]],
|
|
||||||
errors: list[str],
|
|
||||||
max_properties_per_source: int | None,
|
|
||||||
) -> None:
|
|
||||||
client = _new_homecouk_client()
|
|
||||||
if client is None:
|
|
||||||
log.warning("home.co.uk skipped: could not bootstrap a local session")
|
|
||||||
return
|
|
||||||
|
|
||||||
try:
|
|
||||||
for outcode in outcodes:
|
|
||||||
if _source_remaining(results, "homecouk", max_properties_per_source) == 0:
|
|
||||||
log.info("home.co.uk cap reached")
|
|
||||||
return
|
|
||||||
|
|
||||||
for attempt in range(2):
|
|
||||||
try:
|
|
||||||
# home.co.uk cannot express the full filter set at source.
|
|
||||||
# Fetch the outcode page set first; _store_properties applies
|
|
||||||
# the strict filter and source cap after transformation.
|
|
||||||
props = homecouk_search_outcode(
|
|
||||||
client,
|
|
||||||
outcode,
|
|
||||||
pc_index,
|
|
||||||
max_properties=None,
|
|
||||||
)
|
|
||||||
added = _store_properties(
|
|
||||||
results,
|
|
||||||
"homecouk",
|
|
||||||
props,
|
|
||||||
max_properties_per_source,
|
|
||||||
)
|
|
||||||
log.info("home.co.uk %s: +%d", outcode, added)
|
|
||||||
break
|
|
||||||
except CookiesExpiredError as exc:
|
|
||||||
if attempt == 1:
|
|
||||||
_record_error(errors, "homecouk", outcode, exc)
|
|
||||||
break
|
|
||||||
|
|
||||||
log.warning(
|
|
||||||
"home.co.uk cookies expired at %s; refreshing local session",
|
|
||||||
outcode,
|
|
||||||
)
|
|
||||||
try:
|
|
||||||
client.close()
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
client = _new_homecouk_client()
|
|
||||||
if client is None:
|
|
||||||
_record_error(
|
|
||||||
errors,
|
|
||||||
"homecouk",
|
|
||||||
outcode,
|
|
||||||
RuntimeError("could not refresh local session"),
|
|
||||||
)
|
|
||||||
return
|
|
||||||
except Exception as exc:
|
|
||||||
_record_error(errors, "homecouk", outcode, exc)
|
|
||||||
break
|
|
||||||
|
|
||||||
time.sleep(DELAY_BETWEEN_OUTCODES)
|
|
||||||
finally:
|
|
||||||
client.close()
|
|
||||||
|
|
||||||
|
|
||||||
def _scrape_zoopla(
|
def _scrape_zoopla(
|
||||||
outcodes: list[str],
|
outcodes: list[str],
|
||||||
pc_index: PostcodeSpatialIndex,
|
pc_index: PostcodeSpatialIndex,
|
||||||
|
|
@ -459,9 +372,8 @@ def _scrape_zoopla(
|
||||||
|
|
||||||
for attempt in range(2):
|
for attempt in range(2):
|
||||||
try:
|
try:
|
||||||
# Zoopla source-side filters are unverified here. Fetch the
|
# Fetch the outcode page set first; _store_properties applies
|
||||||
# outcode page set first; _store_properties applies the
|
# the London-ish postcode filter and source cap after transformation.
|
||||||
# strict filter and source cap after transformation.
|
|
||||||
props, _ = zoopla_search_outcode(
|
props, _ = zoopla_search_outcode(
|
||||||
page,
|
page,
|
||||||
outcode,
|
outcode,
|
||||||
|
|
@ -539,15 +451,6 @@ def run_scrape(
|
||||||
max_properties_per_source,
|
max_properties_per_source,
|
||||||
)
|
)
|
||||||
|
|
||||||
if "homecouk" in selected_sources:
|
|
||||||
_scrape_homecouk(
|
|
||||||
selected_outcodes,
|
|
||||||
pc_index,
|
|
||||||
results,
|
|
||||||
errors,
|
|
||||||
max_properties_per_source,
|
|
||||||
)
|
|
||||||
|
|
||||||
if "zoopla" in selected_sources:
|
if "zoopla" in selected_sources:
|
||||||
if pc_coords is None:
|
if pc_coords is None:
|
||||||
pc_coords = build_postcode_coords()
|
pc_coords = build_postcode_coords()
|
||||||
|
|
@ -567,20 +470,10 @@ def run_scrape(
|
||||||
else:
|
else:
|
||||||
if output_path.exists():
|
if output_path.exists():
|
||||||
output_path.unlink()
|
output_path.unlink()
|
||||||
log.warning("No strict properties to write to %s", output_path)
|
log.warning("No London-ish properties to write to %s", output_path)
|
||||||
|
|
||||||
filtered = [prop for prop in merged if matches_strict_buy_listing_filter(prop)]
|
|
||||||
filtered_output_path = output_base / "online_listings_buy_filtered.parquet"
|
|
||||||
if filtered:
|
|
||||||
write_parquet(filtered, filtered_output_path)
|
|
||||||
else:
|
|
||||||
if filtered_output_path.exists():
|
|
||||||
filtered_output_path.unlink()
|
|
||||||
log.warning("No strict-filtered properties to write to %s", filtered_output_path)
|
|
||||||
|
|
||||||
counts = {
|
counts = {
|
||||||
"total": len(merged),
|
"total": len(merged),
|
||||||
"filtered_total": len(filtered),
|
|
||||||
"deduped": deduped,
|
"deduped": deduped,
|
||||||
"sources": source_counts,
|
"sources": source_counts,
|
||||||
}
|
}
|
||||||
|
|
@ -588,9 +481,8 @@ def run_scrape(
|
||||||
f"{source}:{source_counts[source]}" for source in SOURCE_ORDER
|
f"{source}:{source_counts[source]}" for source in SOURCE_ORDER
|
||||||
)
|
)
|
||||||
log.info(
|
log.info(
|
||||||
"Sale scrape complete: %d unique, %d strict-filtered (%s deduped:%d)",
|
"Sale scrape complete: %d unique (%s deduped:%d)",
|
||||||
len(merged),
|
len(merged),
|
||||||
len(filtered),
|
|
||||||
source_summary,
|
source_summary,
|
||||||
deduped,
|
deduped,
|
||||||
)
|
)
|
||||||
|
|
@ -603,7 +495,6 @@ def run_scrape(
|
||||||
},
|
},
|
||||||
"counts": counts,
|
"counts": counts,
|
||||||
"path": str(output_path),
|
"path": str(output_path),
|
||||||
"filtered_path": str(filtered_output_path),
|
|
||||||
"errors": errors,
|
"errors": errors,
|
||||||
"elapsed_seconds": round(time.time() - started_at, 3),
|
"elapsed_seconds": round(time.time() - started_at, 3),
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -76,7 +76,7 @@ def normalize_sub_type(sub_type: str | None) -> str:
|
||||||
"""Normalize property sub-type for consistent storage.
|
"""Normalize property sub-type for consistent storage.
|
||||||
|
|
||||||
Fixes delimiter inconsistencies (underscores/hyphens → spaces) from
|
Fixes delimiter inconsistencies (underscores/hyphens → spaces) from
|
||||||
home.co.uk and truncates Zoopla description fragments that were
|
legacy listing data and truncates Zoopla description fragments that were
|
||||||
accidentally captured as sub-types.
|
accidentally captured as sub-types.
|
||||||
"""
|
"""
|
||||||
if not sub_type:
|
if not sub_type:
|
||||||
|
|
@ -200,31 +200,13 @@ def transform_property(
|
||||||
|
|
||||||
price_obj = prop.get("price", {})
|
price_obj = prop.get("price", {})
|
||||||
amount = parse_int_value(price_obj.get("amount"))
|
amount = parse_int_value(price_obj.get("amount"))
|
||||||
if not amount:
|
price = amount or 0
|
||||||
return None
|
|
||||||
price = amount
|
|
||||||
if price <= 0:
|
|
||||||
return None
|
|
||||||
|
|
||||||
display_prices = price_obj.get("displayPrices", [])
|
display_prices = price_obj.get("displayPrices", [])
|
||||||
price_qualifier = (
|
price_qualifier = (
|
||||||
display_prices[0].get("displayPriceQualifier", "") if display_prices else ""
|
display_prices[0].get("displayPriceQualifier", "") if display_prices else ""
|
||||||
)
|
)
|
||||||
|
|
||||||
# POA / Auction listings have unreliable prices — treat as no price
|
|
||||||
pq_lower = price_qualifier.lower()
|
|
||||||
non_comparable_price_terms = (
|
|
||||||
"poa",
|
|
||||||
"auction",
|
|
||||||
"shared ownership",
|
|
||||||
"shared equity",
|
|
||||||
"part buy",
|
|
||||||
"part rent",
|
|
||||||
"from",
|
|
||||||
)
|
|
||||||
if any(term in pq_lower for term in non_comparable_price_terms):
|
|
||||||
return None
|
|
||||||
|
|
||||||
sub_type = prop.get("propertySubType", "")
|
sub_type = prop.get("propertySubType", "")
|
||||||
raw_beds = parse_int_value(prop.get("bedrooms")) or 0
|
raw_beds = parse_int_value(prop.get("bedrooms")) or 0
|
||||||
raw_baths = parse_int_value(prop.get("bathrooms")) or 0
|
raw_baths = parse_int_value(prop.get("bathrooms")) or 0
|
||||||
|
|
|
||||||
91
finder/uv.lock
generated
91
finder/uv.lock
generated
|
|
@ -72,63 +72,6 @@ wheels = [
|
||||||
{ url = "https://files.pythonhosted.org/packages/9a/3c/c17fb3ca2d9c3acff52e30b309f538586f9f5b9c9cf454f3845fc9af4881/certifi-2026.2.25-py3-none-any.whl", hash = "sha256:027692e4402ad994f1c42e52a4997a9763c646b73e4096e4d5d6db8af1d6f0fa", size = 153684, upload-time = "2026-02-25T02:54:15.766Z" },
|
{ url = "https://files.pythonhosted.org/packages/9a/3c/c17fb3ca2d9c3acff52e30b309f538586f9f5b9c9cf454f3845fc9af4881/certifi-2026.2.25-py3-none-any.whl", hash = "sha256:027692e4402ad994f1c42e52a4997a9763c646b73e4096e4d5d6db8af1d6f0fa", size = 153684, upload-time = "2026-02-25T02:54:15.766Z" },
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "cffi"
|
|
||||||
version = "2.0.0"
|
|
||||||
source = { registry = "https://pypi.org/simple" }
|
|
||||||
dependencies = [
|
|
||||||
{ name = "pycparser", marker = "implementation_name != 'PyPy'" },
|
|
||||||
]
|
|
||||||
sdist = { url = "https://files.pythonhosted.org/packages/eb/56/b1ba7935a17738ae8453301356628e8147c79dbb825bcbc73dc7401f9846/cffi-2.0.0.tar.gz", hash = "sha256:44d1b5909021139fe36001ae048dbdde8214afa20200eda0f64c068cac5d5529", size = 523588, upload-time = "2025-09-08T23:24:04.541Z" }
|
|
||||||
wheels = [
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/ea/47/4f61023ea636104d4f16ab488e268b93008c3d0bb76893b1b31db1f96802/cffi-2.0.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:6d02d6655b0e54f54c4ef0b94eb6be0607b70853c45ce98bd278dc7de718be5d", size = 185271, upload-time = "2025-09-08T23:22:44.795Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/df/a2/781b623f57358e360d62cdd7a8c681f074a71d445418a776eef0aadb4ab4/cffi-2.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8eca2a813c1cb7ad4fb74d368c2ffbbb4789d377ee5bb8df98373c2cc0dee76c", size = 181048, upload-time = "2025-09-08T23:22:45.938Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/ff/df/a4f0fbd47331ceeba3d37c2e51e9dfc9722498becbeec2bd8bc856c9538a/cffi-2.0.0-cp312-cp312-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:21d1152871b019407d8ac3985f6775c079416c282e431a4da6afe7aefd2bccbe", size = 212529, upload-time = "2025-09-08T23:22:47.349Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/d5/72/12b5f8d3865bf0f87cf1404d8c374e7487dcf097a1c91c436e72e6badd83/cffi-2.0.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:b21e08af67b8a103c71a250401c78d5e0893beff75e28c53c98f4de42f774062", size = 220097, upload-time = "2025-09-08T23:22:48.677Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/c2/95/7a135d52a50dfa7c882ab0ac17e8dc11cec9d55d2c18dda414c051c5e69e/cffi-2.0.0-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:1e3a615586f05fc4065a8b22b8152f0c1b00cdbc60596d187c2a74f9e3036e4e", size = 207983, upload-time = "2025-09-08T23:22:50.06Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/3a/c8/15cb9ada8895957ea171c62dc78ff3e99159ee7adb13c0123c001a2546c1/cffi-2.0.0-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:81afed14892743bbe14dacb9e36d9e0e504cd204e0b165062c488942b9718037", size = 206519, upload-time = "2025-09-08T23:22:51.364Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/78/2d/7fa73dfa841b5ac06c7b8855cfc18622132e365f5b81d02230333ff26e9e/cffi-2.0.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:3e17ed538242334bf70832644a32a7aae3d83b57567f9fd60a26257e992b79ba", size = 219572, upload-time = "2025-09-08T23:22:52.902Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/07/e0/267e57e387b4ca276b90f0434ff88b2c2241ad72b16d31836adddfd6031b/cffi-2.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3925dd22fa2b7699ed2617149842d2e6adde22b262fcbfada50e3d195e4b3a94", size = 222963, upload-time = "2025-09-08T23:22:54.518Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/b6/75/1f2747525e06f53efbd878f4d03bac5b859cbc11c633d0fb81432d98a795/cffi-2.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:2c8f814d84194c9ea681642fd164267891702542f028a15fc97d4674b6206187", size = 221361, upload-time = "2025-09-08T23:22:55.867Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/7b/2b/2b6435f76bfeb6bbf055596976da087377ede68df465419d192acf00c437/cffi-2.0.0-cp312-cp312-win32.whl", hash = "sha256:da902562c3e9c550df360bfa53c035b2f241fed6d9aef119048073680ace4a18", size = 172932, upload-time = "2025-09-08T23:22:57.188Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/f8/ed/13bd4418627013bec4ed6e54283b1959cf6db888048c7cf4b4c3b5b36002/cffi-2.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:da68248800ad6320861f129cd9c1bf96ca849a2771a59e0344e88681905916f5", size = 183557, upload-time = "2025-09-08T23:22:58.351Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/95/31/9f7f93ad2f8eff1dbc1c3656d7ca5bfd8fb52c9d786b4dcf19b2d02217fa/cffi-2.0.0-cp312-cp312-win_arm64.whl", hash = "sha256:4671d9dd5ec934cb9a73e7ee9676f9362aba54f7f34910956b84d727b0d73fb6", size = 177762, upload-time = "2025-09-08T23:22:59.668Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/4b/8d/a0a47a0c9e413a658623d014e91e74a50cdd2c423f7ccfd44086ef767f90/cffi-2.0.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:00bdf7acc5f795150faa6957054fbbca2439db2f775ce831222b66f192f03beb", size = 185230, upload-time = "2025-09-08T23:23:00.879Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/4a/d2/a6c0296814556c68ee32009d9c2ad4f85f2707cdecfd7727951ec228005d/cffi-2.0.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:45d5e886156860dc35862657e1494b9bae8dfa63bf56796f2fb56e1679fc0bca", size = 181043, upload-time = "2025-09-08T23:23:02.231Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/b0/1e/d22cc63332bd59b06481ceaac49d6c507598642e2230f201649058a7e704/cffi-2.0.0-cp313-cp313-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:07b271772c100085dd28b74fa0cd81c8fb1a3ba18b21e03d7c27f3436a10606b", size = 212446, upload-time = "2025-09-08T23:23:03.472Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/a9/f5/a2c23eb03b61a0b8747f211eb716446c826ad66818ddc7810cc2cc19b3f2/cffi-2.0.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d48a880098c96020b02d5a1f7d9251308510ce8858940e6fa99ece33f610838b", size = 220101, upload-time = "2025-09-08T23:23:04.792Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/f2/7f/e6647792fc5850d634695bc0e6ab4111ae88e89981d35ac269956605feba/cffi-2.0.0-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:f93fd8e5c8c0a4aa1f424d6173f14a892044054871c771f8566e4008eaa359d2", size = 207948, upload-time = "2025-09-08T23:23:06.127Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/cb/1e/a5a1bd6f1fb30f22573f76533de12a00bf274abcdc55c8edab639078abb6/cffi-2.0.0-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:dd4f05f54a52fb558f1ba9f528228066954fee3ebe629fc1660d874d040ae5a3", size = 206422, upload-time = "2025-09-08T23:23:07.753Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/98/df/0a1755e750013a2081e863e7cd37e0cdd02664372c754e5560099eb7aa44/cffi-2.0.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c8d3b5532fc71b7a77c09192b4a5a200ea992702734a2e9279a37f2478236f26", size = 219499, upload-time = "2025-09-08T23:23:09.648Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/50/e1/a969e687fcf9ea58e6e2a928ad5e2dd88cc12f6f0ab477e9971f2309b57c/cffi-2.0.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:d9b29c1f0ae438d5ee9acb31cadee00a58c46cc9c0b2f9038c6b0b3470877a8c", size = 222928, upload-time = "2025-09-08T23:23:10.928Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/36/54/0362578dd2c9e557a28ac77698ed67323ed5b9775ca9d3fe73fe191bb5d8/cffi-2.0.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6d50360be4546678fc1b79ffe7a66265e28667840010348dd69a314145807a1b", size = 221302, upload-time = "2025-09-08T23:23:12.42Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/eb/6d/bf9bda840d5f1dfdbf0feca87fbdb64a918a69bca42cfa0ba7b137c48cb8/cffi-2.0.0-cp313-cp313-win32.whl", hash = "sha256:74a03b9698e198d47562765773b4a8309919089150a0bb17d829ad7b44b60d27", size = 172909, upload-time = "2025-09-08T23:23:14.32Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/37/18/6519e1ee6f5a1e579e04b9ddb6f1676c17368a7aba48299c3759bbc3c8b3/cffi-2.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:19f705ada2530c1167abacb171925dd886168931e0a7b78f5bffcae5c6b5be75", size = 183402, upload-time = "2025-09-08T23:23:15.535Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/cb/0e/02ceeec9a7d6ee63bb596121c2c8e9b3a9e150936f4fbef6ca1943e6137c/cffi-2.0.0-cp313-cp313-win_arm64.whl", hash = "sha256:256f80b80ca3853f90c21b23ee78cd008713787b1b1e93eae9f3d6a7134abd91", size = 177780, upload-time = "2025-09-08T23:23:16.761Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/92/c4/3ce07396253a83250ee98564f8d7e9789fab8e58858f35d07a9a2c78de9f/cffi-2.0.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:fc33c5141b55ed366cfaad382df24fe7dcbc686de5be719b207bb248e3053dc5", size = 185320, upload-time = "2025-09-08T23:23:18.087Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/59/dd/27e9fa567a23931c838c6b02d0764611c62290062a6d4e8ff7863daf9730/cffi-2.0.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:c654de545946e0db659b3400168c9ad31b5d29593291482c43e3564effbcee13", size = 181487, upload-time = "2025-09-08T23:23:19.622Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/d6/43/0e822876f87ea8a4ef95442c3d766a06a51fc5298823f884ef87aaad168c/cffi-2.0.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:24b6f81f1983e6df8db3adc38562c83f7d4a0c36162885ec7f7b77c7dcbec97b", size = 220049, upload-time = "2025-09-08T23:23:20.853Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/b4/89/76799151d9c2d2d1ead63c2429da9ea9d7aac304603de0c6e8764e6e8e70/cffi-2.0.0-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:12873ca6cb9b0f0d3a0da705d6086fe911591737a59f28b7936bdfed27c0d47c", size = 207793, upload-time = "2025-09-08T23:23:22.08Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/bb/dd/3465b14bb9e24ee24cb88c9e3730f6de63111fffe513492bf8c808a3547e/cffi-2.0.0-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:d9b97165e8aed9272a6bb17c01e3cc5871a594a446ebedc996e2397a1c1ea8ef", size = 206300, upload-time = "2025-09-08T23:23:23.314Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/47/d9/d83e293854571c877a92da46fdec39158f8d7e68da75bf73581225d28e90/cffi-2.0.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:afb8db5439b81cf9c9d0c80404b60c3cc9c3add93e114dcae767f1477cb53775", size = 219244, upload-time = "2025-09-08T23:23:24.541Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/2b/0f/1f177e3683aead2bb00f7679a16451d302c436b5cbf2505f0ea8146ef59e/cffi-2.0.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:737fe7d37e1a1bffe70bd5754ea763a62a066dc5913ca57e957824b72a85e205", size = 222828, upload-time = "2025-09-08T23:23:26.143Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/c6/0f/cafacebd4b040e3119dcb32fed8bdef8dfe94da653155f9d0b9dc660166e/cffi-2.0.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:38100abb9d1b1435bc4cc340bb4489635dc2f0da7456590877030c9b3d40b0c1", size = 220926, upload-time = "2025-09-08T23:23:27.873Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/3e/aa/df335faa45b395396fcbc03de2dfcab242cd61a9900e914fe682a59170b1/cffi-2.0.0-cp314-cp314-win32.whl", hash = "sha256:087067fa8953339c723661eda6b54bc98c5625757ea62e95eb4898ad5e776e9f", size = 175328, upload-time = "2025-09-08T23:23:44.61Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/bb/92/882c2d30831744296ce713f0feb4c1cd30f346ef747b530b5318715cc367/cffi-2.0.0-cp314-cp314-win_amd64.whl", hash = "sha256:203a48d1fb583fc7d78a4c6655692963b860a417c0528492a6bc21f1aaefab25", size = 185650, upload-time = "2025-09-08T23:23:45.848Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/9f/2c/98ece204b9d35a7366b5b2c6539c350313ca13932143e79dc133ba757104/cffi-2.0.0-cp314-cp314-win_arm64.whl", hash = "sha256:dbd5c7a25a7cb98f5ca55d258b103a2054f859a46ae11aaf23134f9cc0d356ad", size = 180687, upload-time = "2025-09-08T23:23:47.105Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/3e/61/c768e4d548bfa607abcda77423448df8c471f25dbe64fb2ef6d555eae006/cffi-2.0.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:9a67fc9e8eb39039280526379fb3a70023d77caec1852002b4da7e8b270c4dd9", size = 188773, upload-time = "2025-09-08T23:23:29.347Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/2c/ea/5f76bce7cf6fcd0ab1a1058b5af899bfbef198bea4d5686da88471ea0336/cffi-2.0.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:7a66c7204d8869299919db4d5069a82f1561581af12b11b3c9f48c584eb8743d", size = 185013, upload-time = "2025-09-08T23:23:30.63Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/be/b4/c56878d0d1755cf9caa54ba71e5d049479c52f9e4afc230f06822162ab2f/cffi-2.0.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:7cc09976e8b56f8cebd752f7113ad07752461f48a58cbba644139015ac24954c", size = 221593, upload-time = "2025-09-08T23:23:31.91Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/e0/0d/eb704606dfe8033e7128df5e90fee946bbcb64a04fcdaa97321309004000/cffi-2.0.0-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:92b68146a71df78564e4ef48af17551a5ddd142e5190cdf2c5624d0c3ff5b2e8", size = 209354, upload-time = "2025-09-08T23:23:33.214Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/d8/19/3c435d727b368ca475fb8742ab97c9cb13a0de600ce86f62eab7fa3eea60/cffi-2.0.0-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:b1e74d11748e7e98e2f426ab176d4ed720a64412b6a15054378afdb71e0f37dc", size = 208480, upload-time = "2025-09-08T23:23:34.495Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/d0/44/681604464ed9541673e486521497406fadcc15b5217c3e326b061696899a/cffi-2.0.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:28a3a209b96630bca57cce802da70c266eb08c6e97e5afd61a75611ee6c64592", size = 221584, upload-time = "2025-09-08T23:23:36.096Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/25/8e/342a504ff018a2825d395d44d63a767dd8ebc927ebda557fecdaca3ac33a/cffi-2.0.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:7553fb2090d71822f02c629afe6042c299edf91ba1bf94951165613553984512", size = 224443, upload-time = "2025-09-08T23:23:37.328Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/e1/5e/b666bacbbc60fbf415ba9988324a132c9a7a0448a9a8f125074671c0f2c3/cffi-2.0.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:6c6c373cfc5c83a975506110d17457138c8c63016b563cc9ed6e056a82f13ce4", size = 223437, upload-time = "2025-09-08T23:23:38.945Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/a0/1d/ec1a60bd1a10daa292d3cd6bb0b359a81607154fb8165f3ec95fe003b85c/cffi-2.0.0-cp314-cp314t-win32.whl", hash = "sha256:1fc9ea04857caf665289b7a75923f2c6ed559b8298a1b8c49e59f7dd95c8481e", size = 180487, upload-time = "2025-09-08T23:23:40.423Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/bf/41/4c1168c74fac325c0c8156f04b6749c8b6a8f405bbf91413ba088359f60d/cffi-2.0.0-cp314-cp314t-win_amd64.whl", hash = "sha256:d68b6cef7827e8641e8ef16f4494edda8b36104d79773a334beaa1e3521430f6", size = 191726, upload-time = "2025-09-08T23:23:41.742Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/ae/3a/dbeec9d1ee0844c679f6bb5d6ad4e9f198b1224f4e7a32825f47f6192b0c/cffi-2.0.0-cp314-cp314t-win_arm64.whl", hash = "sha256:0a1527a803f0a659de1af2e1fd700213caba79377e27e4693648c2923da066f9", size = 184195, upload-time = "2025-09-08T23:23:43.004Z" },
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "charset-normalizer"
|
name = "charset-normalizer"
|
||||||
version = "3.4.6"
|
version = "3.4.6"
|
||||||
|
|
@ -223,29 +166,6 @@ wheels = [
|
||||||
{ url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" },
|
{ url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" },
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "curl-cffi"
|
|
||||||
version = "0.14.0"
|
|
||||||
source = { registry = "https://pypi.org/simple" }
|
|
||||||
dependencies = [
|
|
||||||
{ name = "certifi" },
|
|
||||||
{ name = "cffi" },
|
|
||||||
]
|
|
||||||
sdist = { url = "https://files.pythonhosted.org/packages/9b/c9/0067d9a25ed4592b022d4558157fcdb6e123516083700786d38091688767/curl_cffi-0.14.0.tar.gz", hash = "sha256:5ffbc82e59f05008ec08ea432f0e535418823cda44178ee518906a54f27a5f0f", size = 162633, upload-time = "2025-12-16T03:25:07.931Z" }
|
|
||||||
wheels = [
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/aa/f0/0f21e9688eaac85e705537b3a87a5588d0cefb2f09d83e83e0e8be93aa99/curl_cffi-0.14.0-cp39-abi3-macosx_14_0_arm64.whl", hash = "sha256:e35e89c6a69872f9749d6d5fda642ed4fc159619329e99d577d0104c9aad5893", size = 3087277, upload-time = "2025-12-16T03:24:49.607Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/ba/a3/0419bd48fce5b145cb6a2344c6ac17efa588f5b0061f212c88e0723da026/curl_cffi-0.14.0-cp39-abi3-macosx_15_0_x86_64.whl", hash = "sha256:5945478cd28ad7dfb5c54473bcfb6743ee1d66554d57951fdf8fc0e7d8cf4e45", size = 5804650, upload-time = "2025-12-16T03:24:51.518Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/e2/07/a238dd062b7841b8caa2fa8a359eb997147ff3161288f0dd46654d898b4d/curl_cffi-0.14.0-cp39-abi3-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c42e8fa3c667db9ccd2e696ee47adcd3cd5b0838d7282f3fc45f6c0ef3cfdfa7", size = 8231918, upload-time = "2025-12-16T03:24:52.862Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/7c/d2/ce907c9b37b5caf76ac08db40cc4ce3d9f94c5500db68a195af3513eacbc/curl_cffi-0.14.0-cp39-abi3-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:060fe2c99c41d3cb7f894de318ddf4b0301b08dca70453d769bd4e74b36b8483", size = 8654624, upload-time = "2025-12-16T03:24:54.579Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/f2/ae/6256995b18c75e6ef76b30753a5109e786813aa79088b27c8eabb1ef85c9/curl_cffi-0.14.0-cp39-abi3-manylinux_2_28_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:b158c41a25388690dd0d40b5bc38d1e0f512135f17fdb8029868cbc1993d2e5b", size = 8010654, upload-time = "2025-12-16T03:24:56.507Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/fb/10/ff64249e516b103cb762e0a9dca3ee0f04cf25e2a1d5d9838e0f1273d071/curl_cffi-0.14.0-cp39-abi3-manylinux_2_28_i686.whl", hash = "sha256:1439fbef3500fb723333c826adf0efb0e2e5065a703fb5eccce637a2250db34a", size = 7781969, upload-time = "2025-12-16T03:24:57.885Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/51/76/d6f7bb76c2d12811aa7ff16f5e17b678abdd1b357b9a8ac56310ceccabd5/curl_cffi-0.14.0-cp39-abi3-manylinux_2_34_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:e7176f2c2d22b542e3cf261072a81deb018cfa7688930f95dddef215caddb469", size = 7969133, upload-time = "2025-12-16T03:24:59.261Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/23/7c/cca39c0ed4e1772613d3cba13091c0e9d3b89365e84b9bf9838259a3cd8f/curl_cffi-0.14.0-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:03f21ade2d72978c2bb8670e9b6de5260e2755092b02d94b70b906813662998d", size = 9080167, upload-time = "2025-12-16T03:25:00.946Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/75/03/a942d7119d3e8911094d157598ae0169b1c6ca1bd3f27d7991b279bcc45b/curl_cffi-0.14.0-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:58ebf02de64ee5c95613209ddacb014c2d2f86298d7080c0a1c12ed876ee0690", size = 9520464, upload-time = "2025-12-16T03:25:02.922Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/a2/77/78900e9b0833066d2274bda75cba426fdb4cef7fbf6a4f6a6ca447607bec/curl_cffi-0.14.0-cp39-abi3-win_amd64.whl", hash = "sha256:6e503f9a103f6ae7acfb3890c843b53ec030785a22ae7682a22cc43afb94123e", size = 1677416, upload-time = "2025-12-16T03:25:04.902Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/5c/7c/d2ba86b0b3e1e2830bd94163d047de122c69a8df03c5c7c36326c456ad82/curl_cffi-0.14.0-cp39-abi3-win_arm64.whl", hash = "sha256:2eed50a969201605c863c4c31269dfc3e0da52916086ac54553cfa353022425c", size = 1425067, upload-time = "2025-12-16T03:25:06.454Z" },
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "cython"
|
name = "cython"
|
||||||
version = "3.2.4"
|
version = "3.2.4"
|
||||||
|
|
@ -274,7 +194,6 @@ version = "0.1.0"
|
||||||
source = { virtual = "." }
|
source = { virtual = "." }
|
||||||
dependencies = [
|
dependencies = [
|
||||||
{ name = "camoufox" },
|
{ name = "camoufox" },
|
||||||
{ name = "curl-cffi" },
|
|
||||||
{ name = "fake-useragent" },
|
{ name = "fake-useragent" },
|
||||||
{ name = "httpx" },
|
{ name = "httpx" },
|
||||||
{ name = "playwright" },
|
{ name = "playwright" },
|
||||||
|
|
@ -284,7 +203,6 @@ dependencies = [
|
||||||
[package.metadata]
|
[package.metadata]
|
||||||
requires-dist = [
|
requires-dist = [
|
||||||
{ name = "camoufox", specifier = ">=0.4.11" },
|
{ name = "camoufox", specifier = ">=0.4.11" },
|
||||||
{ name = "curl-cffi" },
|
|
||||||
{ name = "fake-useragent", specifier = ">=2.2.0" },
|
{ name = "fake-useragent", specifier = ">=2.2.0" },
|
||||||
{ name = "httpx" },
|
{ name = "httpx" },
|
||||||
{ name = "playwright", specifier = ">=1.58.0" },
|
{ name = "playwright", specifier = ">=1.58.0" },
|
||||||
|
|
@ -639,15 +557,6 @@ wheels = [
|
||||||
{ url = "https://files.pythonhosted.org/packages/b3/eb/936f5eeae196e8c8aaabe5f7d98891be8a5bbc741d50ce5c60f55575ad29/polars_runtime_32-1.39.0-cp310-abi3-win_arm64.whl", hash = "sha256:d69abde5f148566860bbe910010847bd7791e72f7c8063a4d2c462246a33a72a", size = 41885761, upload-time = "2026-03-12T14:23:16.773Z" },
|
{ url = "https://files.pythonhosted.org/packages/b3/eb/936f5eeae196e8c8aaabe5f7d98891be8a5bbc741d50ce5c60f55575ad29/polars_runtime_32-1.39.0-cp310-abi3-win_arm64.whl", hash = "sha256:d69abde5f148566860bbe910010847bd7791e72f7c8063a4d2c462246a33a72a", size = 41885761, upload-time = "2026-03-12T14:23:16.773Z" },
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "pycparser"
|
|
||||||
version = "3.0"
|
|
||||||
source = { registry = "https://pypi.org/simple" }
|
|
||||||
sdist = { url = "https://files.pythonhosted.org/packages/1b/7d/92392ff7815c21062bea51aa7b87d45576f649f16458d78b7cf94b9ab2e6/pycparser-3.0.tar.gz", hash = "sha256:600f49d217304a5902ac3c37e1281c9fe94e4d0489de643a9504c5cdfdfc6b29", size = 103492, upload-time = "2026-01-21T14:26:51.89Z" }
|
|
||||||
wheels = [
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/0c/c3/44f3fbbfa403ea2a7c779186dc20772604442dde72947e7d01069cbe98e3/pycparser-3.0-py3-none-any.whl", hash = "sha256:b727414169a36b7d524c1c3e31839a521725078d7b2ff038656844266160a992", size = 48172, upload-time = "2026-01-21T14:26:50.693Z" },
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "pyee"
|
name = "pyee"
|
||||||
version = "13.0.1"
|
version = "13.0.1"
|
||||||
|
|
|
||||||
209
finder/zoopla.py
209
finder/zoopla.py
|
|
@ -1,8 +1,8 @@
|
||||||
"""Zoopla (zoopla.co.uk) scraper — sale properties.
|
"""Zoopla (zoopla.co.uk) scraper — sale properties.
|
||||||
|
|
||||||
Zoopla is behind Cloudflare Turnstile (managed interactive challenge), which
|
Zoopla is behind Cloudflare Turnstile (managed interactive challenge), which
|
||||||
blocks all HTTP clients (curl_cffi, httpx) and even Playwright with stealth
|
blocks non-browser HTTP clients and even Playwright with stealth patches. Only
|
||||||
patches. Only Camoufox (an anti-fingerprinting Firefox fork) passes reliably.
|
Camoufox (an anti-fingerprinting Firefox fork) passes reliably.
|
||||||
|
|
||||||
Zoopla uses Next.js App Router with React Server Components (RSC). Search
|
Zoopla uses Next.js App Router with React Server Components (RSC). Search
|
||||||
result data is server-rendered in an RSC stream, not available via
|
result data is server-rendered in an RSC stream, not available via
|
||||||
|
|
@ -19,11 +19,20 @@ Architecture:
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
import os
|
||||||
import re
|
import re
|
||||||
|
import sys
|
||||||
import time
|
import time
|
||||||
|
from pathlib import Path
|
||||||
from urllib.parse import parse_qsl, urlencode, urljoin, urlparse, urlunparse
|
from urllib.parse import parse_qsl, urlencode, urljoin, urlparse, urlunparse
|
||||||
|
|
||||||
from constants import DELAY_BETWEEN_PAGES, MAX_BEDROOMS, PROPERTY_TYPE_MAP, ZOOPLA_BASE
|
from constants import (
|
||||||
|
DATA_DIR,
|
||||||
|
DELAY_BETWEEN_PAGES,
|
||||||
|
MAX_BEDROOMS,
|
||||||
|
PROPERTY_TYPE_MAP,
|
||||||
|
ZOOPLA_BASE,
|
||||||
|
)
|
||||||
from spatial import PostcodeSpatialIndex
|
from spatial import PostcodeSpatialIndex
|
||||||
from transform import normalize_sub_type, parse_int_value, validate_floor_area
|
from transform import normalize_sub_type, parse_int_value, validate_floor_area
|
||||||
|
|
||||||
|
|
@ -255,11 +264,120 @@ _DISMISS_COOKIES_JS = """() => {
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
_FALSE_ENV_VALUES = {"0", "false", "no", "off"}
|
||||||
|
_TRUE_ENV_VALUES = {"1", "true", "yes", "on"}
|
||||||
|
|
||||||
|
|
||||||
|
def _env_bool_or_virtual(name: str, default: bool | str) -> bool | str:
|
||||||
|
raw = os.environ.get(name)
|
||||||
|
if raw is None:
|
||||||
|
return default
|
||||||
|
|
||||||
|
value = raw.strip().lower()
|
||||||
|
if value == "virtual":
|
||||||
|
return "virtual"
|
||||||
|
if value in _TRUE_ENV_VALUES:
|
||||||
|
return True
|
||||||
|
if value in _FALSE_ENV_VALUES:
|
||||||
|
return False
|
||||||
|
raise ValueError(
|
||||||
|
f"{name} must be one of 1/0, true/false, yes/no, on/off, or virtual"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _visible_display_available() -> bool:
|
||||||
|
if sys.platform.startswith("linux"):
|
||||||
|
return bool(os.environ.get("DISPLAY") or os.environ.get("WAYLAND_DISPLAY"))
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def _zoopla_headless_mode() -> bool | str:
|
||||||
|
# Prefer a visible browser by default so Cloudflare can be completed by the
|
||||||
|
# person running the scrape. In display-less Linux shells, keep startup
|
||||||
|
# headless and fail fast with an actionable error if a challenge appears.
|
||||||
|
default: bool | str = not _visible_display_available()
|
||||||
|
return _env_bool_or_virtual("ZOOPLA_HEADLESS", default)
|
||||||
|
|
||||||
|
|
||||||
|
def _zoopla_profile_dir() -> Path:
|
||||||
|
raw = os.environ.get("ZOOPLA_PROFILE_DIR")
|
||||||
|
if raw:
|
||||||
|
return Path(raw).expanduser().resolve()
|
||||||
|
return (DATA_DIR / ".runtime" / "zoopla-profile").expanduser().resolve()
|
||||||
|
|
||||||
|
|
||||||
|
def _challenge_timeout_seconds() -> int:
|
||||||
|
raw = os.environ.get("ZOOPLA_CHALLENGE_TIMEOUT_SECONDS")
|
||||||
|
if raw is None:
|
||||||
|
return 300
|
||||||
|
try:
|
||||||
|
timeout = int(raw)
|
||||||
|
except ValueError as exc:
|
||||||
|
raise ValueError("ZOOPLA_CHALLENGE_TIMEOUT_SECONDS must be an integer") from exc
|
||||||
|
if timeout < 1:
|
||||||
|
raise ValueError("ZOOPLA_CHALLENGE_TIMEOUT_SECONDS must be greater than zero")
|
||||||
|
return timeout
|
||||||
|
|
||||||
|
|
||||||
|
def _is_turnstile_challenge(page) -> bool:
|
||||||
|
try:
|
||||||
|
if "just a moment" in page.title().lower():
|
||||||
|
return True
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
return bool(
|
||||||
|
page.query_selector(
|
||||||
|
'iframe[src*="challenges.cloudflare.com"], '
|
||||||
|
'input[name="cf-turnstile-response"]'
|
||||||
|
)
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _wait_for_turnstile(page, headless_mode: bool | str) -> None:
|
||||||
|
if not _is_turnstile_challenge(page):
|
||||||
|
return
|
||||||
|
|
||||||
|
profile_dir = _zoopla_profile_dir()
|
||||||
|
if headless_mode is True or headless_mode == "virtual":
|
||||||
|
raise TurnstileError(
|
||||||
|
"Cloudflare Turnstile requires a visible browser session. "
|
||||||
|
"Run Zoopla from a desktop session with ZOOPLA_HEADLESS=0; "
|
||||||
|
f"the solved session will be saved in {profile_dir}."
|
||||||
|
)
|
||||||
|
|
||||||
|
timeout = _challenge_timeout_seconds()
|
||||||
|
log.warning(
|
||||||
|
"Cloudflare Turnstile challenge shown. Complete it in the Zoopla browser "
|
||||||
|
"window; waiting up to %ds. Profile: %s",
|
||||||
|
timeout,
|
||||||
|
profile_dir,
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
page.bring_to_front()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
deadline = time.monotonic() + timeout
|
||||||
|
while time.monotonic() < deadline:
|
||||||
|
time.sleep(3)
|
||||||
|
if not _is_turnstile_challenge(page):
|
||||||
|
log.info("Cloudflare challenge resolved")
|
||||||
|
return
|
||||||
|
|
||||||
|
raise TurnstileError(
|
||||||
|
f"Cloudflare Turnstile was not completed after {timeout}s"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def launch_browser():
|
def launch_browser():
|
||||||
"""Launch Camoufox, navigate to Zoopla homepage, pass Cloudflare Turnstile,
|
"""Launch Camoufox, navigate to Zoopla homepage, pass Cloudflare Turnstile,
|
||||||
and dismiss cookie consent. Returns (browser, page) tuple.
|
and dismiss cookie consent. Returns (browser, page) tuple.
|
||||||
|
|
||||||
Raises TurnstileError if Cloudflare cannot be passed within two minutes.
|
Raises TurnstileError if Cloudflare cannot be completed.
|
||||||
Caller must close browser when done."""
|
Caller must close browser when done."""
|
||||||
from camoufox.pkgman import camoufox_path
|
from camoufox.pkgman import camoufox_path
|
||||||
|
|
||||||
|
|
@ -269,61 +387,50 @@ def launch_browser():
|
||||||
|
|
||||||
from camoufox.sync_api import Camoufox
|
from camoufox.sync_api import Camoufox
|
||||||
|
|
||||||
log.info("Launching Camoufox browser for Zoopla...")
|
headless_mode = _zoopla_headless_mode()
|
||||||
camoufox = Camoufox(headless=True)
|
profile_dir = _zoopla_profile_dir()
|
||||||
|
profile_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
log.info(
|
||||||
|
"Launching Camoufox browser for Zoopla (headless=%s, profile=%s)...",
|
||||||
|
headless_mode,
|
||||||
|
profile_dir,
|
||||||
|
)
|
||||||
|
camoufox = Camoufox(
|
||||||
|
headless=headless_mode,
|
||||||
|
persistent_context=True,
|
||||||
|
user_data_dir=str(profile_dir),
|
||||||
|
locale=["en-GB", "en"],
|
||||||
|
enable_cache=True,
|
||||||
|
)
|
||||||
raw_browser = camoufox.__enter__()
|
raw_browser = camoufox.__enter__()
|
||||||
browser = _ManagedCamoufoxBrowser(camoufox, raw_browser)
|
browser = _ManagedCamoufoxBrowser(camoufox, raw_browser)
|
||||||
page = browser.new_page()
|
page = raw_browser.pages[0] if raw_browser.pages else raw_browser.new_page()
|
||||||
|
|
||||||
log.info("Navigating to Zoopla homepage...")
|
try:
|
||||||
page.goto(f"{ZOOPLA_BASE}/", wait_until="domcontentloaded", timeout=60000)
|
log.info("Navigating to Zoopla homepage...")
|
||||||
|
page.goto(f"{ZOOPLA_BASE}/", wait_until="domcontentloaded", timeout=60000)
|
||||||
|
_wait_for_turnstile(page, headless_mode)
|
||||||
|
|
||||||
# Wait for Cloudflare Turnstile to resolve.
|
log.info("Zoopla browser ready — title: %s", page.title())
|
||||||
# Try clicking the Turnstile checkbox if present (helps in some cases).
|
time.sleep(2)
|
||||||
for i in range(40):
|
|
||||||
if "Just a moment" not in page.title():
|
|
||||||
break
|
|
||||||
# Attempt to click the Turnstile checkbox in the challenge iframe
|
|
||||||
for frame in page.frames:
|
|
||||||
if "challenges.cloudflare.com" in frame.url:
|
|
||||||
try:
|
|
||||||
iframe_el = page.query_selector('iframe[src*="challenges.cloudflare"]')
|
|
||||||
if iframe_el:
|
|
||||||
box = iframe_el.bounding_box()
|
|
||||||
if box:
|
|
||||||
page.mouse.click(box["x"] + 30, box["y"] + box["height"] / 2)
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
break
|
|
||||||
time.sleep(3)
|
|
||||||
else:
|
|
||||||
page.close()
|
|
||||||
browser.close()
|
|
||||||
raise TurnstileError("Cloudflare Turnstile did not resolve after 120s")
|
|
||||||
|
|
||||||
log.info("Cloudflare passed — title: %s", page.title())
|
# Dismiss cookie consent
|
||||||
time.sleep(2)
|
page.evaluate(_DISMISS_COOKIES_JS)
|
||||||
|
time.sleep(1)
|
||||||
# Dismiss cookie consent
|
except Exception:
|
||||||
page.evaluate(_DISMISS_COOKIES_JS)
|
try:
|
||||||
time.sleep(1)
|
page.close()
|
||||||
|
finally:
|
||||||
|
browser.close()
|
||||||
|
raise
|
||||||
|
|
||||||
return browser, page
|
return browser, page
|
||||||
|
|
||||||
|
|
||||||
def _ensure_not_challenged(page) -> None:
|
def _ensure_not_challenged(page) -> None:
|
||||||
"""Check if current page is a Cloudflare challenge and wait/raise."""
|
"""Check if current page is a Cloudflare challenge and wait/raise."""
|
||||||
if "Just a moment" not in page.title():
|
_wait_for_turnstile(page, _zoopla_headless_mode())
|
||||||
return
|
|
||||||
|
|
||||||
log.warning("Cloudflare challenge detected mid-session, waiting...")
|
|
||||||
for i in range(40):
|
|
||||||
time.sleep(3)
|
|
||||||
if "Just a moment" not in page.title():
|
|
||||||
log.info("Cloudflare challenge resolved")
|
|
||||||
return
|
|
||||||
|
|
||||||
raise TurnstileError("Cloudflare re-challenge did not resolve after 120s")
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
@ -704,9 +811,7 @@ def transform_property(
|
||||||
|
|
||||||
Zoopla search cards do not include coordinates, so we resolve lat/lng
|
Zoopla search cards do not include coordinates, so we resolve lat/lng
|
||||||
from postcodes extracted from the address text."""
|
from postcodes extracted from the address text."""
|
||||||
price = parse_int_value(raw.get("price"))
|
price = parse_int_value(raw.get("price")) or 0
|
||||||
if not price or price <= 0:
|
|
||||||
return None
|
|
||||||
|
|
||||||
address = raw.get("address", "")
|
address = raw.get("address", "")
|
||||||
|
|
||||||
|
|
@ -856,7 +961,7 @@ def search_outcode(
|
||||||
sample = raw_listings[0] if raw_listings else {}
|
sample = raw_listings[0] if raw_listings else {}
|
||||||
log.debug(
|
log.debug(
|
||||||
"Zoopla %s %s: extracted %d raw listings but all %d dropped in transform "
|
"Zoopla %s %s: extracted %d raw listings but all %d dropped in transform "
|
||||||
"(no price/postcode/coords). Sample raw: price=%s address=%r",
|
"(no postcode/coords). Sample raw: price=%s address=%r",
|
||||||
outcode, "BUY", len(raw_listings), dropped,
|
outcode, "BUY", len(raw_listings), dropped,
|
||||||
sample.get("price"), sample.get("address", ""),
|
sample.get("price"), sample.get("address", ""),
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -68,6 +68,34 @@ const ROUTE_COLORS: Record<string, { color: string; darkText?: boolean }> = {
|
||||||
};
|
};
|
||||||
|
|
||||||
const NON_TUBE_NAMES = new Set(['DLR', 'London Overground', 'Elizabeth line']);
|
const NON_TUBE_NAMES = new Set(['DLR', 'London Overground', 'Elizabeth line']);
|
||||||
|
const GOOGLE_MAPS_DEPARTURE_TIME_ZONE = 'Europe/London';
|
||||||
|
const londonDateFormatter = new Intl.DateTimeFormat('en-GB', {
|
||||||
|
timeZone: GOOGLE_MAPS_DEPARTURE_TIME_ZONE,
|
||||||
|
year: 'numeric',
|
||||||
|
month: '2-digit',
|
||||||
|
day: '2-digit',
|
||||||
|
});
|
||||||
|
const londonDateTimeFormatter = new Intl.DateTimeFormat('en-GB', {
|
||||||
|
timeZone: GOOGLE_MAPS_DEPARTURE_TIME_ZONE,
|
||||||
|
year: 'numeric',
|
||||||
|
month: '2-digit',
|
||||||
|
day: '2-digit',
|
||||||
|
hour: '2-digit',
|
||||||
|
minute: '2-digit',
|
||||||
|
second: '2-digit',
|
||||||
|
hour12: false,
|
||||||
|
hourCycle: 'h23',
|
||||||
|
});
|
||||||
|
|
||||||
|
function dateTimeParts(formatter: Intl.DateTimeFormat, date: Date): Record<string, number> {
|
||||||
|
const parts: Record<string, number> = {};
|
||||||
|
formatter.formatToParts(date).forEach((part) => {
|
||||||
|
if (part.type !== 'literal') {
|
||||||
|
parts[part.type] = Number(part.value);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
return parts;
|
||||||
|
}
|
||||||
|
|
||||||
/** Strip trailing parenthesized GTFS route IDs and NaPTAN stop codes (e.g. "(6757261)", "(9400ZZLUCGT1)") */
|
/** Strip trailing parenthesized GTFS route IDs and NaPTAN stop codes (e.g. "(6757261)", "(9400ZZLUCGT1)") */
|
||||||
function stripId(label: string): string {
|
function stripId(label: string): string {
|
||||||
|
|
@ -87,15 +115,48 @@ function getRouteDisplay(mode: string): { label: string; color: string; darkText
|
||||||
return { label: clean, color: '#6b7280', darkText: false };
|
return { label: clean, color: '#6b7280', darkText: false };
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Returns a Unix timestamp for the next Monday at 07:30 local time. */
|
function londonOffsetMs(utcMs: number): number {
|
||||||
|
const parts = dateTimeParts(londonDateTimeFormatter, new Date(utcMs));
|
||||||
|
const londonAsUtcMs = Date.UTC(
|
||||||
|
parts.year,
|
||||||
|
parts.month - 1,
|
||||||
|
parts.day,
|
||||||
|
parts.hour,
|
||||||
|
parts.minute,
|
||||||
|
parts.second
|
||||||
|
);
|
||||||
|
return londonAsUtcMs - utcMs;
|
||||||
|
}
|
||||||
|
|
||||||
|
function londonTimeToUtcMs(
|
||||||
|
year: number,
|
||||||
|
month: number,
|
||||||
|
day: number,
|
||||||
|
hour: number,
|
||||||
|
minute: number
|
||||||
|
): number {
|
||||||
|
const localAsUtcMs = Date.UTC(year, month - 1, day, hour, minute, 0, 0);
|
||||||
|
const offsetMs = londonOffsetMs(localAsUtcMs);
|
||||||
|
const utcMs = localAsUtcMs - offsetMs;
|
||||||
|
const correctedOffsetMs = londonOffsetMs(utcMs);
|
||||||
|
return correctedOffsetMs === offsetMs ? utcMs : localAsUtcMs - correctedOffsetMs;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns a Unix timestamp for the next Monday at 07:30 Europe/London time. */
|
||||||
function nextMondayAt730(): number {
|
function nextMondayAt730(): number {
|
||||||
const now = new Date();
|
const now = new Date();
|
||||||
const day = now.getDay(); // 0=Sun … 6=Sat
|
const today = dateTimeParts(londonDateFormatter, now);
|
||||||
|
const day = new Date(Date.UTC(today.year, today.month - 1, today.day)).getUTCDay();
|
||||||
const daysUntil = day === 0 ? 1 : day === 1 ? 7 : 8 - day;
|
const daysUntil = day === 0 ? 1 : day === 1 ? 7 : 8 - day;
|
||||||
const monday = new Date(now);
|
const monday = new Date(Date.UTC(today.year, today.month - 1, today.day + daysUntil));
|
||||||
monday.setDate(now.getDate() + daysUntil);
|
const utcMs = londonTimeToUtcMs(
|
||||||
monday.setHours(7, 30, 0, 0);
|
monday.getUTCFullYear(),
|
||||||
return Math.floor(monday.getTime() / 1000);
|
monday.getUTCMonth() + 1,
|
||||||
|
monday.getUTCDate(),
|
||||||
|
7,
|
||||||
|
30
|
||||||
|
);
|
||||||
|
return Math.floor(utcMs / 1000);
|
||||||
}
|
}
|
||||||
|
|
||||||
function googleMapsDestination(
|
function googleMapsDestination(
|
||||||
|
|
|
||||||
|
|
@ -419,6 +419,7 @@ export default function MapPage({
|
||||||
const { listings: actualListings } = useActualListings(mapData.bounds, {
|
const { listings: actualListings } = useActualListings(mapData.bounds, {
|
||||||
filterParam: actualListingsFilterParam,
|
filterParam: actualListingsFilterParam,
|
||||||
travelParam: actualListingsTravelParam,
|
travelParam: actualListingsTravelParam,
|
||||||
|
shareCode,
|
||||||
});
|
});
|
||||||
const [isAreaGroupExpanded, toggleAreaGroup] = useCollapsibleGroups(true);
|
const [isAreaGroupExpanded, toggleAreaGroup] = useCollapsibleGroups(true);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -7,11 +7,12 @@ const DEBOUNCE_MS = 200;
|
||||||
interface UseActualListingsOptions {
|
interface UseActualListingsOptions {
|
||||||
filterParam?: string;
|
filterParam?: string;
|
||||||
travelParam?: string;
|
travelParam?: string;
|
||||||
|
shareCode?: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
export function useActualListings(
|
export function useActualListings(
|
||||||
bounds: Bounds | null,
|
bounds: Bounds | null,
|
||||||
{ filterParam = '', travelParam = '' }: UseActualListingsOptions = {}
|
{ filterParam = '', travelParam = '', shareCode = '' }: UseActualListingsOptions = {}
|
||||||
) {
|
) {
|
||||||
const [listings, setListings] = useState<ActualListing[]>([]);
|
const [listings, setListings] = useState<ActualListing[]>([]);
|
||||||
const debounceRef = useRef<ReturnType<typeof setTimeout> | null>(null);
|
const debounceRef = useRef<ReturnType<typeof setTimeout> | null>(null);
|
||||||
|
|
@ -38,11 +39,15 @@ export function useActualListings(
|
||||||
const params = new URLSearchParams({ bounds: boundsStr });
|
const params = new URLSearchParams({ bounds: boundsStr });
|
||||||
if (filterParam) params.set('filters', filterParam);
|
if (filterParam) params.set('filters', filterParam);
|
||||||
if (travelParam) params.set('travel', travelParam);
|
if (travelParam) params.set('travel', travelParam);
|
||||||
|
if (shareCode) params.set('share', shareCode);
|
||||||
const res = await fetch(
|
const res = await fetch(
|
||||||
apiUrl('actual-listings', params),
|
apiUrl('actual-listings', params),
|
||||||
authHeaders({ signal: abortControllerRef.current.signal })
|
authHeaders({ signal: abortControllerRef.current.signal })
|
||||||
);
|
);
|
||||||
if (!res.ok) throw new Error(`Actual listings fetch failed: HTTP ${res.status}`);
|
if (!res.ok) {
|
||||||
|
if (requestIdRef.current === requestId) setListings([]);
|
||||||
|
throw new Error(`Actual listings fetch failed: HTTP ${res.status}`);
|
||||||
|
}
|
||||||
const json: ActualListingsResponse = await res.json();
|
const json: ActualListingsResponse = await res.json();
|
||||||
if (requestIdRef.current !== requestId) return;
|
if (requestIdRef.current !== requestId) return;
|
||||||
setListings(json.listings || []);
|
setListings(json.listings || []);
|
||||||
|
|
@ -57,7 +62,7 @@ export function useActualListings(
|
||||||
};
|
};
|
||||||
// listings intentionally excluded — it's internal state, not an input.
|
// listings intentionally excluded — it's internal state, not an input.
|
||||||
// eslint-disable-next-line react-hooks/exhaustive-deps
|
// eslint-disable-next-line react-hooks/exhaustive-deps
|
||||||
}, [bounds, filterParam, travelParam]);
|
}, [bounds, filterParam, travelParam, shareCode]);
|
||||||
|
|
||||||
return { listings };
|
return { listings };
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -109,9 +109,6 @@ export function useDeckLayers({
|
||||||
listings: actualListings,
|
listings: actualListings,
|
||||||
zoom,
|
zoom,
|
||||||
isDark,
|
isDark,
|
||||||
hexagonData: data,
|
|
||||||
postcodeData,
|
|
||||||
usePostcodeView,
|
|
||||||
});
|
});
|
||||||
|
|
||||||
// --- Refs for deck.gl accessors ---
|
// --- Refs for deck.gl accessors ---
|
||||||
|
|
|
||||||
|
|
@ -1,9 +1,8 @@
|
||||||
import { useCallback, useMemo, useRef, useState } from 'react';
|
import { useCallback, useMemo, useRef, useState } from 'react';
|
||||||
import type { Layer, PickingInfo } from '@deck.gl/core';
|
import type { Layer, PickingInfo } from '@deck.gl/core';
|
||||||
import { ScatterplotLayer, TextLayer } from '@deck.gl/layers';
|
import { ScatterplotLayer, TextLayer } from '@deck.gl/layers';
|
||||||
import { getResolution, latLngToCell } from 'h3-js';
|
|
||||||
|
|
||||||
import type { ActualListing, HexagonData, PostcodeFeature } from '../types';
|
import type { ActualListing } from '../types';
|
||||||
import { trackEvent } from '../lib/analytics';
|
import { trackEvent } from '../lib/analytics';
|
||||||
|
|
||||||
const PRICE_LABEL_MIN_ZOOM = 14;
|
const PRICE_LABEL_MIN_ZOOM = 14;
|
||||||
|
|
@ -19,14 +18,6 @@ interface UseListingLayersProps {
|
||||||
listings: ActualListing[];
|
listings: ActualListing[];
|
||||||
zoom: number;
|
zoom: number;
|
||||||
isDark: boolean;
|
isDark: boolean;
|
||||||
hexagonData: HexagonData[];
|
|
||||||
postcodeData: PostcodeFeature[];
|
|
||||||
usePostcodeView: boolean;
|
|
||||||
}
|
|
||||||
|
|
||||||
function normalizePostcode(value: string | undefined | null): string {
|
|
||||||
if (!value) return '';
|
|
||||||
return value.replace(/\s+/g, '').toUpperCase();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
function formatShortPrice(price: number): string {
|
function formatShortPrice(price: number): string {
|
||||||
|
|
@ -35,57 +26,9 @@ function formatShortPrice(price: number): string {
|
||||||
return `£${price}`;
|
return `£${price}`;
|
||||||
}
|
}
|
||||||
|
|
||||||
export function useListingLayers({
|
export function useListingLayers({ listings, zoom, isDark }: UseListingLayersProps) {
|
||||||
listings,
|
|
||||||
zoom,
|
|
||||||
isDark,
|
|
||||||
hexagonData,
|
|
||||||
postcodeData,
|
|
||||||
usePostcodeView,
|
|
||||||
}: UseListingLayersProps) {
|
|
||||||
const [popupInfo, setPopupInfo] = useState<ListingPopupInfo | null>(null);
|
const [popupInfo, setPopupInfo] = useState<ListingPopupInfo | null>(null);
|
||||||
|
|
||||||
// Split into two memos so the inactive view's data changes don't invalidate
|
|
||||||
// the active filtered list. (e.g. in postcode view, hexagonData updates must
|
|
||||||
// not retrigger filtering / downstream layer rebuilds.)
|
|
||||||
const postcodeFilteredListings = useMemo(() => {
|
|
||||||
if (!usePostcodeView || listings.length === 0) return null;
|
|
||||||
const allowed = new Set<string>();
|
|
||||||
for (const feature of postcodeData) {
|
|
||||||
if (feature.properties.count > 0) {
|
|
||||||
allowed.add(normalizePostcode(feature.properties.postcode));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (allowed.size === 0) return [];
|
|
||||||
return listings.filter((listing) => allowed.has(normalizePostcode(listing.postcode)));
|
|
||||||
}, [listings, postcodeData, usePostcodeView]);
|
|
||||||
|
|
||||||
const hexFilteredListings = useMemo(() => {
|
|
||||||
if (usePostcodeView || listings.length === 0) return null;
|
|
||||||
const allowed = new Set<string>();
|
|
||||||
let cellResolution: number | null = null;
|
|
||||||
for (const cell of hexagonData) {
|
|
||||||
if (cell.count > 0) {
|
|
||||||
allowed.add(cell.h3);
|
|
||||||
if (cellResolution == null) cellResolution = getResolution(cell.h3);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (allowed.size === 0 || cellResolution == null) return [];
|
|
||||||
const resolutionForLookup = cellResolution;
|
|
||||||
return listings.filter((listing) => {
|
|
||||||
try {
|
|
||||||
return allowed.has(latLngToCell(listing.lat, listing.lon, resolutionForLookup));
|
|
||||||
} catch {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}, [listings, hexagonData, usePostcodeView]);
|
|
||||||
|
|
||||||
const visibleListings = useMemo(() => {
|
|
||||||
if (listings.length === 0) return listings;
|
|
||||||
return (usePostcodeView ? postcodeFilteredListings : hexFilteredListings) ?? [];
|
|
||||||
}, [listings, usePostcodeView, postcodeFilteredListings, hexFilteredListings]);
|
|
||||||
|
|
||||||
const handleHover = useCallback((info: PickingInfo<ActualListing>) => {
|
const handleHover = useCallback((info: PickingInfo<ActualListing>) => {
|
||||||
if (info.object && info.x !== undefined && info.y !== undefined) {
|
if (info.object && info.x !== undefined && info.y !== undefined) {
|
||||||
setPopupInfo({ x: info.x, y: info.y, listing: info.object });
|
setPopupInfo({ x: info.x, y: info.y, listing: info.object });
|
||||||
|
|
@ -119,21 +62,21 @@ export function useListingLayers({
|
||||||
() =>
|
() =>
|
||||||
new ScatterplotLayer<ActualListing>({
|
new ScatterplotLayer<ActualListing>({
|
||||||
id: 'actual-listing-shadow',
|
id: 'actual-listing-shadow',
|
||||||
data: visibleListings,
|
data: listings,
|
||||||
getPosition: (d) => [d.lon, d.lat],
|
getPosition: (d) => [d.lon, d.lat],
|
||||||
getRadius: 8,
|
getRadius: 8,
|
||||||
radiusUnits: 'pixels',
|
radiusUnits: 'pixels',
|
||||||
getFillColor: isDark ? [0, 0, 0, 80] : [0, 0, 0, 40],
|
getFillColor: isDark ? [0, 0, 0, 80] : [0, 0, 0, 40],
|
||||||
pickable: false,
|
pickable: false,
|
||||||
}),
|
}),
|
||||||
[visibleListings, isDark]
|
[listings, isDark]
|
||||||
);
|
);
|
||||||
|
|
||||||
const pinLayer = useMemo(
|
const pinLayer = useMemo(
|
||||||
() =>
|
() =>
|
||||||
new ScatterplotLayer<ActualListing>({
|
new ScatterplotLayer<ActualListing>({
|
||||||
id: 'actual-listing-pin',
|
id: 'actual-listing-pin',
|
||||||
data: visibleListings,
|
data: listings,
|
||||||
getPosition: (d) => [d.lon, d.lat],
|
getPosition: (d) => [d.lon, d.lat],
|
||||||
getRadius: 7,
|
getRadius: 7,
|
||||||
radiusUnits: 'pixels',
|
radiusUnits: 'pixels',
|
||||||
|
|
@ -148,12 +91,12 @@ export function useListingLayers({
|
||||||
onHover: stableHover,
|
onHover: stableHover,
|
||||||
onClick: stableClick,
|
onClick: stableClick,
|
||||||
}),
|
}),
|
||||||
[visibleListings, stableHover, stableClick]
|
[listings, stableHover, stableClick]
|
||||||
);
|
);
|
||||||
|
|
||||||
const priceLabelLayer = useMemo(() => {
|
const priceLabelLayer = useMemo(() => {
|
||||||
if (zoom < PRICE_LABEL_MIN_ZOOM) return null;
|
if (zoom < PRICE_LABEL_MIN_ZOOM) return null;
|
||||||
const labeled = visibleListings.filter((l) => l.asking_price && l.asking_price > 0);
|
const labeled = listings.filter((l) => l.asking_price && l.asking_price > 0);
|
||||||
return new TextLayer<ActualListing>({
|
return new TextLayer<ActualListing>({
|
||||||
id: 'actual-listing-price',
|
id: 'actual-listing-price',
|
||||||
data: labeled,
|
data: labeled,
|
||||||
|
|
@ -174,11 +117,11 @@ export function useListingLayers({
|
||||||
sizeMaxPixels: 14,
|
sizeMaxPixels: 14,
|
||||||
pickable: false,
|
pickable: false,
|
||||||
});
|
});
|
||||||
}, [visibleListings, zoom, isDark]);
|
}, [listings, zoom, isDark]);
|
||||||
|
|
||||||
const detailLabelLayer = useMemo(() => {
|
const detailLabelLayer = useMemo(() => {
|
||||||
if (zoom < ADDRESS_LABEL_MIN_ZOOM) return null;
|
if (zoom < ADDRESS_LABEL_MIN_ZOOM) return null;
|
||||||
const labeled = visibleListings.filter((l) => l.address || l.bedrooms != null);
|
const labeled = listings.filter((l) => l.address || l.bedrooms != null);
|
||||||
return new TextLayer<ActualListing>({
|
return new TextLayer<ActualListing>({
|
||||||
id: 'actual-listing-detail',
|
id: 'actual-listing-detail',
|
||||||
data: labeled,
|
data: labeled,
|
||||||
|
|
@ -205,7 +148,7 @@ export function useListingLayers({
|
||||||
sizeMaxPixels: 12,
|
sizeMaxPixels: 12,
|
||||||
pickable: false,
|
pickable: false,
|
||||||
});
|
});
|
||||||
}, [visibleListings, zoom, isDark]);
|
}, [listings, zoom, isDark]);
|
||||||
|
|
||||||
const listingLayers = useMemo(() => {
|
const listingLayers = useMemo(() => {
|
||||||
const layers: Layer[] = [pinShadowLayer, pinLayer];
|
const layers: Layer[] = [pinShadowLayer, pinLayer];
|
||||||
|
|
|
||||||
|
|
@ -132,7 +132,6 @@ export const POI_GROUP_COLORS: Record<string, [number, number, number]> = {
|
||||||
export const POI_CATEGORY_LOGOS: Record<string, string> = {
|
export const POI_CATEGORY_LOGOS: Record<string, string> = {
|
||||||
Airport: '/assets/twemoji/2708.png',
|
Airport: '/assets/twemoji/2708.png',
|
||||||
Aldi: '/assets/poi-icons/logos/aldi.svg',
|
Aldi: '/assets/poi-icons/logos/aldi.svg',
|
||||||
'Allendale Co-operative Society': '/assets/poi-icons/logos/coop.svg',
|
|
||||||
Amazon: '/assets/poi-icons/brands_2024/amazon_fresh.svg',
|
Amazon: '/assets/poi-icons/brands_2024/amazon_fresh.svg',
|
||||||
Asda: '/assets/poi-icons/logos/asda.svg',
|
Asda: '/assets/poi-icons/logos/asda.svg',
|
||||||
'Asda Express': '/assets/poi-icons/logos/asda.svg',
|
'Asda Express': '/assets/poi-icons/logos/asda.svg',
|
||||||
|
|
@ -148,26 +147,18 @@ export const POI_CATEGORY_LOGOS: Record<string, string> = {
|
||||||
'Bus stop': '/assets/twemoji/1f68f.png',
|
'Bus stop': '/assets/twemoji/1f68f.png',
|
||||||
'Butcher & Fishmonger': '/assets/twemoji/1f969.png',
|
'Butcher & Fishmonger': '/assets/twemoji/1f969.png',
|
||||||
Centra: '/assets/poi-icons/logos/centra.svg',
|
Centra: '/assets/poi-icons/logos/centra.svg',
|
||||||
'Central England Co-operative': '/assets/poi-icons/logos/coop.svg',
|
|
||||||
'Chelmsford Star Co-operative Society': '/assets/poi-icons/logos/coop.svg',
|
|
||||||
'Clydebank Co-operative': '/assets/poi-icons/logos/coop.svg',
|
|
||||||
'Co-op': '/assets/poi-icons/logos/coop.svg',
|
'Co-op': '/assets/poi-icons/logos/coop.svg',
|
||||||
'Coniston Co-operative Society': '/assets/poi-icons/logos/coop.svg',
|
|
||||||
COOK: '/assets/poi-icons/brands_2024/cook.svg',
|
COOK: '/assets/poi-icons/brands_2024/cook.svg',
|
||||||
'Convenience Store': '/assets/twemoji/1f3ea.png',
|
'Convenience Store': '/assets/twemoji/1f3ea.png',
|
||||||
Costco: '/assets/poi-icons/logos/costco.svg',
|
Costco: '/assets/poi-icons/logos/costco.svg',
|
||||||
'Deli & Specialty': '/assets/twemoji/1f9c6.png',
|
'Deli & Specialty': '/assets/twemoji/1f9c6.png',
|
||||||
'Dunnes Stores': '/assets/poi-icons/brands_2024/dunnes_stores.svg',
|
'Dunnes Stores': '/assets/poi-icons/brands_2024/dunnes_stores.svg',
|
||||||
'East of England Co-operative': '/assets/poi-icons/logos/coop.svg',
|
|
||||||
Farmfoods: '/assets/poi-icons/brands_2023/supermarkets/farmfoods.svg',
|
Farmfoods: '/assets/poi-icons/brands_2023/supermarkets/farmfoods.svg',
|
||||||
Ferry: '/assets/twemoji/26f4.png',
|
Ferry: '/assets/twemoji/26f4.png',
|
||||||
Greengrocer: '/assets/twemoji/1f96c.png',
|
Greengrocer: '/assets/twemoji/1f96c.png',
|
||||||
'Heart of England Co-operative': '/assets/poi-icons/logos/coop.svg',
|
|
||||||
'Heron Foods': '/assets/poi-icons/brands_2023/supermarkets/heron_foods.svg',
|
'Heron Foods': '/assets/poi-icons/brands_2023/supermarkets/heron_foods.svg',
|
||||||
Iceland: '/assets/poi-icons/brands_2024/iceland.svg',
|
Iceland: '/assets/poi-icons/brands_2024/iceland.svg',
|
||||||
Lidl: '/assets/poi-icons/logos/lidl.svg',
|
Lidl: '/assets/poi-icons/logos/lidl.svg',
|
||||||
'Langdale Co-operative Society': '/assets/poi-icons/logos/coop.svg',
|
|
||||||
'Lincolnshire Co-operative': '/assets/poi-icons/logos/coop.svg',
|
|
||||||
Makro: '/assets/poi-icons/brands_2024/makro.svg',
|
Makro: '/assets/poi-icons/brands_2024/makro.svg',
|
||||||
'M&S': '/assets/poi-icons/brands_2024/mns.svg',
|
'M&S': '/assets/poi-icons/brands_2024/mns.svg',
|
||||||
'M&S Clothing': '/assets/poi-icons/brands_2024/mns.svg',
|
'M&S Clothing': '/assets/poi-icons/brands_2024/mns.svg',
|
||||||
|
|
@ -175,7 +166,6 @@ export const POI_CATEGORY_LOGOS: Record<string, string> = {
|
||||||
'M&S Hospital': '/assets/poi-icons/brands_2024/mns.svg',
|
'M&S Hospital': '/assets/poi-icons/brands_2024/mns.svg',
|
||||||
'M&S MSA': '/assets/poi-icons/brands_2024/mns.svg',
|
'M&S MSA': '/assets/poi-icons/brands_2024/mns.svg',
|
||||||
'M&S Outlet': '/assets/poi-icons/brands_2024/mns.svg',
|
'M&S Outlet': '/assets/poi-icons/brands_2024/mns.svg',
|
||||||
'Midcounties Co-operative': '/assets/poi-icons/logos/coop.svg',
|
|
||||||
Morrisons: '/assets/poi-icons/logos/morrisons.svg',
|
Morrisons: '/assets/poi-icons/logos/morrisons.svg',
|
||||||
'Morrisons Daily': '/assets/poi-icons/brands_2024/morrisons_daily.svg',
|
'Morrisons Daily': '/assets/poi-icons/brands_2024/morrisons_daily.svg',
|
||||||
'Off-Licence': '/assets/twemoji/1f377.png',
|
'Off-Licence': '/assets/twemoji/1f377.png',
|
||||||
|
|
@ -183,16 +173,12 @@ export const POI_CATEGORY_LOGOS: Record<string, string> = {
|
||||||
'Rail station': '/assets/twemoji/1f686.png',
|
'Rail station': '/assets/twemoji/1f686.png',
|
||||||
"Sainsbury's": '/assets/poi-icons/logos/sainsburys.svg',
|
"Sainsbury's": '/assets/poi-icons/logos/sainsburys.svg',
|
||||||
"Sainsbury's Local": '/assets/poi-icons/brands_2024/sainsburys_local.svg',
|
"Sainsbury's Local": '/assets/poi-icons/brands_2024/sainsburys_local.svg',
|
||||||
'Scottish Midland Co-operative': '/assets/poi-icons/logos/coop.svg',
|
|
||||||
Spar: '/assets/poi-icons/logos/spar.svg',
|
Spar: '/assets/poi-icons/logos/spar.svg',
|
||||||
Supermarket: '/assets/twemoji/1f6d2.png',
|
Supermarket: '/assets/twemoji/1f6d2.png',
|
||||||
'Tamworth Co-operative Society': '/assets/poi-icons/logos/coop.svg',
|
|
||||||
Tesco: '/assets/poi-icons/logos/tesco.svg',
|
Tesco: '/assets/poi-icons/logos/tesco.svg',
|
||||||
'Tesco Express': '/assets/poi-icons/logos/tesco_express.svg',
|
'Tesco Express': '/assets/poi-icons/logos/tesco_express.svg',
|
||||||
'Tesco Extra': '/assets/poi-icons/logos/tesco_extra.svg',
|
'Tesco Extra': '/assets/poi-icons/logos/tesco_extra.svg',
|
||||||
'Taxi rank': '/assets/twemoji/1f695.png',
|
'Taxi rank': '/assets/twemoji/1f695.png',
|
||||||
'The Radstock Co-operative Society': '/assets/poi-icons/logos/coop.svg',
|
|
||||||
'The Southern Co-operative': '/assets/poi-icons/logos/coop.svg',
|
|
||||||
'The Food Warehouse': '/assets/poi-icons/logos/the_food_warehouse.png',
|
'The Food Warehouse': '/assets/poi-icons/logos/the_food_warehouse.png',
|
||||||
'Tube station': '/assets/poi-icons/public_transport/london_tube.svg',
|
'Tube station': '/assets/poi-icons/public_transport/london_tube.svg',
|
||||||
Waitrose: '/assets/poi-icons/logos/waitrose.svg',
|
Waitrose: '/assets/poi-icons/logos/waitrose.svg',
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,3 @@
|
||||||
|
from .local_temp import configure_tempfile_defaults
|
||||||
|
|
||||||
|
configure_tempfile_defaults()
|
||||||
|
|
@ -3,6 +3,7 @@ import tempfile
|
||||||
import polars as pl
|
import polars as pl
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
from pipeline.local_temp import local_tmp_dir
|
||||||
from pipeline.utils import download, extract_zip
|
from pipeline.utils import download, extract_zip
|
||||||
|
|
||||||
URL = "https://www.arcgis.com/sharing/rest/content/items/36b718ad00de49afb9ad364f8b815b9e/data"
|
URL = "https://www.arcgis.com/sharing/rest/content/items/36b718ad00de49afb9ad364f8b815b9e/data"
|
||||||
|
|
@ -40,7 +41,7 @@ def main() -> None:
|
||||||
)
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory() as cache_dir:
|
with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as cache_dir:
|
||||||
download_path = Path(cache_dir) / "arcgis_data.zip"
|
download_path = Path(cache_dir) / "arcgis_data.zip"
|
||||||
extract_path = Path(cache_dir) / "arcgis_extracted"
|
extract_path = Path(cache_dir) / "arcgis_extracted"
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -7,6 +7,7 @@ from pathlib import Path
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
|
|
||||||
|
from pipeline.local_temp import local_tmp_dir
|
||||||
from pipeline.utils import download, extract_zip
|
from pipeline.utils import download, extract_zip
|
||||||
|
|
||||||
# Ofcom Connected Nations 2025 - Fixed broadband performance (output area & local authority level)
|
# Ofcom Connected Nations 2025 - Fixed broadband performance (output area & local authority level)
|
||||||
|
|
@ -84,7 +85,7 @@ def main() -> None:
|
||||||
)
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory() as cache_dir:
|
with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as cache_dir:
|
||||||
cache = Path(cache_dir)
|
cache = Path(cache_dir)
|
||||||
zip_path = cache / "broadband_performance.zip"
|
zip_path = cache / "broadband_performance.zip"
|
||||||
extract_dir = cache / "extracted"
|
extract_dir = cache / "extracted"
|
||||||
|
|
|
||||||
|
|
@ -3,6 +3,7 @@ import tempfile
|
||||||
import polars as pl
|
import polars as pl
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
from pipeline.local_temp import local_tmp_dir
|
||||||
from pipeline.utils import download
|
from pipeline.utils import download
|
||||||
|
|
||||||
URL = "https://assets.publishing.service.gov.uk/media/691ded34513046b952c500bd/File_5_IoD2025_Scores_for_the_Indices_of_Deprivation.xlsx"
|
URL = "https://assets.publishing.service.gov.uk/media/691ded34513046b952c500bd/File_5_IoD2025_Scores_for_the_Indices_of_Deprivation.xlsx"
|
||||||
|
|
@ -33,7 +34,7 @@ def main() -> None:
|
||||||
)
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory() as cache_dir:
|
with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as cache_dir:
|
||||||
xlsx_path = Path(cache_dir) / "IoD2025_Scores.xlsx"
|
xlsx_path = Path(cache_dir) / "IoD2025_Scores.xlsx"
|
||||||
download(URL, xlsx_path, timeout=60)
|
download(URL, xlsx_path, timeout=60)
|
||||||
convert_to_parquet(xlsx_path, args.output)
|
convert_to_parquet(xlsx_path, args.output)
|
||||||
|
|
|
||||||
|
|
@ -8,6 +8,7 @@ from zipfile import ZipFile
|
||||||
|
|
||||||
import polars as pl
|
import polars as pl
|
||||||
|
|
||||||
|
from pipeline.local_temp import local_tmp_dir
|
||||||
from pipeline.utils.download import download
|
from pipeline.utils.download import download
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -70,7 +71,9 @@ def download_geolytix_retail_points(output_path: Path) -> None:
|
||||||
"""Download the GEOLYTIX ZIP, extract the latest CSV, and write parquet."""
|
"""Download the GEOLYTIX ZIP, extract the latest CSV, and write parquet."""
|
||||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
with TemporaryDirectory(prefix="geolytix_retail_points_") as tmp:
|
with TemporaryDirectory(
|
||||||
|
prefix="geolytix_retail_points_", dir=local_tmp_dir()
|
||||||
|
) as tmp:
|
||||||
zip_path = Path(tmp) / "geolytix_retail_points.zip"
|
zip_path = Path(tmp) / "geolytix_retail_points.zip"
|
||||||
download(GEOLYTIX_RETAIL_POINTS_URL, zip_path, timeout=300)
|
download(GEOLYTIX_RETAIL_POINTS_URL, zip_path, timeout=300)
|
||||||
df = read_latest_csv(zip_path)
|
df = read_latest_csv(zip_path)
|
||||||
|
|
|
||||||
|
|
@ -31,6 +31,8 @@ from pyproj import Transformer
|
||||||
from rasterio.transform import rowcol
|
from rasterio.transform import rowcol
|
||||||
from scipy.ndimage import maximum_filter
|
from scipy.ndimage import maximum_filter
|
||||||
|
|
||||||
|
from pipeline.local_temp import local_tmp_dir
|
||||||
|
|
||||||
# Noise sources:
|
# Noise sources:
|
||||||
# (label, column_name, WCS base URL, coverage ID, WCS version, allow_missing_tiles)
|
# (label, column_name, WCS base URL, coverage ID, WCS version, allow_missing_tiles)
|
||||||
# Road/rail work with WCS 1.0.0; airport requires WCS 2.0.1 and returns 500
|
# Road/rail work with WCS 1.0.0; airport requires WCS 2.0.1 and returns 500
|
||||||
|
|
@ -437,7 +439,7 @@ def main() -> None:
|
||||||
|
|
||||||
result = postcodes.select("postcode")
|
result = postcodes.select("postcode")
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory() as tmp:
|
with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as tmp:
|
||||||
for (
|
for (
|
||||||
label,
|
label,
|
||||||
col_name,
|
col_name,
|
||||||
|
|
|
||||||
|
|
@ -3,6 +3,7 @@ import tempfile
|
||||||
import polars as pl
|
import polars as pl
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
from pipeline.local_temp import local_tmp_dir
|
||||||
from pipeline.utils import download
|
from pipeline.utils import download
|
||||||
|
|
||||||
# Management information - state-funded schools - latest inspections (as at 28 Feb 2026)
|
# Management information - state-funded schools - latest inspections (as at 28 Feb 2026)
|
||||||
|
|
@ -36,7 +37,7 @@ def main() -> None:
|
||||||
)
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory() as cache_dir:
|
with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as cache_dir:
|
||||||
csv_path = Path(cache_dir) / "ofsted_latest_inspections.csv"
|
csv_path = Path(cache_dir) / "ofsted_latest_inspections.csv"
|
||||||
download(URL, csv_path, timeout=60)
|
download(URL, csv_path, timeout=60)
|
||||||
convert_to_parquet(csv_path, args.output)
|
convert_to_parquet(csv_path, args.output)
|
||||||
|
|
|
||||||
|
|
@ -25,6 +25,7 @@ from pyproj import Transformer
|
||||||
from shapely.errors import GEOSException
|
from shapely.errors import GEOSException
|
||||||
from shapely.geometry import shape as to_shapely
|
from shapely.geometry import shape as to_shapely
|
||||||
|
|
||||||
|
from pipeline.local_temp import local_tmp_dir
|
||||||
from pipeline.utils.download import download, extract_zip
|
from pipeline.utils.download import download, extract_zip
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
@ -171,7 +172,7 @@ def _read_site_centroids(
|
||||||
def download_greenspace(output: Path) -> None:
|
def download_greenspace(output: Path) -> None:
|
||||||
output.parent.mkdir(parents=True, exist_ok=True)
|
output.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory() as cache_dir:
|
with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as cache_dir:
|
||||||
zip_path = Path(cache_dir) / "greenspace.zip"
|
zip_path = Path(cache_dir) / "greenspace.zip"
|
||||||
extract_dir = Path(cache_dir) / "extracted"
|
extract_dir = Path(cache_dir) / "extracted"
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -11,6 +11,7 @@ from shapely.geometry import Point
|
||||||
from shapely.wkb import loads as load_wkb
|
from shapely.wkb import loads as load_wkb
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
from pipeline.local_temp import local_tmp_dir
|
||||||
from pipeline.utils.england_geometry import (
|
from pipeline.utils.england_geometry import (
|
||||||
ENGLAND_BBOX_EAST,
|
ENGLAND_BBOX_EAST,
|
||||||
ENGLAND_BBOX_NORTH,
|
ENGLAND_BBOX_NORTH,
|
||||||
|
|
@ -184,7 +185,7 @@ def main() -> None:
|
||||||
|
|
||||||
england_polygon = load_england_polygon(args.boundary)
|
england_polygon = load_england_polygon(args.boundary)
|
||||||
|
|
||||||
tmp_dir = Path(mkdtemp(prefix="pois_"))
|
tmp_dir = Path(mkdtemp(prefix="pois_", dir=local_tmp_dir()))
|
||||||
with tqdm(
|
with tqdm(
|
||||||
unit=" elements",
|
unit=" elements",
|
||||||
unit_scale=True,
|
unit_scale=True,
|
||||||
|
|
|
||||||
|
|
@ -12,6 +12,7 @@ import tarfile
|
||||||
import tempfile
|
import tempfile
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
from pipeline.local_temp import local_tmp_dir
|
||||||
from pipeline.utils import download
|
from pipeline.utils import download
|
||||||
|
|
||||||
URL = "https://postcodes-mapit-static.s3.eu-west-2.amazonaws.com/data/gb-postcodes-v5.tar.bz2"
|
URL = "https://postcodes-mapit-static.s3.eu-west-2.amazonaws.com/data/gb-postcodes-v5.tar.bz2"
|
||||||
|
|
@ -37,7 +38,7 @@ def main() -> None:
|
||||||
)
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory() as cache_dir:
|
with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as cache_dir:
|
||||||
cache = Path(cache_dir)
|
cache = Path(cache_dir)
|
||||||
archive_path = cache / "gb-postcodes-v5.tar.bz2"
|
archive_path = cache / "gb-postcodes-v5.tar.bz2"
|
||||||
extract_dir = cache / "extracted"
|
extract_dir = cache / "extracted"
|
||||||
|
|
|
||||||
|
|
@ -3,6 +3,7 @@ import tempfile
|
||||||
import polars as pl
|
import polars as pl
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
from pipeline.local_temp import local_tmp_dir
|
||||||
from pipeline.utils import download
|
from pipeline.utils import download
|
||||||
|
|
||||||
URL = "http://prod.publicdata.landregistry.gov.uk.s3-website-eu-west-1.amazonaws.com/pp-complete.csv"
|
URL = "http://prod.publicdata.landregistry.gov.uk.s3-website-eu-west-1.amazonaws.com/pp-complete.csv"
|
||||||
|
|
@ -55,7 +56,7 @@ def main() -> None:
|
||||||
)
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory() as cache_dir:
|
with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as cache_dir:
|
||||||
csv_path = Path(cache_dir) / "price-paid-complete.csv"
|
csv_path = Path(cache_dir) / "price-paid-complete.csv"
|
||||||
|
|
||||||
download(URL, csv_path)
|
download(URL, csv_path)
|
||||||
|
|
|
||||||
|
|
@ -13,6 +13,7 @@ from pathlib import Path
|
||||||
|
|
||||||
import polars as pl
|
import polars as pl
|
||||||
|
|
||||||
|
from pipeline.local_temp import local_tmp_dir
|
||||||
from pipeline.utils import download
|
from pipeline.utils import download
|
||||||
|
|
||||||
URL = "https://www.ons.gov.uk/file?uri=/economy/inflationandpriceindices/datasets/priceindexofprivaterentsukmonthlypricestatistics/25march2026/priceindexofprivaterentsukmonthlypricestatistics.xlsx"
|
URL = "https://www.ons.gov.uk/file?uri=/economy/inflationandpriceindices/datasets/priceindexofprivaterentsukmonthlypricestatistics/25march2026/priceindexofprivaterentsukmonthlypricestatistics.xlsx"
|
||||||
|
|
@ -114,7 +115,7 @@ def main() -> None:
|
||||||
)
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory() as cache_dir:
|
with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as cache_dir:
|
||||||
xlsx_path = Path(cache_dir) / "pipr_monthly.xlsx"
|
xlsx_path = Path(cache_dir) / "pipr_monthly.xlsx"
|
||||||
download(URL, xlsx_path, timeout=120)
|
download(URL, xlsx_path, timeout=120)
|
||||||
convert_to_parquet(xlsx_path, args.output)
|
convert_to_parquet(xlsx_path, args.output)
|
||||||
|
|
|
||||||
|
|
@ -36,6 +36,8 @@ from pathlib import Path
|
||||||
|
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
from pipeline.local_temp import local_tmp_dir
|
||||||
|
|
||||||
ENGLAND_PBF_URL = (
|
ENGLAND_PBF_URL = (
|
||||||
"https://download.geofabrik.de/europe/united-kingdom/england-latest.osm.pbf"
|
"https://download.geofabrik.de/europe/united-kingdom/england-latest.osm.pbf"
|
||||||
)
|
)
|
||||||
|
|
@ -164,7 +166,10 @@ def clean_gtfs(src: Path, dst: Path) -> None:
|
||||||
)
|
)
|
||||||
|
|
||||||
tmp = tempfile.NamedTemporaryFile(
|
tmp = tempfile.NamedTemporaryFile(
|
||||||
mode="wb", delete=False, suffix=".txt"
|
mode="wb",
|
||||||
|
delete=False,
|
||||||
|
suffix=".txt",
|
||||||
|
dir=local_tmp_dir(),
|
||||||
)
|
)
|
||||||
tmp.write(header)
|
tmp.write(header)
|
||||||
|
|
||||||
|
|
@ -388,7 +393,10 @@ def convert_high_freq_to_frequency_based(
|
||||||
trip_id_idx = cols.index("trip_id")
|
trip_id_idx = cols.index("trip_id")
|
||||||
|
|
||||||
tmp = tempfile.NamedTemporaryFile(
|
tmp = tempfile.NamedTemporaryFile(
|
||||||
mode="wb", delete=False, suffix=".txt"
|
mode="wb",
|
||||||
|
delete=False,
|
||||||
|
suffix=".txt",
|
||||||
|
dir=local_tmp_dir(),
|
||||||
)
|
)
|
||||||
tmp.write(header)
|
tmp.write(header)
|
||||||
for line in f:
|
for line in f:
|
||||||
|
|
@ -408,7 +416,10 @@ def convert_high_freq_to_frequency_based(
|
||||||
trip_id_idx = cols.index("trip_id")
|
trip_id_idx = cols.index("trip_id")
|
||||||
|
|
||||||
tmp = tempfile.NamedTemporaryFile(
|
tmp = tempfile.NamedTemporaryFile(
|
||||||
mode="wb", delete=False, suffix=".txt"
|
mode="wb",
|
||||||
|
delete=False,
|
||||||
|
suffix=".txt",
|
||||||
|
dir=local_tmp_dir(),
|
||||||
)
|
)
|
||||||
tmp.write(header)
|
tmp.write(header)
|
||||||
for line in f:
|
for line in f:
|
||||||
|
|
@ -451,8 +462,8 @@ def download_tfl_transxchange(raw_dir: Path) -> Path:
|
||||||
|
|
||||||
|
|
||||||
def download_naptan() -> None:
|
def download_naptan() -> None:
|
||||||
"""Download NaPTAN stops to /tmp/Stops.csv (needed by transxchange2gtfs)."""
|
"""Download NaPTAN stops to the local temp dir for transxchange2gtfs."""
|
||||||
dest = Path("/tmp/Stops.csv")
|
dest = local_tmp_dir() / "Stops.csv"
|
||||||
if dest.exists():
|
if dest.exists():
|
||||||
print(f"NaPTAN Stops.csv already exists: {dest}")
|
print(f"NaPTAN Stops.csv already exists: {dest}")
|
||||||
return
|
return
|
||||||
|
|
@ -661,7 +672,10 @@ def clean_national_rail_gtfs(src: Path, dst: Path) -> None:
|
||||||
)
|
)
|
||||||
|
|
||||||
tmp = tempfile.NamedTemporaryFile(
|
tmp = tempfile.NamedTemporaryFile(
|
||||||
mode="wb", delete=False, suffix=".txt"
|
mode="wb",
|
||||||
|
delete=False,
|
||||||
|
suffix=".txt",
|
||||||
|
dir=local_tmp_dir(),
|
||||||
)
|
)
|
||||||
tmp.write(header)
|
tmp.write(header)
|
||||||
|
|
||||||
|
|
@ -718,7 +732,10 @@ def clean_national_rail_gtfs(src: Path, dst: Path) -> None:
|
||||||
lon_idx = cols.index("stop_lon")
|
lon_idx = cols.index("stop_lon")
|
||||||
|
|
||||||
tmp = tempfile.NamedTemporaryFile(
|
tmp = tempfile.NamedTemporaryFile(
|
||||||
mode="wb", delete=False, suffix=".txt"
|
mode="wb",
|
||||||
|
delete=False,
|
||||||
|
suffix=".txt",
|
||||||
|
dir=local_tmp_dir(),
|
||||||
)
|
)
|
||||||
tmp.write(header)
|
tmp.write(header)
|
||||||
|
|
||||||
|
|
@ -749,7 +766,10 @@ def clean_national_rail_gtfs(src: Path, dst: Path) -> None:
|
||||||
rt_idx = cols.index("route_type")
|
rt_idx = cols.index("route_type")
|
||||||
|
|
||||||
tmp = tempfile.NamedTemporaryFile(
|
tmp = tempfile.NamedTemporaryFile(
|
||||||
mode="wb", delete=False, suffix=".txt"
|
mode="wb",
|
||||||
|
delete=False,
|
||||||
|
suffix=".txt",
|
||||||
|
dir=local_tmp_dir(),
|
||||||
)
|
)
|
||||||
tmp.write(header)
|
tmp.write(header)
|
||||||
|
|
||||||
|
|
@ -774,7 +794,10 @@ def clean_national_rail_gtfs(src: Path, dst: Path) -> None:
|
||||||
trip_id_idx = cols.index("trip_id")
|
trip_id_idx = cols.index("trip_id")
|
||||||
|
|
||||||
tmp = tempfile.NamedTemporaryFile(
|
tmp = tempfile.NamedTemporaryFile(
|
||||||
mode="wb", delete=False, suffix=".txt"
|
mode="wb",
|
||||||
|
delete=False,
|
||||||
|
suffix=".txt",
|
||||||
|
dir=local_tmp_dir(),
|
||||||
)
|
)
|
||||||
tmp.write(header)
|
tmp.write(header)
|
||||||
|
|
||||||
|
|
@ -797,7 +820,10 @@ def clean_national_rail_gtfs(src: Path, dst: Path) -> None:
|
||||||
end_idx = cols.index("end_date")
|
end_idx = cols.index("end_date")
|
||||||
|
|
||||||
tmp = tempfile.NamedTemporaryFile(
|
tmp = tempfile.NamedTemporaryFile(
|
||||||
mode="wb", delete=False, suffix=".txt"
|
mode="wb",
|
||||||
|
delete=False,
|
||||||
|
suffix=".txt",
|
||||||
|
dir=local_tmp_dir(),
|
||||||
)
|
)
|
||||||
tmp.write(header)
|
tmp.write(header)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -15,6 +15,16 @@ if (!pkgDirArg || converterArgs.length < 2) {
|
||||||
}
|
}
|
||||||
|
|
||||||
const pkgDir = path.resolve(pkgDirArg);
|
const pkgDir = path.resolve(pkgDirArg);
|
||||||
|
const defaultTmpDir = path.resolve(__dirname, "..", "..", ".tmp");
|
||||||
|
const localTmpDir =
|
||||||
|
process.env.TMPDIR || process.env.TEMP || process.env.TMP || defaultTmpDir;
|
||||||
|
const stopsCsv = path.join(localTmpDir, "Stops.csv");
|
||||||
|
const converterTmpPrefix = path.join(localTmpDir, "transxchange2gtfs_");
|
||||||
|
const converterTmpPatch =
|
||||||
|
`static TMP = ${JSON.stringify(converterTmpPrefix)}` +
|
||||||
|
` + process.pid + ${JSON.stringify(path.sep)};`;
|
||||||
|
|
||||||
|
fs.mkdirSync(localTmpDir, { recursive: true });
|
||||||
|
|
||||||
function replaceOnce(relativePath, before, after) {
|
function replaceOnce(relativePath, before, after) {
|
||||||
const file = path.join(pkgDir, relativePath);
|
const file = path.join(pkgDir, relativePath);
|
||||||
|
|
@ -37,6 +47,26 @@ function replaceOnce(relativePath, before, after) {
|
||||||
// GTFS shapes are optional for R5 routing. Clear shape references and omit
|
// GTFS shapes are optional for R5 routing. Clear shape references and omit
|
||||||
// shapes.txt so missing route geometry does not drop otherwise usable trips.
|
// shapes.txt so missing route geometry does not drop otherwise usable trips.
|
||||||
function patchPackage() {
|
function patchPackage() {
|
||||||
|
replaceOnce(
|
||||||
|
"dist/Container.js",
|
||||||
|
"static TMP = `/tmp/transxchange2gtfs_${process.pid}/`;",
|
||||||
|
converterTmpPatch,
|
||||||
|
);
|
||||||
|
replaceOnce(
|
||||||
|
"dist/Container.js",
|
||||||
|
'fs.existsSync("/tmp/Stops.csv")',
|
||||||
|
`fs.existsSync(${JSON.stringify(stopsCsv)})`,
|
||||||
|
);
|
||||||
|
replaceOnce(
|
||||||
|
"dist/Container.js",
|
||||||
|
'fs.createReadStream("/tmp/Stops.csv", "utf8")',
|
||||||
|
`fs.createReadStream(${JSON.stringify(stopsCsv)}, "utf8")`,
|
||||||
|
);
|
||||||
|
replaceOnce(
|
||||||
|
"dist/converter/GetStopData.js",
|
||||||
|
'fs.createWriteStream("/tmp/Stops.csv")',
|
||||||
|
`fs.createWriteStream(${JSON.stringify(stopsCsv)})`,
|
||||||
|
);
|
||||||
replaceOnce(
|
replaceOnce(
|
||||||
"dist/transxchange/TransXChangeJourneyStream.js",
|
"dist/transxchange/TransXChangeJourneyStream.js",
|
||||||
"distanceSoFarM += routeLink.Distance;",
|
"distanceSoFarM += routeLink.Distance;",
|
||||||
|
|
|
||||||
|
|
@ -13,6 +13,7 @@ from pathlib import Path
|
||||||
|
|
||||||
import polars as pl
|
import polars as pl
|
||||||
|
|
||||||
|
from pipeline.local_temp import local_tmp_dir
|
||||||
from pipeline.utils import download, extract_zip
|
from pipeline.utils import download, extract_zip
|
||||||
|
|
||||||
URL = "https://www.arcgis.com/sharing/rest/content/items/4e0b4b3fbc2540caae27e7be532e61be/data"
|
URL = "https://www.arcgis.com/sharing/rest/content/items/4e0b4b3fbc2540caae27e7be532e61be/data"
|
||||||
|
|
@ -62,7 +63,7 @@ def main() -> None:
|
||||||
)
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory() as cache_dir:
|
with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as cache_dir:
|
||||||
zip_path = Path(cache_dir) / "uprn_lookup.zip"
|
zip_path = Path(cache_dir) / "uprn_lookup.zip"
|
||||||
extract_path = Path(cache_dir) / "uprn_extracted"
|
extract_path = Path(cache_dir) / "uprn_extracted"
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -10,6 +10,8 @@ import pyarrow as pa
|
||||||
import pyarrow.csv as pa_csv
|
import pyarrow.csv as pa_csv
|
||||||
import pyarrow.parquet as pq
|
import pyarrow.parquet as pq
|
||||||
|
|
||||||
|
from pipeline.local_temp import local_tmp_dir
|
||||||
|
|
||||||
from ..utils import (
|
from ..utils import (
|
||||||
fuzzy_join_on_postcode,
|
fuzzy_join_on_postcode,
|
||||||
normalize_address_key,
|
normalize_address_key,
|
||||||
|
|
@ -192,7 +194,9 @@ def main():
|
||||||
)
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory(prefix="epc_certificates_") as tmpdir:
|
with tempfile.TemporaryDirectory(
|
||||||
|
prefix="epc_certificates_", dir=local_tmp_dir()
|
||||||
|
) as tmpdir:
|
||||||
_run(args.epc, args.price_paid, args.output, Path(tmpdir))
|
_run(args.epc, args.price_paid, args.output, Path(tmpdir))
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -3,6 +3,8 @@ from pathlib import Path
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import polars as pl
|
import polars as pl
|
||||||
|
|
||||||
|
from pipeline.local_temp import local_tmp_dir
|
||||||
|
|
||||||
from .memory import release_memory
|
from .memory import release_memory
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -17,7 +19,9 @@ def load_uprns(uprn_path: Path) -> tuple[pl.DataFrame, dict[str, tuple[int, int]
|
||||||
print("Loading UPRN lookup...")
|
print("Loading UPRN lookup...")
|
||||||
|
|
||||||
# Sort via streaming sink to avoid polars doubling memory during in-memory sort
|
# Sort via streaming sink to avoid polars doubling memory during in-memory sort
|
||||||
with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as tmp:
|
with tempfile.NamedTemporaryFile(
|
||||||
|
suffix=".parquet", delete=False, dir=local_tmp_dir()
|
||||||
|
) as tmp:
|
||||||
tmp_path = Path(tmp.name)
|
tmp_path = Path(tmp.name)
|
||||||
(
|
(
|
||||||
pl.scan_parquet(uprn_path)
|
pl.scan_parquet(uprn_path)
|
||||||
|
|
|
||||||
|
|
@ -79,6 +79,39 @@ def test_transform_grocery_retail_points_keeps_fascia_icon_category():
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_transform_grocery_retail_points_merges_cooperative_societies():
|
||||||
|
raw = pl.DataFrame(
|
||||||
|
{
|
||||||
|
"id": [101, 102, 103],
|
||||||
|
"retailer": [
|
||||||
|
"Central England Co-operative",
|
||||||
|
"Lincolnshire Co-operative",
|
||||||
|
"The Southern Co-operative",
|
||||||
|
],
|
||||||
|
"fascia": [
|
||||||
|
"Central England Co-operative",
|
||||||
|
"The Co-operative Food",
|
||||||
|
None,
|
||||||
|
],
|
||||||
|
"store_name": [
|
||||||
|
"Central Co-op Test",
|
||||||
|
"Lincolnshire Co-op Test",
|
||||||
|
"Southern Co-op Test",
|
||||||
|
],
|
||||||
|
"long_wgs": [-0.141, -0.142, -0.143],
|
||||||
|
"lat_wgs": [51.515, 51.516, 51.517],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
pois = transform_grocery_retail_points(raw, min_chain_locations=1)
|
||||||
|
|
||||||
|
assert pois.select("category", "icon_category").to_dicts() == [
|
||||||
|
{"category": "Co-op", "icon_category": "Co-op"},
|
||||||
|
{"category": "Co-op", "icon_category": "Co-op"},
|
||||||
|
{"category": "Co-op", "icon_category": "Co-op"},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
def test_transform_grocery_retail_points_accepts_base_fascias():
|
def test_transform_grocery_retail_points_accepts_base_fascias():
|
||||||
raw = pl.DataFrame(
|
raw = pl.DataFrame(
|
||||||
{
|
{
|
||||||
|
|
|
||||||
|
|
@ -623,6 +623,7 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
|
||||||
"shop/outpost",
|
"shop/outpost",
|
||||||
"shop/pawnbroker",
|
"shop/pawnbroker",
|
||||||
"shop/photo",
|
"shop/photo",
|
||||||
|
"shop/photo_studio",
|
||||||
"shop/plant_hire",
|
"shop/plant_hire",
|
||||||
"shop/printer_ink",
|
"shop/printer_ink",
|
||||||
"shop/printing",
|
"shop/printing",
|
||||||
|
|
@ -843,6 +844,7 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
|
||||||
[
|
[
|
||||||
"healthcare/physiotherapist",
|
"healthcare/physiotherapist",
|
||||||
"healthcare/podiatrist",
|
"healthcare/podiatrist",
|
||||||
|
"healthcare/occupational_therapist",
|
||||||
],
|
],
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
|
|
@ -1171,7 +1173,6 @@ GROCERY_RETAILER_DISPLAY_NAME_OVERRIDES: dict[str, str] = {
|
||||||
"Heron": "Heron Foods",
|
"Heron": "Heron Foods",
|
||||||
"Marks and Spencer": "M&S",
|
"Marks and Spencer": "M&S",
|
||||||
"Sainsburys": "Sainsbury's",
|
"Sainsburys": "Sainsbury's",
|
||||||
"The Co-operative Group": "Co-op",
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -1238,6 +1239,8 @@ def normalize_grocery_retailer(retailer: str | None) -> str:
|
||||||
if retailer is None:
|
if retailer is None:
|
||||||
return ""
|
return ""
|
||||||
retailer = retailer.strip()
|
retailer = retailer.strip()
|
||||||
|
if retailer in COOP_RETAILERS:
|
||||||
|
return "Co-op"
|
||||||
return GROCERY_RETAILER_DISPLAY_NAME_OVERRIDES.get(retailer, retailer)
|
return GROCERY_RETAILER_DISPLAY_NAME_OVERRIDES.get(retailer, retailer)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -9,6 +9,8 @@ import polars as pl
|
||||||
from thefuzz import fuzz
|
from thefuzz import fuzz
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
from pipeline.local_temp import local_tmp_dir
|
||||||
|
|
||||||
_NUMBER_RE = re.compile(r"\d+")
|
_NUMBER_RE = re.compile(r"\d+")
|
||||||
_POSTCODE_RE = r"^[A-Z]{1,2}\d[A-Z\d]?\d[A-Z]{2}$"
|
_POSTCODE_RE = r"^[A-Z]{1,2}\d[A-Z\d]?\d[A-Z]{2}$"
|
||||||
MIN_FUZZY_SCORE = 60
|
MIN_FUZZY_SCORE = 60
|
||||||
|
|
@ -57,7 +59,7 @@ def fuzzy_join_on_postcode(
|
||||||
have null right columns.
|
have null right columns.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
tmpdir = tempfile.mkdtemp(prefix="fuzzy_join_")
|
tmpdir = tempfile.mkdtemp(prefix="fuzzy_join_", dir=local_tmp_dir())
|
||||||
left_path = Path(tmpdir) / "left.parquet"
|
left_path = Path(tmpdir) / "left.parquet"
|
||||||
right_path = Path(tmpdir) / "right.parquet"
|
right_path = Path(tmpdir) / "right.parquet"
|
||||||
|
|
||||||
|
|
|
||||||
2
property-data/.gitignore
vendored
2
property-data/.gitignore
vendored
|
|
@ -1,2 +0,0 @@
|
||||||
*
|
|
||||||
!.gitignore
|
|
||||||
|
|
@ -6,6 +6,8 @@ use polars::prelude::*;
|
||||||
use serde::Serialize;
|
use serde::Serialize;
|
||||||
use tracing::info;
|
use tracing::info;
|
||||||
|
|
||||||
|
use crate::consts::{NAN_U16, QUANT_SCALE};
|
||||||
|
use crate::data::{PropertyData, QuantRef};
|
||||||
use crate::utils::{normalize_postcode, GridIndex, InternedColumn};
|
use crate::utils::{normalize_postcode, GridIndex, InternedColumn};
|
||||||
|
|
||||||
const GRID_CELL_SIZE: f32 = 0.01;
|
const GRID_CELL_SIZE: f32 = 0.01;
|
||||||
|
|
@ -52,15 +54,22 @@ pub struct ActualListingData {
|
||||||
pub listing_status: InternedColumn,
|
pub listing_status: InternedColumn,
|
||||||
pub listing_date_iso: Vec<Option<String>>,
|
pub listing_date_iso: Vec<Option<String>>,
|
||||||
pub features: Vec<Vec<String>>,
|
pub features: Vec<Vec<String>>,
|
||||||
|
/// Row-major feature matrix aligned with PropertyData::feature_names.
|
||||||
|
///
|
||||||
|
/// Rows start from a best-effort address/postcode join to the historical property
|
||||||
|
/// dataset, then live listing fields such as asking price and property type are
|
||||||
|
/// overlaid where available. This lets the listings endpoint use the same filter
|
||||||
|
/// execution path as the property endpoints.
|
||||||
|
pub filter_feature_data: Vec<u16>,
|
||||||
pub grid: GridIndex,
|
pub grid: GridIndex,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl ActualListingData {
|
impl ActualListingData {
|
||||||
pub fn load(parquet_path: &Path) -> Result<Self> {
|
pub fn load(parquet_path: &Path, property_data: &PropertyData) -> Result<Self> {
|
||||||
super::run_polars_io(|| Self::load_inner(parquet_path))
|
super::run_polars_io(|| Self::load_inner(parquet_path, Some(property_data)))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn load_inner(parquet_path: &Path) -> Result<Self> {
|
fn load_inner(parquet_path: &Path, property_data: Option<&PropertyData>) -> Result<Self> {
|
||||||
info!("Loading actual listings from {:?}", parquet_path);
|
info!("Loading actual listings from {:?}", parquet_path);
|
||||||
let pl_path = PlRefPath::try_from_path(parquet_path)
|
let pl_path = PlRefPath::try_from_path(parquet_path)
|
||||||
.context("Failed to normalize actual listings parquet path")?;
|
.context("Failed to normalize actual listings parquet path")?;
|
||||||
|
|
@ -99,6 +108,18 @@ impl ActualListingData {
|
||||||
let price_qualifier = InternedColumn::build(&opt_to_string(&price_qualifier_raw));
|
let price_qualifier = InternedColumn::build(&opt_to_string(&price_qualifier_raw));
|
||||||
let listing_status = InternedColumn::build(&opt_to_string(&listing_status_raw));
|
let listing_status = InternedColumn::build(&opt_to_string(&listing_status_raw));
|
||||||
|
|
||||||
|
let filter_feature_data = build_filter_feature_data(
|
||||||
|
property_data,
|
||||||
|
&postcode,
|
||||||
|
&address,
|
||||||
|
&property_type_raw,
|
||||||
|
&leasehold_freehold_raw,
|
||||||
|
&rooms_total,
|
||||||
|
&floor_area_sqm,
|
||||||
|
&asking_price,
|
||||||
|
&asking_price_per_sqm,
|
||||||
|
);
|
||||||
|
|
||||||
let grid = GridIndex::build(&lat, &lon, GRID_CELL_SIZE);
|
let grid = GridIndex::build(&lat, &lon, GRID_CELL_SIZE);
|
||||||
|
|
||||||
info!(rows = row_count, "Actual listings loaded");
|
info!(rows = row_count, "Actual listings loaded");
|
||||||
|
|
@ -122,6 +143,7 @@ impl ActualListingData {
|
||||||
listing_status,
|
listing_status,
|
||||||
listing_date_iso,
|
listing_date_iso,
|
||||||
features,
|
features,
|
||||||
|
filter_feature_data,
|
||||||
grid,
|
grid,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
@ -150,6 +172,201 @@ impl ActualListingData {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[allow(clippy::too_many_arguments)]
|
||||||
|
fn build_filter_feature_data(
|
||||||
|
property_data: Option<&PropertyData>,
|
||||||
|
postcode: &[String],
|
||||||
|
address: &[Option<String>],
|
||||||
|
property_type: &[Option<String>],
|
||||||
|
leasehold_freehold: &[Option<String>],
|
||||||
|
rooms_total: &[Option<i32>],
|
||||||
|
floor_area_sqm: &[Option<f32>],
|
||||||
|
asking_price: &[Option<i64>],
|
||||||
|
asking_price_per_sqm: &[Option<f32>],
|
||||||
|
) -> Vec<u16> {
|
||||||
|
let Some(property_data) = property_data else {
|
||||||
|
return Vec::new();
|
||||||
|
};
|
||||||
|
|
||||||
|
let num_features = property_data.num_features;
|
||||||
|
let mut feature_data = vec![NAN_U16; postcode.len() * num_features];
|
||||||
|
let mut joined_rows = 0usize;
|
||||||
|
|
||||||
|
for (row, postcode_value) in postcode.iter().enumerate() {
|
||||||
|
let Some(address_value) = address[row]
|
||||||
|
.as_deref()
|
||||||
|
.map(str::trim)
|
||||||
|
.filter(|v| !v.is_empty())
|
||||||
|
else {
|
||||||
|
continue;
|
||||||
|
};
|
||||||
|
|
||||||
|
let query = format!("{address_value} {postcode_value}");
|
||||||
|
let Some(&property_row) = property_data.search_addresses(&query, 1).first() else {
|
||||||
|
continue;
|
||||||
|
};
|
||||||
|
if property_data.postcode(property_row) != postcode_value {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
let dst = row * num_features;
|
||||||
|
let src = property_row * num_features;
|
||||||
|
feature_data[dst..dst + num_features]
|
||||||
|
.copy_from_slice(&property_data.feature_data[src..src + num_features]);
|
||||||
|
joined_rows += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
let quant = property_data.quant_ref();
|
||||||
|
overlay_numeric_feature(
|
||||||
|
&mut feature_data,
|
||||||
|
property_data,
|
||||||
|
&quant,
|
||||||
|
"Total floor area (sqm)",
|
||||||
|
floor_area_sqm.iter().copied(),
|
||||||
|
false,
|
||||||
|
);
|
||||||
|
overlay_numeric_feature(
|
||||||
|
&mut feature_data,
|
||||||
|
property_data,
|
||||||
|
&quant,
|
||||||
|
"Number of bedrooms & living rooms",
|
||||||
|
rooms_total.iter().map(|value| value.map(|v| v as f32)),
|
||||||
|
false,
|
||||||
|
);
|
||||||
|
overlay_numeric_feature(
|
||||||
|
&mut feature_data,
|
||||||
|
property_data,
|
||||||
|
&quant,
|
||||||
|
"Estimated current price",
|
||||||
|
asking_price.iter().map(|value| value.map(|v| v as f32)),
|
||||||
|
true,
|
||||||
|
);
|
||||||
|
overlay_numeric_feature(
|
||||||
|
&mut feature_data,
|
||||||
|
property_data,
|
||||||
|
&quant,
|
||||||
|
"Last known price",
|
||||||
|
asking_price.iter().map(|value| value.map(|v| v as f32)),
|
||||||
|
true,
|
||||||
|
);
|
||||||
|
overlay_numeric_feature(
|
||||||
|
&mut feature_data,
|
||||||
|
property_data,
|
||||||
|
&quant,
|
||||||
|
"Est. price per sqm",
|
||||||
|
asking_price_per_sqm.iter().copied(),
|
||||||
|
true,
|
||||||
|
);
|
||||||
|
overlay_numeric_feature(
|
||||||
|
&mut feature_data,
|
||||||
|
property_data,
|
||||||
|
&quant,
|
||||||
|
"Price per sqm",
|
||||||
|
asking_price_per_sqm.iter().copied(),
|
||||||
|
true,
|
||||||
|
);
|
||||||
|
overlay_enum_feature(
|
||||||
|
&mut feature_data,
|
||||||
|
property_data,
|
||||||
|
"Property type",
|
||||||
|
property_type.iter().map(Option::as_deref),
|
||||||
|
false,
|
||||||
|
);
|
||||||
|
overlay_enum_feature(
|
||||||
|
&mut feature_data,
|
||||||
|
property_data,
|
||||||
|
"Leasehold/Freehold",
|
||||||
|
leasehold_freehold.iter().map(Option::as_deref),
|
||||||
|
false,
|
||||||
|
);
|
||||||
|
|
||||||
|
info!(
|
||||||
|
rows = postcode.len(),
|
||||||
|
joined_rows, "Actual listings joined to property feature matrix"
|
||||||
|
);
|
||||||
|
|
||||||
|
feature_data
|
||||||
|
}
|
||||||
|
|
||||||
|
fn feature_index(property_data: &PropertyData, name: &str) -> Option<usize> {
|
||||||
|
property_data
|
||||||
|
.feature_names
|
||||||
|
.iter()
|
||||||
|
.position(|candidate| candidate == name)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn overlay_numeric_feature<I>(
|
||||||
|
feature_data: &mut [u16],
|
||||||
|
property_data: &PropertyData,
|
||||||
|
quant: &QuantRef<'_>,
|
||||||
|
name: &str,
|
||||||
|
values: I,
|
||||||
|
clear_missing: bool,
|
||||||
|
) where
|
||||||
|
I: IntoIterator<Item = Option<f32>>,
|
||||||
|
{
|
||||||
|
let Some(feat_idx) = feature_index(property_data, name) else {
|
||||||
|
return;
|
||||||
|
};
|
||||||
|
if feat_idx >= property_data.num_numeric {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
let num_features = property_data.num_features;
|
||||||
|
for (row, value) in values.into_iter().enumerate() {
|
||||||
|
let dst = row * num_features + feat_idx;
|
||||||
|
match value {
|
||||||
|
Some(value) => feature_data[dst] = encode_numeric_value(quant, feat_idx, value),
|
||||||
|
None if clear_missing => feature_data[dst] = NAN_U16,
|
||||||
|
None => {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn overlay_enum_feature<'a, I>(
|
||||||
|
feature_data: &mut [u16],
|
||||||
|
property_data: &PropertyData,
|
||||||
|
name: &str,
|
||||||
|
values: I,
|
||||||
|
clear_missing: bool,
|
||||||
|
) where
|
||||||
|
I: IntoIterator<Item = Option<&'a str>>,
|
||||||
|
{
|
||||||
|
let Some(feat_idx) = feature_index(property_data, name) else {
|
||||||
|
return;
|
||||||
|
};
|
||||||
|
let Some(enum_values) = property_data.enum_values.get(&feat_idx) else {
|
||||||
|
return;
|
||||||
|
};
|
||||||
|
|
||||||
|
let num_features = property_data.num_features;
|
||||||
|
for (row, value) in values.into_iter().enumerate() {
|
||||||
|
let dst = row * num_features + feat_idx;
|
||||||
|
let encoded = value
|
||||||
|
.map(str::trim)
|
||||||
|
.filter(|text| !text.is_empty())
|
||||||
|
.and_then(|text| enum_values.iter().position(|candidate| candidate == text))
|
||||||
|
.map(|position| position as u16);
|
||||||
|
match encoded {
|
||||||
|
Some(value) => feature_data[dst] = value,
|
||||||
|
None if clear_missing => feature_data[dst] = NAN_U16,
|
||||||
|
None => {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn encode_numeric_value(quant: &QuantRef<'_>, feat_idx: usize, value: f32) -> u16 {
|
||||||
|
if !value.is_finite() {
|
||||||
|
return NAN_U16;
|
||||||
|
}
|
||||||
|
let range = quant.quant_range[feat_idx];
|
||||||
|
if range <= 0.0 {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
let normalized = (value - quant.quant_min[feat_idx]) / range;
|
||||||
|
(normalized * QUANT_SCALE).round().clamp(0.0, QUANT_SCALE) as u16
|
||||||
|
}
|
||||||
|
|
||||||
fn opt_to_string(values: &[Option<String>]) -> Vec<String> {
|
fn opt_to_string(values: &[Option<String>]) -> Vec<String> {
|
||||||
values
|
values
|
||||||
.iter()
|
.iter()
|
||||||
|
|
@ -311,7 +528,7 @@ mod tests {
|
||||||
return;
|
return;
|
||||||
};
|
};
|
||||||
|
|
||||||
let data = ActualListingData::load(&path).expect("listings load");
|
let data = ActualListingData::load_inner(&path, None).expect("listings load");
|
||||||
assert!(!data.lat.is_empty());
|
assert!(!data.lat.is_empty());
|
||||||
assert_eq!(data.lat.len(), data.lon.len());
|
assert_eq!(data.lat.len(), data.lon.len());
|
||||||
assert_eq!(data.lat.len(), data.postcode.len());
|
assert_eq!(data.lat.len(), data.postcode.len());
|
||||||
|
|
|
||||||
|
|
@ -30,16 +30,6 @@ const GROCERY_DASHBOARD_CATEGORIES: &[&str] = &[
|
||||||
"Budgens",
|
"Budgens",
|
||||||
"Centra",
|
"Centra",
|
||||||
"Co-op",
|
"Co-op",
|
||||||
"Central England Co-operative",
|
|
||||||
"Chelmsford Star Co-operative Society",
|
|
||||||
"East of England Co-operative",
|
|
||||||
"Heart of England Co-operative",
|
|
||||||
"Lincolnshire Co-operative",
|
|
||||||
"Midcounties Co-operative",
|
|
||||||
"Scottish Midland Co-operative",
|
|
||||||
"Tamworth Co-operative Society",
|
|
||||||
"The Radstock Co-operative Society",
|
|
||||||
"The Southern Co-operative",
|
|
||||||
"COOK",
|
"COOK",
|
||||||
"Costco",
|
"Costco",
|
||||||
"Dunnes Stores",
|
"Dunnes Stores",
|
||||||
|
|
@ -104,10 +94,35 @@ fn add_category_filter_index(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn canonical_poi_category(category: &str) -> &str {
|
||||||
|
match category {
|
||||||
|
"Allendale Co-operative Society"
|
||||||
|
| "Central England Co-operative"
|
||||||
|
| "Channel Islands Co-operative Society"
|
||||||
|
| "Chelmsford Star Co-operative Society"
|
||||||
|
| "Clydebank Co-operative"
|
||||||
|
| "Coniston Co-operative Society"
|
||||||
|
| "Co-op Food"
|
||||||
|
| "East of England Co-operative"
|
||||||
|
| "Heart of England Co-operative"
|
||||||
|
| "Langdale Co-operative Society"
|
||||||
|
| "Lincolnshire Co-operative"
|
||||||
|
| "Midcounties Co-operative"
|
||||||
|
| "Scottish Midland Co-operative"
|
||||||
|
| "Tamworth Co-operative Society"
|
||||||
|
| "The Co-operative Food"
|
||||||
|
| "The Co-operative Food PFS"
|
||||||
|
| "The Co-operative Group"
|
||||||
|
| "The Radstock Co-operative Society"
|
||||||
|
| "The Southern Co-operative" => "Co-op",
|
||||||
|
_ => category,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub fn resolve_poi_category_filter(category_values: &[String], categories: &str) -> FxHashSet<u16> {
|
pub fn resolve_poi_category_filter(category_values: &[String], categories: &str) -> FxHashSet<u16> {
|
||||||
let mut selected = FxHashSet::default();
|
let mut selected = FxHashSet::default();
|
||||||
for part in categories.split(',') {
|
for part in categories.split(',') {
|
||||||
let category = part.trim();
|
let category = canonical_poi_category(part.trim());
|
||||||
if category.is_empty() {
|
if category.is_empty() {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
@ -200,12 +215,18 @@ impl POIData {
|
||||||
|
|
||||||
let id_raw: Vec<String> = extract_str_col(&df, "id")?;
|
let id_raw: Vec<String> = extract_str_col(&df, "id")?;
|
||||||
let name = extract_str_col(&df, "name")?;
|
let name = extract_str_col(&df, "name")?;
|
||||||
let category_raw = extract_str_col(&df, "category")?;
|
let category_raw: Vec<String> = extract_str_col(&df, "category")?
|
||||||
|
.into_iter()
|
||||||
|
.map(|category| canonical_poi_category(&category).to_string())
|
||||||
|
.collect();
|
||||||
let group_raw = extract_str_col(&df, "group")?;
|
let group_raw = extract_str_col(&df, "group")?;
|
||||||
let lat = extract_f32_col(&df, "lat")?;
|
let lat = extract_f32_col(&df, "lat")?;
|
||||||
let lng = extract_f32_col(&df, "lng")?;
|
let lng = extract_f32_col(&df, "lng")?;
|
||||||
let emoji_raw = extract_str_col(&df, "emoji")?;
|
let emoji_raw = extract_str_col(&df, "emoji")?;
|
||||||
let icon_category_raw = extract_str_col(&df, "icon_category")?;
|
let icon_category_raw: Vec<String> = extract_str_col(&df, "icon_category")?
|
||||||
|
.into_iter()
|
||||||
|
.map(|category| canonical_poi_category(&category).to_string())
|
||||||
|
.collect();
|
||||||
|
|
||||||
// Pack POI IDs into a contiguous buffer
|
// Pack POI IDs into a contiguous buffer
|
||||||
let total_id_bytes: usize = id_raw.iter().map(|s| s.len()).sum();
|
let total_id_bytes: usize = id_raw.iter().map(|s| s.len()).sum();
|
||||||
|
|
@ -351,4 +372,19 @@ mod tests {
|
||||||
|
|
||||||
assert!(selected.is_empty());
|
assert!(selected.is_empty());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn coop_category_aliases_resolve_to_single_category() {
|
||||||
|
let values = vec!["Co-op".to_string(), "Tesco".to_string()];
|
||||||
|
|
||||||
|
let selected = resolve_poi_category_filter(
|
||||||
|
&values,
|
||||||
|
"Central England Co-operative,The Southern Co-operative",
|
||||||
|
);
|
||||||
|
|
||||||
|
assert!(selected.contains(&0));
|
||||||
|
assert_eq!(selected.len(), 1);
|
||||||
|
assert_eq!(canonical_poi_category("Lincolnshire Co-operative"), "Co-op");
|
||||||
|
assert_eq!(canonical_poi_category("Tesco"), "Tesco");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1014,22 +1014,6 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[
|
||||||
},
|
},
|
||||||
];
|
];
|
||||||
|
|
||||||
/// Feature names that describe an individual property (price, size, type, etc.) rather
|
|
||||||
/// than the surrounding area. Use this to skip filters that should not exclude live
|
|
||||||
/// listings on the map even though they hide aggregated property rows.
|
|
||||||
pub fn property_level_feature_names() -> Vec<&'static str> {
|
|
||||||
const PROPERTY_GROUPS: &[&str] = &["Properties", "Property prices"];
|
|
||||||
FEATURE_GROUPS
|
|
||||||
.iter()
|
|
||||||
.filter(|group| PROPERTY_GROUPS.contains(&group.name))
|
|
||||||
.flat_map(|group| group.features.iter())
|
|
||||||
.map(|feature| match feature {
|
|
||||||
Feature::Numeric(c) => c.name,
|
|
||||||
Feature::Enum(c) => c.name,
|
|
||||||
})
|
|
||||||
.collect()
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Flat ordered list of all numeric feature names (follows group order).
|
/// Flat ordered list of all numeric feature names (follows group order).
|
||||||
pub fn all_numeric_feature_names() -> Vec<&'static str> {
|
pub fn all_numeric_feature_names() -> Vec<&'static str> {
|
||||||
FEATURE_GROUPS
|
FEATURE_GROUPS
|
||||||
|
|
|
||||||
|
|
@ -541,7 +541,7 @@ async fn main() -> anyhow::Result<()> {
|
||||||
bail!("Actual listings parquet not found: {}", path.display());
|
bail!("Actual listings parquet not found: {}", path.display());
|
||||||
}
|
}
|
||||||
info!("Loading actual listings from {}", path.display());
|
info!("Loading actual listings from {}", path.display());
|
||||||
let listings = data::ActualListingData::load(path)?;
|
let listings = data::ActualListingData::load(path, &property_data)?;
|
||||||
trim_allocator("actual listings load");
|
trim_allocator("actual listings load");
|
||||||
info!(rows = listings.lat.len(), "Actual listings loaded");
|
info!(rows = listings.lat.len(), "Actual listings loaded");
|
||||||
Some(Arc::new(listings))
|
Some(Arc::new(listings))
|
||||||
|
|
|
||||||
|
|
@ -1,16 +1,20 @@
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
use axum::extract::{Query, State};
|
use axum::extract::{Query, State};
|
||||||
use axum::response::Json;
|
use axum::response::{IntoResponse, Json, Response};
|
||||||
|
use axum::Extension;
|
||||||
use rustc_hash::FxHashSet;
|
use rustc_hash::FxHashSet;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use tracing::info;
|
use tracing::info;
|
||||||
|
|
||||||
use crate::api_error::ApiError;
|
use crate::api_error::ApiError;
|
||||||
|
use crate::auth::OptionalUser;
|
||||||
|
use crate::consts::NAN_U16;
|
||||||
use crate::data::ActualListing;
|
use crate::data::ActualListing;
|
||||||
use crate::features::property_level_feature_names;
|
use crate::licensing::{check_license_bounds, resolve_share_code};
|
||||||
use crate::parsing::{
|
use crate::parsing::{
|
||||||
parse_filters_with_poi, require_bounds, row_passes_filters, row_passes_poi_filters,
|
parse_filters_with_poi, require_bounds, row_passes_filters, row_passes_poi_filters,
|
||||||
|
ParsedEnumFilter, ParsedFilter,
|
||||||
};
|
};
|
||||||
use crate::state::{AppState, SharedState};
|
use crate::state::{AppState, SharedState};
|
||||||
|
|
||||||
|
|
@ -25,6 +29,8 @@ pub struct ActualListingsParams {
|
||||||
travel: Option<String>,
|
travel: Option<String>,
|
||||||
/// Number of results to skip. Defaults to 0.
|
/// Number of results to skip. Defaults to 0.
|
||||||
offset: Option<usize>,
|
offset: Option<usize>,
|
||||||
|
/// Share-link code; grants bbox-scoped access for unlicensed users.
|
||||||
|
share: Option<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Serialize)]
|
#[derive(Serialize)]
|
||||||
|
|
@ -35,10 +41,24 @@ pub struct ActualListingsResponse {
|
||||||
pub truncated: bool,
|
pub truncated: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const LISTING_LEVEL_FILTER_FEATURES: &[&str] = &[
|
||||||
|
"Property type",
|
||||||
|
"Leasehold/Freehold",
|
||||||
|
"Total floor area (sqm)",
|
||||||
|
"Number of bedrooms & living rooms",
|
||||||
|
"Estimated current price",
|
||||||
|
"Last known price",
|
||||||
|
"Est. price per sqm",
|
||||||
|
"Price per sqm",
|
||||||
|
];
|
||||||
|
|
||||||
|
const KEEP_UNKNOWN_LISTING_FILTER_FEATURES: &[&str] = &["Total floor area (sqm)"];
|
||||||
|
|
||||||
pub async fn get_actual_listings(
|
pub async fn get_actual_listings(
|
||||||
State(shared): State<Arc<SharedState>>,
|
State(shared): State<Arc<SharedState>>,
|
||||||
|
Extension(user): Extension<OptionalUser>,
|
||||||
Query(params): Query<ActualListingsParams>,
|
Query(params): Query<ActualListingsParams>,
|
||||||
) -> Result<Json<ActualListingsResponse>, ApiError> {
|
) -> Result<Json<ActualListingsResponse>, Response> {
|
||||||
let state = shared.load_state();
|
let state = shared.load_state();
|
||||||
let offset = params.offset.unwrap_or(0);
|
let offset = params.offset.unwrap_or(0);
|
||||||
let Some(actual_listings) = state.actual_listings.clone() else {
|
let Some(actual_listings) = state.actual_listings.clone() else {
|
||||||
|
|
@ -49,11 +69,15 @@ pub async fn get_actual_listings(
|
||||||
truncated: false,
|
truncated: false,
|
||||||
}));
|
}));
|
||||||
};
|
};
|
||||||
let (south, west, north, east) = require_bounds(params.bounds).map_err(ApiError::from)?;
|
let (south, west, north, east) =
|
||||||
|
require_bounds(params.bounds).map_err(IntoResponse::into_response)?;
|
||||||
|
|
||||||
|
let share_bounds = resolve_share_code(&state, params.share.as_deref()).await;
|
||||||
|
check_license_bounds(&user.0, (south, west, north, east), share_bounds)?;
|
||||||
|
|
||||||
let quant = state.data.quant_ref();
|
let quant = state.data.quant_ref();
|
||||||
let poi_quant = state.data.poi_metrics.quant_ref();
|
let poi_quant = state.data.poi_metrics.quant_ref();
|
||||||
let (mut parsed_filters, mut parsed_enum_filters, parsed_poi_filters) = parse_filters_with_poi(
|
let (parsed_filters, parsed_enum_filters, parsed_poi_filters) = parse_filters_with_poi(
|
||||||
params.filters.as_deref(),
|
params.filters.as_deref(),
|
||||||
&state.feature_name_to_index,
|
&state.feature_name_to_index,
|
||||||
&state.data.enum_values,
|
&state.data.enum_values,
|
||||||
|
|
@ -61,40 +85,38 @@ pub async fn get_actual_listings(
|
||||||
&state.data.poi_metrics.name_to_index,
|
&state.data.poi_metrics.name_to_index,
|
||||||
&poi_quant,
|
&poi_quant,
|
||||||
)
|
)
|
||||||
.map_err(ApiError::BadRequest)?;
|
.map_err(|err| ApiError::BadRequest(err).into_response())?;
|
||||||
|
|
||||||
// Drop property-level filters (price, sqm, build year, beds, type, etc.) so they
|
let travel_entries = parse_optional_travel(params.travel.as_deref())
|
||||||
// don't hide live listings — those are individual-property concerns the user can
|
.map_err(|err| ApiError::BadRequest(err).into_response())?;
|
||||||
// judge from the pin itself. We only keep area/postcode-level filters here.
|
|
||||||
let property_level_idxs: FxHashSet<usize> = property_level_feature_names()
|
|
||||||
.into_iter()
|
|
||||||
.filter_map(|name| state.feature_name_to_index.get(name).copied())
|
|
||||||
.collect();
|
|
||||||
parsed_filters.retain(|f| !property_level_idxs.contains(&f.feat_idx));
|
|
||||||
parsed_enum_filters.retain(|f| !property_level_idxs.contains(&f.feat_idx));
|
|
||||||
|
|
||||||
let travel_entries =
|
let listing_level_feature_idxs = listing_level_filter_feature_idxs(&state);
|
||||||
parse_optional_travel(params.travel.as_deref()).map_err(ApiError::BadRequest)?;
|
let keep_unknown_listing_filter_idxs = keep_unknown_listing_filter_feature_idxs(&state);
|
||||||
|
let (listing_filters, postcode_filters) =
|
||||||
|
split_numeric_filters(parsed_filters, &listing_level_feature_idxs);
|
||||||
|
let (listing_enum_filters, postcode_enum_filters) =
|
||||||
|
split_enum_filters(parsed_enum_filters, &listing_level_feature_idxs);
|
||||||
|
|
||||||
let has_area_filters = !parsed_filters.is_empty()
|
let has_postcode_filters = !postcode_filters.is_empty()
|
||||||
|| !parsed_enum_filters.is_empty()
|
|| !postcode_enum_filters.is_empty()
|
||||||
|| !parsed_poi_filters.is_empty()
|
|| !parsed_poi_filters.is_empty()
|
||||||
|| !travel_entries.is_empty();
|
|| !travel_entries.is_empty();
|
||||||
|
let has_listing_filters = !listing_filters.is_empty() || !listing_enum_filters.is_empty();
|
||||||
|
|
||||||
let state_clone = state.clone();
|
let state_clone = state.clone();
|
||||||
let response =
|
let response =
|
||||||
tokio::task::spawn_blocking(move || -> Result<ActualListingsResponse, String> {
|
tokio::task::spawn_blocking(move || -> Result<ActualListingsResponse, String> {
|
||||||
let t0 = std::time::Instant::now();
|
let t0 = std::time::Instant::now();
|
||||||
|
|
||||||
let passing_postcodes = if has_area_filters {
|
let passing_postcodes = if has_postcode_filters {
|
||||||
Some(compute_passing_postcodes(
|
Some(compute_passing_postcodes(
|
||||||
&state_clone,
|
&state_clone,
|
||||||
south,
|
south,
|
||||||
west,
|
west,
|
||||||
north,
|
north,
|
||||||
east,
|
east,
|
||||||
&parsed_filters,
|
&postcode_filters,
|
||||||
&parsed_enum_filters,
|
&postcode_enum_filters,
|
||||||
&parsed_poi_filters,
|
&parsed_poi_filters,
|
||||||
&travel_entries,
|
&travel_entries,
|
||||||
)?)
|
)?)
|
||||||
|
|
@ -116,6 +138,18 @@ pub async fn get_actual_listings(
|
||||||
return None;
|
return None;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if has_listing_filters
|
||||||
|
&& !row_passes_listing_filters(
|
||||||
|
row,
|
||||||
|
&listing_filters,
|
||||||
|
&listing_enum_filters,
|
||||||
|
&actual_listings.filter_feature_data,
|
||||||
|
state_clone.data.num_features,
|
||||||
|
&keep_unknown_listing_filter_idxs,
|
||||||
|
)
|
||||||
|
{
|
||||||
|
return None;
|
||||||
|
}
|
||||||
Some(row)
|
Some(row)
|
||||||
})
|
})
|
||||||
.collect();
|
.collect();
|
||||||
|
|
@ -142,7 +176,8 @@ pub async fn get_actual_listings(
|
||||||
total = total_matching,
|
total = total_matching,
|
||||||
total_in_bounds,
|
total_in_bounds,
|
||||||
offset,
|
offset,
|
||||||
filtered = passing_postcodes.is_some(),
|
postcode_filtered = passing_postcodes.is_some(),
|
||||||
|
listing_filtered = has_listing_filters,
|
||||||
ms = format_args!("{:.1}", elapsed.as_secs_f64() * 1000.0),
|
ms = format_args!("{:.1}", elapsed.as_secs_f64() * 1000.0),
|
||||||
"GET /api/actual-listings"
|
"GET /api/actual-listings"
|
||||||
);
|
);
|
||||||
|
|
@ -155,12 +190,82 @@ pub async fn get_actual_listings(
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
.await
|
.await
|
||||||
.map_err(|error| ApiError::Internal(error.to_string()))?
|
.map_err(|error| ApiError::Internal(error.to_string()).into_response())?
|
||||||
.map_err(ApiError::Internal)?;
|
.map_err(|err| ApiError::Internal(err).into_response())?;
|
||||||
|
|
||||||
Ok(Json(response))
|
Ok(Json(response))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn listing_level_filter_feature_idxs(state: &AppState) -> FxHashSet<usize> {
|
||||||
|
feature_idxs(state, LISTING_LEVEL_FILTER_FEATURES)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn keep_unknown_listing_filter_feature_idxs(state: &AppState) -> FxHashSet<usize> {
|
||||||
|
feature_idxs(state, KEEP_UNKNOWN_LISTING_FILTER_FEATURES)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn feature_idxs(state: &AppState, names: &[&str]) -> FxHashSet<usize> {
|
||||||
|
names
|
||||||
|
.iter()
|
||||||
|
.filter_map(|name| state.feature_name_to_index.get(*name).copied())
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn split_numeric_filters(
|
||||||
|
filters: Vec<ParsedFilter>,
|
||||||
|
listing_level_feature_idxs: &FxHashSet<usize>,
|
||||||
|
) -> (Vec<ParsedFilter>, Vec<ParsedFilter>) {
|
||||||
|
let mut listing_filters = Vec::new();
|
||||||
|
let mut postcode_filters = Vec::new();
|
||||||
|
for filter in filters {
|
||||||
|
if listing_level_feature_idxs.contains(&filter.feat_idx) {
|
||||||
|
listing_filters.push(filter);
|
||||||
|
} else {
|
||||||
|
postcode_filters.push(filter);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
(listing_filters, postcode_filters)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn split_enum_filters(
|
||||||
|
filters: Vec<ParsedEnumFilter>,
|
||||||
|
listing_level_feature_idxs: &FxHashSet<usize>,
|
||||||
|
) -> (Vec<ParsedEnumFilter>, Vec<ParsedEnumFilter>) {
|
||||||
|
let mut listing_filters = Vec::new();
|
||||||
|
let mut postcode_filters = Vec::new();
|
||||||
|
for filter in filters {
|
||||||
|
if listing_level_feature_idxs.contains(&filter.feat_idx) {
|
||||||
|
listing_filters.push(filter);
|
||||||
|
} else {
|
||||||
|
postcode_filters.push(filter);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
(listing_filters, postcode_filters)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn row_passes_listing_filters(
|
||||||
|
row: usize,
|
||||||
|
filters: &[ParsedFilter],
|
||||||
|
enum_filters: &[ParsedEnumFilter],
|
||||||
|
feature_data: &[u16],
|
||||||
|
num_features: usize,
|
||||||
|
keep_unknown_filter_idxs: &FxHashSet<usize>,
|
||||||
|
) -> bool {
|
||||||
|
let base = row * num_features;
|
||||||
|
|
||||||
|
filters.iter().all(|filter| {
|
||||||
|
let raw = feature_data[base + filter.feat_idx];
|
||||||
|
if raw == NAN_U16 {
|
||||||
|
keep_unknown_filter_idxs.contains(&filter.feat_idx)
|
||||||
|
} else {
|
||||||
|
raw >= filter.min_u16 && raw <= filter.max_u16
|
||||||
|
}
|
||||||
|
}) && enum_filters.iter().all(|filter| {
|
||||||
|
let raw = feature_data[base + filter.feat_idx];
|
||||||
|
raw != NAN_U16 && filter.allowed.contains(&raw)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
#[allow(clippy::too_many_arguments)]
|
#[allow(clippy::too_many_arguments)]
|
||||||
fn compute_passing_postcodes(
|
fn compute_passing_postcodes(
|
||||||
state: &AppState,
|
state: &AppState,
|
||||||
|
|
@ -224,3 +329,111 @@ fn compute_passing_postcodes(
|
||||||
|
|
||||||
Ok(passing)
|
Ok(passing)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
fn numeric_filter(feat_idx: usize) -> ParsedFilter {
|
||||||
|
ParsedFilter {
|
||||||
|
feat_idx,
|
||||||
|
min_u16: 0,
|
||||||
|
max_u16: 100,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn enum_filter(feat_idx: usize) -> ParsedEnumFilter {
|
||||||
|
ParsedEnumFilter {
|
||||||
|
feat_idx,
|
||||||
|
allowed: [0u16].into_iter().collect(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn splits_actual_listing_filters_by_listing_native_features() {
|
||||||
|
let listing_level_feature_idxs: FxHashSet<usize> = [1usize, 3].into_iter().collect();
|
||||||
|
|
||||||
|
let (listing_filters, postcode_filters) = split_numeric_filters(
|
||||||
|
vec![numeric_filter(0), numeric_filter(1), numeric_filter(3)],
|
||||||
|
&listing_level_feature_idxs,
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
listing_filters
|
||||||
|
.iter()
|
||||||
|
.map(|filter| filter.feat_idx)
|
||||||
|
.collect::<Vec<_>>(),
|
||||||
|
vec![1, 3]
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
postcode_filters
|
||||||
|
.iter()
|
||||||
|
.map(|filter| filter.feat_idx)
|
||||||
|
.collect::<Vec<_>>(),
|
||||||
|
vec![0]
|
||||||
|
);
|
||||||
|
|
||||||
|
let (listing_enum_filters, postcode_enum_filters) = split_enum_filters(
|
||||||
|
vec![enum_filter(2), enum_filter(3)],
|
||||||
|
&listing_level_feature_idxs,
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
listing_enum_filters
|
||||||
|
.iter()
|
||||||
|
.map(|filter| filter.feat_idx)
|
||||||
|
.collect::<Vec<_>>(),
|
||||||
|
vec![3]
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
postcode_enum_filters
|
||||||
|
.iter()
|
||||||
|
.map(|filter| filter.feat_idx)
|
||||||
|
.collect::<Vec<_>>(),
|
||||||
|
vec![2]
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn listing_floor_area_filter_keeps_unknown_values() {
|
||||||
|
let floor_area_filter = ParsedFilter {
|
||||||
|
feat_idx: 0,
|
||||||
|
min_u16: 10,
|
||||||
|
max_u16: 20,
|
||||||
|
};
|
||||||
|
let keep_unknown_filter_idxs: FxHashSet<usize> = [0usize].into_iter().collect();
|
||||||
|
|
||||||
|
assert!(row_passes_listing_filters(
|
||||||
|
0,
|
||||||
|
&[floor_area_filter],
|
||||||
|
&[],
|
||||||
|
&[NAN_U16],
|
||||||
|
1,
|
||||||
|
&keep_unknown_filter_idxs
|
||||||
|
));
|
||||||
|
|
||||||
|
assert!(!row_passes_listing_filters(
|
||||||
|
0,
|
||||||
|
&[ParsedFilter {
|
||||||
|
feat_idx: 0,
|
||||||
|
min_u16: 10,
|
||||||
|
max_u16: 20,
|
||||||
|
}],
|
||||||
|
&[],
|
||||||
|
&[9],
|
||||||
|
1,
|
||||||
|
&keep_unknown_filter_idxs
|
||||||
|
));
|
||||||
|
|
||||||
|
assert!(row_passes_listing_filters(
|
||||||
|
0,
|
||||||
|
&[ParsedFilter {
|
||||||
|
feat_idx: 0,
|
||||||
|
min_u16: 10,
|
||||||
|
max_u16: 20,
|
||||||
|
}],
|
||||||
|
&[],
|
||||||
|
&[15],
|
||||||
|
1,
|
||||||
|
&keep_unknown_filter_idxs
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue