Compare commits

...

3 commits

Author SHA1 Message Date
d93beb9201 Small fixes
Some checks failed
CI / Python (lint + test) (push) Failing after 1m42s
CI / Frontend (lint + typecheck) (push) Failing after 1m45s
CI / Rust (lint + test) (push) Successful in 4m45s
Build and publish Docker image / build-and-push (push) Failing after 6m21s
2026-03-26 07:55:13 +00:00
d56b5dedff Bump memory 2026-03-26 07:54:43 +00:00
3adbaf435d Fix scrape 2026-03-26 07:54:39 +00:00
10 changed files with 228 additions and 71 deletions

View file

@ -18,6 +18,17 @@ log = logging.getLogger("rightmove")
# Outcode ID cache (Rightmove typeahead → internal ID)
outcode_cache: dict[str, str] = {}
# Rightmove hard-caps pagination at index 1008 (42 pages × 24 results).
# Requesting index >= 1008 returns HTTP 400.
_MAX_INDEX = 1008
# Property type filters for splitting overcapped searches. Each sub-query
# gets its own 1008 cap, so we can recover listings beyond the unfiltered limit.
_PROPERTY_TYPES = [
"detached", "semi-detached", "terraced", "flat",
"bungalow", "park-home", "land",
]
def resolve_outcode_id(client: httpx.Client, outcode: str) -> str | None:
"""Look up Rightmove's internal ID for an outcode via typeahead API."""
@ -40,16 +51,18 @@ def resolve_outcode_id(client: httpx.Client, outcode: str) -> str | None:
return None
def search_outcode(
def _paginate(
client: httpx.Client,
outcode_id: str,
outcode: str,
channel_cfg: dict,
pc_index: PostcodeSpatialIndex,
) -> list[dict]:
"""Paginate through search results for one outcode+channel. Returns transformed properties."""
extra_params: dict | None = None,
) -> tuple[list[dict], int]:
"""Paginate through search results. Returns (properties, result_count)."""
properties = []
index = 0
result_count = 0
while True:
params = {
@ -60,6 +73,8 @@ def search_outcode(
"channel": channel_cfg["channel"],
"transactionType": channel_cfg["transactionType"],
}
if extra_params:
params.update(extra_params)
data = fetch_with_retry(client, SEARCH_URL, params)
if not data:
@ -90,4 +105,52 @@ def search_outcode(
time.sleep(DELAY_BETWEEN_PAGES)
return properties
return properties, result_count
def search_outcode(
client: httpx.Client,
outcode_id: str,
outcode: str,
channel_cfg: dict,
pc_index: PostcodeSpatialIndex,
) -> list[dict]:
"""Paginate through search results for one outcode+channel. Returns transformed properties.
When the unfiltered result count exceeds 1008 (Rightmove's hard pagination cap),
re-queries per property type to recover listings beyond the cap.
"""
properties, result_count = _paginate(
client, outcode_id, outcode, channel_cfg, pc_index
)
if result_count <= _MAX_INDEX:
return properties
# Hit the 1008 cap — re-search per property type to get full coverage
ch = channel_cfg["channel"]
log.info(
"%s/%s: %d results exceed %d cap, splitting by property type",
outcode, ch, result_count, _MAX_INDEX,
)
all_by_id: dict[str, dict] = {p["id"]: p for p in properties}
for pt in _PROPERTY_TYPES:
pt_props, _ = _paginate(
client, outcode_id, outcode, channel_cfg, pc_index,
extra_params={"propertyTypes": pt},
)
new = 0
for p in pt_props:
if p["id"] not in all_by_id:
all_by_id[p["id"]] = p
new += 1
if new:
log.debug("%s/%s type=%s: +%d new properties", outcode, ch, pt, new)
log.info(
"%s/%s: type split recovered %d%d properties",
outcode, ch, len(properties), len(all_by_id),
)
return list(all_by_id.values())

View file

@ -39,7 +39,7 @@ class TurnstileError(Exception):
# Maximum search result pages to scrape per outcode (25 listings/page)
MAX_PAGES_PER_OUTCODE = 10
MAX_PAGES_PER_OUTCODE = 40
# JavaScript to extract listings from the rendered DOM.
# Uses data-testid attributes as primary selectors (stable across deployments),
@ -98,6 +98,12 @@ _EXTRACT_LISTINGS_JS = r"""() => {
if (/leasehold/i.test(text)) tenure = 'Leasehold';
else if (/freehold/i.test(text)) tenure = 'Freehold';
// Extract property type (e.g., "2 bed flat for sale" "flat")
let property_type = '';
const ptMatch = text.match(/\d+\s*(?:beds?|bedrooms?)\s+([\w\s-]+?)\s+(?:for\s+sale|to\s+(?:rent|let)|for\s+rent)/i);
if (ptMatch) property_type = ptMatch[1].trim();
else if (/\bstudio\s*(?:flat|apartment)?\s+(?:for\s+sale|to\s+(?:rent|let)|for\s+rent)/i.test(text)) property_type = 'Studio';
results.push({
id, url: href.replace(window.location.origin, ''),
price: priceMatch ? parseInt(priceMatch[1].replace(/,/g, '')) : null,
@ -106,7 +112,7 @@ _EXTRACT_LISTINGS_JS = r"""() => {
baths: bathsMatch && parseInt(bathsMatch[1]) <= 20 ? parseInt(bathsMatch[1]) : null,
receptions: recMatch && parseInt(recMatch[1]) <= 20 ? parseInt(recMatch[1]) : null,
floor_area_sqft: areaMatch ? parseInt(areaMatch[1].replace(/,/g, '')) : null,
address, tenure,
address, tenure, property_type,
});
}
@ -160,6 +166,12 @@ _EXTRACT_LISTINGS_JS = r"""() => {
if (/leasehold/i.test(text)) tenure = 'Leasehold';
else if (/freehold/i.test(text)) tenure = 'Freehold';
// Extract property type
let property_type = '';
const ptMatch2 = text.match(/\d+\s*(?:beds?|bedrooms?)\s+([\w\s-]+?)\s+(?:for\s+sale|to\s+(?:rent|let)|for\s+rent)/i);
if (ptMatch2) property_type = ptMatch2[1].trim();
else if (/\bstudio\s*(?:flat|apartment)?\s+(?:for\s+sale|to\s+(?:rent|let)|for\s+rent)/i.test(text)) property_type = 'Studio';
results.push({
id, url: href.replace(window.location.origin, ''),
price: priceMatch ? parseInt(priceMatch[1].replace(/,/g, '')) : null,
@ -168,7 +180,7 @@ _EXTRACT_LISTINGS_JS = r"""() => {
baths: bathsMatch && parseInt(bathsMatch[1]) <= 20 ? parseInt(bathsMatch[1]) : null,
receptions: recMatch && parseInt(recMatch[1]) <= 20 ? parseInt(recMatch[1]) : null,
floor_area_sqft: areaMatch ? parseInt(areaMatch[1].replace(/,/g, '')) : null,
address, tenure,
address, tenure, property_type,
});
}
}
@ -557,6 +569,32 @@ def _paginate(page, total_results: int, channel: str) -> list[dict]:
# ---------------------------------------------------------------------------
# Cached outcode → (postcode, lat, lng) lookups to avoid repeated O(n) scans
# over 2.26M postcodes. Populated lazily on first lookup per outcode.
_outcode_coords_cache: dict[str, tuple[str, float, float] | None] = {}
def _resolve_outcode_coords(
outcode: str, pc_coords: dict[str, tuple[float, float]]
) -> tuple[str, float, float] | None:
"""Find first postcode + coords for an outcode. Result is cached."""
if outcode in _outcode_coords_cache:
return _outcode_coords_cache[outcode]
prefix = outcode + " "
for pcd, (lat, lng) in pc_coords.items():
if pcd.startswith(prefix) or (
len(outcode) >= 4
and pcd.startswith(outcode)
and len(pcd) > len(outcode)
):
_outcode_coords_cache[outcode] = (pcd, lat, lng)
return (pcd, lat, lng)
_outcode_coords_cache[outcode] = None
return None
def _extract_postcode(text: str) -> str | None:
"""Extract a full UK postcode from text like 'Dollar Bay Place, Canary Wharf E14 9SS'."""
match = re.search(r"([A-Z]{1,2}\d[A-Z0-9]?\s*\d[A-Z]{2})", text, re.IGNORECASE)
@ -585,11 +623,17 @@ def _map_property_type(raw_type: str | None) -> str:
"""Map Zoopla property type text to canonical type."""
if not raw_type:
return "Other"
# Exact match (handles Rightmove-style capitalised values)
canonical = PROPERTY_TYPE_MAP.get(raw_type)
if canonical:
return canonical
# Title-case match (handles regex-extracted lowercase like "town house" → "Town House")
canonical = PROPERTY_TYPE_MAP.get(raw_type.title())
if canonical:
return canonical
# Keyword fallback
lower = raw_type.lower()
if "flat" in lower or "apartment" in lower or "maisonette" in lower or "studio" in lower:
if "flat" in lower or "apartment" in lower or "maisonette" in lower or "studio" in lower or "penthouse" in lower:
return "Flats/Maisonettes"
if "detached" in lower and "semi" not in lower:
return "Detached"
@ -622,6 +666,7 @@ def transform_property(
channel: str,
pc_index: PostcodeSpatialIndex,
pc_coords: dict[str, tuple[float, float]],
search_outcode: str | None = None,
) -> dict | None:
"""Transform a raw Zoopla listing dict into the standard output schema.
@ -643,22 +688,18 @@ def transform_property(
lat, lng = coords
if lat is None:
# Try outcode-level fallback
outcode = _extract_outcode(address)
if outcode:
# ONSPD 7-char format: 4-char outcodes have no space before incode
# (e.g., "BH191AB"), while shorter outcodes do (e.g., "E14 5AB").
# Check both formats to handle all outcode lengths.
prefix = outcode + " "
for pcd, coords in pc_coords.items():
if pcd.startswith(prefix) or (
len(outcode) >= 4
and pcd.startswith(outcode)
and len(pcd) > len(outcode)
):
postcode = pcd
lat, lng = coords
break
# Try outcode-level fallback from address text
addr_outcode = _extract_outcode(address)
if addr_outcode:
result = _resolve_outcode_coords(addr_outcode, pc_coords)
if result:
postcode, lat, lng = result
# Final fallback: use the outcode we know we're searching
if lat is None and search_outcode:
result = _resolve_outcode_coords(search_outcode, pc_coords)
if result:
postcode, lat, lng = result
if lat is None or lng is None or not postcode:
return None
@ -706,8 +747,8 @@ def transform_property(
"Postcode": postcode,
"Address per Property Register": address,
"Leasehold/Freehold": raw.get("tenure") or None,
"Property type": "Other", # Not reliably extractable from Zoopla search cards
"Property sub-type": "",
"Property type": _map_property_type(raw.get("property_type")),
"Property sub-type": raw.get("property_type") or "",
"price": int(price),
"price_frequency": frequency,
"Price qualifier": "",
@ -774,7 +815,7 @@ def search_outcode(
properties = []
dropped = 0
for raw in raw_listings:
transformed = transform_property(raw, channel, pc_index, pc_coords)
transformed = transform_property(raw, channel, pc_index, pc_coords, search_outcode=outcode)
if transformed:
properties.append(transformed)
zoopla_properties_scraped.labels(channel=channel_label).inc()

View file

@ -1,4 +1,4 @@
import { useState, useEffect, useCallback, useMemo } from 'react';
import { useState, useEffect, useCallback, useMemo, useRef } from 'react';
import MapPage, { type ExportState } from './components/map/MapPage';
import PricingPage from './components/pricing/PricingPage';
import HomePage from './components/home/HomePage';
@ -67,9 +67,14 @@ function pathToPage(pathname: string): { page: Page; inviteCode?: string } | nul
export default function App() {
const urlState = useMemo(() => parseUrlState(), []);
const [mapUrlState, setMapUrlState] = useState(urlState);
const dashboardSearchRef = useRef(
window.location.pathname === '/dashboard' ? window.location.search : ''
);
const activePageRef = useRef<Page>('home');
const initialViewState = useMemo(
() => urlState.viewState || INITIAL_VIEW_STATE,
[urlState.viewState]
() => mapUrlState.viewState || INITIAL_VIEW_STATE,
[mapUrlState.viewState]
);
const isScreenshotMode = useMemo(() => {
@ -179,17 +184,30 @@ export default function App() {
const navigateTo = useCallback(
(page: Page, hash?: string, infoFeature?: string) => {
// Save dashboard search params before navigating away
if (activePageRef.current === 'dashboard') {
dashboardSearchRef.current = window.location.search;
}
if (infoFeature) {
window.history.replaceState({ ...window.history.state, infoFeature }, '');
}
const path = pageToPath(page, inviteCode ?? undefined);
const url = hash ? `${path}#${hash}` : path;
// Restore dashboard search params when navigating back
const search = page === 'dashboard' ? dashboardSearchRef.current : '';
const url = hash ? `${path}${search}#${hash}` : `${path}${search}`;
window.history.pushState({ page }, '', url);
if (page === 'dashboard') {
setMapUrlState(parseUrlState());
}
setActivePage(page);
},
[inviteCode]
);
useEffect(() => {
activePageRef.current = activePage;
}, [activePage]);
useEffect(() => {
if (!window.history.state?.page) {
window.history.replaceState(
@ -199,17 +217,24 @@ export default function App() {
);
}
const handlePopState = (e: PopStateEvent) => {
let page: Page;
if (e.state?.page) {
setActivePage(e.state.page);
page = e.state.page;
setActivePage(page);
if (e.state.infoFeature) {
setPendingInfoFeature(e.state.infoFeature);
}
} else {
// Fall back to deriving page from pathname
const parsed = pathToPage(window.location.pathname);
setActivePage(parsed?.page || 'home');
page = parsed?.page || 'home';
setActivePage(page);
if (parsed?.inviteCode) setInviteCode(parsed.inviteCode);
}
// Re-parse URL state when returning to dashboard via back/forward
if (page === 'dashboard') {
setMapUrlState(parseUrlState());
}
};
window.addEventListener('popstate', handlePopState);
return () => window.removeEventListener('popstate', handlePopState);
@ -367,10 +392,10 @@ export default function App() {
<MapPage
features={features}
poiCategoryGroups={poiCategoryGroups}
initialFilters={urlState.filters || { 'Listing status': ['Historical sale'] }}
initialFilters={mapUrlState.filters || { 'Listing status': ['Historical sale'] }}
initialViewState={initialViewState}
initialPOICategories={urlState.poiCategories || new Set()}
initialTab={urlState.tab || 'area'}
initialPOICategories={mapUrlState.poiCategories || new Set()}
initialTab={mapUrlState.tab || 'area'}
initialLoading={initialLoading}
theme={theme}
pendingInfoFeature={pendingInfoFeature}
@ -378,8 +403,8 @@ export default function App() {
onNavigateTo={navigateTo}
onExportStateChange={setExportState}
isMobile={isMobile}
initialTravelTime={urlState.travelTime}
initialPostcode={urlState.postcode}
initialTravelTime={mapUrlState.travelTime}
initialPostcode={mapUrlState.postcode}
user={user}
onLoginClick={() => {
setAuthModalTab('login');

View file

@ -567,6 +567,7 @@ export default function MapPage({
selectedCategories={selectedPOICategories}
onCategoriesChange={setSelectedPOICategories}
poiCount={pois.length}
onClose={() => setPoiPaneOpen(false)}
/>
);

View file

@ -6,7 +6,7 @@ import InfoPopup from '../ui/InfoPopup';
import { SearchInput } from '../ui/SearchInput';
import { PillToggle } from '../ui/PillToggle';
import { PillGroup } from '../ui/PillGroup';
import { InfoIcon, ChevronIcon } from '../ui/icons';
import { InfoIcon, ChevronIcon, CloseIcon } from '../ui/icons';
import { IconButton } from '../ui/IconButton';
interface POIPaneProps {
@ -15,6 +15,7 @@ interface POIPaneProps {
onCategoriesChange: (categories: Set<string>) => void;
poiCount: number;
onNavigateToSource?: (slug: string) => void;
onClose?: () => void;
}
export default function POIPane({
@ -23,6 +24,7 @@ export default function POIPane({
onCategoriesChange,
poiCount: _poiCount,
onNavigateToSource,
onClose,
}: POIPaneProps) {
const [searchTerm, setSearchTerm] = useState('');
const [isGroupExpanded, toggleCollapse] = useCollapsibleGroups();
@ -96,7 +98,7 @@ export default function POIPane({
<IconButton onClick={() => setShowInfo(true)} title="Data source info">
<InfoIcon />
</IconButton>
<div className="flex gap-1 ml-auto">
<div className="flex gap-1 ml-auto items-center">
<button
onClick={selectAll}
className="px-2 py-0.5 text-xs rounded border border-warm-300 dark:border-warm-700 text-warm-600 dark:text-warm-400 hover:bg-warm-50 dark:hover:bg-warm-700"
@ -109,6 +111,15 @@ export default function POIPane({
>
None
</button>
{onClose && (
<button
onClick={onClose}
className="ml-1 p-0.5 text-warm-400 hover:text-warm-700 dark:hover:text-warm-300"
title="Close"
>
<CloseIcon className="w-4 h-4" />
</button>
)}
</div>
</div>

View file

@ -316,10 +316,12 @@ export function useDeckLayers({
number,
];
}
const ttMin = (d[`min_${vf}`] as number) ?? ttVal;
const ttMax = (d[`max_${vf}`] as number) ?? ttVal;
return getFeatureFillColor(
ttVal as number,
ttVal as number,
ttVal as number,
ttMin as number,
ttMax as number,
clr,
fr,
0,
@ -417,10 +419,12 @@ export function useDeckLayers({
number,
];
}
const ttMin = (d[`min_${vf}`] as number) ?? ttVal;
const ttMax = (d[`max_${vf}`] as number) ?? ttVal;
return getFeatureFillColor(
ttVal as number,
ttVal as number,
ttVal as number,
ttMin as number,
ttMax as number,
clr,
fr,
0,

View file

@ -7,22 +7,23 @@ import subprocess
import sys
import tarfile
import urllib.request
from datetime import datetime, timedelta
from datetime import UTC, datetime, timedelta
from io import BytesIO
from pathlib import Path
PROTOMAPS_BASE = "https://build.protomaps.com"
UK_BBOX = "-10.5,49,5,61"
MAX_AGE_DAYS = 14
USER_AGENT = "property-map-tiles/1.0"
def find_latest_build() -> str:
"""Find the most recent available Protomaps daily build."""
today = datetime.utcnow().date()
today = datetime.now(UTC).date()
for i in range(MAX_AGE_DAYS):
d = today - timedelta(days=i)
url = f"{PROTOMAPS_BASE}/{d:%Y%m%d}.pmtiles"
req = urllib.request.Request(url, method="HEAD")
req = urllib.request.Request(url, method="HEAD", headers={"User-Agent": USER_AGENT})
try:
urllib.request.urlopen(req)
print(f"Found build: {d:%Y%m%d}")

View file

@ -22,8 +22,8 @@ set -euo pipefail
# --demo only compute Bank + TCR, transit only (quick test)
# --- Defaults ---
THREADS=16
HEAP=16g
THREADS=12
HEAP=24g
NETWORK_DIR=property-data/r5-network
OUTPUT_BASE=property-data/travel-times
R5_DIR=r5-java

View file

@ -175,8 +175,7 @@ fn execute_destination_search(state: &AppState, query: &str, mode: &str) -> Valu
.find_map(|(idx, name_lower)| {
let words_match = query_words.iter().all(|word| name_lower.contains(word));
let slug = slugify(&pd.name[idx]);
let slug_match =
slug.contains(&query_slug) || query_slug.contains(&slug);
let slug_match = slug.contains(&query_slug) || query_slug.contains(&slug);
if (words_match || slug_match) && pd.type_rank[idx] == 0 {
Some(pd.name[idx].as_str())
} else {
@ -704,7 +703,7 @@ fn count_matching_rows(
let (pc_interner, pc_keys) = state.data.postcode_parts();
let mut count = 0usize;
for row in 0..num_rows {
for (row, pc_key) in pc_keys.iter().enumerate().take(num_rows) {
if !row_passes_filters(
row,
&parsed_filters,
@ -716,12 +715,11 @@ fn count_matching_rows(
}
if has_travel {
let postcode = pc_interner.resolve(&pc_keys[row]);
let postcode = pc_interner.resolve(pc_key);
let mut passes_travel = true;
for (data, fmin, fmax) in &travel_data {
let pass = if let Some(mins) = data.get(postcode).map(|r| r.minutes as f32) {
fmin.map_or(true, |min| mins >= min)
&& fmax.map_or(true, |max| mins <= max)
fmin.is_none_or(|min| mins >= min) && fmax.is_none_or(|max| mins <= max)
} else {
false // no travel data → postcode not reachable
};
@ -880,7 +878,12 @@ pub async fn post_ai_filters(
let fn_args = fc.get("args").cloned().unwrap_or(json!({}));
tool_call_count += 1;
info!(function = fn_name, round = round, tool_call = tool_call_count, "AI called tool");
info!(
function = fn_name,
round = round,
tool_call = tool_call_count,
"AI called tool"
);
if tool_call_count > MAX_TOOL_CALLS {
warn!("Tool call budget exhausted, forcing text output");
@ -929,9 +932,15 @@ pub async fn post_ai_filters(
if text.is_empty() {
retry_count += 1;
warn!("Gemini returned empty text content (round {}, retry {})", round, retry_count);
warn!(
"Gemini returned empty text content (round {}, retry {})",
round, retry_count
);
if retry_count > MAX_RETRIES {
return Err((StatusCode::BAD_GATEWAY, "AI returned empty responses".into()));
return Err((
StatusCode::BAD_GATEWAY,
"AI returned empty responses".into(),
));
}
contents.push(candidate.clone());
contents.push(json!({
@ -988,7 +997,11 @@ pub async fn post_ai_filters(
// Count matching properties and refine if too restrictive
let match_count = count_matching_rows(&state, &filters, &travel_time_filters);
info!(match_count = match_count, round = round, "AI filter match count");
info!(
match_count = match_count,
round = round,
"AI filter match count"
);
if match_count == 0 {
refinement_attempts += 1;
@ -1008,7 +1021,10 @@ pub async fn post_ai_filters(
let notes = if notes.is_empty() {
"No properties match these filters. Try relaxing some constraints.".to_string()
} else {
format!("{}. No properties match — try relaxing some constraints.", notes)
format!(
"{}. No properties match — try relaxing some constraints.",
notes
)
};
return Ok(Json(AiFiltersResponse {
@ -1193,8 +1209,7 @@ fn validate_and_convert(raw: &Value, features: &FeaturesResponse, listing_type:
} => {
// Only include features valid for the chosen listing mode
if modes.is_empty() || modes.contains(&listing_type) {
numeric_features
.insert(name, (*min, *max, histogram.min, histogram.max));
numeric_features.insert(name, (*min, *max, histogram.min, histogram.max));
}
}
FeatureInfo::Enum { name, values, .. } => {
@ -1217,11 +1232,10 @@ fn validate_and_convert(raw: &Value, features: &FeaturesResponse, listing_type:
Some(name) => name,
None => continue,
};
let (slider_min, slider_max, data_min, data_max) =
match numeric_features.get(name) {
Some(range) => *range,
None => continue,
};
let (slider_min, slider_max, data_min, data_max) = match numeric_features.get(name) {
Some(range) => *range,
None => continue,
};
let bound = match item.get("bound").and_then(|val| val.as_str()) {
Some(b) => b,
None => continue,

View file

@ -140,10 +140,7 @@ pub async fn get_short_url(
match params {
Some(params) => {
let redirect_url = format!("/dashboard?{params}");
let og_image_url = format!(
"{}/api/screenshot?og=1&{params}",
state.public_url
);
let og_image_url = format!("{}/api/screenshot?og=1&{params}", state.public_url);
let og_url = format!("{}/s/{code}", state.public_url);
let og_title = "Perfect Postcode \u{2014} Every neighbourhood in England";
let og_description = "Explore property prices, energy ratings, crime stats, school ratings, and more across England on one interactive map.";