fmt
Some checks failed
CI / Check (push) Failing after 6m52s
Build and publish Docker image / build-and-push (push) Failing after 16m5s

This commit is contained in:
Andras Schmelczer 2026-05-17 19:48:55 +01:00
parent 2f149503bb
commit 6ea544a0f6
10 changed files with 144 additions and 60 deletions

View file

@ -22,13 +22,22 @@ jobs:
- name: Set up Docker Buildx - name: Set up Docker Buildx
uses: https://github.com/docker/setup-buildx-action@v3 uses: https://github.com/docker/setup-buildx-action@v3
with:
driver-opts: |
network=host
- name: Resolve registry vars - name: Resolve registry vars
id: registry id: registry
env:
CONTAINER_REGISTRY_HOST: ${{ vars.CONTAINER_REGISTRY_HOST }}
run: | run: |
host="${{ gitea.server_url }}" host="${CONTAINER_REGISTRY_HOST:-${{ gitea.server_url }}}"
host="${host#https://}" host="${host#https://}"
host="${host#http://}" host="${host#http://}"
host="${host%/}"
if [ "$host" = "forgejo:3000" ]; then
host="127.0.0.1:13000"
fi
repo=$(echo "${{ gitea.repository }}" | tr '[:upper:]' '[:lower:]') repo=$(echo "${{ gitea.repository }}" | tr '[:upper:]' '[:lower:]')
{ {
echo "host=${host}" echo "host=${host}"

View file

@ -33,6 +33,10 @@ class CookiesExpiredError(Exception):
"""Raised when home.co.uk returns 403, indicating cookies need refresh.""" """Raised when home.co.uk returns 403, indicating cookies need refresh."""
class PaginationError(Exception):
"""Raised when home.co.uk pagination cannot be completed."""
# Channel mapping: internal name → URL path segment # Channel mapping: internal name → URL path segment
HOMECOUK_URL_SEGMENT = "for-sale" HOMECOUK_URL_SEGMENT = "for-sale"
@ -171,6 +175,25 @@ def fetch_page(
return None return None
def _coerce_positive_int(value) -> int | None:
parsed = parse_int_value(value)
if parsed is None or parsed <= 0:
return None
return parsed
def _property_identity(prop: dict, page: int, index: int) -> str:
for key in ("listing_id", "property_id", "id"):
value = prop.get(key)
if value:
return f"{key}:{value}"
return (
f"page:{page}:index:{index}:"
f"{prop.get('display_address') or prop.get('address') or ''}:"
f"{prop.get('price') or prop.get('latest_price') or ''}"
)
def parse_floor_area(description: str | None) -> float | None: def parse_floor_area(description: str | None) -> float | None:
"""Try to extract floor area from description text like '789 sq.ft.' or '73 sq.m.'.""" """Try to extract floor area from description text like '789 sq.ft.' or '73 sq.m.'."""
if not description: if not description:
@ -363,6 +386,9 @@ def search_outcode(
url = f"{HOMECOUK_API_BASE}/{url_segment}/{outcode.lower()}/" url = f"{HOMECOUK_API_BASE}/{url_segment}/{outcode.lower()}/"
properties = [] properties = []
page = 1 page = 1
last_page: int | None = None
total_results: int | None = None
seen_ids: set[str] = set()
while True: while True:
params = { params = {
@ -379,12 +405,32 @@ def search_outcode(
data = fetch_page(client, url, params) data = fetch_page(client, url, params)
if not data: if not data:
break raise PaginationError(f"home.co.uk {outcode} page {page} failed to load")
pagination = data.get("pagination", {}) or {}
if last_page is None:
last_page = _coerce_positive_int(pagination.get("last_page"))
if total_results is None:
total_results = _coerce_positive_int(pagination.get("total"))
raw_props = data.get("properties", []) raw_props = data.get("properties", [])
if not raw_props: if not raw_props:
if total_results and page <= (last_page or page):
raise PaginationError(
f"home.co.uk {outcode} page {page} returned no properties "
f"before the advertised end"
)
break break
page_ids = {
_property_identity(prop, page, idx) for idx, prop in enumerate(raw_props)
}
if page_ids and page_ids.issubset(seen_ids):
raise PaginationError(
f"home.co.uk {outcode} page {page} repeated previously seen results"
)
seen_ids.update(page_ids)
for prop in raw_props: for prop in raw_props:
try: try:
transformed = transform_property(prop, pc_index) transformed = transform_property(prop, pc_index)
@ -401,10 +447,12 @@ def search_outcode(
if max_properties is not None and len(properties) >= max_properties: if max_properties is not None and len(properties) >= max_properties:
return properties return properties
# Check pagination if last_page is not None:
pagination = data.get("pagination", {}) if page >= last_page:
last_page = pagination.get("last_page", 1) break
if page >= last_page: elif total_results is not None and len(seen_ids) >= total_results:
break
elif len(raw_props) < HOMECOUK_PER_PAGE:
break break
page += 1 page += 1

View file

@ -21,6 +21,7 @@ Architecture:
import logging import logging
import re import re
import time import time
from urllib.parse import parse_qsl, urlencode, urljoin, urlparse, urlunparse
from constants import DELAY_BETWEEN_PAGES, MAX_BEDROOMS, PROPERTY_TYPE_MAP, ZOOPLA_BASE from constants import DELAY_BETWEEN_PAGES, MAX_BEDROOMS, PROPERTY_TYPE_MAP, ZOOPLA_BASE
from spatial import PostcodeSpatialIndex from spatial import PostcodeSpatialIndex
@ -52,9 +53,6 @@ class _ManagedCamoufoxBrowser:
return getattr(self._browser, name) return getattr(self._browser, name)
# Maximum search result pages to scrape per outcode (25 listings/page)
MAX_PAGES_PER_OUTCODE = 40
# JavaScript to extract listings from the rendered DOM. # JavaScript to extract listings from the rendered DOM.
# Uses data-testid attributes as primary selectors (stable across deployments), # Uses data-testid attributes as primary selectors (stable across deployments),
# then falls back to href-based link matching with parent-walking. # then falls back to href-based link matching with parent-walking.
@ -423,6 +421,45 @@ def _get_result_count(page) -> int:
return 0 return 0
def _url_with_page(url: str, page_num: int) -> str:
parsed = urlparse(url)
query = [(key, value) for key, value in parse_qsl(parsed.query) if key != "pn"]
query.append(("pn", str(page_num)))
return urlunparse(parsed._replace(query=urlencode(query)))
def _find_next_page_url(page) -> str | None:
"""Return the rendered pagination next URL, if Zoopla exposes one."""
try:
href = page.evaluate(
"""() => {
const links = Array.from(document.querySelectorAll('a[href]'));
const next = links.find((link) => {
const text = (link.innerText || link.textContent || '')
.trim()
.toLowerCase();
const label = (link.getAttribute('aria-label') || '').toLowerCase();
const rel = (link.getAttribute('rel') || '').toLowerCase();
return rel.includes('next')
|| label.includes('next')
|| text === 'next'
|| text === 'next page';
});
if (!next) return null;
const href = next.href || '';
if (!href.includes('/for-sale/') && !href.includes('/new-homes/')) {
return null;
}
return href;
}"""
)
except Exception:
return None
if not href:
return None
return urljoin(ZOOPLA_BASE, href)
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Extraction and pagination # Extraction and pagination
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
@ -476,29 +513,27 @@ def _paginate(
) -> list[dict]: ) -> list[dict]:
"""Extract listings from all pages of search results. """Extract listings from all pages of search results.
Page 1 is already loaded. For subsequent pages, clicks the Next button Page 1 is already loaded. For subsequent pages, follow Zoopla's rendered
or navigates via URL parameter ?pn=N.""" next link when present, otherwise advance via the pn=N URL parameter while
the advertised result count says more listings remain."""
all_listings = _extract_listings(page) all_listings = _extract_listings(page)
if max_properties is not None and len(all_listings) >= max_properties: if max_properties is not None and len(all_listings) >= max_properties:
return all_listings[:max_properties] return all_listings[:max_properties]
if not all_listings or total_results <= len(all_listings): if not all_listings:
return all_listings return all_listings
seen_ids = {listing["id"] for listing in all_listings} seen_ids = {listing["id"] for listing in all_listings}
current_url = page.url
page_num = 2 page_num = 2
while len(all_listings) < total_results and page_num <= MAX_PAGES_PER_OUTCODE: while True:
time.sleep(DELAY_BETWEEN_PAGES) next_url = _find_next_page_url(page)
if not next_url:
if total_results > 0 and len(all_listings) >= total_results:
break
next_url = _url_with_page(page.url, page_num)
# Try navigating via URL parameter time.sleep(DELAY_BETWEEN_PAGES)
if "?" in current_url:
next_url = re.sub(r"[?&]pn=\d+", "", current_url)
separator = "&" if "?" in next_url else "?"
next_url = f"{next_url}{separator}pn={page_num}"
else:
next_url = f"{current_url}?pn={page_num}"
try: try:
page.goto(next_url, wait_until="domcontentloaded", timeout=30000) page.goto(next_url, wait_until="domcontentloaded", timeout=30000)
@ -512,6 +547,12 @@ def _paginate(
page_listings = _extract_listings(page) page_listings = _extract_listings(page)
if not page_listings: if not page_listings:
if total_results > len(all_listings):
raise RuntimeError(
"Zoopla pagination stopped with no listings on page "
f"{page_num}; collected {len(all_listings)} of "
f"{total_results} advertised results"
)
break break
# Deduplicate within this outcode # Deduplicate within this outcode
@ -525,10 +566,20 @@ def _paginate(
return all_listings[:max_properties] return all_listings[:max_properties]
if new_count == 0: if new_count == 0:
break # No new listings on this page if total_results > len(all_listings):
raise RuntimeError(
"Zoopla pagination repeated results on page "
f"{page_num}; collected {len(all_listings)} of "
f"{total_results} advertised results"
)
break
page_num += 1 page_num += 1
if total_results > 0 and len(all_listings) >= total_results:
if not _find_next_page_url(page):
break
return all_listings return all_listings
@ -768,7 +819,7 @@ def search_outcode(
# not match Zoopla's current text format, but listings may still be in DOM # not match Zoopla's current text format, but listings may still be in DOM
raw_listings = _paginate( raw_listings = _paginate(
page, page,
max(total_results, 25), total_results,
max_properties=max_properties, max_properties=max_properties,
) )
if not raw_listings: if not raw_listings:

View file

@ -44,7 +44,7 @@ function getProductDemoSlug(language: string | undefined, isMobile: boolean): st
return isMobile ? `${base}-mobile` : base; return isMobile ? `${base}-mobile` : base;
} }
function highlightBrandText(text: string) { function highlightBrandText(text: string, className = BRAND_TEXT_CLASS) {
const parts = text.split(BRAND_NAME); const parts = text.split(BRAND_NAME);
if (parts.length === 1) return text; if (parts.length === 1) return text;
@ -52,7 +52,7 @@ function highlightBrandText(text: string) {
index === 0 index === 0
? [part] ? [part]
: [ : [
<span key={`brand-${index}`} className={BRAND_TEXT_CLASS}> <span key={`brand-${index}`} className={className}>
{BRAND_NAME} {BRAND_NAME}
</span>, </span>,
part, part,
@ -325,7 +325,7 @@ export default function HomePage({
{t('home.heroSubtitle')} {t('home.heroSubtitle')}
</p> </p>
<p className="text-base md:text-lg text-warm-200 mb-8 max-w-xl"> <p className="text-base md:text-lg text-warm-200 mb-8 max-w-xl">
{highlightBrandText(t('home.heroDescription'))} {highlightBrandText(t('home.heroDescription'), 'font-semibold text-teal-300')}
</p> </p>
<div className="flex flex-col sm:flex-row sm:items-center gap-3 sm:gap-4 mb-10"> <div className="flex flex-col sm:flex-row sm:items-center gap-3 sm:gap-4 mb-10">
<button <button

View file

@ -561,9 +561,7 @@ export default function MapPage({
isGroupExpanded={isAreaGroupExpanded} isGroupExpanded={isAreaGroupExpanded}
onToggleGroup={toggleAreaGroup} onToggleGroup={toggleAreaGroup}
scrollTopRef={areaPaneScrollTopRef} scrollTopRef={areaPaneScrollTopRef}
scrollRestoreKey={ scrollRestoreKey={selectedHexagon ? `${selectedHexagon.type}:${selectedHexagon.id}` : null}
selectedHexagon ? `${selectedHexagon.type}:${selectedHexagon.id}` : null
}
scrollSaveDisabled={loadingAreaStats && areaStats == null} scrollSaveDisabled={loadingAreaStats && areaStats == null}
/> />
</Suspense> </Suspense>
@ -578,9 +576,7 @@ export default function MapPage({
hexagonId={selectedHexagon?.id || null} hexagonId={selectedHexagon?.id || null}
onLoadMore={handleLoadMoreProperties} onLoadMore={handleLoadMoreProperties}
scrollTopRef={propertiesPaneScrollTopRef} scrollTopRef={propertiesPaneScrollTopRef}
scrollRestoreKey={ scrollRestoreKey={selectedHexagon ? `${selectedHexagon.type}:${selectedHexagon.id}` : null}
selectedHexagon ? `${selectedHexagon.type}:${selectedHexagon.id}` : null
}
scrollSaveDisabled={loadingProperties && properties.length === 0} scrollSaveDisabled={loadingProperties && properties.length === 0}
/> />
</Suspense> </Suspense>

View file

@ -34,11 +34,7 @@ describe('useRetainedScrollTop', () => {
it('keeps the saved scroll offset while replacement content is loading', () => { it('keeps the saved scroll offset while replacement content is loading', () => {
const savedScrollTopRef = { current: 0 }; const savedScrollTopRef = { current: 0 };
const view = render( const view = render(
<ScrollPane <ScrollPane restoreKey="area:a" savedScrollTopRef={savedScrollTopRef} suspendSave={false} />
restoreKey="area:a"
savedScrollTopRef={savedScrollTopRef}
suspendSave={false}
/>
); );
const pane = view.getByTestId('pane'); const pane = view.getByTestId('pane');
@ -55,11 +51,7 @@ describe('useRetainedScrollTop', () => {
expect(savedScrollTopRef.current).toBe(360); expect(savedScrollTopRef.current).toBe(360);
view.rerender( view.rerender(
<ScrollPane <ScrollPane restoreKey="area:b" savedScrollTopRef={savedScrollTopRef} suspendSave={false} />
restoreKey="area:b"
savedScrollTopRef={savedScrollTopRef}
suspendSave={false}
/>
); );
expect(pane.scrollTop).toBe(360); expect(pane.scrollTop).toBe(360);
@ -68,11 +60,7 @@ describe('useRetainedScrollTop', () => {
it('restores the saved offset when a pane remounts', () => { it('restores the saved offset when a pane remounts', () => {
const savedScrollTopRef = { current: 220 }; const savedScrollTopRef = { current: 220 };
const view = render( const view = render(
<ScrollPane <ScrollPane restoreKey="area:a" savedScrollTopRef={savedScrollTopRef} suspendSave={false} />
restoreKey="area:a"
savedScrollTopRef={savedScrollTopRef}
suspendSave={false}
/>
); );
expect(view.getByTestId('pane').scrollTop).toBe(220); expect(view.getByTestId('pane').scrollTop).toBe(220);

View file

@ -83,9 +83,7 @@ describe('map utilities', () => {
expect(getPoiIconUrl('M&S', '🛒', undefined, 'M&S Simply Food')).toBe( expect(getPoiIconUrl('M&S', '🛒', undefined, 'M&S Simply Food')).toBe(
'/assets/poi-icons/visuals/mns.svg' '/assets/poi-icons/visuals/mns.svg'
); );
expect(getPoiIconUrl('Tian Tian', '🛒')).toMatch( expect(getPoiIconUrl('Tian Tian', '🛒')).toMatch(/^data:image\/svg\+xml;charset=utf-8,/);
/^data:image\/svg\+xml;charset=utf-8,/
);
}); });
it('keeps POI icon URLs bundled locally', () => { it('keeps POI icon URLs bundled locally', () => {

View file

@ -358,9 +358,8 @@ function getGeneratedPoiLogoUrl(label: string): string {
const cached = generatedPoiLogoCache.get(key); const cached = generatedPoiLogoCache.get(key);
if (cached) return cached; if (cached) return cached;
const [background, foreground] = GENERATED_POI_LOGO_COLORS[ const [background, foreground] =
hashLabel(key) % GENERATED_POI_LOGO_COLORS.length GENERATED_POI_LOGO_COLORS[hashLabel(key) % GENERATED_POI_LOGO_COLORS.length];
];
const initials = escapeSvgText(getPoiLogoInitials(key)); const initials = escapeSvgText(getPoiLogoInitials(key));
const svg = `<svg xmlns="http://www.w3.org/2000/svg" width="256" height="256" viewBox="0 0 256 256"><rect width="256" height="256" rx="48" fill="${background}"/><text x="128" y="144" text-anchor="middle" font-family="Inter,Arial,sans-serif" font-size="82" font-weight="800" fill="${foreground}">${initials}</text></svg>`; const svg = `<svg xmlns="http://www.w3.org/2000/svg" width="256" height="256" viewBox="0 0 256 256"><rect width="256" height="256" rx="48" fill="${background}"/><text x="128" y="144" text-anchor="middle" font-family="Inter,Arial,sans-serif" font-size="82" font-weight="800" fill="${foreground}">${initials}</text></svg>`;
const url = `data:image/svg+xml;charset=utf-8,${encodeURIComponent(svg)}`; const url = `data:image/svg+xml;charset=utf-8,${encodeURIComponent(svg)}`;

View file

@ -14,7 +14,6 @@ pub const MAX_CELLS_PER_REQUEST: usize = 200000;
pub const MAX_POIS_PER_REQUEST: usize = 3000; pub const MAX_POIS_PER_REQUEST: usize = 3000;
pub const PROPERTIES_LIMIT: usize = 100; pub const PROPERTIES_LIMIT: usize = 100;
pub const ACTUAL_LISTINGS_LIMIT: usize = 500;
pub const PLACES_LIMIT: usize = 20; pub const PLACES_LIMIT: usize = 20;
pub const PRICE_HISTORY_POINTS_LIMIT: usize = 5000; pub const PRICE_HISTORY_POINTS_LIMIT: usize = 5000;
pub const POSTCODE_SEARCH_OFFSET: f64 = 0.02; pub const POSTCODE_SEARCH_OFFSET: f64 = 0.02;

View file

@ -7,7 +7,6 @@ use serde::{Deserialize, Serialize};
use tracing::info; use tracing::info;
use crate::api_error::ApiError; use crate::api_error::ApiError;
use crate::consts::ACTUAL_LISTINGS_LIMIT;
use crate::data::ActualListing; use crate::data::ActualListing;
use crate::features::property_level_feature_names; use crate::features::property_level_feature_names;
use crate::parsing::{ use crate::parsing::{
@ -41,7 +40,6 @@ pub async fn get_actual_listings(
Query(params): Query<ActualListingsParams>, Query(params): Query<ActualListingsParams>,
) -> Result<Json<ActualListingsResponse>, ApiError> { ) -> Result<Json<ActualListingsResponse>, ApiError> {
let state = shared.load_state(); let state = shared.load_state();
let limit = ACTUAL_LISTINGS_LIMIT;
let offset = params.offset.unwrap_or(0); let offset = params.offset.unwrap_or(0);
let Some(actual_listings) = state.actual_listings.clone() else { let Some(actual_listings) = state.actual_listings.clone() else {
return Ok(Json(ActualListingsResponse { return Ok(Json(ActualListingsResponse {
@ -132,11 +130,9 @@ pub async fn get_actual_listings(
}) })
}); });
let truncated = total_matching > offset.saturating_add(limit);
let listings: Vec<ActualListing> = matching_rows let listings: Vec<ActualListing> = matching_rows
.iter() .iter()
.skip(offset) .skip(offset)
.take(limit)
.map(|&row| actual_listings.listing_at(row)) .map(|&row| actual_listings.listing_at(row))
.collect(); .collect();
@ -155,7 +151,7 @@ pub async fn get_actual_listings(
listings, listings,
total: total_matching, total: total_matching,
offset, offset,
truncated, truncated: false,
}) })
}) })
.await .await