From 6ea544a0f6d2530b041f8e1090c809518530e930 Mon Sep 17 00:00:00 2001
From: Andras Schmelczer
Date: Sun, 17 May 2026 19:48:55 +0100
Subject: [PATCH] fmt
---
.forgejo/workflows/docker-publish.yml | 11 ++-
finder/homecouk.py | 58 +++++++++++--
finder/zoopla.py | 87 +++++++++++++++----
frontend/src/components/home/HomePage.tsx | 6 +-
frontend/src/components/map/MapPage.tsx | 8 +-
.../src/hooks/useRetainedScrollTop.test.tsx | 18 +---
frontend/src/lib/map-utils.test.ts | 4 +-
frontend/src/lib/map-utils.ts | 5 +-
server-rs/src/consts.rs | 1 -
server-rs/src/routes/actual_listings.rs | 6 +-
10 files changed, 144 insertions(+), 60 deletions(-)
diff --git a/.forgejo/workflows/docker-publish.yml b/.forgejo/workflows/docker-publish.yml
index 10a3b89..19542d8 100644
--- a/.forgejo/workflows/docker-publish.yml
+++ b/.forgejo/workflows/docker-publish.yml
@@ -22,13 +22,22 @@ jobs:
- name: Set up Docker Buildx
uses: https://github.com/docker/setup-buildx-action@v3
+ with:
+ driver-opts: |
+ network=host
- name: Resolve registry vars
id: registry
+ env:
+ CONTAINER_REGISTRY_HOST: ${{ vars.CONTAINER_REGISTRY_HOST }}
run: |
- host="${{ gitea.server_url }}"
+ host="${CONTAINER_REGISTRY_HOST:-${{ gitea.server_url }}}"
host="${host#https://}"
host="${host#http://}"
+ host="${host%/}"
+ if [ "$host" = "forgejo:3000" ]; then
+ host="127.0.0.1:13000"
+ fi
repo=$(echo "${{ gitea.repository }}" | tr '[:upper:]' '[:lower:]')
{
echo "host=${host}"
diff --git a/finder/homecouk.py b/finder/homecouk.py
index 09a2401..60d421d 100644
--- a/finder/homecouk.py
+++ b/finder/homecouk.py
@@ -33,6 +33,10 @@ class CookiesExpiredError(Exception):
"""Raised when home.co.uk returns 403, indicating cookies need refresh."""
+class PaginationError(Exception):
+ """Raised when home.co.uk pagination cannot be completed."""
+
+
# Channel mapping: internal name → URL path segment
HOMECOUK_URL_SEGMENT = "for-sale"
@@ -171,6 +175,25 @@ def fetch_page(
return None
+def _coerce_positive_int(value) -> int | None:
+ parsed = parse_int_value(value)
+ if parsed is None or parsed <= 0:
+ return None
+ return parsed
+
+
+def _property_identity(prop: dict, page: int, index: int) -> str:
+ for key in ("listing_id", "property_id", "id"):
+ value = prop.get(key)
+ if value:
+ return f"{key}:{value}"
+ return (
+ f"page:{page}:index:{index}:"
+ f"{prop.get('display_address') or prop.get('address') or ''}:"
+ f"{prop.get('price') or prop.get('latest_price') or ''}"
+ )
+
+
def parse_floor_area(description: str | None) -> float | None:
"""Try to extract floor area from description text like '789 sq.ft.' or '73 sq.m.'."""
if not description:
@@ -363,6 +386,9 @@ def search_outcode(
url = f"{HOMECOUK_API_BASE}/{url_segment}/{outcode.lower()}/"
properties = []
page = 1
+ last_page: int | None = None
+ total_results: int | None = None
+ seen_ids: set[str] = set()
while True:
params = {
@@ -379,12 +405,32 @@ def search_outcode(
data = fetch_page(client, url, params)
if not data:
- break
+ raise PaginationError(f"home.co.uk {outcode} page {page} failed to load")
+
+ pagination = data.get("pagination", {}) or {}
+ if last_page is None:
+ last_page = _coerce_positive_int(pagination.get("last_page"))
+ if total_results is None:
+ total_results = _coerce_positive_int(pagination.get("total"))
raw_props = data.get("properties", [])
if not raw_props:
+ if total_results and page <= (last_page or page):
+ raise PaginationError(
+ f"home.co.uk {outcode} page {page} returned no properties "
+ f"before the advertised end"
+ )
break
+ page_ids = {
+ _property_identity(prop, page, idx) for idx, prop in enumerate(raw_props)
+ }
+ if page_ids and page_ids.issubset(seen_ids):
+ raise PaginationError(
+ f"home.co.uk {outcode} page {page} repeated previously seen results"
+ )
+ seen_ids.update(page_ids)
+
for prop in raw_props:
try:
transformed = transform_property(prop, pc_index)
@@ -401,10 +447,12 @@ def search_outcode(
if max_properties is not None and len(properties) >= max_properties:
return properties
- # Check pagination
- pagination = data.get("pagination", {})
- last_page = pagination.get("last_page", 1)
- if page >= last_page:
+ if last_page is not None:
+ if page >= last_page:
+ break
+ elif total_results is not None and len(seen_ids) >= total_results:
+ break
+ elif len(raw_props) < HOMECOUK_PER_PAGE:
break
page += 1
diff --git a/finder/zoopla.py b/finder/zoopla.py
index dcd70ee..e2a2363 100644
--- a/finder/zoopla.py
+++ b/finder/zoopla.py
@@ -21,6 +21,7 @@ Architecture:
import logging
import re
import time
+from urllib.parse import parse_qsl, urlencode, urljoin, urlparse, urlunparse
from constants import DELAY_BETWEEN_PAGES, MAX_BEDROOMS, PROPERTY_TYPE_MAP, ZOOPLA_BASE
from spatial import PostcodeSpatialIndex
@@ -52,9 +53,6 @@ class _ManagedCamoufoxBrowser:
return getattr(self._browser, name)
-# Maximum search result pages to scrape per outcode (25 listings/page)
-MAX_PAGES_PER_OUTCODE = 40
-
# JavaScript to extract listings from the rendered DOM.
# Uses data-testid attributes as primary selectors (stable across deployments),
# then falls back to href-based link matching with parent-walking.
@@ -423,6 +421,45 @@ def _get_result_count(page) -> int:
return 0
+def _url_with_page(url: str, page_num: int) -> str:
+ parsed = urlparse(url)
+ query = [(key, value) for key, value in parse_qsl(parsed.query) if key != "pn"]
+ query.append(("pn", str(page_num)))
+ return urlunparse(parsed._replace(query=urlencode(query)))
+
+
+def _find_next_page_url(page) -> str | None:
+ """Return the rendered pagination next URL, if Zoopla exposes one."""
+ try:
+ href = page.evaluate(
+ """() => {
+ const links = Array.from(document.querySelectorAll('a[href]'));
+ const next = links.find((link) => {
+ const text = (link.innerText || link.textContent || '')
+ .trim()
+ .toLowerCase();
+ const label = (link.getAttribute('aria-label') || '').toLowerCase();
+ const rel = (link.getAttribute('rel') || '').toLowerCase();
+ return rel.includes('next')
+ || label.includes('next')
+ || text === 'next'
+ || text === 'next page';
+ });
+ if (!next) return null;
+ const href = next.href || '';
+ if (!href.includes('/for-sale/') && !href.includes('/new-homes/')) {
+ return null;
+ }
+ return href;
+ }"""
+ )
+ except Exception:
+ return None
+ if not href:
+ return None
+ return urljoin(ZOOPLA_BASE, href)
+
+
# ---------------------------------------------------------------------------
# Extraction and pagination
# ---------------------------------------------------------------------------
@@ -476,29 +513,27 @@ def _paginate(
) -> list[dict]:
"""Extract listings from all pages of search results.
- Page 1 is already loaded. For subsequent pages, clicks the Next button
- or navigates via URL parameter ?pn=N."""
+ Page 1 is already loaded. For subsequent pages, follow Zoopla's rendered
+ next link when present, otherwise advance via the pn=N URL parameter while
+ the advertised result count says more listings remain."""
all_listings = _extract_listings(page)
if max_properties is not None and len(all_listings) >= max_properties:
return all_listings[:max_properties]
- if not all_listings or total_results <= len(all_listings):
+ if not all_listings:
return all_listings
seen_ids = {listing["id"] for listing in all_listings}
- current_url = page.url
page_num = 2
- while len(all_listings) < total_results and page_num <= MAX_PAGES_PER_OUTCODE:
- time.sleep(DELAY_BETWEEN_PAGES)
+ while True:
+ next_url = _find_next_page_url(page)
+ if not next_url:
+ if total_results > 0 and len(all_listings) >= total_results:
+ break
+ next_url = _url_with_page(page.url, page_num)
- # Try navigating via URL parameter
- if "?" in current_url:
- next_url = re.sub(r"[?&]pn=\d+", "", current_url)
- separator = "&" if "?" in next_url else "?"
- next_url = f"{next_url}{separator}pn={page_num}"
- else:
- next_url = f"{current_url}?pn={page_num}"
+ time.sleep(DELAY_BETWEEN_PAGES)
try:
page.goto(next_url, wait_until="domcontentloaded", timeout=30000)
@@ -512,6 +547,12 @@ def _paginate(
page_listings = _extract_listings(page)
if not page_listings:
+ if total_results > len(all_listings):
+ raise RuntimeError(
+ "Zoopla pagination stopped with no listings on page "
+ f"{page_num}; collected {len(all_listings)} of "
+ f"{total_results} advertised results"
+ )
break
# Deduplicate within this outcode
@@ -525,10 +566,20 @@ def _paginate(
return all_listings[:max_properties]
if new_count == 0:
- break # No new listings on this page
+ if total_results > len(all_listings):
+ raise RuntimeError(
+ "Zoopla pagination repeated results on page "
+ f"{page_num}; collected {len(all_listings)} of "
+ f"{total_results} advertised results"
+ )
+ break
page_num += 1
+ if total_results > 0 and len(all_listings) >= total_results:
+ if not _find_next_page_url(page):
+ break
+
return all_listings
@@ -768,7 +819,7 @@ def search_outcode(
# not match Zoopla's current text format, but listings may still be in DOM
raw_listings = _paginate(
page,
- max(total_results, 25),
+ total_results,
max_properties=max_properties,
)
if not raw_listings:
diff --git a/frontend/src/components/home/HomePage.tsx b/frontend/src/components/home/HomePage.tsx
index bc72f8a..ed703a1 100644
--- a/frontend/src/components/home/HomePage.tsx
+++ b/frontend/src/components/home/HomePage.tsx
@@ -44,7 +44,7 @@ function getProductDemoSlug(language: string | undefined, isMobile: boolean): st
return isMobile ? `${base}-mobile` : base;
}
-function highlightBrandText(text: string) {
+function highlightBrandText(text: string, className = BRAND_TEXT_CLASS) {
const parts = text.split(BRAND_NAME);
if (parts.length === 1) return text;
@@ -52,7 +52,7 @@ function highlightBrandText(text: string) {
index === 0
? [part]
: [
-
+
{BRAND_NAME}
,
part,
@@ -325,7 +325,7 @@ export default function HomePage({
{t('home.heroSubtitle')}
- {highlightBrandText(t('home.heroDescription'))}
+ {highlightBrandText(t('home.heroDescription'), 'font-semibold text-teal-300')}
@@ -578,9 +576,7 @@ export default function MapPage({
hexagonId={selectedHexagon?.id || null}
onLoadMore={handleLoadMoreProperties}
scrollTopRef={propertiesPaneScrollTopRef}
- scrollRestoreKey={
- selectedHexagon ? `${selectedHexagon.type}:${selectedHexagon.id}` : null
- }
+ scrollRestoreKey={selectedHexagon ? `${selectedHexagon.type}:${selectedHexagon.id}` : null}
scrollSaveDisabled={loadingProperties && properties.length === 0}
/>
diff --git a/frontend/src/hooks/useRetainedScrollTop.test.tsx b/frontend/src/hooks/useRetainedScrollTop.test.tsx
index 592177e..630cf1d 100644
--- a/frontend/src/hooks/useRetainedScrollTop.test.tsx
+++ b/frontend/src/hooks/useRetainedScrollTop.test.tsx
@@ -34,11 +34,7 @@ describe('useRetainedScrollTop', () => {
it('keeps the saved scroll offset while replacement content is loading', () => {
const savedScrollTopRef = { current: 0 };
const view = render(
-
+
);
const pane = view.getByTestId('pane');
@@ -55,11 +51,7 @@ describe('useRetainedScrollTop', () => {
expect(savedScrollTopRef.current).toBe(360);
view.rerender(
-
+
);
expect(pane.scrollTop).toBe(360);
@@ -68,11 +60,7 @@ describe('useRetainedScrollTop', () => {
it('restores the saved offset when a pane remounts', () => {
const savedScrollTopRef = { current: 220 };
const view = render(
-
+
);
expect(view.getByTestId('pane').scrollTop).toBe(220);
diff --git a/frontend/src/lib/map-utils.test.ts b/frontend/src/lib/map-utils.test.ts
index 7cfc55c..b82e1b6 100644
--- a/frontend/src/lib/map-utils.test.ts
+++ b/frontend/src/lib/map-utils.test.ts
@@ -83,9 +83,7 @@ describe('map utilities', () => {
expect(getPoiIconUrl('M&S', '🛒', undefined, 'M&S Simply Food')).toBe(
'/assets/poi-icons/visuals/mns.svg'
);
- expect(getPoiIconUrl('Tian Tian', '🛒')).toMatch(
- /^data:image\/svg\+xml;charset=utf-8,/
- );
+ expect(getPoiIconUrl('Tian Tian', '🛒')).toMatch(/^data:image\/svg\+xml;charset=utf-8,/);
});
it('keeps POI icon URLs bundled locally', () => {
diff --git a/frontend/src/lib/map-utils.ts b/frontend/src/lib/map-utils.ts
index 8824a58..a893fc3 100644
--- a/frontend/src/lib/map-utils.ts
+++ b/frontend/src/lib/map-utils.ts
@@ -358,9 +358,8 @@ function getGeneratedPoiLogoUrl(label: string): string {
const cached = generatedPoiLogoCache.get(key);
if (cached) return cached;
- const [background, foreground] = GENERATED_POI_LOGO_COLORS[
- hashLabel(key) % GENERATED_POI_LOGO_COLORS.length
- ];
+ const [background, foreground] =
+ GENERATED_POI_LOGO_COLORS[hashLabel(key) % GENERATED_POI_LOGO_COLORS.length];
const initials = escapeSvgText(getPoiLogoInitials(key));
const svg = `
`;
const url = `data:image/svg+xml;charset=utf-8,${encodeURIComponent(svg)}`;
diff --git a/server-rs/src/consts.rs b/server-rs/src/consts.rs
index 64ea799..e29240a 100644
--- a/server-rs/src/consts.rs
+++ b/server-rs/src/consts.rs
@@ -14,7 +14,6 @@ pub const MAX_CELLS_PER_REQUEST: usize = 200000;
pub const MAX_POIS_PER_REQUEST: usize = 3000;
pub const PROPERTIES_LIMIT: usize = 100;
-pub const ACTUAL_LISTINGS_LIMIT: usize = 500;
pub const PLACES_LIMIT: usize = 20;
pub const PRICE_HISTORY_POINTS_LIMIT: usize = 5000;
pub const POSTCODE_SEARCH_OFFSET: f64 = 0.02;
diff --git a/server-rs/src/routes/actual_listings.rs b/server-rs/src/routes/actual_listings.rs
index 2c37df1..525cfa9 100644
--- a/server-rs/src/routes/actual_listings.rs
+++ b/server-rs/src/routes/actual_listings.rs
@@ -7,7 +7,6 @@ use serde::{Deserialize, Serialize};
use tracing::info;
use crate::api_error::ApiError;
-use crate::consts::ACTUAL_LISTINGS_LIMIT;
use crate::data::ActualListing;
use crate::features::property_level_feature_names;
use crate::parsing::{
@@ -41,7 +40,6 @@ pub async fn get_actual_listings(
Query(params): Query
,
) -> Result, ApiError> {
let state = shared.load_state();
- let limit = ACTUAL_LISTINGS_LIMIT;
let offset = params.offset.unwrap_or(0);
let Some(actual_listings) = state.actual_listings.clone() else {
return Ok(Json(ActualListingsResponse {
@@ -132,11 +130,9 @@ pub async fn get_actual_listings(
})
});
- let truncated = total_matching > offset.saturating_add(limit);
let listings: Vec = matching_rows
.iter()
.skip(offset)
- .take(limit)
.map(|&row| actual_listings.listing_at(row))
.collect();
@@ -155,7 +151,7 @@ pub async fn get_actual_listings(
listings,
total: total_matching,
offset,
- truncated,
+ truncated: false,
})
})
.await