fmt
This commit is contained in:
parent
2f149503bb
commit
6ea544a0f6
10 changed files with 144 additions and 60 deletions
|
|
@ -22,13 +22,22 @@ jobs:
|
|||
|
||||
- name: Set up Docker Buildx
|
||||
uses: https://github.com/docker/setup-buildx-action@v3
|
||||
with:
|
||||
driver-opts: |
|
||||
network=host
|
||||
|
||||
- name: Resolve registry vars
|
||||
id: registry
|
||||
env:
|
||||
CONTAINER_REGISTRY_HOST: ${{ vars.CONTAINER_REGISTRY_HOST }}
|
||||
run: |
|
||||
host="${{ gitea.server_url }}"
|
||||
host="${CONTAINER_REGISTRY_HOST:-${{ gitea.server_url }}}"
|
||||
host="${host#https://}"
|
||||
host="${host#http://}"
|
||||
host="${host%/}"
|
||||
if [ "$host" = "forgejo:3000" ]; then
|
||||
host="127.0.0.1:13000"
|
||||
fi
|
||||
repo=$(echo "${{ gitea.repository }}" | tr '[:upper:]' '[:lower:]')
|
||||
{
|
||||
echo "host=${host}"
|
||||
|
|
|
|||
|
|
@ -33,6 +33,10 @@ class CookiesExpiredError(Exception):
|
|||
"""Raised when home.co.uk returns 403, indicating cookies need refresh."""
|
||||
|
||||
|
||||
class PaginationError(Exception):
|
||||
"""Raised when home.co.uk pagination cannot be completed."""
|
||||
|
||||
|
||||
# Channel mapping: internal name → URL path segment
|
||||
HOMECOUK_URL_SEGMENT = "for-sale"
|
||||
|
||||
|
|
@ -171,6 +175,25 @@ def fetch_page(
|
|||
return None
|
||||
|
||||
|
||||
def _coerce_positive_int(value) -> int | None:
|
||||
parsed = parse_int_value(value)
|
||||
if parsed is None or parsed <= 0:
|
||||
return None
|
||||
return parsed
|
||||
|
||||
|
||||
def _property_identity(prop: dict, page: int, index: int) -> str:
|
||||
for key in ("listing_id", "property_id", "id"):
|
||||
value = prop.get(key)
|
||||
if value:
|
||||
return f"{key}:{value}"
|
||||
return (
|
||||
f"page:{page}:index:{index}:"
|
||||
f"{prop.get('display_address') or prop.get('address') or ''}:"
|
||||
f"{prop.get('price') or prop.get('latest_price') or ''}"
|
||||
)
|
||||
|
||||
|
||||
def parse_floor_area(description: str | None) -> float | None:
|
||||
"""Try to extract floor area from description text like '789 sq.ft.' or '73 sq.m.'."""
|
||||
if not description:
|
||||
|
|
@ -363,6 +386,9 @@ def search_outcode(
|
|||
url = f"{HOMECOUK_API_BASE}/{url_segment}/{outcode.lower()}/"
|
||||
properties = []
|
||||
page = 1
|
||||
last_page: int | None = None
|
||||
total_results: int | None = None
|
||||
seen_ids: set[str] = set()
|
||||
|
||||
while True:
|
||||
params = {
|
||||
|
|
@ -379,12 +405,32 @@ def search_outcode(
|
|||
|
||||
data = fetch_page(client, url, params)
|
||||
if not data:
|
||||
break
|
||||
raise PaginationError(f"home.co.uk {outcode} page {page} failed to load")
|
||||
|
||||
pagination = data.get("pagination", {}) or {}
|
||||
if last_page is None:
|
||||
last_page = _coerce_positive_int(pagination.get("last_page"))
|
||||
if total_results is None:
|
||||
total_results = _coerce_positive_int(pagination.get("total"))
|
||||
|
||||
raw_props = data.get("properties", [])
|
||||
if not raw_props:
|
||||
if total_results and page <= (last_page or page):
|
||||
raise PaginationError(
|
||||
f"home.co.uk {outcode} page {page} returned no properties "
|
||||
f"before the advertised end"
|
||||
)
|
||||
break
|
||||
|
||||
page_ids = {
|
||||
_property_identity(prop, page, idx) for idx, prop in enumerate(raw_props)
|
||||
}
|
||||
if page_ids and page_ids.issubset(seen_ids):
|
||||
raise PaginationError(
|
||||
f"home.co.uk {outcode} page {page} repeated previously seen results"
|
||||
)
|
||||
seen_ids.update(page_ids)
|
||||
|
||||
for prop in raw_props:
|
||||
try:
|
||||
transformed = transform_property(prop, pc_index)
|
||||
|
|
@ -401,10 +447,12 @@ def search_outcode(
|
|||
if max_properties is not None and len(properties) >= max_properties:
|
||||
return properties
|
||||
|
||||
# Check pagination
|
||||
pagination = data.get("pagination", {})
|
||||
last_page = pagination.get("last_page", 1)
|
||||
if page >= last_page:
|
||||
if last_page is not None:
|
||||
if page >= last_page:
|
||||
break
|
||||
elif total_results is not None and len(seen_ids) >= total_results:
|
||||
break
|
||||
elif len(raw_props) < HOMECOUK_PER_PAGE:
|
||||
break
|
||||
|
||||
page += 1
|
||||
|
|
|
|||
|
|
@ -21,6 +21,7 @@ Architecture:
|
|||
import logging
|
||||
import re
|
||||
import time
|
||||
from urllib.parse import parse_qsl, urlencode, urljoin, urlparse, urlunparse
|
||||
|
||||
from constants import DELAY_BETWEEN_PAGES, MAX_BEDROOMS, PROPERTY_TYPE_MAP, ZOOPLA_BASE
|
||||
from spatial import PostcodeSpatialIndex
|
||||
|
|
@ -52,9 +53,6 @@ class _ManagedCamoufoxBrowser:
|
|||
return getattr(self._browser, name)
|
||||
|
||||
|
||||
# Maximum search result pages to scrape per outcode (25 listings/page)
|
||||
MAX_PAGES_PER_OUTCODE = 40
|
||||
|
||||
# JavaScript to extract listings from the rendered DOM.
|
||||
# Uses data-testid attributes as primary selectors (stable across deployments),
|
||||
# then falls back to href-based link matching with parent-walking.
|
||||
|
|
@ -423,6 +421,45 @@ def _get_result_count(page) -> int:
|
|||
return 0
|
||||
|
||||
|
||||
def _url_with_page(url: str, page_num: int) -> str:
|
||||
parsed = urlparse(url)
|
||||
query = [(key, value) for key, value in parse_qsl(parsed.query) if key != "pn"]
|
||||
query.append(("pn", str(page_num)))
|
||||
return urlunparse(parsed._replace(query=urlencode(query)))
|
||||
|
||||
|
||||
def _find_next_page_url(page) -> str | None:
|
||||
"""Return the rendered pagination next URL, if Zoopla exposes one."""
|
||||
try:
|
||||
href = page.evaluate(
|
||||
"""() => {
|
||||
const links = Array.from(document.querySelectorAll('a[href]'));
|
||||
const next = links.find((link) => {
|
||||
const text = (link.innerText || link.textContent || '')
|
||||
.trim()
|
||||
.toLowerCase();
|
||||
const label = (link.getAttribute('aria-label') || '').toLowerCase();
|
||||
const rel = (link.getAttribute('rel') || '').toLowerCase();
|
||||
return rel.includes('next')
|
||||
|| label.includes('next')
|
||||
|| text === 'next'
|
||||
|| text === 'next page';
|
||||
});
|
||||
if (!next) return null;
|
||||
const href = next.href || '';
|
||||
if (!href.includes('/for-sale/') && !href.includes('/new-homes/')) {
|
||||
return null;
|
||||
}
|
||||
return href;
|
||||
}"""
|
||||
)
|
||||
except Exception:
|
||||
return None
|
||||
if not href:
|
||||
return None
|
||||
return urljoin(ZOOPLA_BASE, href)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Extraction and pagination
|
||||
# ---------------------------------------------------------------------------
|
||||
|
|
@ -476,29 +513,27 @@ def _paginate(
|
|||
) -> list[dict]:
|
||||
"""Extract listings from all pages of search results.
|
||||
|
||||
Page 1 is already loaded. For subsequent pages, clicks the Next button
|
||||
or navigates via URL parameter ?pn=N."""
|
||||
Page 1 is already loaded. For subsequent pages, follow Zoopla's rendered
|
||||
next link when present, otherwise advance via the pn=N URL parameter while
|
||||
the advertised result count says more listings remain."""
|
||||
all_listings = _extract_listings(page)
|
||||
if max_properties is not None and len(all_listings) >= max_properties:
|
||||
return all_listings[:max_properties]
|
||||
|
||||
if not all_listings or total_results <= len(all_listings):
|
||||
if not all_listings:
|
||||
return all_listings
|
||||
|
||||
seen_ids = {listing["id"] for listing in all_listings}
|
||||
current_url = page.url
|
||||
page_num = 2
|
||||
|
||||
while len(all_listings) < total_results and page_num <= MAX_PAGES_PER_OUTCODE:
|
||||
time.sleep(DELAY_BETWEEN_PAGES)
|
||||
while True:
|
||||
next_url = _find_next_page_url(page)
|
||||
if not next_url:
|
||||
if total_results > 0 and len(all_listings) >= total_results:
|
||||
break
|
||||
next_url = _url_with_page(page.url, page_num)
|
||||
|
||||
# Try navigating via URL parameter
|
||||
if "?" in current_url:
|
||||
next_url = re.sub(r"[?&]pn=\d+", "", current_url)
|
||||
separator = "&" if "?" in next_url else "?"
|
||||
next_url = f"{next_url}{separator}pn={page_num}"
|
||||
else:
|
||||
next_url = f"{current_url}?pn={page_num}"
|
||||
time.sleep(DELAY_BETWEEN_PAGES)
|
||||
|
||||
try:
|
||||
page.goto(next_url, wait_until="domcontentloaded", timeout=30000)
|
||||
|
|
@ -512,6 +547,12 @@ def _paginate(
|
|||
|
||||
page_listings = _extract_listings(page)
|
||||
if not page_listings:
|
||||
if total_results > len(all_listings):
|
||||
raise RuntimeError(
|
||||
"Zoopla pagination stopped with no listings on page "
|
||||
f"{page_num}; collected {len(all_listings)} of "
|
||||
f"{total_results} advertised results"
|
||||
)
|
||||
break
|
||||
|
||||
# Deduplicate within this outcode
|
||||
|
|
@ -525,10 +566,20 @@ def _paginate(
|
|||
return all_listings[:max_properties]
|
||||
|
||||
if new_count == 0:
|
||||
break # No new listings on this page
|
||||
if total_results > len(all_listings):
|
||||
raise RuntimeError(
|
||||
"Zoopla pagination repeated results on page "
|
||||
f"{page_num}; collected {len(all_listings)} of "
|
||||
f"{total_results} advertised results"
|
||||
)
|
||||
break
|
||||
|
||||
page_num += 1
|
||||
|
||||
if total_results > 0 and len(all_listings) >= total_results:
|
||||
if not _find_next_page_url(page):
|
||||
break
|
||||
|
||||
return all_listings
|
||||
|
||||
|
||||
|
|
@ -768,7 +819,7 @@ def search_outcode(
|
|||
# not match Zoopla's current text format, but listings may still be in DOM
|
||||
raw_listings = _paginate(
|
||||
page,
|
||||
max(total_results, 25),
|
||||
total_results,
|
||||
max_properties=max_properties,
|
||||
)
|
||||
if not raw_listings:
|
||||
|
|
|
|||
|
|
@ -44,7 +44,7 @@ function getProductDemoSlug(language: string | undefined, isMobile: boolean): st
|
|||
return isMobile ? `${base}-mobile` : base;
|
||||
}
|
||||
|
||||
function highlightBrandText(text: string) {
|
||||
function highlightBrandText(text: string, className = BRAND_TEXT_CLASS) {
|
||||
const parts = text.split(BRAND_NAME);
|
||||
if (parts.length === 1) return text;
|
||||
|
||||
|
|
@ -52,7 +52,7 @@ function highlightBrandText(text: string) {
|
|||
index === 0
|
||||
? [part]
|
||||
: [
|
||||
<span key={`brand-${index}`} className={BRAND_TEXT_CLASS}>
|
||||
<span key={`brand-${index}`} className={className}>
|
||||
{BRAND_NAME}
|
||||
</span>,
|
||||
part,
|
||||
|
|
@ -325,7 +325,7 @@ export default function HomePage({
|
|||
{t('home.heroSubtitle')}
|
||||
</p>
|
||||
<p className="text-base md:text-lg text-warm-200 mb-8 max-w-xl">
|
||||
{highlightBrandText(t('home.heroDescription'))}
|
||||
{highlightBrandText(t('home.heroDescription'), 'font-semibold text-teal-300')}
|
||||
</p>
|
||||
<div className="flex flex-col sm:flex-row sm:items-center gap-3 sm:gap-4 mb-10">
|
||||
<button
|
||||
|
|
|
|||
|
|
@ -561,9 +561,7 @@ export default function MapPage({
|
|||
isGroupExpanded={isAreaGroupExpanded}
|
||||
onToggleGroup={toggleAreaGroup}
|
||||
scrollTopRef={areaPaneScrollTopRef}
|
||||
scrollRestoreKey={
|
||||
selectedHexagon ? `${selectedHexagon.type}:${selectedHexagon.id}` : null
|
||||
}
|
||||
scrollRestoreKey={selectedHexagon ? `${selectedHexagon.type}:${selectedHexagon.id}` : null}
|
||||
scrollSaveDisabled={loadingAreaStats && areaStats == null}
|
||||
/>
|
||||
</Suspense>
|
||||
|
|
@ -578,9 +576,7 @@ export default function MapPage({
|
|||
hexagonId={selectedHexagon?.id || null}
|
||||
onLoadMore={handleLoadMoreProperties}
|
||||
scrollTopRef={propertiesPaneScrollTopRef}
|
||||
scrollRestoreKey={
|
||||
selectedHexagon ? `${selectedHexagon.type}:${selectedHexagon.id}` : null
|
||||
}
|
||||
scrollRestoreKey={selectedHexagon ? `${selectedHexagon.type}:${selectedHexagon.id}` : null}
|
||||
scrollSaveDisabled={loadingProperties && properties.length === 0}
|
||||
/>
|
||||
</Suspense>
|
||||
|
|
|
|||
|
|
@ -34,11 +34,7 @@ describe('useRetainedScrollTop', () => {
|
|||
it('keeps the saved scroll offset while replacement content is loading', () => {
|
||||
const savedScrollTopRef = { current: 0 };
|
||||
const view = render(
|
||||
<ScrollPane
|
||||
restoreKey="area:a"
|
||||
savedScrollTopRef={savedScrollTopRef}
|
||||
suspendSave={false}
|
||||
/>
|
||||
<ScrollPane restoreKey="area:a" savedScrollTopRef={savedScrollTopRef} suspendSave={false} />
|
||||
);
|
||||
const pane = view.getByTestId('pane');
|
||||
|
||||
|
|
@ -55,11 +51,7 @@ describe('useRetainedScrollTop', () => {
|
|||
expect(savedScrollTopRef.current).toBe(360);
|
||||
|
||||
view.rerender(
|
||||
<ScrollPane
|
||||
restoreKey="area:b"
|
||||
savedScrollTopRef={savedScrollTopRef}
|
||||
suspendSave={false}
|
||||
/>
|
||||
<ScrollPane restoreKey="area:b" savedScrollTopRef={savedScrollTopRef} suspendSave={false} />
|
||||
);
|
||||
|
||||
expect(pane.scrollTop).toBe(360);
|
||||
|
|
@ -68,11 +60,7 @@ describe('useRetainedScrollTop', () => {
|
|||
it('restores the saved offset when a pane remounts', () => {
|
||||
const savedScrollTopRef = { current: 220 };
|
||||
const view = render(
|
||||
<ScrollPane
|
||||
restoreKey="area:a"
|
||||
savedScrollTopRef={savedScrollTopRef}
|
||||
suspendSave={false}
|
||||
/>
|
||||
<ScrollPane restoreKey="area:a" savedScrollTopRef={savedScrollTopRef} suspendSave={false} />
|
||||
);
|
||||
|
||||
expect(view.getByTestId('pane').scrollTop).toBe(220);
|
||||
|
|
|
|||
|
|
@ -83,9 +83,7 @@ describe('map utilities', () => {
|
|||
expect(getPoiIconUrl('M&S', '🛒', undefined, 'M&S Simply Food')).toBe(
|
||||
'/assets/poi-icons/visuals/mns.svg'
|
||||
);
|
||||
expect(getPoiIconUrl('Tian Tian', '🛒')).toMatch(
|
||||
/^data:image\/svg\+xml;charset=utf-8,/
|
||||
);
|
||||
expect(getPoiIconUrl('Tian Tian', '🛒')).toMatch(/^data:image\/svg\+xml;charset=utf-8,/);
|
||||
});
|
||||
|
||||
it('keeps POI icon URLs bundled locally', () => {
|
||||
|
|
|
|||
|
|
@ -358,9 +358,8 @@ function getGeneratedPoiLogoUrl(label: string): string {
|
|||
const cached = generatedPoiLogoCache.get(key);
|
||||
if (cached) return cached;
|
||||
|
||||
const [background, foreground] = GENERATED_POI_LOGO_COLORS[
|
||||
hashLabel(key) % GENERATED_POI_LOGO_COLORS.length
|
||||
];
|
||||
const [background, foreground] =
|
||||
GENERATED_POI_LOGO_COLORS[hashLabel(key) % GENERATED_POI_LOGO_COLORS.length];
|
||||
const initials = escapeSvgText(getPoiLogoInitials(key));
|
||||
const svg = `<svg xmlns="http://www.w3.org/2000/svg" width="256" height="256" viewBox="0 0 256 256"><rect width="256" height="256" rx="48" fill="${background}"/><text x="128" y="144" text-anchor="middle" font-family="Inter,Arial,sans-serif" font-size="82" font-weight="800" fill="${foreground}">${initials}</text></svg>`;
|
||||
const url = `data:image/svg+xml;charset=utf-8,${encodeURIComponent(svg)}`;
|
||||
|
|
|
|||
|
|
@ -14,7 +14,6 @@ pub const MAX_CELLS_PER_REQUEST: usize = 200000;
|
|||
pub const MAX_POIS_PER_REQUEST: usize = 3000;
|
||||
|
||||
pub const PROPERTIES_LIMIT: usize = 100;
|
||||
pub const ACTUAL_LISTINGS_LIMIT: usize = 500;
|
||||
pub const PLACES_LIMIT: usize = 20;
|
||||
pub const PRICE_HISTORY_POINTS_LIMIT: usize = 5000;
|
||||
pub const POSTCODE_SEARCH_OFFSET: f64 = 0.02;
|
||||
|
|
|
|||
|
|
@ -7,7 +7,6 @@ use serde::{Deserialize, Serialize};
|
|||
use tracing::info;
|
||||
|
||||
use crate::api_error::ApiError;
|
||||
use crate::consts::ACTUAL_LISTINGS_LIMIT;
|
||||
use crate::data::ActualListing;
|
||||
use crate::features::property_level_feature_names;
|
||||
use crate::parsing::{
|
||||
|
|
@ -41,7 +40,6 @@ pub async fn get_actual_listings(
|
|||
Query(params): Query<ActualListingsParams>,
|
||||
) -> Result<Json<ActualListingsResponse>, ApiError> {
|
||||
let state = shared.load_state();
|
||||
let limit = ACTUAL_LISTINGS_LIMIT;
|
||||
let offset = params.offset.unwrap_or(0);
|
||||
let Some(actual_listings) = state.actual_listings.clone() else {
|
||||
return Ok(Json(ActualListingsResponse {
|
||||
|
|
@ -132,11 +130,9 @@ pub async fn get_actual_listings(
|
|||
})
|
||||
});
|
||||
|
||||
let truncated = total_matching > offset.saturating_add(limit);
|
||||
let listings: Vec<ActualListing> = matching_rows
|
||||
.iter()
|
||||
.skip(offset)
|
||||
.take(limit)
|
||||
.map(|&row| actual_listings.listing_at(row))
|
||||
.collect();
|
||||
|
||||
|
|
@ -155,7 +151,7 @@ pub async fn get_actual_listings(
|
|||
listings,
|
||||
total: total_matching,
|
||||
offset,
|
||||
truncated,
|
||||
truncated: false,
|
||||
})
|
||||
})
|
||||
.await
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue