diff --git a/finder/rightmove.py b/finder/rightmove.py index c00f5cb..236959b 100644 --- a/finder/rightmove.py +++ b/finder/rightmove.py @@ -18,6 +18,17 @@ log = logging.getLogger("rightmove") # Outcode ID cache (Rightmove typeahead → internal ID) outcode_cache: dict[str, str] = {} +# Rightmove hard-caps pagination at index 1008 (42 pages × 24 results). +# Requesting index >= 1008 returns HTTP 400. +_MAX_INDEX = 1008 + +# Property type filters for splitting overcapped searches. Each sub-query +# gets its own 1008 cap, so we can recover listings beyond the unfiltered limit. +_PROPERTY_TYPES = [ + "detached", "semi-detached", "terraced", "flat", + "bungalow", "park-home", "land", +] + def resolve_outcode_id(client: httpx.Client, outcode: str) -> str | None: """Look up Rightmove's internal ID for an outcode via typeahead API.""" @@ -40,16 +51,18 @@ def resolve_outcode_id(client: httpx.Client, outcode: str) -> str | None: return None -def search_outcode( +def _paginate( client: httpx.Client, outcode_id: str, outcode: str, channel_cfg: dict, pc_index: PostcodeSpatialIndex, -) -> list[dict]: - """Paginate through search results for one outcode+channel. Returns transformed properties.""" + extra_params: dict | None = None, +) -> tuple[list[dict], int]: + """Paginate through search results. Returns (properties, result_count).""" properties = [] index = 0 + result_count = 0 while True: params = { @@ -60,6 +73,8 @@ def search_outcode( "channel": channel_cfg["channel"], "transactionType": channel_cfg["transactionType"], } + if extra_params: + params.update(extra_params) data = fetch_with_retry(client, SEARCH_URL, params) if not data: @@ -90,4 +105,52 @@ def search_outcode( time.sleep(DELAY_BETWEEN_PAGES) - return properties + return properties, result_count + + +def search_outcode( + client: httpx.Client, + outcode_id: str, + outcode: str, + channel_cfg: dict, + pc_index: PostcodeSpatialIndex, +) -> list[dict]: + """Paginate through search results for one outcode+channel. Returns transformed properties. + + When the unfiltered result count exceeds 1008 (Rightmove's hard pagination cap), + re-queries per property type to recover listings beyond the cap. + """ + properties, result_count = _paginate( + client, outcode_id, outcode, channel_cfg, pc_index + ) + + if result_count <= _MAX_INDEX: + return properties + + # Hit the 1008 cap — re-search per property type to get full coverage + ch = channel_cfg["channel"] + log.info( + "%s/%s: %d results exceed %d cap, splitting by property type", + outcode, ch, result_count, _MAX_INDEX, + ) + + all_by_id: dict[str, dict] = {p["id"]: p for p in properties} + + for pt in _PROPERTY_TYPES: + pt_props, _ = _paginate( + client, outcode_id, outcode, channel_cfg, pc_index, + extra_params={"propertyTypes": pt}, + ) + new = 0 + for p in pt_props: + if p["id"] not in all_by_id: + all_by_id[p["id"]] = p + new += 1 + if new: + log.debug("%s/%s type=%s: +%d new properties", outcode, ch, pt, new) + + log.info( + "%s/%s: type split recovered %d → %d properties", + outcode, ch, len(properties), len(all_by_id), + ) + return list(all_by_id.values()) diff --git a/finder/zoopla.py b/finder/zoopla.py index f7a7bec..19d3b31 100644 --- a/finder/zoopla.py +++ b/finder/zoopla.py @@ -39,7 +39,7 @@ class TurnstileError(Exception): # Maximum search result pages to scrape per outcode (25 listings/page) -MAX_PAGES_PER_OUTCODE = 10 +MAX_PAGES_PER_OUTCODE = 40 # JavaScript to extract listings from the rendered DOM. # Uses data-testid attributes as primary selectors (stable across deployments), @@ -98,6 +98,12 @@ _EXTRACT_LISTINGS_JS = r"""() => { if (/leasehold/i.test(text)) tenure = 'Leasehold'; else if (/freehold/i.test(text)) tenure = 'Freehold'; + // Extract property type (e.g., "2 bed flat for sale" → "flat") + let property_type = ''; + const ptMatch = text.match(/\d+\s*(?:beds?|bedrooms?)\s+([\w\s-]+?)\s+(?:for\s+sale|to\s+(?:rent|let)|for\s+rent)/i); + if (ptMatch) property_type = ptMatch[1].trim(); + else if (/\bstudio\s*(?:flat|apartment)?\s+(?:for\s+sale|to\s+(?:rent|let)|for\s+rent)/i.test(text)) property_type = 'Studio'; + results.push({ id, url: href.replace(window.location.origin, ''), price: priceMatch ? parseInt(priceMatch[1].replace(/,/g, '')) : null, @@ -106,7 +112,7 @@ _EXTRACT_LISTINGS_JS = r"""() => { baths: bathsMatch && parseInt(bathsMatch[1]) <= 20 ? parseInt(bathsMatch[1]) : null, receptions: recMatch && parseInt(recMatch[1]) <= 20 ? parseInt(recMatch[1]) : null, floor_area_sqft: areaMatch ? parseInt(areaMatch[1].replace(/,/g, '')) : null, - address, tenure, + address, tenure, property_type, }); } @@ -160,6 +166,12 @@ _EXTRACT_LISTINGS_JS = r"""() => { if (/leasehold/i.test(text)) tenure = 'Leasehold'; else if (/freehold/i.test(text)) tenure = 'Freehold'; + // Extract property type + let property_type = ''; + const ptMatch2 = text.match(/\d+\s*(?:beds?|bedrooms?)\s+([\w\s-]+?)\s+(?:for\s+sale|to\s+(?:rent|let)|for\s+rent)/i); + if (ptMatch2) property_type = ptMatch2[1].trim(); + else if (/\bstudio\s*(?:flat|apartment)?\s+(?:for\s+sale|to\s+(?:rent|let)|for\s+rent)/i.test(text)) property_type = 'Studio'; + results.push({ id, url: href.replace(window.location.origin, ''), price: priceMatch ? parseInt(priceMatch[1].replace(/,/g, '')) : null, @@ -168,7 +180,7 @@ _EXTRACT_LISTINGS_JS = r"""() => { baths: bathsMatch && parseInt(bathsMatch[1]) <= 20 ? parseInt(bathsMatch[1]) : null, receptions: recMatch && parseInt(recMatch[1]) <= 20 ? parseInt(recMatch[1]) : null, floor_area_sqft: areaMatch ? parseInt(areaMatch[1].replace(/,/g, '')) : null, - address, tenure, + address, tenure, property_type, }); } } @@ -557,6 +569,32 @@ def _paginate(page, total_results: int, channel: str) -> list[dict]: # --------------------------------------------------------------------------- +# Cached outcode → (postcode, lat, lng) lookups to avoid repeated O(n) scans +# over 2.26M postcodes. Populated lazily on first lookup per outcode. +_outcode_coords_cache: dict[str, tuple[str, float, float] | None] = {} + + +def _resolve_outcode_coords( + outcode: str, pc_coords: dict[str, tuple[float, float]] +) -> tuple[str, float, float] | None: + """Find first postcode + coords for an outcode. Result is cached.""" + if outcode in _outcode_coords_cache: + return _outcode_coords_cache[outcode] + + prefix = outcode + " " + for pcd, (lat, lng) in pc_coords.items(): + if pcd.startswith(prefix) or ( + len(outcode) >= 4 + and pcd.startswith(outcode) + and len(pcd) > len(outcode) + ): + _outcode_coords_cache[outcode] = (pcd, lat, lng) + return (pcd, lat, lng) + + _outcode_coords_cache[outcode] = None + return None + + def _extract_postcode(text: str) -> str | None: """Extract a full UK postcode from text like 'Dollar Bay Place, Canary Wharf E14 9SS'.""" match = re.search(r"([A-Z]{1,2}\d[A-Z0-9]?\s*\d[A-Z]{2})", text, re.IGNORECASE) @@ -585,11 +623,17 @@ def _map_property_type(raw_type: str | None) -> str: """Map Zoopla property type text to canonical type.""" if not raw_type: return "Other" + # Exact match (handles Rightmove-style capitalised values) canonical = PROPERTY_TYPE_MAP.get(raw_type) if canonical: return canonical + # Title-case match (handles regex-extracted lowercase like "town house" → "Town House") + canonical = PROPERTY_TYPE_MAP.get(raw_type.title()) + if canonical: + return canonical + # Keyword fallback lower = raw_type.lower() - if "flat" in lower or "apartment" in lower or "maisonette" in lower or "studio" in lower: + if "flat" in lower or "apartment" in lower or "maisonette" in lower or "studio" in lower or "penthouse" in lower: return "Flats/Maisonettes" if "detached" in lower and "semi" not in lower: return "Detached" @@ -622,6 +666,7 @@ def transform_property( channel: str, pc_index: PostcodeSpatialIndex, pc_coords: dict[str, tuple[float, float]], + search_outcode: str | None = None, ) -> dict | None: """Transform a raw Zoopla listing dict into the standard output schema. @@ -643,22 +688,18 @@ def transform_property( lat, lng = coords if lat is None: - # Try outcode-level fallback - outcode = _extract_outcode(address) - if outcode: - # ONSPD 7-char format: 4-char outcodes have no space before incode - # (e.g., "BH191AB"), while shorter outcodes do (e.g., "E14 5AB"). - # Check both formats to handle all outcode lengths. - prefix = outcode + " " - for pcd, coords in pc_coords.items(): - if pcd.startswith(prefix) or ( - len(outcode) >= 4 - and pcd.startswith(outcode) - and len(pcd) > len(outcode) - ): - postcode = pcd - lat, lng = coords - break + # Try outcode-level fallback from address text + addr_outcode = _extract_outcode(address) + if addr_outcode: + result = _resolve_outcode_coords(addr_outcode, pc_coords) + if result: + postcode, lat, lng = result + + # Final fallback: use the outcode we know we're searching + if lat is None and search_outcode: + result = _resolve_outcode_coords(search_outcode, pc_coords) + if result: + postcode, lat, lng = result if lat is None or lng is None or not postcode: return None @@ -706,8 +747,8 @@ def transform_property( "Postcode": postcode, "Address per Property Register": address, "Leasehold/Freehold": raw.get("tenure") or None, - "Property type": "Other", # Not reliably extractable from Zoopla search cards - "Property sub-type": "", + "Property type": _map_property_type(raw.get("property_type")), + "Property sub-type": raw.get("property_type") or "", "price": int(price), "price_frequency": frequency, "Price qualifier": "", @@ -774,7 +815,7 @@ def search_outcode( properties = [] dropped = 0 for raw in raw_listings: - transformed = transform_property(raw, channel, pc_index, pc_coords) + transformed = transform_property(raw, channel, pc_index, pc_coords, search_outcode=outcode) if transformed: properties.append(transformed) zoopla_properties_scraped.labels(channel=channel_label).inc()