import logging import time import httpx from constants import ( PAGE_SIZE, DELAY_BETWEEN_PAGES, SEARCH_URL, TYPEAHEAD_URL, ) from http_client import fetch_with_retry from spatial import PostcodeSpatialIndex from transform import transform_property log = logging.getLogger("rightmove") # Outcode ID cache (Rightmove typeahead → internal ID) outcode_cache: dict[str, str] = {} # Rightmove hard-caps pagination at index 1008 (42 pages × 24 results). # Requesting index >= 1008 returns HTTP 400. _MAX_INDEX = 1008 # Property type filters for splitting overcapped searches. Each sub-query # gets its own 1008 cap, so we can recover listings beyond the unfiltered limit. _PROPERTY_TYPES = [ "detached", "semi-detached", "terraced", "flat", "bungalow", "park-home", "land", ] def resolve_outcode_id(client: httpx.Client, outcode: str) -> str | None: """Look up Rightmove's internal ID for an outcode via typeahead API.""" if outcode in outcode_cache: return outcode_cache[outcode] data = fetch_with_retry( client, TYPEAHEAD_URL, {"query": outcode, "limit": "10", "exclude": "STREET"} ) if not data: return None for match in data.get("matches", []): if match.get("type") == "OUTCODE" and match.get("displayName") == outcode: rid = str(match["id"]) outcode_cache[outcode] = rid return rid log.debug("Outcode %s not found in typeahead results", outcode) return None def _paginate( client: httpx.Client, outcode_id: str, outcode: str, channel_cfg: dict, pc_index: PostcodeSpatialIndex, extra_params: dict | None = None, max_properties: int | None = None, ) -> tuple[list[dict], int]: """Paginate through search results. Returns (properties, result_count).""" properties = [] index = 0 result_count = 0 while True: params = { "useLocationIdentifier": "true", "locationIdentifier": f"OUTCODE^{outcode_id}", "index": str(index), "sortType": channel_cfg["sortType"], "channel": channel_cfg["channel"], "transactionType": channel_cfg["transactionType"], } if extra_params: params.update(extra_params) data = fetch_with_retry(client, SEARCH_URL, params) if not data: log.warning( "Failed to fetch index %d for %s/%s", index, outcode, channel_cfg["channel"], ) break raw_props = data.get("properties", []) if not raw_props: break for prop in raw_props: transformed = transform_property(prop, outcode, pc_index) if transformed: properties.append(transformed) if max_properties is not None and len(properties) >= max_properties: return properties, result_count # Check if there are more pages result_count_str = data.get("resultCount", "0") result_count = int(result_count_str.replace(",", "")) index += PAGE_SIZE if index >= result_count: break time.sleep(DELAY_BETWEEN_PAGES) return properties, result_count def search_outcode( client: httpx.Client, outcode_id: str, outcode: str, channel_cfg: dict, pc_index: PostcodeSpatialIndex, max_properties: int | None = None, ) -> list[dict]: """Paginate through search results for one outcode+channel. Returns transformed properties. When the unfiltered result count exceeds 1008 (Rightmove's hard pagination cap), re-queries per property type to recover listings beyond the cap. """ properties, result_count = _paginate( client, outcode_id, outcode, channel_cfg, pc_index, max_properties=max_properties ) if max_properties is not None and len(properties) >= max_properties: return properties[:max_properties] if result_count <= _MAX_INDEX: return properties # Hit the 1008 cap — re-search per property type to get full coverage ch = channel_cfg["channel"] log.info( "%s/%s: %d results exceed %d cap, splitting by property type", outcode, ch, result_count, _MAX_INDEX, ) all_by_id: dict[str, dict] = {p["id"]: p for p in properties} for pt in _PROPERTY_TYPES: pt_props, _ = _paginate( client, outcode_id, outcode, channel_cfg, pc_index, extra_params={"propertyTypes": pt}, max_properties=max_properties, ) new = 0 for p in pt_props: if p["id"] not in all_by_id: all_by_id[p["id"]] = p new += 1 if ( max_properties is not None and len(all_by_id) >= max_properties ): break if new: log.debug("%s/%s type=%s: +%d new properties", outcode, ch, pt, new) if max_properties is not None and len(all_by_id) >= max_properties: break log.info( "%s/%s: type split recovered %d → %d properties", outcode, ch, len(properties), len(all_by_id), ) properties = list(all_by_id.values()) if max_properties is not None: return properties[:max_properties] return properties