import json import logging import re import time import httpx from constants import ( PAGE_SIZE, DELAY_BETWEEN_PAGES, RIGHTMOVE_DETAIL_URL, RIGHTMOVE_FETCH_DETAILS, RIGHTMOVE_MAX_DETAILS_PER_OUTCODE, SEARCH_URL, TYPEAHEAD_URL, ) from http_client import fetch_with_retry from spatial import PostcodeSpatialIndex from transform import extract_full_postcode, normalize_postcode, transform_property log = logging.getLogger("rightmove") # Outcode ID cache (Rightmove typeahead → internal ID) outcode_cache: dict[str, str] = {} # Rightmove hard-caps pagination at index 1008 (42 pages × 24 results). # Requesting index >= 1008 returns HTTP 400. _MAX_INDEX = 1008 # --------------------------------------------------------------------------- # Detail-page postcode extraction # --------------------------------------------------------------------------- # # The search API (_paginate) only returns an outcode-level `displayAddress` # (e.g. "Akerman Road, Brixton, London, SW9") — never the full postcode. Each # listing's detail page, however, embeds the property's OWN full postcode in a # `window.__PAGE_MODEL` script as `propertyData.address.{outcode, incode}` # (e.g. outcode "SW9" + incode "0HD" → "SW9 0HD"), independently corroborated by # `propertyData.propertyUrls.similarPropertiesUrl` ("/property-for-sale/SW9-0HD.html"). # This is the property's own postcode, NOT a nearest station/school: the # `nearestStations`/`nearestAirports` arrays carry only names + distances, no # postcodes, and the address outcode always matches the searched outcode. # Recon over 24 live listings across SW9/E1/M1/LS6/E20 (incl. APPROXIMATE_POINT # new-builds) found the full postcode present 100% of the time. There is no # UPRN or house-number field anywhere in propertyData, so those stay None. # # __PAGE_MODEL is a "devalue"-style flattened object graph: its `data` field is # a JSON STRING holding a flat array where every integer inside a container is # an index reference into that same array (so the graph can dedupe). We # brace-match the (large, deeply-nested) object literal — a non-greedy regex # cannot — then rehydrate the reference graph before reading the address. _PAGE_MODEL_RE = re.compile(r"window\.__PAGE_MODEL\s*=\s*") def _extract_page_model_literal(html: str) -> str | None: """Return the `{...}` object literal assigned to window.__PAGE_MODEL. Brace-matches with string/escape awareness so embedded braces and quotes in string values don't end the match early. Returns None when absent.""" marker = _PAGE_MODEL_RE.search(html) if not marker: return None start = marker.end() if start >= len(html) or html[start] != "{": return None depth = 0 in_str = False esc = False for j in range(start, len(html)): ch = html[j] if in_str: if esc: esc = False elif ch == "\\": esc = True elif ch == '"': in_str = False elif ch == '"': in_str = True elif ch == "{": depth += 1 elif ch == "}": depth -= 1 if depth == 0: return html[start : j + 1] return None def _rehydrate(flat: list) -> object: """Resolve a devalue-style flattened reference array into a nested object. Index 0 is the root; every int inside a dict/list is an index back into ``flat``. Memoised so shared/cyclic references resolve once.""" cache: dict[int, object] = {} def resolve(idx: int) -> object: if not isinstance(idx, int) or idx < 0 or idx >= len(flat): return None if idx in cache: return cache[idx] node = flat[idx] if isinstance(node, dict): out: dict = {} cache[idx] = out for key, value in node.items(): out[key] = resolve(value) if isinstance(value, int) else value return out if isinstance(node, list): arr: list = [] cache[idx] = arr for value in node: arr.append(resolve(value) if isinstance(value, int) else value) return arr cache[idx] = node return node return resolve(0) def parse_detail_postcode(html: str) -> str | None: """Extract a Rightmove property's TRUE full postcode from its detail HTML. Pure and network-free so it is unit-testable: callers pass the page HTML. Reads ``propertyData.address.outcode`` + ``.incode`` from window.__PAGE_MODEL and returns a normalised full postcode (e.g. "SW9 0HD"), or None when the page has no parseable address (the property location wrapper can be empty — the caller then keeps the coordinate fallback). The returned outcode is re-validated against the joined postcode so a malformed incode is dropped. """ if not html: return None literal = _extract_page_model_literal(html) if not literal: return None try: outer = json.loads(literal) flat = json.loads(outer["data"]) except (ValueError, KeyError, TypeError): return None if not isinstance(flat, list) or not flat: return None root = _rehydrate(flat) if not isinstance(root, dict): return None property_data = root.get("propertyData") if not isinstance(property_data, dict): return None address = property_data.get("address") if not isinstance(address, dict): return None outcode = address.get("outcode") incode = address.get("incode") if not isinstance(outcode, str) or not isinstance(incode, str): return None outcode, incode = outcode.strip(), incode.strip() if not outcode or not incode: return None # Round-trip through the shared postcode validator/normaliser: this both # canonicalises spacing and rejects an outcode/incode pair that doesn't form # a structurally-valid UK postcode. return extract_full_postcode(normalize_postcode(f"{outcode} {incode}")) # listingId -> true full postcode (or None when unavailable). Failures are # cached too, so a broken/duplicate listing is fetched at most once per run (the # same listing can reappear across overlapping outcode searches). _detail_postcode_cache: dict[str, str | None] = {} def _fetch_detail_postcode(client: httpx.Client, property_id: str) -> str | None: """GET a listing detail page and return its true full postcode (or None). Results (including failures) are cached by listing id. The detail page is a plain HTML GET — no Cloudflare, unlike Zoopla — so a single httpx call suffices; any error degrades gracefully to the coordinate fallback.""" if not property_id: return None if property_id in _detail_postcode_cache: return _detail_postcode_cache[property_id] postcode: str | None = None url = RIGHTMOVE_DETAIL_URL.format(id=property_id) try: resp = client.get(url, headers={"Accept": "text/html"}) if resp.status_code == 200: postcode = parse_detail_postcode(resp.text) else: log.debug("Rightmove detail %s returned HTTP %d", url, resp.status_code) except httpx.HTTPError as exc: log.debug("Rightmove detail fetch failed %s: %s", url, exc) _detail_postcode_cache[property_id] = postcode return postcode def resolve_outcode_id(client: httpx.Client, outcode: str) -> str | None: """Look up Rightmove's internal ID for an outcode via typeahead API.""" if outcode in outcode_cache: return outcode_cache[outcode] data = fetch_with_retry( client, TYPEAHEAD_URL, {"query": outcode, "limit": "10", "exclude": "STREET"} ) if not data: return None for match in data.get("matches", []): if match.get("type") == "OUTCODE" and match.get("displayName") == outcode: rid = str(match["id"]) outcode_cache[outcode] = rid return rid log.debug("Outcode %s not found in typeahead results", outcode) return None def _detail_postcode_for( client: httpx.Client, prop: dict, fetch_details: bool, detail_budget: dict, ) -> str | None: """Look up a listing's true postcode, honouring the per-outcode fetch cap. Cached listings are always served (they cost neither a cap slot nor a GET); a fresh fetch is made only while ``detail_budget['remaining'] > 0``.""" if not fetch_details: return None property_id = str(prop.get("id") or "") if not property_id: return None if property_id in _detail_postcode_cache: return _detail_postcode_cache[property_id] if detail_budget["remaining"] <= 0: return None detail_budget["remaining"] -= 1 postcode = _fetch_detail_postcode(client, property_id) time.sleep(DELAY_BETWEEN_PAGES) return postcode def _paginate( client: httpx.Client, outcode_id: str, outcode: str, channel_cfg: dict, pc_index: PostcodeSpatialIndex, max_properties: int | None = None, fetch_details: bool = False, detail_cap: int = 0, ) -> tuple[list[dict], int]: """Paginate through search results. Returns (properties, result_count). When ``fetch_details`` is set, up to ``detail_cap`` listings per outcode have their detail page fetched for the property's TRUE full postcode (see ``parse_detail_postcode``); the rest fall back to coordinate-derived postcodes.""" properties = [] index = 0 result_count = 0 detail_budget = {"remaining": detail_cap} while True: params = { "useLocationIdentifier": "true", "locationIdentifier": f"OUTCODE^{outcode_id}", "index": str(index), "sortType": channel_cfg["sortType"], "channel": channel_cfg["channel"], "transactionType": channel_cfg["transactionType"], } data = fetch_with_retry(client, SEARCH_URL, params) if not data: log.warning( "Failed to fetch index %d for %s/%s", index, outcode, channel_cfg["channel"], ) break raw_props = data.get("properties", []) if not raw_props: break for prop in raw_props: try: detail_postcode = _detail_postcode_for( client, prop, fetch_details, detail_budget ) transformed = transform_property( prop, outcode, pc_index, detail_postcode=detail_postcode ) except Exception as exc: log.warning( "Rightmove %s/%s property %s failed to transform: %s", outcode, channel_cfg["channel"], prop.get("id", "?"), exc, ) continue if transformed: properties.append(transformed) if max_properties is not None and len(properties) >= max_properties: return properties, result_count # Check if there are more pages result_count_str = data.get("resultCount", "0") result_count = int(result_count_str.replace(",", "")) index += PAGE_SIZE if index >= result_count: break if index >= _MAX_INDEX: log.warning( "%s/%s: %d results exceed Rightmove's %d-result page cap", outcode, channel_cfg["channel"], result_count, _MAX_INDEX, ) break time.sleep(DELAY_BETWEEN_PAGES) return properties, result_count def search_outcode( client: httpx.Client, outcode_id: str, outcode: str, channel_cfg: dict, pc_index: PostcodeSpatialIndex, max_properties: int | None = None, ) -> list[dict]: """Paginate through unfiltered sale results for one outcode+channel. Each listing's detail page is fetched for the property's TRUE full postcode (gated by ``RIGHTMOVE_FETCH_DETAILS`` and capped per outcode by ``RIGHTMOVE_MAX_DETAILS_PER_OUTCODE``); listings beyond the cap keep the coordinate-derived postcode.""" properties, _ = _paginate( client, outcode_id, outcode, channel_cfg, pc_index, max_properties=max_properties, fetch_details=RIGHTMOVE_FETCH_DETAILS, detail_cap=RIGHTMOVE_MAX_DETAILS_PER_OUTCODE, ) if max_properties is not None and len(properties) >= max_properties: return properties[:max_properties] return properties