perfect-postcode/finder/rightmove.py

import json
import logging
import re
import time

import httpx

from constants import (
    PAGE_SIZE,
    DELAY_BETWEEN_PAGES,
    RIGHTMOVE_DETAIL_URL,
    RIGHTMOVE_FETCH_DETAILS,
    RIGHTMOVE_MAX_DETAILS_PER_OUTCODE,
    SEARCH_URL,
    TYPEAHEAD_URL,
)
from http_client import fetch_with_retry
from spatial import PostcodeSpatialIndex
from transform import extract_full_postcode, normalize_postcode, transform_property

log = logging.getLogger("rightmove")

# Outcode ID cache (Rightmove typeahead → internal ID)
outcode_cache: dict[str, str] = {}

# Rightmove hard-caps pagination at index 1008 (42 pages × 24 results).
# Requesting index >= 1008 returns HTTP 400.
_MAX_INDEX = 1008


# ---------------------------------------------------------------------------
# Detail-page postcode extraction
# ---------------------------------------------------------------------------
#
# The search API (_paginate) only returns an outcode-level `displayAddress`
# (e.g. "Akerman Road, Brixton, London, SW9") — never the full postcode. Each
# listing's detail page, however, embeds the property's OWN full postcode in a
# `window.__PAGE_MODEL` script as `propertyData.address.{outcode, incode}`
# (e.g. outcode "SW9" + incode "0HD" → "SW9 0HD"), independently corroborated by
# `propertyData.propertyUrls.similarPropertiesUrl` ("/property-for-sale/SW9-0HD.html").
# This is the property's own postcode, NOT a nearest station/school: the
# `nearestStations`/`nearestAirports` arrays carry only names + distances, no
# postcodes, and the address outcode always matches the searched outcode.
# Recon over 24 live listings across SW9/E1/M1/LS6/E20 (incl. APPROXIMATE_POINT
# new-builds) found the full postcode present 100% of the time. There is no
# UPRN or house-number field anywhere in propertyData, so those stay None.
#
# __PAGE_MODEL is a "devalue"-style flattened object graph: its `data` field is
# a JSON STRING holding a flat array where every integer inside a container is
# an index reference into that same array (so the graph can dedupe). We
# brace-match the (large, deeply-nested) object literal — a non-greedy regex
# cannot — then rehydrate the reference graph before reading the address.

_PAGE_MODEL_RE = re.compile(r"window\.__PAGE_MODEL\s*=\s*")


def _extract_page_model_literal(html: str) -> str | None:
    """Return the `{...}` object literal assigned to window.__PAGE_MODEL.

    Brace-matches with string/escape awareness so embedded braces and quotes in
    string values don't end the match early. Returns None when absent."""
    marker = _PAGE_MODEL_RE.search(html)
    if not marker:
        return None
    start = marker.end()
    if start >= len(html) or html[start] != "{":
        return None
    depth = 0
    in_str = False
    esc = False
    for j in range(start, len(html)):
        ch = html[j]
        if in_str:
            if esc:
                esc = False
            elif ch == "\\":
                esc = True
            elif ch == '"':
                in_str = False
        elif ch == '"':
            in_str = True
        elif ch == "{":
            depth += 1
        elif ch == "}":
            depth -= 1
            if depth == 0:
                return html[start : j + 1]
    return None


def _rehydrate(flat: list) -> object:
    """Resolve a devalue-style flattened reference array into a nested object.

    Index 0 is the root; every int inside a dict/list is an index back into
    ``flat``. Memoised so shared/cyclic references resolve once."""
    cache: dict[int, object] = {}

    def resolve(idx: int) -> object:
        if not isinstance(idx, int) or idx < 0 or idx >= len(flat):
            return None
        if idx in cache:
            return cache[idx]
        node = flat[idx]
        if isinstance(node, dict):
            out: dict = {}
            cache[idx] = out
            for key, value in node.items():
                out[key] = resolve(value) if isinstance(value, int) else value
            return out
        if isinstance(node, list):
            arr: list = []
            cache[idx] = arr
            for value in node:
                arr.append(resolve(value) if isinstance(value, int) else value)
            return arr
        cache[idx] = node
        return node

    return resolve(0)


def parse_detail_postcode(html: str) -> str | None:
    """Extract a Rightmove property's TRUE full postcode from its detail HTML.

    Pure and network-free so it is unit-testable: callers pass the page HTML.
    Reads ``propertyData.address.outcode`` + ``.incode`` from window.__PAGE_MODEL
    and returns a normalised full postcode (e.g. "SW9 0HD"), or None when the
    page has no parseable address (the property location wrapper can be empty —
    the caller then keeps the coordinate fallback). The returned outcode is
    re-validated against the joined postcode so a malformed incode is dropped.
    """
    if not html:
        return None
    literal = _extract_page_model_literal(html)
    if not literal:
        return None
    try:
        outer = json.loads(literal)
        flat = json.loads(outer["data"])
    except (ValueError, KeyError, TypeError):
        return None
    if not isinstance(flat, list) or not flat:
        return None

    root = _rehydrate(flat)
    if not isinstance(root, dict):
        return None
    property_data = root.get("propertyData")
    if not isinstance(property_data, dict):
        return None
    address = property_data.get("address")
    if not isinstance(address, dict):
        return None

    outcode = address.get("outcode")
    incode = address.get("incode")
    if not isinstance(outcode, str) or not isinstance(incode, str):
        return None
    outcode, incode = outcode.strip(), incode.strip()
    if not outcode or not incode:
        return None

    # Round-trip through the shared postcode validator/normaliser: this both
    # canonicalises spacing and rejects an outcode/incode pair that doesn't form
    # a structurally-valid UK postcode.
    return extract_full_postcode(normalize_postcode(f"{outcode} {incode}"))


# listingId -> true full postcode (or None when unavailable). Failures are
# cached too, so a broken/duplicate listing is fetched at most once per run (the
# same listing can reappear across overlapping outcode searches).
_detail_postcode_cache: dict[str, str | None] = {}


def _fetch_detail_postcode(client: httpx.Client, property_id: str) -> str | None:
    """GET a listing detail page and return its true full postcode (or None).

    Results (including failures) are cached by listing id. The detail page is a
    plain HTML GET — no Cloudflare, unlike Zoopla — so a single httpx call
    suffices; any error degrades gracefully to the coordinate fallback."""
    if not property_id:
        return None
    if property_id in _detail_postcode_cache:
        return _detail_postcode_cache[property_id]

    postcode: str | None = None
    url = RIGHTMOVE_DETAIL_URL.format(id=property_id)
    try:
        resp = client.get(url, headers={"Accept": "text/html"})
        if resp.status_code == 200:
            postcode = parse_detail_postcode(resp.text)
        else:
            log.debug("Rightmove detail %s returned HTTP %d", url, resp.status_code)
    except httpx.HTTPError as exc:
        log.debug("Rightmove detail fetch failed %s: %s", url, exc)

    _detail_postcode_cache[property_id] = postcode
    return postcode


def resolve_outcode_id(client: httpx.Client, outcode: str) -> str | None:
    """Look up Rightmove's internal ID for an outcode via typeahead API."""
    if outcode in outcode_cache:
        return outcode_cache[outcode]

    data = fetch_with_retry(
        client, TYPEAHEAD_URL, {"query": outcode, "limit": "10", "exclude": "STREET"}
    )
    if not data:
        return None

    for match in data.get("matches", []):
        if match.get("type") == "OUTCODE" and match.get("displayName") == outcode:
            rid = str(match["id"])
            outcode_cache[outcode] = rid
            return rid

    log.debug("Outcode %s not found in typeahead results", outcode)
    return None


def _detail_postcode_for(
    client: httpx.Client,
    prop: dict,
    fetch_details: bool,
    detail_budget: dict,
) -> str | None:
    """Look up a listing's true postcode, honouring the per-outcode fetch cap.

    Cached listings are always served (they cost neither a cap slot nor a GET);
    a fresh fetch is made only while ``detail_budget['remaining'] > 0``."""
    if not fetch_details:
        return None
    property_id = str(prop.get("id") or "")
    if not property_id:
        return None
    if property_id in _detail_postcode_cache:
        return _detail_postcode_cache[property_id]
    if detail_budget["remaining"] <= 0:
        return None
    detail_budget["remaining"] -= 1
    postcode = _fetch_detail_postcode(client, property_id)
    time.sleep(DELAY_BETWEEN_PAGES)
    return postcode


def _paginate(
    client: httpx.Client,
    outcode_id: str,
    outcode: str,
    channel_cfg: dict,
    pc_index: PostcodeSpatialIndex,
    max_properties: int | None = None,
    fetch_details: bool = False,
    detail_cap: int = 0,
) -> tuple[list[dict], int]:
    """Paginate through search results. Returns (properties, result_count).

    When ``fetch_details`` is set, up to ``detail_cap`` listings per outcode have
    their detail page fetched for the property's TRUE full postcode (see
    ``parse_detail_postcode``); the rest fall back to coordinate-derived
    postcodes."""
    properties = []
    index = 0
    result_count = 0
    detail_budget = {"remaining": detail_cap}

    while True:
        params = {
            "useLocationIdentifier": "true",
            "locationIdentifier": f"OUTCODE^{outcode_id}",
            "index": str(index),
            "sortType": channel_cfg["sortType"],
            "channel": channel_cfg["channel"],
            "transactionType": channel_cfg["transactionType"],
        }
        data = fetch_with_retry(client, SEARCH_URL, params)
        if not data:
            log.warning(
                "Failed to fetch index %d for %s/%s",
                index,
                outcode,
                channel_cfg["channel"],
            )
            break

        raw_props = data.get("properties", [])
        if not raw_props:
            break

        for prop in raw_props:
            try:
                detail_postcode = _detail_postcode_for(
                    client, prop, fetch_details, detail_budget
                )
                transformed = transform_property(
                    prop, outcode, pc_index, detail_postcode=detail_postcode
                )
            except Exception as exc:
                log.warning(
                    "Rightmove %s/%s property %s failed to transform: %s",
                    outcode,
                    channel_cfg["channel"],
                    prop.get("id", "?"),
                    exc,
                )
                continue
            if transformed:
                properties.append(transformed)
                if max_properties is not None and len(properties) >= max_properties:
                    return properties, result_count

        # Check if there are more pages
        result_count_str = data.get("resultCount", "0")
        result_count = int(result_count_str.replace(",", ""))
        index += PAGE_SIZE

        if index >= result_count:
            break
        if index >= _MAX_INDEX:
            log.warning(
                "%s/%s: %d results exceed Rightmove's %d-result page cap",
                outcode,
                channel_cfg["channel"],
                result_count,
                _MAX_INDEX,
            )
            break

        time.sleep(DELAY_BETWEEN_PAGES)

    return properties, result_count


def search_outcode(
    client: httpx.Client,
    outcode_id: str,
    outcode: str,
    channel_cfg: dict,
    pc_index: PostcodeSpatialIndex,
    max_properties: int | None = None,
) -> list[dict]:
    """Paginate through unfiltered sale results for one outcode+channel.

    Each listing's detail page is fetched for the property's TRUE full postcode
    (gated by ``RIGHTMOVE_FETCH_DETAILS`` and capped per outcode by
    ``RIGHTMOVE_MAX_DETAILS_PER_OUTCODE``); listings beyond the cap keep the
    coordinate-derived postcode."""
    properties, _ = _paginate(
        client,
        outcode_id,
        outcode,
        channel_cfg,
        pc_index,
        max_properties=max_properties,
        fetch_details=RIGHTMOVE_FETCH_DETAILS,
        detail_cap=RIGHTMOVE_MAX_DETAILS_PER_OUTCODE,
    )

    if max_properties is not None and len(properties) >= max_properties:
        return properties[:max_properties]

    return properties