perfect-postcode/finder/rightmove.py

363 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
import logging
import re
import time
import httpx
from constants import (
PAGE_SIZE,
DELAY_BETWEEN_PAGES,
RIGHTMOVE_DETAIL_URL,
RIGHTMOVE_FETCH_DETAILS,
RIGHTMOVE_MAX_DETAILS_PER_OUTCODE,
SEARCH_URL,
TYPEAHEAD_URL,
)
from http_client import fetch_with_retry
from spatial import PostcodeSpatialIndex
from transform import extract_full_postcode, normalize_postcode, transform_property
log = logging.getLogger("rightmove")
# Outcode ID cache (Rightmove typeahead → internal ID)
outcode_cache: dict[str, str] = {}
# Rightmove hard-caps pagination at index 1008 (42 pages × 24 results).
# Requesting index >= 1008 returns HTTP 400.
_MAX_INDEX = 1008
# ---------------------------------------------------------------------------
# Detail-page postcode extraction
# ---------------------------------------------------------------------------
#
# The search API (_paginate) only returns an outcode-level `displayAddress`
# (e.g. "Akerman Road, Brixton, London, SW9") — never the full postcode. Each
# listing's detail page, however, embeds the property's OWN full postcode in a
# `window.__PAGE_MODEL` script as `propertyData.address.{outcode, incode}`
# (e.g. outcode "SW9" + incode "0HD" → "SW9 0HD"), independently corroborated by
# `propertyData.propertyUrls.similarPropertiesUrl` ("/property-for-sale/SW9-0HD.html").
# This is the property's own postcode, NOT a nearest station/school: the
# `nearestStations`/`nearestAirports` arrays carry only names + distances, no
# postcodes, and the address outcode always matches the searched outcode.
# Recon over 24 live listings across SW9/E1/M1/LS6/E20 (incl. APPROXIMATE_POINT
# new-builds) found the full postcode present 100% of the time. There is no
# UPRN or house-number field anywhere in propertyData, so those stay None.
#
# __PAGE_MODEL is a "devalue"-style flattened object graph: its `data` field is
# a JSON STRING holding a flat array where every integer inside a container is
# an index reference into that same array (so the graph can dedupe). We
# brace-match the (large, deeply-nested) object literal — a non-greedy regex
# cannot — then rehydrate the reference graph before reading the address.
_PAGE_MODEL_RE = re.compile(r"window\.__PAGE_MODEL\s*=\s*")
def _extract_page_model_literal(html: str) -> str | None:
"""Return the `{...}` object literal assigned to window.__PAGE_MODEL.
Brace-matches with string/escape awareness so embedded braces and quotes in
string values don't end the match early. Returns None when absent."""
marker = _PAGE_MODEL_RE.search(html)
if not marker:
return None
start = marker.end()
if start >= len(html) or html[start] != "{":
return None
depth = 0
in_str = False
esc = False
for j in range(start, len(html)):
ch = html[j]
if in_str:
if esc:
esc = False
elif ch == "\\":
esc = True
elif ch == '"':
in_str = False
elif ch == '"':
in_str = True
elif ch == "{":
depth += 1
elif ch == "}":
depth -= 1
if depth == 0:
return html[start : j + 1]
return None
def _rehydrate(flat: list) -> object:
"""Resolve a devalue-style flattened reference array into a nested object.
Index 0 is the root; every int inside a dict/list is an index back into
``flat``. Memoised so shared/cyclic references resolve once."""
cache: dict[int, object] = {}
def resolve(idx: int) -> object:
if not isinstance(idx, int) or idx < 0 or idx >= len(flat):
return None
if idx in cache:
return cache[idx]
node = flat[idx]
if isinstance(node, dict):
out: dict = {}
cache[idx] = out
for key, value in node.items():
out[key] = resolve(value) if isinstance(value, int) else value
return out
if isinstance(node, list):
arr: list = []
cache[idx] = arr
for value in node:
arr.append(resolve(value) if isinstance(value, int) else value)
return arr
cache[idx] = node
return node
return resolve(0)
def parse_detail_postcode(html: str) -> str | None:
"""Extract a Rightmove property's TRUE full postcode from its detail HTML.
Pure and network-free so it is unit-testable: callers pass the page HTML.
Reads ``propertyData.address.outcode`` + ``.incode`` from window.__PAGE_MODEL
and returns a normalised full postcode (e.g. "SW9 0HD"), or None when the
page has no parseable address (the property location wrapper can be empty —
the caller then keeps the coordinate fallback). The returned outcode is
re-validated against the joined postcode so a malformed incode is dropped.
"""
if not html:
return None
literal = _extract_page_model_literal(html)
if not literal:
return None
try:
outer = json.loads(literal)
flat = json.loads(outer["data"])
except (ValueError, KeyError, TypeError):
return None
if not isinstance(flat, list) or not flat:
return None
root = _rehydrate(flat)
if not isinstance(root, dict):
return None
property_data = root.get("propertyData")
if not isinstance(property_data, dict):
return None
address = property_data.get("address")
if not isinstance(address, dict):
return None
outcode = address.get("outcode")
incode = address.get("incode")
if not isinstance(outcode, str) or not isinstance(incode, str):
return None
outcode, incode = outcode.strip(), incode.strip()
if not outcode or not incode:
return None
# Round-trip through the shared postcode validator/normaliser: this both
# canonicalises spacing and rejects an outcode/incode pair that doesn't form
# a structurally-valid UK postcode.
return extract_full_postcode(normalize_postcode(f"{outcode} {incode}"))
# listingId -> true full postcode (or None when unavailable). Failures are
# cached too, so a broken/duplicate listing is fetched at most once per run (the
# same listing can reappear across overlapping outcode searches).
_detail_postcode_cache: dict[str, str | None] = {}
def _fetch_detail_postcode(client: httpx.Client, property_id: str) -> str | None:
"""GET a listing detail page and return its true full postcode (or None).
Results (including failures) are cached by listing id. The detail page is a
plain HTML GET — no Cloudflare, unlike Zoopla — so a single httpx call
suffices; any error degrades gracefully to the coordinate fallback."""
if not property_id:
return None
if property_id in _detail_postcode_cache:
return _detail_postcode_cache[property_id]
postcode: str | None = None
url = RIGHTMOVE_DETAIL_URL.format(id=property_id)
try:
resp = client.get(url, headers={"Accept": "text/html"})
if resp.status_code == 200:
postcode = parse_detail_postcode(resp.text)
else:
log.debug("Rightmove detail %s returned HTTP %d", url, resp.status_code)
except httpx.HTTPError as exc:
log.debug("Rightmove detail fetch failed %s: %s", url, exc)
_detail_postcode_cache[property_id] = postcode
return postcode
def resolve_outcode_id(client: httpx.Client, outcode: str) -> str | None:
"""Look up Rightmove's internal ID for an outcode via typeahead API."""
if outcode in outcode_cache:
return outcode_cache[outcode]
data = fetch_with_retry(
client, TYPEAHEAD_URL, {"query": outcode, "limit": "10", "exclude": "STREET"}
)
if not data:
return None
for match in data.get("matches", []):
if match.get("type") == "OUTCODE" and match.get("displayName") == outcode:
rid = str(match["id"])
outcode_cache[outcode] = rid
return rid
log.debug("Outcode %s not found in typeahead results", outcode)
return None
def _detail_postcode_for(
client: httpx.Client,
prop: dict,
fetch_details: bool,
detail_budget: dict,
) -> str | None:
"""Look up a listing's true postcode, honouring the per-outcode fetch cap.
Cached listings are always served (they cost neither a cap slot nor a GET);
a fresh fetch is made only while ``detail_budget['remaining'] > 0``."""
if not fetch_details:
return None
property_id = str(prop.get("id") or "")
if not property_id:
return None
if property_id in _detail_postcode_cache:
return _detail_postcode_cache[property_id]
if detail_budget["remaining"] <= 0:
return None
detail_budget["remaining"] -= 1
postcode = _fetch_detail_postcode(client, property_id)
time.sleep(DELAY_BETWEEN_PAGES)
return postcode
def _paginate(
client: httpx.Client,
outcode_id: str,
outcode: str,
channel_cfg: dict,
pc_index: PostcodeSpatialIndex,
max_properties: int | None = None,
fetch_details: bool = False,
detail_cap: int = 0,
) -> tuple[list[dict], int]:
"""Paginate through search results. Returns (properties, result_count).
When ``fetch_details`` is set, up to ``detail_cap`` listings per outcode have
their detail page fetched for the property's TRUE full postcode (see
``parse_detail_postcode``); the rest fall back to coordinate-derived
postcodes."""
properties = []
index = 0
result_count = 0
detail_budget = {"remaining": detail_cap}
while True:
params = {
"useLocationIdentifier": "true",
"locationIdentifier": f"OUTCODE^{outcode_id}",
"index": str(index),
"sortType": channel_cfg["sortType"],
"channel": channel_cfg["channel"],
"transactionType": channel_cfg["transactionType"],
}
data = fetch_with_retry(client, SEARCH_URL, params)
if not data:
log.warning(
"Failed to fetch index %d for %s/%s",
index,
outcode,
channel_cfg["channel"],
)
break
raw_props = data.get("properties", [])
if not raw_props:
break
for prop in raw_props:
try:
detail_postcode = _detail_postcode_for(
client, prop, fetch_details, detail_budget
)
transformed = transform_property(
prop, outcode, pc_index, detail_postcode=detail_postcode
)
except Exception as exc:
log.warning(
"Rightmove %s/%s property %s failed to transform: %s",
outcode,
channel_cfg["channel"],
prop.get("id", "?"),
exc,
)
continue
if transformed:
properties.append(transformed)
if max_properties is not None and len(properties) >= max_properties:
return properties, result_count
# Check if there are more pages
result_count_str = data.get("resultCount", "0")
result_count = int(result_count_str.replace(",", ""))
index += PAGE_SIZE
if index >= result_count:
break
if index >= _MAX_INDEX:
log.warning(
"%s/%s: %d results exceed Rightmove's %d-result page cap",
outcode,
channel_cfg["channel"],
result_count,
_MAX_INDEX,
)
break
time.sleep(DELAY_BETWEEN_PAGES)
return properties, result_count
def search_outcode(
client: httpx.Client,
outcode_id: str,
outcode: str,
channel_cfg: dict,
pc_index: PostcodeSpatialIndex,
max_properties: int | None = None,
) -> list[dict]:
"""Paginate through unfiltered sale results for one outcode+channel.
Each listing's detail page is fetched for the property's TRUE full postcode
(gated by ``RIGHTMOVE_FETCH_DETAILS`` and capped per outcode by
``RIGHTMOVE_MAX_DETAILS_PER_OUTCODE``); listings beyond the cap keep the
coordinate-derived postcode."""
properties, _ = _paginate(
client,
outcode_id,
outcode,
channel_cfg,
pc_index,
max_properties=max_properties,
fetch_details=RIGHTMOVE_FETCH_DETAILS,
detail_cap=RIGHTMOVE_MAX_DETAILS_PER_OUTCODE,
)
if max_properties is not None and len(properties) >= max_properties:
return properties[:max_properties]
return properties