363 lines
12 KiB
Python
363 lines
12 KiB
Python
import json
|
||
import logging
|
||
import re
|
||
import time
|
||
|
||
import httpx
|
||
|
||
from constants import (
|
||
PAGE_SIZE,
|
||
DELAY_BETWEEN_PAGES,
|
||
RIGHTMOVE_DETAIL_URL,
|
||
RIGHTMOVE_FETCH_DETAILS,
|
||
RIGHTMOVE_MAX_DETAILS_PER_OUTCODE,
|
||
SEARCH_URL,
|
||
TYPEAHEAD_URL,
|
||
)
|
||
from http_client import fetch_with_retry
|
||
from spatial import PostcodeSpatialIndex
|
||
from transform import extract_full_postcode, normalize_postcode, transform_property
|
||
|
||
log = logging.getLogger("rightmove")
|
||
|
||
# Outcode ID cache (Rightmove typeahead → internal ID)
|
||
outcode_cache: dict[str, str] = {}
|
||
|
||
# Rightmove hard-caps pagination at index 1008 (42 pages × 24 results).
|
||
# Requesting index >= 1008 returns HTTP 400.
|
||
_MAX_INDEX = 1008
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Detail-page postcode extraction
|
||
# ---------------------------------------------------------------------------
|
||
#
|
||
# The search API (_paginate) only returns an outcode-level `displayAddress`
|
||
# (e.g. "Akerman Road, Brixton, London, SW9") — never the full postcode. Each
|
||
# listing's detail page, however, embeds the property's OWN full postcode in a
|
||
# `window.__PAGE_MODEL` script as `propertyData.address.{outcode, incode}`
|
||
# (e.g. outcode "SW9" + incode "0HD" → "SW9 0HD"), independently corroborated by
|
||
# `propertyData.propertyUrls.similarPropertiesUrl` ("/property-for-sale/SW9-0HD.html").
|
||
# This is the property's own postcode, NOT a nearest station/school: the
|
||
# `nearestStations`/`nearestAirports` arrays carry only names + distances, no
|
||
# postcodes, and the address outcode always matches the searched outcode.
|
||
# Recon over 24 live listings across SW9/E1/M1/LS6/E20 (incl. APPROXIMATE_POINT
|
||
# new-builds) found the full postcode present 100% of the time. There is no
|
||
# UPRN or house-number field anywhere in propertyData, so those stay None.
|
||
#
|
||
# __PAGE_MODEL is a "devalue"-style flattened object graph: its `data` field is
|
||
# a JSON STRING holding a flat array where every integer inside a container is
|
||
# an index reference into that same array (so the graph can dedupe). We
|
||
# brace-match the (large, deeply-nested) object literal — a non-greedy regex
|
||
# cannot — then rehydrate the reference graph before reading the address.
|
||
|
||
_PAGE_MODEL_RE = re.compile(r"window\.__PAGE_MODEL\s*=\s*")
|
||
|
||
|
||
def _extract_page_model_literal(html: str) -> str | None:
|
||
"""Return the `{...}` object literal assigned to window.__PAGE_MODEL.
|
||
|
||
Brace-matches with string/escape awareness so embedded braces and quotes in
|
||
string values don't end the match early. Returns None when absent."""
|
||
marker = _PAGE_MODEL_RE.search(html)
|
||
if not marker:
|
||
return None
|
||
start = marker.end()
|
||
if start >= len(html) or html[start] != "{":
|
||
return None
|
||
depth = 0
|
||
in_str = False
|
||
esc = False
|
||
for j in range(start, len(html)):
|
||
ch = html[j]
|
||
if in_str:
|
||
if esc:
|
||
esc = False
|
||
elif ch == "\\":
|
||
esc = True
|
||
elif ch == '"':
|
||
in_str = False
|
||
elif ch == '"':
|
||
in_str = True
|
||
elif ch == "{":
|
||
depth += 1
|
||
elif ch == "}":
|
||
depth -= 1
|
||
if depth == 0:
|
||
return html[start : j + 1]
|
||
return None
|
||
|
||
|
||
def _rehydrate(flat: list) -> object:
|
||
"""Resolve a devalue-style flattened reference array into a nested object.
|
||
|
||
Index 0 is the root; every int inside a dict/list is an index back into
|
||
``flat``. Memoised so shared/cyclic references resolve once."""
|
||
cache: dict[int, object] = {}
|
||
|
||
def resolve(idx: int) -> object:
|
||
if not isinstance(idx, int) or idx < 0 or idx >= len(flat):
|
||
return None
|
||
if idx in cache:
|
||
return cache[idx]
|
||
node = flat[idx]
|
||
if isinstance(node, dict):
|
||
out: dict = {}
|
||
cache[idx] = out
|
||
for key, value in node.items():
|
||
out[key] = resolve(value) if isinstance(value, int) else value
|
||
return out
|
||
if isinstance(node, list):
|
||
arr: list = []
|
||
cache[idx] = arr
|
||
for value in node:
|
||
arr.append(resolve(value) if isinstance(value, int) else value)
|
||
return arr
|
||
cache[idx] = node
|
||
return node
|
||
|
||
return resolve(0)
|
||
|
||
|
||
def parse_detail_postcode(html: str) -> str | None:
|
||
"""Extract a Rightmove property's TRUE full postcode from its detail HTML.
|
||
|
||
Pure and network-free so it is unit-testable: callers pass the page HTML.
|
||
Reads ``propertyData.address.outcode`` + ``.incode`` from window.__PAGE_MODEL
|
||
and returns a normalised full postcode (e.g. "SW9 0HD"), or None when the
|
||
page has no parseable address (the property location wrapper can be empty —
|
||
the caller then keeps the coordinate fallback). The returned outcode is
|
||
re-validated against the joined postcode so a malformed incode is dropped.
|
||
"""
|
||
if not html:
|
||
return None
|
||
literal = _extract_page_model_literal(html)
|
||
if not literal:
|
||
return None
|
||
try:
|
||
outer = json.loads(literal)
|
||
flat = json.loads(outer["data"])
|
||
except (ValueError, KeyError, TypeError):
|
||
return None
|
||
if not isinstance(flat, list) or not flat:
|
||
return None
|
||
|
||
root = _rehydrate(flat)
|
||
if not isinstance(root, dict):
|
||
return None
|
||
property_data = root.get("propertyData")
|
||
if not isinstance(property_data, dict):
|
||
return None
|
||
address = property_data.get("address")
|
||
if not isinstance(address, dict):
|
||
return None
|
||
|
||
outcode = address.get("outcode")
|
||
incode = address.get("incode")
|
||
if not isinstance(outcode, str) or not isinstance(incode, str):
|
||
return None
|
||
outcode, incode = outcode.strip(), incode.strip()
|
||
if not outcode or not incode:
|
||
return None
|
||
|
||
# Round-trip through the shared postcode validator/normaliser: this both
|
||
# canonicalises spacing and rejects an outcode/incode pair that doesn't form
|
||
# a structurally-valid UK postcode.
|
||
return extract_full_postcode(normalize_postcode(f"{outcode} {incode}"))
|
||
|
||
|
||
# listingId -> true full postcode (or None when unavailable). Failures are
|
||
# cached too, so a broken/duplicate listing is fetched at most once per run (the
|
||
# same listing can reappear across overlapping outcode searches).
|
||
_detail_postcode_cache: dict[str, str | None] = {}
|
||
|
||
|
||
def _fetch_detail_postcode(client: httpx.Client, property_id: str) -> str | None:
|
||
"""GET a listing detail page and return its true full postcode (or None).
|
||
|
||
Results (including failures) are cached by listing id. The detail page is a
|
||
plain HTML GET — no Cloudflare, unlike Zoopla — so a single httpx call
|
||
suffices; any error degrades gracefully to the coordinate fallback."""
|
||
if not property_id:
|
||
return None
|
||
if property_id in _detail_postcode_cache:
|
||
return _detail_postcode_cache[property_id]
|
||
|
||
postcode: str | None = None
|
||
url = RIGHTMOVE_DETAIL_URL.format(id=property_id)
|
||
try:
|
||
resp = client.get(url, headers={"Accept": "text/html"})
|
||
if resp.status_code == 200:
|
||
postcode = parse_detail_postcode(resp.text)
|
||
else:
|
||
log.debug("Rightmove detail %s returned HTTP %d", url, resp.status_code)
|
||
except httpx.HTTPError as exc:
|
||
log.debug("Rightmove detail fetch failed %s: %s", url, exc)
|
||
|
||
_detail_postcode_cache[property_id] = postcode
|
||
return postcode
|
||
|
||
|
||
def resolve_outcode_id(client: httpx.Client, outcode: str) -> str | None:
|
||
"""Look up Rightmove's internal ID for an outcode via typeahead API."""
|
||
if outcode in outcode_cache:
|
||
return outcode_cache[outcode]
|
||
|
||
data = fetch_with_retry(
|
||
client, TYPEAHEAD_URL, {"query": outcode, "limit": "10", "exclude": "STREET"}
|
||
)
|
||
if not data:
|
||
return None
|
||
|
||
for match in data.get("matches", []):
|
||
if match.get("type") == "OUTCODE" and match.get("displayName") == outcode:
|
||
rid = str(match["id"])
|
||
outcode_cache[outcode] = rid
|
||
return rid
|
||
|
||
log.debug("Outcode %s not found in typeahead results", outcode)
|
||
return None
|
||
|
||
|
||
def _detail_postcode_for(
|
||
client: httpx.Client,
|
||
prop: dict,
|
||
fetch_details: bool,
|
||
detail_budget: dict,
|
||
) -> str | None:
|
||
"""Look up a listing's true postcode, honouring the per-outcode fetch cap.
|
||
|
||
Cached listings are always served (they cost neither a cap slot nor a GET);
|
||
a fresh fetch is made only while ``detail_budget['remaining'] > 0``."""
|
||
if not fetch_details:
|
||
return None
|
||
property_id = str(prop.get("id") or "")
|
||
if not property_id:
|
||
return None
|
||
if property_id in _detail_postcode_cache:
|
||
return _detail_postcode_cache[property_id]
|
||
if detail_budget["remaining"] <= 0:
|
||
return None
|
||
detail_budget["remaining"] -= 1
|
||
postcode = _fetch_detail_postcode(client, property_id)
|
||
time.sleep(DELAY_BETWEEN_PAGES)
|
||
return postcode
|
||
|
||
|
||
def _paginate(
|
||
client: httpx.Client,
|
||
outcode_id: str,
|
||
outcode: str,
|
||
channel_cfg: dict,
|
||
pc_index: PostcodeSpatialIndex,
|
||
max_properties: int | None = None,
|
||
fetch_details: bool = False,
|
||
detail_cap: int = 0,
|
||
) -> tuple[list[dict], int]:
|
||
"""Paginate through search results. Returns (properties, result_count).
|
||
|
||
When ``fetch_details`` is set, up to ``detail_cap`` listings per outcode have
|
||
their detail page fetched for the property's TRUE full postcode (see
|
||
``parse_detail_postcode``); the rest fall back to coordinate-derived
|
||
postcodes."""
|
||
properties = []
|
||
index = 0
|
||
result_count = 0
|
||
detail_budget = {"remaining": detail_cap}
|
||
|
||
while True:
|
||
params = {
|
||
"useLocationIdentifier": "true",
|
||
"locationIdentifier": f"OUTCODE^{outcode_id}",
|
||
"index": str(index),
|
||
"sortType": channel_cfg["sortType"],
|
||
"channel": channel_cfg["channel"],
|
||
"transactionType": channel_cfg["transactionType"],
|
||
}
|
||
data = fetch_with_retry(client, SEARCH_URL, params)
|
||
if not data:
|
||
log.warning(
|
||
"Failed to fetch index %d for %s/%s",
|
||
index,
|
||
outcode,
|
||
channel_cfg["channel"],
|
||
)
|
||
break
|
||
|
||
raw_props = data.get("properties", [])
|
||
if not raw_props:
|
||
break
|
||
|
||
for prop in raw_props:
|
||
try:
|
||
detail_postcode = _detail_postcode_for(
|
||
client, prop, fetch_details, detail_budget
|
||
)
|
||
transformed = transform_property(
|
||
prop, outcode, pc_index, detail_postcode=detail_postcode
|
||
)
|
||
except Exception as exc:
|
||
log.warning(
|
||
"Rightmove %s/%s property %s failed to transform: %s",
|
||
outcode,
|
||
channel_cfg["channel"],
|
||
prop.get("id", "?"),
|
||
exc,
|
||
)
|
||
continue
|
||
if transformed:
|
||
properties.append(transformed)
|
||
if max_properties is not None and len(properties) >= max_properties:
|
||
return properties, result_count
|
||
|
||
# Check if there are more pages
|
||
result_count_str = data.get("resultCount", "0")
|
||
result_count = int(result_count_str.replace(",", ""))
|
||
index += PAGE_SIZE
|
||
|
||
if index >= result_count:
|
||
break
|
||
if index >= _MAX_INDEX:
|
||
log.warning(
|
||
"%s/%s: %d results exceed Rightmove's %d-result page cap",
|
||
outcode,
|
||
channel_cfg["channel"],
|
||
result_count,
|
||
_MAX_INDEX,
|
||
)
|
||
break
|
||
|
||
time.sleep(DELAY_BETWEEN_PAGES)
|
||
|
||
return properties, result_count
|
||
|
||
|
||
def search_outcode(
|
||
client: httpx.Client,
|
||
outcode_id: str,
|
||
outcode: str,
|
||
channel_cfg: dict,
|
||
pc_index: PostcodeSpatialIndex,
|
||
max_properties: int | None = None,
|
||
) -> list[dict]:
|
||
"""Paginate through unfiltered sale results for one outcode+channel.
|
||
|
||
Each listing's detail page is fetched for the property's TRUE full postcode
|
||
(gated by ``RIGHTMOVE_FETCH_DETAILS`` and capped per outcode by
|
||
``RIGHTMOVE_MAX_DETAILS_PER_OUTCODE``); listings beyond the cap keep the
|
||
coordinate-derived postcode."""
|
||
properties, _ = _paginate(
|
||
client,
|
||
outcode_id,
|
||
outcode,
|
||
channel_cfg,
|
||
pc_index,
|
||
max_properties=max_properties,
|
||
fetch_details=RIGHTMOVE_FETCH_DETAILS,
|
||
detail_cap=RIGHTMOVE_MAX_DETAILS_PER_OUTCODE,
|
||
)
|
||
|
||
if max_properties is not None and len(properties) >= max_properties:
|
||
return properties[:max_properties]
|
||
|
||
return properties
|