perfect-postcode/finder/onthemarket.py

499 lines
18 KiB
Python

"""OnTheMarket (onthemarket.com) scraper — sale properties.
OnTheMarket serves a Next.js app with the full search-results payload embedded
as JSON in a `__NEXT_DATA__` script tag. No JS execution or browser needed:
plain HTTP with a Chrome-ish User-Agent is enough.
Each rendered page contains 30 listings under
`props.initialReduxState.results.list`, each with `location.{lat,lon}`,
`bedrooms`, `bathrooms`, `price` (formatted £-string), `address`,
`humanised-property-type`, `features` (a list where the first element is
typically `"Tenure: <value>"`), and `details-url`. Pagination is via
`?page=N`; the loop terminates when `paginationControls.next` is null.
Postcodes
---------
The search card exposes only an *outcode*-level address (e.g. "Padfield Road,
London, SE5") and a map pin, so the old behaviour derived the postcode from the
nearest postcode to that pin — a guess that frequently lands on a neighbouring
unit (the pin can sit on the wrong side of a street boundary).
Each *detail* page (`/details/{id}/`) is a plain HTTPS GET whose `__NEXT_DATA__`
embeds the property's analytics dataLayer at
`props.initialReduxState.metadata.dataLayer`, which carries the property's own
`postcode` (full unit postcode, e.g. "SE5 9AA") keyed to this listing by
`property-id`. Crucially this is NOT the agent's office postcode — that lives
separately at `…property.agent.postcode` ("SE5 8RS" for the same listing) and
is the classic trap when blindly scanning the page for a postcode. We read the
dataLayer postcode, verify `property-id` matches the listing, and accept it only
when its outcode agrees with the coordinate-nearest postcode (via
``resolve_listing_postcode``) — exactly the trust rule the other scrapers use.
Measured over a sample of real listings this yields a trustworthy, usually
exact-unit postcode for ~11/12 listings; the rest safely fall back to the
coordinate-nearest postcode.
Detail fetching costs one extra HTTPS GET per listing, so it is gated behind
``OTM_FETCH_DETAILS`` and capped at ``OTM_MAX_DETAILS_PER_OUTCODE`` per outcode.
"""
import json
import logging
import random
import re
import time
import httpx
from constants import (
DELAY_BETWEEN_PAGES,
MAX_BEDROOMS,
MAX_RETRIES,
ONTHEMARKET_BASE,
RETRY_BASE_DELAY,
)
from spatial import PostcodeSpatialIndex
from transform import (
clean_listing_address,
extract_full_postcode,
extract_outcode,
fix_coords,
map_property_type,
normalize_sub_type,
parse_display_size,
resolve_listing_postcode,
)
log = logging.getLogger("rightmove")
# Detail-page postcode recovery (see module docstring). When enabled, each
# listing's detail page is fetched so its analytics dataLayer postcode — the
# property's own full unit postcode — can replace the coordinate-nearest guess.
# Bounded per outcode so a large outcode can't balloon into unbounded extra
# HTTPS GETs. Kept at parity with the Rightmove/Zoopla detail caps (400) so a
# typical outcode's listings all get their real postcode rather than a
# coordinate-nearest guess.
OTM_FETCH_DETAILS = True
OTM_MAX_DETAILS_PER_OUTCODE = 400
_NEXT_DATA_RE = re.compile(
r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
re.DOTALL,
)
_PRICE_RE = re.compile(r"([\d,]+)")
_TENURE_RE = re.compile(r"tenure:\s*(.+)", re.IGNORECASE)
_HTML_HEADERS = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-GB,en;q=0.9",
}
# listingId -> recovered full postcode (or None). Failures are cached too so a
# broken or postcode-less detail page is not re-fetched within a run (the same
# listing can reappear across overlapping outcode searches).
_detail_postcode_cache: dict[str, str | None] = {}
def _fetch_page_json(client: httpx.Client, outcode: str, page_num: int) -> dict | None:
"""GET one search-results page and return the embedded __NEXT_DATA__ JSON.
Returns None on permanent failure, missing script, or a 3xx redirect
(OnTheMarket redirects out-of-range pages, so a redirect = end of results).
"""
url = f"{ONTHEMARKET_BASE}/for-sale/property/{outcode.lower()}/"
params = {"page": str(page_num)} if page_num > 1 else None
for attempt in range(MAX_RETRIES):
try:
resp = client.get(
url,
params=params,
headers=_HTML_HEADERS,
follow_redirects=False,
)
except (
httpx.ConnectError,
httpx.ReadTimeout,
httpx.WriteTimeout,
httpx.PoolTimeout,
) as exc:
delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1)
log.warning(
"%s from %s, retry %d/%d in %.1fs",
type(exc).__name__, url, attempt + 1, MAX_RETRIES, delay,
)
time.sleep(delay)
continue
if 300 <= resp.status_code < 400:
log.debug(
"OnTheMarket %s page %d redirected (%d) — end of results",
outcode, page_num, resp.status_code,
)
return None
if resp.status_code == 200:
match = _NEXT_DATA_RE.search(resp.text)
if not match:
log.warning(
"No __NEXT_DATA__ in OnTheMarket %s page %d", outcode, page_num
)
return None
try:
return json.loads(match.group(1))
except json.JSONDecodeError as exc:
log.warning(
"Failed to parse __NEXT_DATA__ for %s page %d: %s",
outcode, page_num, exc,
)
return None
if resp.status_code in (429, 500, 502, 503, 504):
delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1)
log.warning(
"HTTP %d from %s, retry %d/%d in %.1fs",
resp.status_code, url, attempt + 1, MAX_RETRIES, delay,
)
time.sleep(delay)
continue
log.error("HTTP %d from %s (non-retryable)", resp.status_code, url)
return None
log.error("All %d retries exhausted for %s page %d", MAX_RETRIES, outcode, page_num)
return None
def parse_detail_postcode(html: str, listing_id: str | None = None) -> str | None:
"""Extract the property's own full postcode from an OnTheMarket detail page.
Pure and network-free so it is unit-testable: callers pass `page.content()`
/ the GET body and this does the parsing.
The postcode lives in the analytics dataLayer embedded in `__NEXT_DATA__` at
``props.initialReduxState.metadata.dataLayer.postcode`` and is the
property's own unit postcode (e.g. "SE5 9AA"). It is deliberately NOT the
agent's office postcode, which sits separately at
``…property.agent.postcode`` — the trap when scanning a detail page for "a"
postcode. When ``listing_id`` is given, the dataLayer's ``property-id`` must
match it, guaranteeing we read this listing's postcode and not a stray one.
Returns a normalized full postcode (e.g. "SE5 9AA") or ``None`` when the
page has no usable property postcode. Trust (outcode-vs-coordinates
agreement) is enforced later in ``transform_property``.
"""
if not html:
return None
match = _NEXT_DATA_RE.search(html)
if not match:
return None
try:
data = json.loads(match.group(1))
except json.JSONDecodeError:
return None
try:
data_layer = data["props"]["initialReduxState"]["metadata"]["dataLayer"]
except (KeyError, TypeError):
return None
if not isinstance(data_layer, dict):
return None
# Guard against reading a different listing's postcode: the dataLayer is the
# property's own analytics payload, so its property-id must match.
if listing_id is not None:
page_id = data_layer.get("property-id")
if page_id is not None and str(page_id) != str(listing_id):
return None
raw_postcode = data_layer.get("postcode")
if not isinstance(raw_postcode, str):
return None
return extract_full_postcode(raw_postcode)
def _fetch_detail_postcode(
client: httpx.Client, details_url: str, listing_id: str
) -> str | None:
"""GET one listing's detail page and return its dataLayer postcode (or None).
Results (including failures) are cached by listing id so a listing that
reappears across overlapping outcode searches is fetched at most once. Plain
HTTPS GET — OnTheMarket detail pages have no Cloudflare challenge. Network /
parse errors degrade gracefully to None so the caller falls back to the
coordinate-nearest postcode.
"""
if listing_id in _detail_postcode_cache:
return _detail_postcode_cache[listing_id]
full_url = (
ONTHEMARKET_BASE + details_url
if details_url and not details_url.startswith("http")
else details_url
)
result: str | None = None
if full_url:
for attempt in range(MAX_RETRIES):
try:
resp = client.get(
full_url, headers=_HTML_HEADERS, follow_redirects=True
)
except (
httpx.ConnectError,
httpx.ReadTimeout,
httpx.WriteTimeout,
httpx.PoolTimeout,
) as exc:
delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1)
log.warning(
"%s from %s, retry %d/%d in %.1fs",
type(exc).__name__, full_url, attempt + 1, MAX_RETRIES, delay,
)
time.sleep(delay)
continue
if resp.status_code == 200:
result = parse_detail_postcode(resp.text, listing_id)
break
if resp.status_code in (429, 500, 502, 503, 504):
delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1)
log.warning(
"HTTP %d from %s, retry %d/%d in %.1fs",
resp.status_code, full_url, attempt + 1, MAX_RETRIES, delay,
)
time.sleep(delay)
continue
log.debug(
"OnTheMarket detail %s returned HTTP %d (no postcode)",
listing_id, resp.status_code,
)
break
_detail_postcode_cache[listing_id] = result
return result
def _parse_price(price_value) -> int:
"""Parse a formatted price string like '£450,000' into an integer.
Returns 0 for POA/auction/null values."""
if price_value is None:
return 0
if isinstance(price_value, (int, float)):
return int(price_value)
match = _PRICE_RE.search(str(price_value))
if not match:
return 0
return int(match.group(1).replace(",", ""))
def _extract_tenure(features: list) -> str | None:
"""Pull canonical Freehold/Leasehold out of the features list.
OnTheMarket encodes tenure as 'Tenure: Leasehold (NN years remaining)' etc.
'Share of freehold' is normalised to Freehold."""
if not features:
return None
for feature in features:
if not isinstance(feature, str):
continue
match = _TENURE_RE.search(feature)
if not match:
continue
value = match.group(1).strip().lower()
if "freehold" in value:
return "Freehold"
if "leasehold" in value:
return "Leasehold"
return None
def _extract_floor_area(features: list) -> float | None:
"""Search features for a sq ft / sq m mention and return sqm."""
if not features:
return None
for feature in features:
if not isinstance(feature, str):
continue
sqm = parse_display_size(feature)
if sqm is not None:
return sqm
return None
def transform_property(
raw: dict,
pc_index: PostcodeSpatialIndex,
detail_postcode: str | None = None,
) -> dict | None:
"""Transform a raw OnTheMarket listing dict into our output schema.
``detail_postcode`` is the property's own full postcode recovered from its
detail page (see ``parse_detail_postcode`` / ``_fetch_detail_postcode``),
or ``None`` when no detail fetch was done / no postcode was found. When
present and trustworthy (its outcode agrees with the coordinate-nearest
postcode) it supersedes the coordinate guess and is labelled
``"detail_address"``.
"""
loc = raw.get("location") or {}
raw_lat = loc.get("lat")
raw_lng = loc.get("lon")
if raw_lat is None or raw_lng is None:
return None
lat, lng = fix_coords(raw_lat, raw_lng)
if not (49 <= lat <= 56 and -7 <= lng <= 2):
return None
inferred_postcode = pc_index.nearest(lat, lng)
if not inferred_postcode:
return None
raw_address = raw.get("address", "") or ""
extracted_postcode = extract_full_postcode(raw_address)
# Prefer the property's own detail-page postcode when we have one and it is
# trustworthy. The detail postcode is a full unit postcode (better than the
# coordinate-nearest guess and than the usually outcode-only card address),
# but a stale/mislabelled value would silently override the spatially
# correct one, so apply the same outcode-agreement trust rule the address
# postcode uses: keep it only when its outcode matches the
# coordinate-nearest postcode's outcode.
detail_postcode = extract_full_postcode(detail_postcode)
if detail_postcode and extract_outcode(detail_postcode) == extract_outcode(
inferred_postcode
):
postcode, postcode_source = detail_postcode, "detail_address"
else:
if detail_postcode:
log.debug(
"OnTheMarket %s: rejecting detail postcode %s "
"(outcode mismatch with inferred %s)",
raw.get("id", "?"), detail_postcode, inferred_postcode,
)
postcode, postcode_source = resolve_listing_postcode(
extracted_postcode, inferred_postcode
)
raw_beds = raw.get("bedrooms") or 0
raw_baths = raw.get("bathrooms") or 0
bedrooms = raw_beds if 0 <= raw_beds <= MAX_BEDROOMS else 0
bathrooms = raw_baths if 0 <= raw_baths <= MAX_BEDROOMS else 0
if raw_beds > MAX_BEDROOMS or raw_baths > MAX_BEDROOMS:
log.warning(
"OnTheMarket %s: implausible beds=%d baths=%d (capped to 0)",
raw.get("id", "?"), raw_beds, raw_baths,
)
sub_type = raw.get("humanised-property-type") or ""
features = raw.get("features") or []
listing_id = str(raw.get("id") or "")
if not listing_id:
return None
details_url = raw.get("details-url") or ""
full_url = (
ONTHEMARKET_BASE + details_url
if details_url and not details_url.startswith("http")
else details_url
)
return {
"id": f"otm_{listing_id}",
"Bedrooms": bedrooms,
"Bathrooms": bathrooms,
"Number of bedrooms & living rooms": bedrooms + bathrooms,
"lon": lng,
"lat": lat,
"Postcode": postcode,
"Postcode source": postcode_source,
"Extracted postcode": extracted_postcode,
"Inferred postcode": inferred_postcode,
"Listing raw address": raw_address,
"Address per Property Register": clean_listing_address(raw_address),
# OnTheMarket search JSON exposes only a street-level address; no UPRN
# or house number/name is available without a detail-page fetch.
"UPRN": None,
"Property number or name": None,
"Leasehold/Freehold": _extract_tenure(features),
"Property type": map_property_type(sub_type),
"Property sub-type": normalize_sub_type(sub_type),
"price": _parse_price(raw.get("price")),
"price_frequency": "",
"Price qualifier": raw.get("price-qualifier") or "",
"Total floor area (sqm)": _extract_floor_area(features),
"Listing URL": full_url,
"Listing features": [f for f in features if isinstance(f, str)],
"first_visible_date": "",
}
def search_outcode(
client: httpx.Client,
outcode: str,
pc_index: PostcodeSpatialIndex,
max_properties: int | None = None,
) -> list[dict]:
"""Paginate through OnTheMarket sale results for one outcode.
When ``OTM_FETCH_DETAILS`` is enabled, up to
``OTM_MAX_DETAILS_PER_OUTCODE`` listings per outcode have their detail page
fetched for the property's own postcode (see ``_fetch_detail_postcode``);
the rest fall back to the coordinate-nearest postcode.
"""
properties: list[dict] = []
seen_ids: set[str] = set()
page_num = 1
details_fetched = 0
while True:
data = _fetch_page_json(client, outcode, page_num)
if data is None:
break
try:
state = data["props"]["initialReduxState"]["results"]
except (KeyError, TypeError):
log.warning(
"Unexpected __NEXT_DATA__ shape for %s page %d", outcode, page_num
)
break
raw_listings = state.get("list") or []
if not raw_listings:
break
for raw in raw_listings:
listing_id = str(raw.get("id") or "")
if listing_id and listing_id in seen_ids:
continue
seen_ids.add(listing_id)
detail_postcode = None
if OTM_FETCH_DETAILS and listing_id:
# Cached lookups are free; only fresh GETs count toward the cap
# and incur the inter-request delay.
cached = listing_id in _detail_postcode_cache
if cached or details_fetched < OTM_MAX_DETAILS_PER_OUTCODE:
detail_postcode = _fetch_detail_postcode(
client, raw.get("details-url") or "", listing_id
)
if not cached:
details_fetched += 1
time.sleep(DELAY_BETWEEN_PAGES)
try:
transformed = transform_property(raw, pc_index, detail_postcode)
except Exception as exc:
log.warning(
"OnTheMarket %s property %s failed to transform: %s",
outcode, listing_id or "?", exc,
)
continue
if transformed:
properties.append(transformed)
if max_properties is not None and len(properties) >= max_properties:
return properties
pagination = state.get("paginationControls") or {}
if not pagination.get("next"):
break
page_num += 1
time.sleep(DELAY_BETWEEN_PAGES)
return properties