perfect-postcode/finder/onthemarket.py
2026-05-28 21:48:35 +01:00

292 lines
9.5 KiB
Python

"""OnTheMarket (onthemarket.com) scraper — sale properties.
OnTheMarket serves a Next.js app with the full search-results payload embedded
as JSON in a `__NEXT_DATA__` script tag. No JS execution or browser needed:
plain HTTP with a Chrome-ish User-Agent is enough.
Each rendered page contains 30 listings under
`props.initialReduxState.results.list`, each with `location.{lat,lon}`,
`bedrooms`, `bathrooms`, `price` (formatted £-string), `address`,
`humanised-property-type`, `features` (a list where the first element is
typically `"Tenure: <value>"`), and `details-url`. Pagination is via
`?page=N`; the loop terminates when `paginationControls.next` is null.
"""
import json
import logging
import random
import re
import time
import httpx
from constants import (
DELAY_BETWEEN_PAGES,
MAX_BEDROOMS,
MAX_RETRIES,
ONTHEMARKET_BASE,
RETRY_BASE_DELAY,
)
from spatial import PostcodeSpatialIndex
from transform import (
clean_listing_address,
extract_full_postcode,
fix_coords,
map_property_type,
normalize_sub_type,
parse_display_size,
)
log = logging.getLogger("rightmove")
_NEXT_DATA_RE = re.compile(
r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
re.DOTALL,
)
_PRICE_RE = re.compile(r"([\d,]+)")
_TENURE_RE = re.compile(r"tenure:\s*(.+)", re.IGNORECASE)
_HTML_HEADERS = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-GB,en;q=0.9",
}
def _fetch_page_json(client: httpx.Client, outcode: str, page_num: int) -> dict | None:
"""GET one search-results page and return the embedded __NEXT_DATA__ JSON.
Returns None on permanent failure, missing script, or a 3xx redirect
(OnTheMarket redirects out-of-range pages, so a redirect = end of results).
"""
url = f"{ONTHEMARKET_BASE}/for-sale/property/{outcode.lower()}/"
params = {"page": str(page_num)} if page_num > 1 else None
for attempt in range(MAX_RETRIES):
try:
resp = client.get(
url,
params=params,
headers=_HTML_HEADERS,
follow_redirects=False,
)
except (
httpx.ConnectError,
httpx.ReadTimeout,
httpx.WriteTimeout,
httpx.PoolTimeout,
) as exc:
delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1)
log.warning(
"%s from %s, retry %d/%d in %.1fs",
type(exc).__name__, url, attempt + 1, MAX_RETRIES, delay,
)
time.sleep(delay)
continue
if 300 <= resp.status_code < 400:
log.debug(
"OnTheMarket %s page %d redirected (%d) — end of results",
outcode, page_num, resp.status_code,
)
return None
if resp.status_code == 200:
match = _NEXT_DATA_RE.search(resp.text)
if not match:
log.warning(
"No __NEXT_DATA__ in OnTheMarket %s page %d", outcode, page_num
)
return None
try:
return json.loads(match.group(1))
except json.JSONDecodeError as exc:
log.warning(
"Failed to parse __NEXT_DATA__ for %s page %d: %s",
outcode, page_num, exc,
)
return None
if resp.status_code in (429, 500, 502, 503, 504):
delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1)
log.warning(
"HTTP %d from %s, retry %d/%d in %.1fs",
resp.status_code, url, attempt + 1, MAX_RETRIES, delay,
)
time.sleep(delay)
continue
log.error("HTTP %d from %s (non-retryable)", resp.status_code, url)
return None
log.error("All %d retries exhausted for %s page %d", MAX_RETRIES, outcode, page_num)
return None
def _parse_price(price_value) -> int:
"""Parse a formatted price string like '£450,000' into an integer.
Returns 0 for POA/auction/null values."""
if price_value is None:
return 0
if isinstance(price_value, (int, float)):
return int(price_value)
match = _PRICE_RE.search(str(price_value))
if not match:
return 0
return int(match.group(1).replace(",", ""))
def _extract_tenure(features: list) -> str | None:
"""Pull canonical Freehold/Leasehold out of the features list.
OnTheMarket encodes tenure as 'Tenure: Leasehold (NN years remaining)' etc.
'Share of freehold' is normalised to Freehold."""
if not features:
return None
for feature in features:
if not isinstance(feature, str):
continue
match = _TENURE_RE.search(feature)
if not match:
continue
value = match.group(1).strip().lower()
if "freehold" in value:
return "Freehold"
if "leasehold" in value:
return "Leasehold"
return None
def _extract_floor_area(features: list) -> float | None:
"""Search features for a sq ft / sq m mention and return sqm."""
if not features:
return None
for feature in features:
if not isinstance(feature, str):
continue
sqm = parse_display_size(feature)
if sqm is not None:
return sqm
return None
def transform_property(
raw: dict, pc_index: PostcodeSpatialIndex
) -> dict | None:
"""Transform a raw OnTheMarket listing dict into our output schema."""
loc = raw.get("location") or {}
raw_lat = loc.get("lat")
raw_lng = loc.get("lon")
if raw_lat is None or raw_lng is None:
return None
lat, lng = fix_coords(raw_lat, raw_lng)
if not (49 <= lat <= 56 and -7 <= lng <= 2):
return None
inferred_postcode = pc_index.nearest(lat, lng)
if not inferred_postcode:
return None
raw_address = raw.get("address", "") or ""
extracted_postcode = extract_full_postcode(raw_address)
postcode = extracted_postcode or inferred_postcode
postcode_source = "address" if extracted_postcode else "coordinates"
raw_beds = raw.get("bedrooms") or 0
raw_baths = raw.get("bathrooms") or 0
bedrooms = raw_beds if 0 <= raw_beds <= MAX_BEDROOMS else 0
bathrooms = raw_baths if 0 <= raw_baths <= MAX_BEDROOMS else 0
if raw_beds > MAX_BEDROOMS or raw_baths > MAX_BEDROOMS:
log.warning(
"OnTheMarket %s: implausible beds=%d baths=%d (capped to 0)",
raw.get("id", "?"), raw_beds, raw_baths,
)
sub_type = raw.get("humanised-property-type") or ""
features = raw.get("features") or []
listing_id = str(raw.get("id") or "")
if not listing_id:
return None
details_url = raw.get("details-url") or ""
full_url = (
ONTHEMARKET_BASE + details_url
if details_url and not details_url.startswith("http")
else details_url
)
return {
"id": f"otm_{listing_id}",
"Bedrooms": bedrooms,
"Bathrooms": bathrooms,
"Number of bedrooms & living rooms": bedrooms + bathrooms,
"lon": lng,
"lat": lat,
"Postcode": postcode,
"Postcode source": postcode_source,
"Extracted postcode": extracted_postcode,
"Inferred postcode": inferred_postcode,
"Listing raw address": raw_address,
"Address per Property Register": clean_listing_address(raw_address),
"Leasehold/Freehold": _extract_tenure(features),
"Property type": map_property_type(sub_type),
"Property sub-type": normalize_sub_type(sub_type),
"price": _parse_price(raw.get("price")),
"price_frequency": "",
"Price qualifier": raw.get("price-qualifier") or "",
"Total floor area (sqm)": _extract_floor_area(features),
"Listing URL": full_url,
"Listing features": [f for f in features if isinstance(f, str)],
"first_visible_date": "",
}
def search_outcode(
client: httpx.Client,
outcode: str,
pc_index: PostcodeSpatialIndex,
max_properties: int | None = None,
) -> list[dict]:
"""Paginate through OnTheMarket sale results for one outcode."""
properties: list[dict] = []
seen_ids: set[str] = set()
page_num = 1
while True:
data = _fetch_page_json(client, outcode, page_num)
if data is None:
break
try:
state = data["props"]["initialReduxState"]["results"]
except (KeyError, TypeError):
log.warning(
"Unexpected __NEXT_DATA__ shape for %s page %d", outcode, page_num
)
break
raw_listings = state.get("list") or []
if not raw_listings:
break
for raw in raw_listings:
listing_id = str(raw.get("id") or "")
if listing_id and listing_id in seen_ids:
continue
seen_ids.add(listing_id)
try:
transformed = transform_property(raw, pc_index)
except Exception as exc:
log.warning(
"OnTheMarket %s property %s failed to transform: %s",
outcode, listing_id or "?", exc,
)
continue
if transformed:
properties.append(transformed)
if max_properties is not None and len(properties) >= max_properties:
return properties
pagination = state.get("paginationControls") or {}
if not pagination.get("next"):
break
page_num += 1
time.sleep(DELAY_BETWEEN_PAGES)
return properties