292 lines
9.5 KiB
Python
292 lines
9.5 KiB
Python
"""OnTheMarket (onthemarket.com) scraper — sale properties.
|
|
|
|
OnTheMarket serves a Next.js app with the full search-results payload embedded
|
|
as JSON in a `__NEXT_DATA__` script tag. No JS execution or browser needed:
|
|
plain HTTP with a Chrome-ish User-Agent is enough.
|
|
|
|
Each rendered page contains 30 listings under
|
|
`props.initialReduxState.results.list`, each with `location.{lat,lon}`,
|
|
`bedrooms`, `bathrooms`, `price` (formatted £-string), `address`,
|
|
`humanised-property-type`, `features` (a list where the first element is
|
|
typically `"Tenure: <value>"`), and `details-url`. Pagination is via
|
|
`?page=N`; the loop terminates when `paginationControls.next` is null.
|
|
"""
|
|
|
|
import json
|
|
import logging
|
|
import random
|
|
import re
|
|
import time
|
|
|
|
import httpx
|
|
|
|
from constants import (
|
|
DELAY_BETWEEN_PAGES,
|
|
MAX_BEDROOMS,
|
|
MAX_RETRIES,
|
|
ONTHEMARKET_BASE,
|
|
RETRY_BASE_DELAY,
|
|
)
|
|
from spatial import PostcodeSpatialIndex
|
|
from transform import (
|
|
clean_listing_address,
|
|
extract_full_postcode,
|
|
fix_coords,
|
|
map_property_type,
|
|
normalize_sub_type,
|
|
parse_display_size,
|
|
)
|
|
|
|
log = logging.getLogger("rightmove")
|
|
|
|
_NEXT_DATA_RE = re.compile(
|
|
r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
|
|
re.DOTALL,
|
|
)
|
|
_PRICE_RE = re.compile(r"([\d,]+)")
|
|
_TENURE_RE = re.compile(r"tenure:\s*(.+)", re.IGNORECASE)
|
|
|
|
_HTML_HEADERS = {
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
"Accept-Language": "en-GB,en;q=0.9",
|
|
}
|
|
|
|
|
|
def _fetch_page_json(client: httpx.Client, outcode: str, page_num: int) -> dict | None:
|
|
"""GET one search-results page and return the embedded __NEXT_DATA__ JSON.
|
|
|
|
Returns None on permanent failure, missing script, or a 3xx redirect
|
|
(OnTheMarket redirects out-of-range pages, so a redirect = end of results).
|
|
"""
|
|
url = f"{ONTHEMARKET_BASE}/for-sale/property/{outcode.lower()}/"
|
|
params = {"page": str(page_num)} if page_num > 1 else None
|
|
|
|
for attempt in range(MAX_RETRIES):
|
|
try:
|
|
resp = client.get(
|
|
url,
|
|
params=params,
|
|
headers=_HTML_HEADERS,
|
|
follow_redirects=False,
|
|
)
|
|
except (
|
|
httpx.ConnectError,
|
|
httpx.ReadTimeout,
|
|
httpx.WriteTimeout,
|
|
httpx.PoolTimeout,
|
|
) as exc:
|
|
delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1)
|
|
log.warning(
|
|
"%s from %s, retry %d/%d in %.1fs",
|
|
type(exc).__name__, url, attempt + 1, MAX_RETRIES, delay,
|
|
)
|
|
time.sleep(delay)
|
|
continue
|
|
|
|
if 300 <= resp.status_code < 400:
|
|
log.debug(
|
|
"OnTheMarket %s page %d redirected (%d) — end of results",
|
|
outcode, page_num, resp.status_code,
|
|
)
|
|
return None
|
|
if resp.status_code == 200:
|
|
match = _NEXT_DATA_RE.search(resp.text)
|
|
if not match:
|
|
log.warning(
|
|
"No __NEXT_DATA__ in OnTheMarket %s page %d", outcode, page_num
|
|
)
|
|
return None
|
|
try:
|
|
return json.loads(match.group(1))
|
|
except json.JSONDecodeError as exc:
|
|
log.warning(
|
|
"Failed to parse __NEXT_DATA__ for %s page %d: %s",
|
|
outcode, page_num, exc,
|
|
)
|
|
return None
|
|
if resp.status_code in (429, 500, 502, 503, 504):
|
|
delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1)
|
|
log.warning(
|
|
"HTTP %d from %s, retry %d/%d in %.1fs",
|
|
resp.status_code, url, attempt + 1, MAX_RETRIES, delay,
|
|
)
|
|
time.sleep(delay)
|
|
continue
|
|
log.error("HTTP %d from %s (non-retryable)", resp.status_code, url)
|
|
return None
|
|
|
|
log.error("All %d retries exhausted for %s page %d", MAX_RETRIES, outcode, page_num)
|
|
return None
|
|
|
|
|
|
def _parse_price(price_value) -> int:
|
|
"""Parse a formatted price string like '£450,000' into an integer.
|
|
Returns 0 for POA/auction/null values."""
|
|
if price_value is None:
|
|
return 0
|
|
if isinstance(price_value, (int, float)):
|
|
return int(price_value)
|
|
match = _PRICE_RE.search(str(price_value))
|
|
if not match:
|
|
return 0
|
|
return int(match.group(1).replace(",", ""))
|
|
|
|
|
|
def _extract_tenure(features: list) -> str | None:
|
|
"""Pull canonical Freehold/Leasehold out of the features list.
|
|
OnTheMarket encodes tenure as 'Tenure: Leasehold (NN years remaining)' etc.
|
|
'Share of freehold' is normalised to Freehold."""
|
|
if not features:
|
|
return None
|
|
for feature in features:
|
|
if not isinstance(feature, str):
|
|
continue
|
|
match = _TENURE_RE.search(feature)
|
|
if not match:
|
|
continue
|
|
value = match.group(1).strip().lower()
|
|
if "freehold" in value:
|
|
return "Freehold"
|
|
if "leasehold" in value:
|
|
return "Leasehold"
|
|
return None
|
|
|
|
|
|
def _extract_floor_area(features: list) -> float | None:
|
|
"""Search features for a sq ft / sq m mention and return sqm."""
|
|
if not features:
|
|
return None
|
|
for feature in features:
|
|
if not isinstance(feature, str):
|
|
continue
|
|
sqm = parse_display_size(feature)
|
|
if sqm is not None:
|
|
return sqm
|
|
return None
|
|
|
|
|
|
def transform_property(
|
|
raw: dict, pc_index: PostcodeSpatialIndex
|
|
) -> dict | None:
|
|
"""Transform a raw OnTheMarket listing dict into our output schema."""
|
|
loc = raw.get("location") or {}
|
|
raw_lat = loc.get("lat")
|
|
raw_lng = loc.get("lon")
|
|
if raw_lat is None or raw_lng is None:
|
|
return None
|
|
|
|
lat, lng = fix_coords(raw_lat, raw_lng)
|
|
if not (49 <= lat <= 56 and -7 <= lng <= 2):
|
|
return None
|
|
|
|
inferred_postcode = pc_index.nearest(lat, lng)
|
|
if not inferred_postcode:
|
|
return None
|
|
raw_address = raw.get("address", "") or ""
|
|
extracted_postcode = extract_full_postcode(raw_address)
|
|
postcode = extracted_postcode or inferred_postcode
|
|
postcode_source = "address" if extracted_postcode else "coordinates"
|
|
|
|
raw_beds = raw.get("bedrooms") or 0
|
|
raw_baths = raw.get("bathrooms") or 0
|
|
bedrooms = raw_beds if 0 <= raw_beds <= MAX_BEDROOMS else 0
|
|
bathrooms = raw_baths if 0 <= raw_baths <= MAX_BEDROOMS else 0
|
|
if raw_beds > MAX_BEDROOMS or raw_baths > MAX_BEDROOMS:
|
|
log.warning(
|
|
"OnTheMarket %s: implausible beds=%d baths=%d (capped to 0)",
|
|
raw.get("id", "?"), raw_beds, raw_baths,
|
|
)
|
|
|
|
sub_type = raw.get("humanised-property-type") or ""
|
|
features = raw.get("features") or []
|
|
listing_id = str(raw.get("id") or "")
|
|
if not listing_id:
|
|
return None
|
|
|
|
details_url = raw.get("details-url") or ""
|
|
full_url = (
|
|
ONTHEMARKET_BASE + details_url
|
|
if details_url and not details_url.startswith("http")
|
|
else details_url
|
|
)
|
|
|
|
return {
|
|
"id": f"otm_{listing_id}",
|
|
"Bedrooms": bedrooms,
|
|
"Bathrooms": bathrooms,
|
|
"Number of bedrooms & living rooms": bedrooms + bathrooms,
|
|
"lon": lng,
|
|
"lat": lat,
|
|
"Postcode": postcode,
|
|
"Postcode source": postcode_source,
|
|
"Extracted postcode": extracted_postcode,
|
|
"Inferred postcode": inferred_postcode,
|
|
"Listing raw address": raw_address,
|
|
"Address per Property Register": clean_listing_address(raw_address),
|
|
"Leasehold/Freehold": _extract_tenure(features),
|
|
"Property type": map_property_type(sub_type),
|
|
"Property sub-type": normalize_sub_type(sub_type),
|
|
"price": _parse_price(raw.get("price")),
|
|
"price_frequency": "",
|
|
"Price qualifier": raw.get("price-qualifier") or "",
|
|
"Total floor area (sqm)": _extract_floor_area(features),
|
|
"Listing URL": full_url,
|
|
"Listing features": [f for f in features if isinstance(f, str)],
|
|
"first_visible_date": "",
|
|
}
|
|
|
|
|
|
def search_outcode(
|
|
client: httpx.Client,
|
|
outcode: str,
|
|
pc_index: PostcodeSpatialIndex,
|
|
max_properties: int | None = None,
|
|
) -> list[dict]:
|
|
"""Paginate through OnTheMarket sale results for one outcode."""
|
|
properties: list[dict] = []
|
|
seen_ids: set[str] = set()
|
|
page_num = 1
|
|
|
|
while True:
|
|
data = _fetch_page_json(client, outcode, page_num)
|
|
if data is None:
|
|
break
|
|
|
|
try:
|
|
state = data["props"]["initialReduxState"]["results"]
|
|
except (KeyError, TypeError):
|
|
log.warning(
|
|
"Unexpected __NEXT_DATA__ shape for %s page %d", outcode, page_num
|
|
)
|
|
break
|
|
|
|
raw_listings = state.get("list") or []
|
|
if not raw_listings:
|
|
break
|
|
|
|
for raw in raw_listings:
|
|
listing_id = str(raw.get("id") or "")
|
|
if listing_id and listing_id in seen_ids:
|
|
continue
|
|
seen_ids.add(listing_id)
|
|
try:
|
|
transformed = transform_property(raw, pc_index)
|
|
except Exception as exc:
|
|
log.warning(
|
|
"OnTheMarket %s property %s failed to transform: %s",
|
|
outcode, listing_id or "?", exc,
|
|
)
|
|
continue
|
|
if transformed:
|
|
properties.append(transformed)
|
|
if max_properties is not None and len(properties) >= max_properties:
|
|
return properties
|
|
|
|
pagination = state.get("paginationControls") or {}
|
|
if not pagination.get("next"):
|
|
break
|
|
|
|
page_num += 1
|
|
time.sleep(DELAY_BETWEEN_PAGES)
|
|
|
|
return properties
|