"""OnTheMarket (onthemarket.com) scraper — sale properties. OnTheMarket serves a Next.js app with the full search-results payload embedded as JSON in a `__NEXT_DATA__` script tag. No JS execution or browser needed: plain HTTP with a Chrome-ish User-Agent is enough. Each rendered page contains 30 listings under `props.initialReduxState.results.list`, each with `location.{lat,lon}`, `bedrooms`, `bathrooms`, `price` (formatted £-string), `address`, `humanised-property-type`, `features` (a list where the first element is typically `"Tenure: "`), and `details-url`. Pagination is via `?page=N`; the loop terminates when `paginationControls.next` is null. """ import json import logging import random import re import time import httpx from constants import ( DELAY_BETWEEN_PAGES, MAX_BEDROOMS, MAX_RETRIES, ONTHEMARKET_BASE, RETRY_BASE_DELAY, ) from spatial import PostcodeSpatialIndex from transform import ( fix_coords, map_property_type, normalize_sub_type, parse_display_size, ) log = logging.getLogger("rightmove") _NEXT_DATA_RE = re.compile( r'', re.DOTALL, ) _PRICE_RE = re.compile(r"([\d,]+)") _TENURE_RE = re.compile(r"tenure:\s*(.+)", re.IGNORECASE) _HTML_HEADERS = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-GB,en;q=0.9", } def _fetch_page_json(client: httpx.Client, outcode: str, page_num: int) -> dict | None: """GET one search-results page and return the embedded __NEXT_DATA__ JSON. Returns None on permanent failure, missing script, or a 3xx redirect (OnTheMarket redirects out-of-range pages, so a redirect = end of results). """ url = f"{ONTHEMARKET_BASE}/for-sale/property/{outcode.lower()}/" params = {"page": str(page_num)} if page_num > 1 else None for attempt in range(MAX_RETRIES): try: resp = client.get( url, params=params, headers=_HTML_HEADERS, follow_redirects=False, ) except ( httpx.ConnectError, httpx.ReadTimeout, httpx.WriteTimeout, httpx.PoolTimeout, ) as exc: delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1) log.warning( "%s from %s, retry %d/%d in %.1fs", type(exc).__name__, url, attempt + 1, MAX_RETRIES, delay, ) time.sleep(delay) continue if 300 <= resp.status_code < 400: log.debug( "OnTheMarket %s page %d redirected (%d) — end of results", outcode, page_num, resp.status_code, ) return None if resp.status_code == 200: match = _NEXT_DATA_RE.search(resp.text) if not match: log.warning( "No __NEXT_DATA__ in OnTheMarket %s page %d", outcode, page_num ) return None try: return json.loads(match.group(1)) except json.JSONDecodeError as exc: log.warning( "Failed to parse __NEXT_DATA__ for %s page %d: %s", outcode, page_num, exc, ) return None if resp.status_code in (429, 500, 502, 503, 504): delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1) log.warning( "HTTP %d from %s, retry %d/%d in %.1fs", resp.status_code, url, attempt + 1, MAX_RETRIES, delay, ) time.sleep(delay) continue log.error("HTTP %d from %s (non-retryable)", resp.status_code, url) return None log.error("All %d retries exhausted for %s page %d", MAX_RETRIES, outcode, page_num) return None def _parse_price(price_value) -> int: """Parse a formatted price string like '£450,000' into an integer. Returns 0 for POA/auction/null values.""" if price_value is None: return 0 if isinstance(price_value, (int, float)): return int(price_value) match = _PRICE_RE.search(str(price_value)) if not match: return 0 return int(match.group(1).replace(",", "")) def _extract_tenure(features: list) -> str | None: """Pull canonical Freehold/Leasehold out of the features list. OnTheMarket encodes tenure as 'Tenure: Leasehold (NN years remaining)' etc. 'Share of freehold' is normalised to Freehold.""" if not features: return None for feature in features: if not isinstance(feature, str): continue match = _TENURE_RE.search(feature) if not match: continue value = match.group(1).strip().lower() if "freehold" in value: return "Freehold" if "leasehold" in value: return "Leasehold" return None def _extract_floor_area(features: list) -> float | None: """Search features for a sq ft / sq m mention and return sqm.""" if not features: return None for feature in features: if not isinstance(feature, str): continue sqm = parse_display_size(feature) if sqm is not None: return sqm return None def transform_property( raw: dict, pc_index: PostcodeSpatialIndex ) -> dict | None: """Transform a raw OnTheMarket listing dict into our output schema.""" loc = raw.get("location") or {} raw_lat = loc.get("lat") raw_lng = loc.get("lon") if raw_lat is None or raw_lng is None: return None lat, lng = fix_coords(raw_lat, raw_lng) if not (49 <= lat <= 56 and -7 <= lng <= 2): return None postcode = pc_index.nearest(lat, lng) if not postcode: return None raw_beds = raw.get("bedrooms") or 0 raw_baths = raw.get("bathrooms") or 0 bedrooms = raw_beds if 0 <= raw_beds <= MAX_BEDROOMS else 0 bathrooms = raw_baths if 0 <= raw_baths <= MAX_BEDROOMS else 0 if raw_beds > MAX_BEDROOMS or raw_baths > MAX_BEDROOMS: log.warning( "OnTheMarket %s: implausible beds=%d baths=%d (capped to 0)", raw.get("id", "?"), raw_beds, raw_baths, ) sub_type = raw.get("humanised-property-type") or "" features = raw.get("features") or [] listing_id = str(raw.get("id") or "") if not listing_id: return None details_url = raw.get("details-url") or "" full_url = ( ONTHEMARKET_BASE + details_url if details_url and not details_url.startswith("http") else details_url ) return { "id": f"otm_{listing_id}", "Bedrooms": bedrooms, "Bathrooms": bathrooms, "Number of bedrooms & living rooms": bedrooms + bathrooms, "lon": lng, "lat": lat, "Postcode": postcode, "Address per Property Register": raw.get("address", ""), "Leasehold/Freehold": _extract_tenure(features), "Property type": map_property_type(sub_type), "Property sub-type": normalize_sub_type(sub_type), "price": _parse_price(raw.get("price")), "price_frequency": "", "Price qualifier": raw.get("price-qualifier") or "", "Total floor area (sqm)": _extract_floor_area(features), "Listing URL": full_url, "Listing features": [f for f in features if isinstance(f, str)], "first_visible_date": "", } def search_outcode( client: httpx.Client, outcode: str, pc_index: PostcodeSpatialIndex, max_properties: int | None = None, ) -> list[dict]: """Paginate through OnTheMarket sale results for one outcode.""" properties: list[dict] = [] seen_ids: set[str] = set() page_num = 1 while True: data = _fetch_page_json(client, outcode, page_num) if data is None: break try: state = data["props"]["initialReduxState"]["results"] except (KeyError, TypeError): log.warning( "Unexpected __NEXT_DATA__ shape for %s page %d", outcode, page_num ) break raw_listings = state.get("list") or [] if not raw_listings: break for raw in raw_listings: listing_id = str(raw.get("id") or "") if listing_id and listing_id in seen_ids: continue seen_ids.add(listing_id) try: transformed = transform_property(raw, pc_index) except Exception as exc: log.warning( "OnTheMarket %s property %s failed to transform: %s", outcode, listing_id or "?", exc, ) continue if transformed: properties.append(transformed) if max_properties is not None and len(properties) >= max_properties: return properties pagination = state.get("paginationControls") or {} if not pagination.get("next"): break page_num += 1 time.sleep(DELAY_BETWEEN_PAGES) return properties