"""OnTheMarket (onthemarket.com) scraper — sale properties. OnTheMarket serves a Next.js app with the full search-results payload embedded as JSON in a `__NEXT_DATA__` script tag. No JS execution or browser needed: plain HTTP with a Chrome-ish User-Agent is enough. Each rendered page contains 30 listings under `props.initialReduxState.results.list`, each with `location.{lat,lon}`, `bedrooms`, `bathrooms`, `price` (formatted £-string), `address`, `humanised-property-type`, `features` (a list where the first element is typically `"Tenure: "`), and `details-url`. Pagination is via `?page=N`; the loop terminates when `paginationControls.next` is null. Postcodes --------- The search card exposes only an *outcode*-level address (e.g. "Padfield Road, London, SE5") and a map pin, so the old behaviour derived the postcode from the nearest postcode to that pin — a guess that frequently lands on a neighbouring unit (the pin can sit on the wrong side of a street boundary). Each *detail* page (`/details/{id}/`) is a plain HTTPS GET whose `__NEXT_DATA__` embeds the property's analytics dataLayer at `props.initialReduxState.metadata.dataLayer`, which carries the property's own `postcode` (full unit postcode, e.g. "SE5 9AA") keyed to this listing by `property-id`. Crucially this is NOT the agent's office postcode — that lives separately at `…property.agent.postcode` ("SE5 8RS" for the same listing) and is the classic trap when blindly scanning the page for a postcode. We read the dataLayer postcode, verify `property-id` matches the listing, and accept it only when its outcode agrees with the coordinate-nearest postcode (via ``resolve_listing_postcode``) — exactly the trust rule the other scrapers use. Measured over a sample of real listings this yields a trustworthy, usually exact-unit postcode for ~11/12 listings; the rest safely fall back to the coordinate-nearest postcode. Detail fetching costs one extra HTTPS GET per listing, so it is gated behind ``OTM_FETCH_DETAILS`` and capped at ``OTM_MAX_DETAILS_PER_OUTCODE`` per outcode. """ import json import logging import random import re import time import httpx from constants import ( DELAY_BETWEEN_PAGES, MAX_BEDROOMS, MAX_RETRIES, ONTHEMARKET_BASE, RETRY_BASE_DELAY, ) from spatial import PostcodeSpatialIndex from transform import ( clean_listing_address, extract_full_postcode, extract_outcode, fix_coords, map_property_type, normalize_sub_type, parse_display_size, resolve_listing_postcode, ) log = logging.getLogger("rightmove") # Detail-page postcode recovery (see module docstring). When enabled, each # listing's detail page is fetched so its analytics dataLayer postcode — the # property's own full unit postcode — can replace the coordinate-nearest guess. # Bounded per outcode so a large outcode can't balloon into unbounded extra # HTTPS GETs. Kept at parity with the Rightmove/Zoopla detail caps (400) so a # typical outcode's listings all get their real postcode rather than a # coordinate-nearest guess. OTM_FETCH_DETAILS = True OTM_MAX_DETAILS_PER_OUTCODE = 400 _NEXT_DATA_RE = re.compile( r'', re.DOTALL, ) _PRICE_RE = re.compile(r"([\d,]+)") _TENURE_RE = re.compile(r"tenure:\s*(.+)", re.IGNORECASE) _HTML_HEADERS = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-GB,en;q=0.9", } # listingId -> recovered full postcode (or None). Failures are cached too so a # broken or postcode-less detail page is not re-fetched within a run (the same # listing can reappear across overlapping outcode searches). _detail_postcode_cache: dict[str, str | None] = {} def _fetch_page_json(client: httpx.Client, outcode: str, page_num: int) -> dict | None: """GET one search-results page and return the embedded __NEXT_DATA__ JSON. Returns None on permanent failure, missing script, or a 3xx redirect (OnTheMarket redirects out-of-range pages, so a redirect = end of results). """ url = f"{ONTHEMARKET_BASE}/for-sale/property/{outcode.lower()}/" params = {"page": str(page_num)} if page_num > 1 else None for attempt in range(MAX_RETRIES): try: resp = client.get( url, params=params, headers=_HTML_HEADERS, follow_redirects=False, ) except ( httpx.ConnectError, httpx.ReadTimeout, httpx.WriteTimeout, httpx.PoolTimeout, ) as exc: delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1) log.warning( "%s from %s, retry %d/%d in %.1fs", type(exc).__name__, url, attempt + 1, MAX_RETRIES, delay, ) time.sleep(delay) continue if 300 <= resp.status_code < 400: log.debug( "OnTheMarket %s page %d redirected (%d) — end of results", outcode, page_num, resp.status_code, ) return None if resp.status_code == 200: match = _NEXT_DATA_RE.search(resp.text) if not match: log.warning( "No __NEXT_DATA__ in OnTheMarket %s page %d", outcode, page_num ) return None try: return json.loads(match.group(1)) except json.JSONDecodeError as exc: log.warning( "Failed to parse __NEXT_DATA__ for %s page %d: %s", outcode, page_num, exc, ) return None if resp.status_code in (429, 500, 502, 503, 504): delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1) log.warning( "HTTP %d from %s, retry %d/%d in %.1fs", resp.status_code, url, attempt + 1, MAX_RETRIES, delay, ) time.sleep(delay) continue log.error("HTTP %d from %s (non-retryable)", resp.status_code, url) return None log.error("All %d retries exhausted for %s page %d", MAX_RETRIES, outcode, page_num) return None def parse_detail_postcode(html: str, listing_id: str | None = None) -> str | None: """Extract the property's own full postcode from an OnTheMarket detail page. Pure and network-free so it is unit-testable: callers pass `page.content()` / the GET body and this does the parsing. The postcode lives in the analytics dataLayer embedded in `__NEXT_DATA__` at ``props.initialReduxState.metadata.dataLayer.postcode`` and is the property's own unit postcode (e.g. "SE5 9AA"). It is deliberately NOT the agent's office postcode, which sits separately at ``…property.agent.postcode`` — the trap when scanning a detail page for "a" postcode. When ``listing_id`` is given, the dataLayer's ``property-id`` must match it, guaranteeing we read this listing's postcode and not a stray one. Returns a normalized full postcode (e.g. "SE5 9AA") or ``None`` when the page has no usable property postcode. Trust (outcode-vs-coordinates agreement) is enforced later in ``transform_property``. """ if not html: return None match = _NEXT_DATA_RE.search(html) if not match: return None try: data = json.loads(match.group(1)) except json.JSONDecodeError: return None try: data_layer = data["props"]["initialReduxState"]["metadata"]["dataLayer"] except (KeyError, TypeError): return None if not isinstance(data_layer, dict): return None # Guard against reading a different listing's postcode: the dataLayer is the # property's own analytics payload, so its property-id must match. if listing_id is not None: page_id = data_layer.get("property-id") if page_id is not None and str(page_id) != str(listing_id): return None raw_postcode = data_layer.get("postcode") if not isinstance(raw_postcode, str): return None return extract_full_postcode(raw_postcode) def _fetch_detail_postcode( client: httpx.Client, details_url: str, listing_id: str ) -> str | None: """GET one listing's detail page and return its dataLayer postcode (or None). Results (including failures) are cached by listing id so a listing that reappears across overlapping outcode searches is fetched at most once. Plain HTTPS GET — OnTheMarket detail pages have no Cloudflare challenge. Network / parse errors degrade gracefully to None so the caller falls back to the coordinate-nearest postcode. """ if listing_id in _detail_postcode_cache: return _detail_postcode_cache[listing_id] full_url = ( ONTHEMARKET_BASE + details_url if details_url and not details_url.startswith("http") else details_url ) result: str | None = None if full_url: for attempt in range(MAX_RETRIES): try: resp = client.get( full_url, headers=_HTML_HEADERS, follow_redirects=True ) except ( httpx.ConnectError, httpx.ReadTimeout, httpx.WriteTimeout, httpx.PoolTimeout, ) as exc: delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1) log.warning( "%s from %s, retry %d/%d in %.1fs", type(exc).__name__, full_url, attempt + 1, MAX_RETRIES, delay, ) time.sleep(delay) continue if resp.status_code == 200: result = parse_detail_postcode(resp.text, listing_id) break if resp.status_code in (429, 500, 502, 503, 504): delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1) log.warning( "HTTP %d from %s, retry %d/%d in %.1fs", resp.status_code, full_url, attempt + 1, MAX_RETRIES, delay, ) time.sleep(delay) continue log.debug( "OnTheMarket detail %s returned HTTP %d (no postcode)", listing_id, resp.status_code, ) break _detail_postcode_cache[listing_id] = result return result def _parse_price(price_value) -> int: """Parse a formatted price string like '£450,000' into an integer. Returns 0 for POA/auction/null values.""" if price_value is None: return 0 if isinstance(price_value, (int, float)): return int(price_value) match = _PRICE_RE.search(str(price_value)) if not match: return 0 return int(match.group(1).replace(",", "")) def _extract_tenure(features: list) -> str | None: """Pull canonical Freehold/Leasehold out of the features list. OnTheMarket encodes tenure as 'Tenure: Leasehold (NN years remaining)' etc. 'Share of freehold' is normalised to Freehold.""" if not features: return None for feature in features: if not isinstance(feature, str): continue match = _TENURE_RE.search(feature) if not match: continue value = match.group(1).strip().lower() if "freehold" in value: return "Freehold" if "leasehold" in value: return "Leasehold" return None def _extract_floor_area(features: list) -> float | None: """Search features for a sq ft / sq m mention and return sqm.""" if not features: return None for feature in features: if not isinstance(feature, str): continue sqm = parse_display_size(feature) if sqm is not None: return sqm return None def transform_property( raw: dict, pc_index: PostcodeSpatialIndex, detail_postcode: str | None = None, ) -> dict | None: """Transform a raw OnTheMarket listing dict into our output schema. ``detail_postcode`` is the property's own full postcode recovered from its detail page (see ``parse_detail_postcode`` / ``_fetch_detail_postcode``), or ``None`` when no detail fetch was done / no postcode was found. When present and trustworthy (its outcode agrees with the coordinate-nearest postcode) it supersedes the coordinate guess and is labelled ``"detail_address"``. """ loc = raw.get("location") or {} raw_lat = loc.get("lat") raw_lng = loc.get("lon") if raw_lat is None or raw_lng is None: return None lat, lng = fix_coords(raw_lat, raw_lng) if not (49 <= lat <= 56 and -7 <= lng <= 2): return None inferred_postcode = pc_index.nearest(lat, lng) if not inferred_postcode: return None raw_address = raw.get("address", "") or "" extracted_postcode = extract_full_postcode(raw_address) # Prefer the property's own detail-page postcode when we have one and it is # trustworthy. The detail postcode is a full unit postcode (better than the # coordinate-nearest guess and than the usually outcode-only card address), # but a stale/mislabelled value would silently override the spatially # correct one, so apply the same outcode-agreement trust rule the address # postcode uses: keep it only when its outcode matches the # coordinate-nearest postcode's outcode. detail_postcode = extract_full_postcode(detail_postcode) if detail_postcode and extract_outcode(detail_postcode) == extract_outcode( inferred_postcode ): postcode, postcode_source = detail_postcode, "detail_address" else: if detail_postcode: log.debug( "OnTheMarket %s: rejecting detail postcode %s " "(outcode mismatch with inferred %s)", raw.get("id", "?"), detail_postcode, inferred_postcode, ) postcode, postcode_source = resolve_listing_postcode( extracted_postcode, inferred_postcode ) raw_beds = raw.get("bedrooms") or 0 raw_baths = raw.get("bathrooms") or 0 bedrooms = raw_beds if 0 <= raw_beds <= MAX_BEDROOMS else 0 bathrooms = raw_baths if 0 <= raw_baths <= MAX_BEDROOMS else 0 if raw_beds > MAX_BEDROOMS or raw_baths > MAX_BEDROOMS: log.warning( "OnTheMarket %s: implausible beds=%d baths=%d (capped to 0)", raw.get("id", "?"), raw_beds, raw_baths, ) sub_type = raw.get("humanised-property-type") or "" features = raw.get("features") or [] listing_id = str(raw.get("id") or "") if not listing_id: return None details_url = raw.get("details-url") or "" full_url = ( ONTHEMARKET_BASE + details_url if details_url and not details_url.startswith("http") else details_url ) return { "id": f"otm_{listing_id}", "Bedrooms": bedrooms, "Bathrooms": bathrooms, "Number of bedrooms & living rooms": bedrooms + bathrooms, "lon": lng, "lat": lat, "Postcode": postcode, "Postcode source": postcode_source, "Extracted postcode": extracted_postcode, "Inferred postcode": inferred_postcode, "Listing raw address": raw_address, "Address per Property Register": clean_listing_address(raw_address), # OnTheMarket search JSON exposes only a street-level address; no UPRN # or house number/name is available without a detail-page fetch. "UPRN": None, "Property number or name": None, "Leasehold/Freehold": _extract_tenure(features), "Property type": map_property_type(sub_type), "Property sub-type": normalize_sub_type(sub_type), "price": _parse_price(raw.get("price")), "price_frequency": "", "Price qualifier": raw.get("price-qualifier") or "", "Total floor area (sqm)": _extract_floor_area(features), "Listing URL": full_url, "Listing features": [f for f in features if isinstance(f, str)], "first_visible_date": "", } def search_outcode( client: httpx.Client, outcode: str, pc_index: PostcodeSpatialIndex, max_properties: int | None = None, ) -> list[dict]: """Paginate through OnTheMarket sale results for one outcode. When ``OTM_FETCH_DETAILS`` is enabled, up to ``OTM_MAX_DETAILS_PER_OUTCODE`` listings per outcode have their detail page fetched for the property's own postcode (see ``_fetch_detail_postcode``); the rest fall back to the coordinate-nearest postcode. """ properties: list[dict] = [] seen_ids: set[str] = set() page_num = 1 details_fetched = 0 while True: data = _fetch_page_json(client, outcode, page_num) if data is None: break try: state = data["props"]["initialReduxState"]["results"] except (KeyError, TypeError): log.warning( "Unexpected __NEXT_DATA__ shape for %s page %d", outcode, page_num ) break raw_listings = state.get("list") or [] if not raw_listings: break for raw in raw_listings: listing_id = str(raw.get("id") or "") if listing_id and listing_id in seen_ids: continue seen_ids.add(listing_id) detail_postcode = None if OTM_FETCH_DETAILS and listing_id: # Cached lookups are free; only fresh GETs count toward the cap # and incur the inter-request delay. cached = listing_id in _detail_postcode_cache if cached or details_fetched < OTM_MAX_DETAILS_PER_OUTCODE: detail_postcode = _fetch_detail_postcode( client, raw.get("details-url") or "", listing_id ) if not cached: details_fetched += 1 time.sleep(DELAY_BETWEEN_PAGES) try: transformed = transform_property(raw, pc_index, detail_postcode) except Exception as exc: log.warning( "OnTheMarket %s property %s failed to transform: %s", outcode, listing_id or "?", exc, ) continue if transformed: properties.append(transformed) if max_properties is not None and len(properties) >= max_properties: return properties pagination = state.get("paginationControls") or {} if not pagination.get("next"): break page_num += 1 time.sleep(DELAY_BETWEEN_PAGES) return properties