has issues
This commit is contained in:
parent
2e112d7398
commit
c645b0f1d4
96 changed files with 2147083 additions and 5787 deletions
282
finder/onthemarket.py
Normal file
282
finder/onthemarket.py
Normal file
|
|
@ -0,0 +1,282 @@
|
|||
"""OnTheMarket (onthemarket.com) scraper — sale properties.
|
||||
|
||||
OnTheMarket serves a Next.js app with the full search-results payload embedded
|
||||
as JSON in a `__NEXT_DATA__` script tag. No JS execution or browser needed:
|
||||
plain HTTP with a Chrome-ish User-Agent is enough.
|
||||
|
||||
Each rendered page contains 30 listings under
|
||||
`props.initialReduxState.results.list`, each with `location.{lat,lon}`,
|
||||
`bedrooms`, `bathrooms`, `price` (formatted £-string), `address`,
|
||||
`humanised-property-type`, `features` (a list where the first element is
|
||||
typically `"Tenure: <value>"`), and `details-url`. Pagination is via
|
||||
`?page=N`; the loop terminates when `paginationControls.next` is null.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import random
|
||||
import re
|
||||
import time
|
||||
|
||||
import httpx
|
||||
|
||||
from constants import (
|
||||
DELAY_BETWEEN_PAGES,
|
||||
MAX_BEDROOMS,
|
||||
MAX_RETRIES,
|
||||
ONTHEMARKET_BASE,
|
||||
RETRY_BASE_DELAY,
|
||||
)
|
||||
from spatial import PostcodeSpatialIndex
|
||||
from transform import (
|
||||
fix_coords,
|
||||
map_property_type,
|
||||
normalize_sub_type,
|
||||
parse_display_size,
|
||||
)
|
||||
|
||||
log = logging.getLogger("rightmove")
|
||||
|
||||
_NEXT_DATA_RE = re.compile(
|
||||
r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
|
||||
re.DOTALL,
|
||||
)
|
||||
_PRICE_RE = re.compile(r"([\d,]+)")
|
||||
_TENURE_RE = re.compile(r"tenure:\s*(.+)", re.IGNORECASE)
|
||||
|
||||
_HTML_HEADERS = {
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Accept-Language": "en-GB,en;q=0.9",
|
||||
}
|
||||
|
||||
|
||||
def _fetch_page_json(client: httpx.Client, outcode: str, page_num: int) -> dict | None:
|
||||
"""GET one search-results page and return the embedded __NEXT_DATA__ JSON.
|
||||
|
||||
Returns None on permanent failure, missing script, or a 3xx redirect
|
||||
(OnTheMarket redirects out-of-range pages, so a redirect = end of results).
|
||||
"""
|
||||
url = f"{ONTHEMARKET_BASE}/for-sale/property/{outcode.lower()}/"
|
||||
params = {"page": str(page_num)} if page_num > 1 else None
|
||||
|
||||
for attempt in range(MAX_RETRIES):
|
||||
try:
|
||||
resp = client.get(
|
||||
url,
|
||||
params=params,
|
||||
headers=_HTML_HEADERS,
|
||||
follow_redirects=False,
|
||||
)
|
||||
except (
|
||||
httpx.ConnectError,
|
||||
httpx.ReadTimeout,
|
||||
httpx.WriteTimeout,
|
||||
httpx.PoolTimeout,
|
||||
) as exc:
|
||||
delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1)
|
||||
log.warning(
|
||||
"%s from %s, retry %d/%d in %.1fs",
|
||||
type(exc).__name__, url, attempt + 1, MAX_RETRIES, delay,
|
||||
)
|
||||
time.sleep(delay)
|
||||
continue
|
||||
|
||||
if 300 <= resp.status_code < 400:
|
||||
log.debug(
|
||||
"OnTheMarket %s page %d redirected (%d) — end of results",
|
||||
outcode, page_num, resp.status_code,
|
||||
)
|
||||
return None
|
||||
if resp.status_code == 200:
|
||||
match = _NEXT_DATA_RE.search(resp.text)
|
||||
if not match:
|
||||
log.warning(
|
||||
"No __NEXT_DATA__ in OnTheMarket %s page %d", outcode, page_num
|
||||
)
|
||||
return None
|
||||
try:
|
||||
return json.loads(match.group(1))
|
||||
except json.JSONDecodeError as exc:
|
||||
log.warning(
|
||||
"Failed to parse __NEXT_DATA__ for %s page %d: %s",
|
||||
outcode, page_num, exc,
|
||||
)
|
||||
return None
|
||||
if resp.status_code in (429, 500, 502, 503, 504):
|
||||
delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1)
|
||||
log.warning(
|
||||
"HTTP %d from %s, retry %d/%d in %.1fs",
|
||||
resp.status_code, url, attempt + 1, MAX_RETRIES, delay,
|
||||
)
|
||||
time.sleep(delay)
|
||||
continue
|
||||
log.error("HTTP %d from %s (non-retryable)", resp.status_code, url)
|
||||
return None
|
||||
|
||||
log.error("All %d retries exhausted for %s page %d", MAX_RETRIES, outcode, page_num)
|
||||
return None
|
||||
|
||||
|
||||
def _parse_price(price_value) -> int:
|
||||
"""Parse a formatted price string like '£450,000' into an integer.
|
||||
Returns 0 for POA/auction/null values."""
|
||||
if price_value is None:
|
||||
return 0
|
||||
if isinstance(price_value, (int, float)):
|
||||
return int(price_value)
|
||||
match = _PRICE_RE.search(str(price_value))
|
||||
if not match:
|
||||
return 0
|
||||
return int(match.group(1).replace(",", ""))
|
||||
|
||||
|
||||
def _extract_tenure(features: list) -> str | None:
|
||||
"""Pull canonical Freehold/Leasehold out of the features list.
|
||||
OnTheMarket encodes tenure as 'Tenure: Leasehold (NN years remaining)' etc.
|
||||
'Share of freehold' is normalised to Freehold."""
|
||||
if not features:
|
||||
return None
|
||||
for feature in features:
|
||||
if not isinstance(feature, str):
|
||||
continue
|
||||
match = _TENURE_RE.search(feature)
|
||||
if not match:
|
||||
continue
|
||||
value = match.group(1).strip().lower()
|
||||
if "freehold" in value:
|
||||
return "Freehold"
|
||||
if "leasehold" in value:
|
||||
return "Leasehold"
|
||||
return None
|
||||
|
||||
|
||||
def _extract_floor_area(features: list) -> float | None:
|
||||
"""Search features for a sq ft / sq m mention and return sqm."""
|
||||
if not features:
|
||||
return None
|
||||
for feature in features:
|
||||
if not isinstance(feature, str):
|
||||
continue
|
||||
sqm = parse_display_size(feature)
|
||||
if sqm is not None:
|
||||
return sqm
|
||||
return None
|
||||
|
||||
|
||||
def transform_property(
|
||||
raw: dict, pc_index: PostcodeSpatialIndex
|
||||
) -> dict | None:
|
||||
"""Transform a raw OnTheMarket listing dict into our output schema."""
|
||||
loc = raw.get("location") or {}
|
||||
raw_lat = loc.get("lat")
|
||||
raw_lng = loc.get("lon")
|
||||
if raw_lat is None or raw_lng is None:
|
||||
return None
|
||||
|
||||
lat, lng = fix_coords(raw_lat, raw_lng)
|
||||
if not (49 <= lat <= 56 and -7 <= lng <= 2):
|
||||
return None
|
||||
|
||||
postcode = pc_index.nearest(lat, lng)
|
||||
if not postcode:
|
||||
return None
|
||||
|
||||
raw_beds = raw.get("bedrooms") or 0
|
||||
raw_baths = raw.get("bathrooms") or 0
|
||||
bedrooms = raw_beds if 0 <= raw_beds <= MAX_BEDROOMS else 0
|
||||
bathrooms = raw_baths if 0 <= raw_baths <= MAX_BEDROOMS else 0
|
||||
if raw_beds > MAX_BEDROOMS or raw_baths > MAX_BEDROOMS:
|
||||
log.warning(
|
||||
"OnTheMarket %s: implausible beds=%d baths=%d (capped to 0)",
|
||||
raw.get("id", "?"), raw_beds, raw_baths,
|
||||
)
|
||||
|
||||
sub_type = raw.get("humanised-property-type") or ""
|
||||
features = raw.get("features") or []
|
||||
listing_id = str(raw.get("id") or "")
|
||||
if not listing_id:
|
||||
return None
|
||||
|
||||
details_url = raw.get("details-url") or ""
|
||||
full_url = (
|
||||
ONTHEMARKET_BASE + details_url
|
||||
if details_url and not details_url.startswith("http")
|
||||
else details_url
|
||||
)
|
||||
|
||||
return {
|
||||
"id": f"otm_{listing_id}",
|
||||
"Bedrooms": bedrooms,
|
||||
"Bathrooms": bathrooms,
|
||||
"Number of bedrooms & living rooms": bedrooms + bathrooms,
|
||||
"lon": lng,
|
||||
"lat": lat,
|
||||
"Postcode": postcode,
|
||||
"Address per Property Register": raw.get("address", ""),
|
||||
"Leasehold/Freehold": _extract_tenure(features),
|
||||
"Property type": map_property_type(sub_type),
|
||||
"Property sub-type": normalize_sub_type(sub_type),
|
||||
"price": _parse_price(raw.get("price")),
|
||||
"price_frequency": "",
|
||||
"Price qualifier": raw.get("price-qualifier") or "",
|
||||
"Total floor area (sqm)": _extract_floor_area(features),
|
||||
"Listing URL": full_url,
|
||||
"Listing features": [f for f in features if isinstance(f, str)],
|
||||
"first_visible_date": "",
|
||||
}
|
||||
|
||||
|
||||
def search_outcode(
|
||||
client: httpx.Client,
|
||||
outcode: str,
|
||||
pc_index: PostcodeSpatialIndex,
|
||||
max_properties: int | None = None,
|
||||
) -> list[dict]:
|
||||
"""Paginate through OnTheMarket sale results for one outcode."""
|
||||
properties: list[dict] = []
|
||||
seen_ids: set[str] = set()
|
||||
page_num = 1
|
||||
|
||||
while True:
|
||||
data = _fetch_page_json(client, outcode, page_num)
|
||||
if data is None:
|
||||
break
|
||||
|
||||
try:
|
||||
state = data["props"]["initialReduxState"]["results"]
|
||||
except (KeyError, TypeError):
|
||||
log.warning(
|
||||
"Unexpected __NEXT_DATA__ shape for %s page %d", outcode, page_num
|
||||
)
|
||||
break
|
||||
|
||||
raw_listings = state.get("list") or []
|
||||
if not raw_listings:
|
||||
break
|
||||
|
||||
for raw in raw_listings:
|
||||
listing_id = str(raw.get("id") or "")
|
||||
if listing_id and listing_id in seen_ids:
|
||||
continue
|
||||
seen_ids.add(listing_id)
|
||||
try:
|
||||
transformed = transform_property(raw, pc_index)
|
||||
except Exception as exc:
|
||||
log.warning(
|
||||
"OnTheMarket %s property %s failed to transform: %s",
|
||||
outcode, listing_id or "?", exc,
|
||||
)
|
||||
continue
|
||||
if transformed:
|
||||
properties.append(transformed)
|
||||
if max_properties is not None and len(properties) >= max_properties:
|
||||
return properties
|
||||
|
||||
pagination = state.get("paginationControls") or {}
|
||||
if not pagination.get("next"):
|
||||
break
|
||||
|
||||
page_num += 1
|
||||
time.sleep(DELAY_BETWEEN_PAGES)
|
||||
|
||||
return properties
|
||||
Loading…
Add table
Add a link
Reference in a new issue