Fix openrent scraping

This commit is contained in:
Andras Schmelczer 2026-03-15 20:15:40 +00:00
parent 96a4934b0c
commit ea8389ef40

View file

@ -6,11 +6,20 @@ impersonation to make requests with those cookies.
OpenRent is a rental-only platform, so this scraper only handles RENT channel. OpenRent is a rental-only platform, so this scraper only handles RENT channel.
HTML parsing notes: HTML structure (as of 2026-03):
OpenRent server-renders property cards in the search results page. Search results page renders property cards as <a class="pli search-property-card">.
Property cards use class "pli" (property list item). Each card contains Each card contains:
a link to the property detail page, price, bedrooms, and address info. - Monthly price in <div class="pim"> with <span class="text-primary">£X,XXX</span>
The CSS selectors below may need updating if OpenRent changes their markup. - Weekly price in <div class="piw"> (hidden by Alpine.js)
- Title in <div class="fw-medium text-primary fs-3">N Bed Type, Location, OUTCODE</div>
- Features in <ul> with <li> items like "1 Bed", "1 Bath", "Furnished"
- Listing ID in data-listing-id on the .or-swiper div
- Description snippet in <div class="line-clamp-2">
Detail page has:
- <h1> with property title including outcode
- <div id="map" data-lat="..." data-lng="..."> for coordinates
- Tables with deposit, rent, furnishing, tenant preferences
""" """
import logging import logging
@ -70,7 +79,7 @@ def solve_waf() -> tuple[dict[str, str], str] | None:
if "AwsWafIntegration" in content: if "AwsWafIntegration" in content:
log.info("Got WAF challenge page, waiting for resolution...") log.info("Got WAF challenge page, waiting for resolution...")
page.wait_for_selector( page.wait_for_selector(
"a.pli, .pli, [class*=propertyListing]", timeout=30000, "a.pli, .pli, .search-property-card", timeout=30000,
) )
raw_cookies = context.cookies() raw_cookies = context.cookies()
@ -195,6 +204,17 @@ def fetch_page(
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
def _extract_price_from_element(el) -> tuple[int, str] | None:
"""Extract price integer from a price element's text like '£2,100'."""
if not el:
return None
text = el.get_text(strip=True)
match = re.search(r"£([\d,]+)", text)
if not match:
return None
return int(match.group(1).replace(",", ""))
def _extract_price(text: str) -> tuple[int, str] | None: def _extract_price(text: str) -> tuple[int, str] | None:
"""Extract price and frequency from text like '£1,500 pcm' or '£350 pw'. """Extract price and frequency from text like '£1,500 pcm' or '£350 pw'.
Returns (price_int, frequency) or None. Returns (price_int, frequency) or None.
@ -227,6 +247,25 @@ def _extract_bedrooms_from_title(title: str) -> int | None:
return None return None
def _extract_beds_baths_from_features(feature_items: list) -> tuple[int | None, int | None]:
"""Extract bedrooms and bathrooms from feature list items.
OpenRent search cards have <ul> with items like:
<li>1 Bed</li> <li>1 Bath</li> <li>Furnished</li>
"""
bedrooms = None
bathrooms = None
for li in feature_items:
text = li.get_text(strip=True).lower()
bed_match = re.search(r"(\d+)\s*bed", text)
if bed_match:
bedrooms = int(bed_match.group(1))
bath_match = re.search(r"(\d+)\s*bath", text)
if bath_match:
bathrooms = int(bath_match.group(1))
return bedrooms, bathrooms
def _extract_postcode(text: str) -> str | None: def _extract_postcode(text: str) -> str | None:
"""Extract full UK postcode from text like '2 Bed Flat, Pimlico, SW1V 2AA'.""" """Extract full UK postcode from text like '2 Bed Flat, Pimlico, SW1V 2AA'."""
match = re.search(r"([A-Z]{1,2}\d[A-Z0-9]?\s*\d[A-Z]{2})", text, re.IGNORECASE) match = re.search(r"([A-Z]{1,2}\d[A-Z0-9]?\s*\d[A-Z]{2})", text, re.IGNORECASE)
@ -235,23 +274,69 @@ def _extract_postcode(text: str) -> str | None:
return None return None
def _extract_outcode(text: str) -> str | None:
"""Extract UK outcode from text like '1 Bed Flat, Bank Chambers, SW1Y'.
Looks for an outcode pattern (e.g., SW1Y, E1, EC2A) at the end of the text
or after the last comma."""
# Try after last comma first (most reliable position in OpenRent titles)
parts = text.split(",")
if len(parts) > 1:
last_part = parts[-1].strip()
match = re.match(r"^([A-Z]{1,2}\d[A-Z0-9]?)$", last_part, re.IGNORECASE)
if match:
return match.group(1).upper()
# Fall back to searching anywhere in text
match = re.search(r"\b([A-Z]{1,2}\d[A-Z0-9]?)\b", text, re.IGNORECASE)
if match:
candidate = match.group(1).upper()
# Avoid matching things like "1 Bed" → "1B"
if len(candidate) >= 2 and not candidate[0].isdigit():
return candidate
return None
def _infer_property_type(title: str) -> str:
"""Infer property type from title text.
Order matters: "Room in a Shared Flat" should be "Room" not "Flat",
so check "room" before "flat"."""
lower = title.lower()
if "room in" in lower or "room " in lower:
return "Room"
if "studio" in lower:
return "Studio"
if "flat" in lower or "apartment" in lower:
return "Flat"
if "maisonette" in lower:
return "Maisonette"
if "house" in lower:
return "House"
if "bungalow" in lower:
return "Bungalow"
return ""
def parse_search_results(html: str) -> list[dict]: def parse_search_results(html: str) -> list[dict]:
"""Parse property data from OpenRent search results HTML. """Parse property data from OpenRent search results HTML.
Returns list of raw property dicts extracted from property cards. Returns list of raw property dicts extracted from property cards.
Uses multiple fallback selectors for resilience against markup changes.
Current OpenRent card structure (2026-03):
<a class="pli search-property-card" href="/property-to-rent/.../ID">
<div class="or-swiper" data-listing-id="ID">
<div class="pim"><span class="text-primary">£2,100</span> per month</div>
<div class="piw"><span class="text-primary">£485</span> per week</div>
<div class="fw-medium text-primary fs-3">1 Bed Flat, Location, SW1Y</div>
<ul>...<li>1 Bed</li><li>1 Bath</li><li>Furnished</li>...</ul>
""" """
soup = BeautifulSoup(html, "html.parser") soup = BeautifulSoup(html, "html.parser")
properties = [] properties = []
# Try known selectors for property cards (most specific first) # Property cards: <a class="pli search-property-card">
cards = soup.select("a.pli") cards = soup.select("a.pli")
if not cards: if not cards:
cards = soup.select(".pli.clearfix")
if not cards:
cards = soup.select("[class*='propertyListing']")
if not cards:
# Last resort: look for links that match property URL pattern
cards = soup.find_all("a", href=re.compile(r"/property-to-rent/")) cards = soup.find_all("a", href=re.compile(r"/property-to-rent/"))
if not cards: if not cards:
@ -275,36 +360,86 @@ def parse_search_results(html: str) -> list[dict]:
if id_match: if id_match:
prop["id"] = id_match.group(1) prop["id"] = id_match.group(1)
else: else:
continue # can't use a property without an ID # Try data-listing-id on the swiper element
swiper = card.select_one("[data-listing-id]")
if swiper:
prop["id"] = swiper["data-listing-id"]
else:
continue # can't use a property without an ID
# Extract card text for parsing # --- Price ---
card_text = card.get_text(" ", strip=True) # Prefer structured price elements over free-text parsing.
# Monthly price is in <div class="pim"><span class="text-primary">£X</span>
pim = card.select_one(".pim .text-primary, .pim span")
piw = card.select_one(".piw .text-primary, .piw span")
# Price monthly_price = _extract_price_from_element(pim)
price_result = _extract_price(card_text) weekly_price = _extract_price_from_element(piw)
if price_result:
prop["price"], prop["frequency"] = price_result
# Title / address — try specific elements first, fall back to card text if monthly_price:
title_el = card.select_one( prop["price"] = monthly_price
".listing-title, .banda, h2, h3, [class*='title']" prop["frequency"] = "monthly"
) elif weekly_price:
prop["title"] = ( prop["price"] = weekly_price
title_el.get_text(strip=True) if title_el prop["frequency"] = "weekly"
else card_text[:200] else:
# Fall back to parsing card text
card_text = card.get_text(" ", strip=True)
price_result = _extract_price(card_text)
if price_result:
prop["price"], prop["frequency"] = price_result
# --- Title / Address ---
# The property title is in a div with classes "fw-medium text-primary fs-3"
# e.g., "1 Bed Flat, Bank Chambers, SW1Y"
title_el = card.select_one("div.fw-medium.fs-3")
if not title_el:
# Fallback: try image alt text which also has the title
img = card.select_one("img.propertyPic")
if img and img.get("alt"):
prop["title"] = img["alt"]
else:
# Last resort: extract from card text, excluding price/nav noise
prop["title"] = ""
else:
prop["title"] = title_el.get_text(strip=True)
# --- Bedrooms / Bathrooms from feature list ---
feature_list = card.select("ul li")
beds_from_features, baths_from_features = _extract_beds_baths_from_features(
feature_list,
) )
# Bedrooms from title text # Bedrooms: prefer feature list, fall back to title parsing
beds = _extract_bedrooms_from_title(prop["title"]) if beds_from_features is not None:
if beds is not None: prop["bedrooms"] = beds_from_features
prop["bedrooms"] = beds else:
beds = _extract_bedrooms_from_title(prop.get("title", ""))
if beds is not None:
prop["bedrooms"] = beds
# Postcode from title if baths_from_features is not None:
postcode = _extract_postcode(prop["title"]) prop["bathrooms"] = baths_from_features
# --- Property type from title ---
title = prop.get("title", "")
prop["property_type"] = _infer_property_type(title)
# --- Postcode / outcode from title ---
postcode = _extract_postcode(title)
if postcode: if postcode:
prop["postcode"] = postcode prop["postcode"] = postcode
else:
outcode = _extract_outcode(title)
if outcode:
prop["outcode"] = outcode
# Coordinates from data attributes (if present on card or child elements) # --- Description snippet ---
desc_el = card.select_one(".line-clamp-2")
if desc_el:
prop["description"] = desc_el.get_text(strip=True)
# --- Coordinates from data attributes (may not be present on cards) ---
for el in [card] + card.select("[data-lat], [data-latitude]"): for el in [card] + card.select("[data-lat], [data-latitude]"):
lat = el.get("data-lat") or el.get("data-latitude") lat = el.get("data-lat") or el.get("data-latitude")
lng = ( lng = (
@ -329,62 +464,35 @@ def parse_search_results(html: str) -> list[dict]:
def parse_property_detail(html: str) -> dict: def parse_property_detail(html: str) -> dict:
"""Parse a single property detail page for additional data. """Parse a single property detail page for additional data.
Extracts: bedrooms, bathrooms, price, property_type, postcode, Current detail page structure (2026-03):
lat/lng (from map data), description (for floor area). - <h1> has the full title (e.g., "Room in a Shared House, Lime Tree Court, AL2")
- <div id="map" data-lat="..." data-lng="..."> has coordinates
- Tables have "Rent PCM", "Deposit", "Bills Included", etc. (NOT bedrooms)
- Description in elements with class containing "description"
""" """
soup = BeautifulSoup(html, "html.parser") soup = BeautifulSoup(html, "html.parser")
details: dict = {} details: dict = {}
# Parse structured data tables (class "table table-striped") # --- Title from h1 ---
for table in soup.select("table.table-striped, table.table"): h1 = soup.select_one("h1")
for row in table.select("tr"): if h1:
cells = row.select("td, th") title_text = h1.get_text(strip=True)
if len(cells) < 2: # Validate it's not a nav/modal element (e.g. "Log in")
continue if len(title_text) > 10 and "log in" not in title_text.lower():
label = cells[0].get_text(strip=True).lower() details["title"] = title_text
value = cells[1].get_text(strip=True) postcode = _extract_postcode(title_text)
if postcode:
details["postcode"] = postcode
if "bedroom" in label: # --- Coordinates from map element ---
match = re.search(r"(\d+)", value) # The map div has id="map" with data-lat and data-lng
if match: map_el = soup.select_one("#map[data-lat]")
details["bedrooms"] = int(match.group(1)) if not map_el:
elif "bathroom" in label: # Fallback: any element with data-lat (but prefer #map)
match = re.search(r"(\d+)", value) map_el = soup.select_one("[data-lat]")
if match:
details["bathrooms"] = int(match.group(1))
elif "rent" in label or "price" in label:
match = re.search(r"£([\d,]+)", value)
if match:
details["price"] = int(match.group(1).replace(",", ""))
elif "type" in label:
details["property_type"] = value
elif "available" in label or "move" in label:
details["available_date"] = value
elif "furnish" in label:
details["furnished"] = value
# Extract postcode from page title / address heading.
# Prefer h1 (the actual property title) over generic [class*='title']
# which may match nav elements like "Log in".
title_tag = soup.select_one("h1")
if not title_tag:
title_tag = soup.select_one(".property-title, [class*='title']")
if title_tag:
title_text = title_tag.get_text(strip=True)
details["title"] = title_text
postcode = _extract_postcode(title_text)
if postcode:
details["postcode"] = postcode
# Extract coordinates from map element data attributes
map_el = soup.select_one("[data-lat], [data-latitude]")
if map_el: if map_el:
lat = map_el.get("data-lat") or map_el.get("data-latitude") lat = map_el.get("data-lat")
lng = ( lng = map_el.get("data-lng") or map_el.get("data-lon")
map_el.get("data-lng")
or map_el.get("data-longitude")
or map_el.get("data-lon")
)
if lat and lng: if lat and lng:
try: try:
details["lat"] = float(lat) details["lat"] = float(lat)
@ -392,7 +500,35 @@ def parse_property_detail(html: str) -> dict:
except ValueError: except ValueError:
pass pass
# Also check for coordinates in JSON-LD or inline JavaScript # --- Parse tables for rent and property details ---
for table in soup.select("table"):
for row in table.select("tr"):
cells = row.select("td")
if len(cells) < 2:
continue
label = cells[0].get_text(strip=True).lower()
value = cells[1].get_text(strip=True)
if "rent" in label and "pcm" in label:
match = re.search(r"£([\d,]+)", value)
if match:
details["price"] = int(match.group(1).replace(",", ""))
elif "bedroom" in label:
match = re.search(r"(\d+)", value)
if match:
details["bedrooms"] = int(match.group(1))
elif "bathroom" in label:
match = re.search(r"(\d+)", value)
if match:
details["bathrooms"] = int(match.group(1))
elif "type" in label and "property" in label:
details["property_type"] = value
elif "available" in label or "move" in label:
details["available_date"] = value
elif "furnish" in label:
details["furnished"] = value
# --- Coordinates from inline JavaScript (last resort) ---
if "lat" not in details: if "lat" not in details:
for script in soup.select("script"): for script in soup.select("script"):
text = script.string or "" text = script.string or ""
@ -406,7 +542,7 @@ def parse_property_detail(html: str) -> dict:
pass pass
break break
# Extract description for floor area parsing # --- Description for floor area ---
desc_el = soup.select_one( desc_el = soup.select_one(
".description, [class*='description'], #description" ".description, [class*='description'], #description"
) )
@ -464,6 +600,16 @@ def parse_floor_area(description: str | None) -> float | None:
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
def _resolve_outcode_postcodes(
outcode: str,
pc_coords: dict[str, tuple[float, float]],
) -> list[str]:
"""Get all postcodes for an outcode from the postcode coordinates lookup."""
prefix = outcode + " "
# Also try without space for non-standard format (e.g. "SW1Y" matches "SW1Y 4AA")
return [pcd for pcd in pc_coords if pcd.startswith(prefix)]
def transform_property( def transform_property(
search_data: dict, search_data: dict,
detail_data: dict | None, detail_data: dict | None,
@ -473,7 +619,7 @@ def transform_property(
"""Transform OpenRent property data into our output schema. """Transform OpenRent property data into our output schema.
Merges data from the search results page and (optionally) the detail page. Merges data from the search results page and (optionally) the detail page.
Uses pc_coords (postcode lat/lng) as a fallback when coordinates are Uses pc_coords (postcode -> lat/lng) as a fallback when coordinates are
missing but a postcode is available. missing but a postcode is available.
""" """
detail = detail_data or {} detail = detail_data or {}
@ -487,7 +633,7 @@ def transform_property(
frequency = search_data.get("frequency", "monthly") frequency = search_data.get("frequency", "monthly")
# Get postcode: detail page > search card > spatial index # Get postcode: detail page > search card
postcode = detail.get("postcode") or search_data.get("postcode") postcode = detail.get("postcode") or search_data.get("postcode")
if lat is not None and lng is not None: if lat is not None and lng is not None:
@ -496,7 +642,15 @@ def transform_property(
log.debug("Coords outside England: lat=%.4f lng=%.4f — skipping", lat, lng) log.debug("Coords outside England: lat=%.4f lng=%.4f — skipping", lat, lng)
return None return None
if not postcode: if not postcode:
postcode = pc_index.nearest(lat, lng) if pc_index:
postcode = pc_index.nearest(lat, lng)
elif search_data.get("outcode"):
# No spatial index — try outcode lookup as fallback
outcode_pcs = _resolve_outcode_postcodes(
search_data["outcode"], pc_coords,
)
if outcode_pcs:
postcode = outcode_pcs[0]
elif postcode: elif postcode:
# Have postcode but no coordinates — look up centroid from arcgis data # Have postcode but no coordinates — look up centroid from arcgis data
coords = pc_coords.get(postcode) coords = pc_coords.get(postcode)
@ -505,6 +659,17 @@ def transform_property(
else: else:
log.debug("Postcode %s not in arcgis data — skipping", postcode) log.debug("Postcode %s not in arcgis data — skipping", postcode)
return None return None
elif search_data.get("outcode"):
# Have only outcode — find postcodes in that outcode and use centroid
outcode = search_data["outcode"]
outcode_postcodes = _resolve_outcode_postcodes(outcode, pc_coords)
if outcode_postcodes:
# Use the first postcode as a rough approximation
postcode = outcode_postcodes[0]
lat, lng = pc_coords[postcode]
else:
log.debug("No postcodes found for outcode %s — skipping", outcode)
return None
else: else:
return None return None
@ -513,37 +678,45 @@ def transform_property(
return None return None
bedrooms = detail.get("bedrooms") or search_data.get("bedrooms", 0) or 0 bedrooms = detail.get("bedrooms") or search_data.get("bedrooms", 0) or 0
bathrooms = detail.get("bathrooms", 0) or 0 bathrooms = detail.get("bathrooms") or search_data.get("bathrooms", 0) or 0
# Title: prefer detail page (has h1 with full title)
title = detail.get("title") or search_data.get("title", "") title = detail.get("title") or search_data.get("title", "")
address = title.split(",")[0].strip() if title else ""
property_type = detail.get("property_type", "") # Address: take the middle part of the title (skip the "N Bed Type" prefix
# Infer from title if not found in detail page # and the outcode suffix). E.g., "1 Bed Flat, Bank Chambers, SW1Y" -> "Bank Chambers"
address = ""
if title:
parts = [p.strip() for p in title.split(",")]
if len(parts) >= 3:
# Skip first (type) and last (outcode), join the middle
address = ", ".join(parts[1:-1])
elif len(parts) == 2:
# Could be "Location, OUTCODE" or "Type, Location"
# If last part looks like an outcode, use the first part
if re.match(r"^[A-Z]{1,2}\d", parts[-1].strip()):
address = parts[0]
else:
address = parts[1]
else:
address = title
# Property type: prefer detail, then search card, then infer from title
property_type = detail.get("property_type") or search_data.get("property_type", "")
if not property_type and title: if not property_type and title:
lower = title.lower() property_type = _infer_property_type(title)
if "flat" in lower or "apartment" in lower:
property_type = "Flat"
elif "studio" in lower:
property_type = "Studio"
elif "maisonette" in lower:
property_type = "Maisonette"
elif "house" in lower:
property_type = "House"
elif "room" in lower:
property_type = "Room"
prop_id = search_data.get("id", "") prop_id = search_data.get("id", "")
listing_url = search_data.get( listing_url = search_data.get(
"url", f"{OPENRENT_BASE}/{prop_id}" if prop_id else "", "url", f"{OPENRENT_BASE}/{prop_id}" if prop_id else "",
) )
description = detail.get("description", "") description = detail.get("description") or search_data.get("description", "")
return { return {
"id": f"or_{prop_id}", "id": f"or_{prop_id}",
"Bedrooms": bedrooms, "Bedrooms": bedrooms,
"Bathrooms": bathrooms, "Bathrooms": bathrooms,
"Number of bedrooms & living rooms": bedrooms + bathrooms, "Number of bedrooms & living rooms": bedrooms,
"lon": lng, "lon": lng,
"lat": lat, "lat": lat,
"Postcode": postcode, "Postcode": postcode,
@ -571,14 +744,14 @@ def search_outcode(
"""Search OpenRent for rental properties in one outcode. """Search OpenRent for rental properties in one outcode.
1. Fetches the search results page for the outcode 1. Fetches the search results page for the outcode
2. Parses property cards from the HTML 2. Parses property cards from the HTML (title, price, beds, baths)
3. Optionally fetches each property's detail page for full data 3. Fetches each property's detail page for coordinates
4. Transforms to common output schema 4. Transforms to common output schema
Args: The search card provides most data (price, bedrooms, bathrooms, title,
fetch_details: If True, visits each property's detail page for property type). Detail pages are needed primarily for precise coordinates
coordinates and extra data. Slower but more complete. and full postcodes. When detail pages fail, we fall back to outcode-level
If False, relies only on search card data + postcode lookup. coordinates from the postcode lookup.
""" """
search_url = f"{OPENRENT_BASE}/properties-to-rent/?term={outcode}&isLive=true" search_url = f"{OPENRENT_BASE}/properties-to-rent/?term={outcode}&isLive=true"