Add back finder
This commit is contained in:
parent
5e5d9f9a1c
commit
48c13fbcdd
23 changed files with 57347 additions and 0 deletions
869
finder/openrent.py
Normal file
869
finder/openrent.py
Normal file
|
|
@ -0,0 +1,869 @@
|
|||
"""OpenRent (openrent.co.uk) scraper — rental properties only.
|
||||
|
||||
OpenRent is behind AWS WAF, so we use Playwright (headless Chromium) to solve
|
||||
the challenge and get valid cookies. Then we use curl_cffi with Chrome TLS
|
||||
impersonation to make requests with those cookies.
|
||||
|
||||
OpenRent is a rental-only platform, so this scraper only handles RENT channel.
|
||||
|
||||
HTML structure (as of 2026-03):
|
||||
Search results page renders property cards as <a class="pli search-property-card">.
|
||||
Each card contains:
|
||||
- Monthly price in <div class="pim"> with <span class="text-primary">£X,XXX</span>
|
||||
- Weekly price in <div class="piw"> (hidden by Alpine.js)
|
||||
- Title in <div class="fw-medium text-primary fs-3">N Bed Type, Location, OUTCODE</div>
|
||||
- Features in <ul> with <li> items like "1 Bed", "1 Bath", "Furnished"
|
||||
- Listing ID in data-listing-id on the .or-swiper div
|
||||
- Description snippet in <div class="line-clamp-2">
|
||||
|
||||
Detail page has:
|
||||
- <h1> with property title including outcode
|
||||
- <div id="map" data-lat="..." data-lng="..."> for coordinates
|
||||
- Tables with deposit, rent, furnishing, tenant preferences
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from curl_cffi.requests import Session
|
||||
from curl_cffi.requests.errors import RequestsError
|
||||
from playwright.sync_api import sync_playwright
|
||||
|
||||
from constants import (
|
||||
DELAY_BETWEEN_PAGES,
|
||||
MAX_BEDROOMS,
|
||||
OPENRENT_BASE,
|
||||
PROPERTY_TYPE_MAP,
|
||||
RETRY_BASE_DELAY,
|
||||
)
|
||||
from metrics import (
|
||||
flaresolverr_attempts_total,
|
||||
openrent_errors_total,
|
||||
openrent_properties_scraped,
|
||||
openrent_requests_total,
|
||||
)
|
||||
from spatial import PostcodeSpatialIndex
|
||||
from transform import normalize_postcode, normalize_sub_type, validate_floor_area
|
||||
|
||||
log = logging.getLogger("openrent")
|
||||
|
||||
|
||||
class WafChallengeError(Exception):
|
||||
"""Raised when OpenRent returns a WAF challenge, indicating cookies need refresh."""
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Cookie / session management via Playwright
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def solve_waf() -> tuple[dict[str, str], str] | None:
|
||||
"""Use Playwright (headless Chromium) to solve the AWS WAF challenge.
|
||||
Returns (cookies_dict, user_agent) or None on failure."""
|
||||
log.info("Solving AWS WAF challenge via Playwright")
|
||||
try:
|
||||
with sync_playwright() as p:
|
||||
browser = p.chromium.launch(
|
||||
headless=True,
|
||||
args=["--no-sandbox", "--disable-blink-features=AutomationControlled"],
|
||||
)
|
||||
context = browser.new_context()
|
||||
page = context.new_page()
|
||||
|
||||
url = f"{OPENRENT_BASE}/properties-to-rent/?term=london&isLive=true"
|
||||
log.info("Navigating to %s", url)
|
||||
page.goto(url, wait_until="domcontentloaded", timeout=60000)
|
||||
|
||||
content = page.content()
|
||||
if "AwsWafIntegration" in content:
|
||||
log.info("Got WAF challenge page, waiting for resolution...")
|
||||
page.wait_for_selector(
|
||||
"a.pli, .pli, .search-property-card",
|
||||
timeout=30000,
|
||||
)
|
||||
|
||||
raw_cookies = context.cookies()
|
||||
user_agent = page.evaluate("navigator.userAgent")
|
||||
browser.close()
|
||||
|
||||
cookies = {c["name"]: c["value"] for c in raw_cookies}
|
||||
if "aws-waf-token" not in cookies:
|
||||
log.error("Playwright solved page but no aws-waf-token cookie found")
|
||||
flaresolverr_attempts_total.labels(result="no_cookies").inc()
|
||||
return None
|
||||
|
||||
log.info(
|
||||
"AWS WAF solved — got %d cookies, UA: %s",
|
||||
len(cookies),
|
||||
user_agent[:60],
|
||||
)
|
||||
flaresolverr_attempts_total.labels(result="success").inc()
|
||||
return cookies, user_agent
|
||||
|
||||
except Exception as e:
|
||||
log.error("Playwright WAF solve failed: %s", e)
|
||||
flaresolverr_attempts_total.labels(result="error").inc()
|
||||
return None
|
||||
|
||||
|
||||
def load_cookies() -> tuple[dict[str, str], str] | None:
|
||||
"""Get OpenRent cookies + user-agent.
|
||||
Tries Playwright first, then falls back to environment variables."""
|
||||
result = solve_waf()
|
||||
if result:
|
||||
return result
|
||||
|
||||
# Fall back to env vars
|
||||
waf_token = os.environ.get("OPENRENT_WAF_TOKEN", "")
|
||||
if not waf_token:
|
||||
return None
|
||||
|
||||
user_agent = os.environ.get(
|
||||
"OPENRENT_USER_AGENT",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/145.0.0.0 Safari/537.36",
|
||||
)
|
||||
return {"aws-waf-token": waf_token}, user_agent
|
||||
|
||||
|
||||
def make_client(cookies: dict[str, str], user_agent: str) -> Session:
|
||||
"""Create a curl_cffi Session configured for OpenRent.
|
||||
Uses Chrome TLS impersonation so AWS WAF cookies remain valid."""
|
||||
session = Session(impersonate="chrome")
|
||||
session.headers.update(
|
||||
{
|
||||
"User-Agent": user_agent,
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Accept-Language": "en-GB,en;q=0.9",
|
||||
}
|
||||
)
|
||||
for name, value in cookies.items():
|
||||
session.cookies.set(name, value, domain="openrent.co.uk")
|
||||
return session
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# HTTP fetch with retry
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _status_label(code: int) -> str:
|
||||
if code >= 500:
|
||||
return "5xx"
|
||||
return str(code)
|
||||
|
||||
|
||||
def fetch_page(
|
||||
client: Session,
|
||||
url: str,
|
||||
max_retries: int = 3,
|
||||
) -> str | None:
|
||||
"""GET HTML with retries on 429/5xx. Returns None on permanent failure.
|
||||
WAF challenge (202 or 403 with challenge JS) raises WafChallengeError."""
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
resp = client.get(url, timeout=30)
|
||||
openrent_requests_total.labels(status=_status_label(resp.status_code)).inc()
|
||||
|
||||
if resp.status_code == 200:
|
||||
html = resp.text
|
||||
# Detect WAF challenge page masquerading as 200
|
||||
if "AwsWafIntegration" in html and "challenge.js" in html:
|
||||
raise WafChallengeError(
|
||||
"Got AWS WAF challenge page — cookies expired"
|
||||
)
|
||||
return html
|
||||
|
||||
if resp.status_code in (202, 403):
|
||||
raise WafChallengeError(
|
||||
f"HTTP {resp.status_code} — cookies likely expired"
|
||||
)
|
||||
|
||||
if resp.status_code in (429, 500, 502, 503, 504):
|
||||
delay = RETRY_BASE_DELAY * (2**attempt)
|
||||
log.warning(
|
||||
"HTTP %d from %s, retry %d/%d in %.1fs",
|
||||
resp.status_code,
|
||||
url,
|
||||
attempt + 1,
|
||||
max_retries,
|
||||
delay,
|
||||
)
|
||||
time.sleep(delay)
|
||||
continue
|
||||
|
||||
log.error("HTTP %d from %s (non-retryable)", resp.status_code, url)
|
||||
return None
|
||||
|
||||
except WafChallengeError:
|
||||
raise
|
||||
except RequestsError as e:
|
||||
openrent_errors_total.labels(type=type(e).__name__).inc()
|
||||
delay = RETRY_BASE_DELAY * (2**attempt)
|
||||
log.warning(
|
||||
"%s from %s, retry %d/%d in %.1fs",
|
||||
type(e).__name__,
|
||||
url,
|
||||
attempt + 1,
|
||||
max_retries,
|
||||
delay,
|
||||
)
|
||||
time.sleep(delay)
|
||||
|
||||
openrent_errors_total.labels(type="retry_exhausted").inc()
|
||||
log.error("All %d retries exhausted for %s", max_retries, url)
|
||||
return None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# HTML parsing
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _extract_price_from_element(el) -> tuple[int, str] | None:
|
||||
"""Extract price integer from a price element's text like '£2,100'."""
|
||||
if not el:
|
||||
return None
|
||||
text = el.get_text(strip=True)
|
||||
match = re.search(r"£([\d,]+)", text)
|
||||
if not match:
|
||||
return None
|
||||
return int(match.group(1).replace(",", ""))
|
||||
|
||||
|
||||
def _extract_price(text: str) -> tuple[int, str] | None:
|
||||
"""Extract price and frequency from text like '£1,500 pcm' or '£350 pw'.
|
||||
Returns (price_int, frequency) or None.
|
||||
|
||||
OpenRent card text shows both monthly and weekly prices (e.g.
|
||||
'£2,800 per month £646 per week'), so check monthly *before* weekly
|
||||
to match the first (monthly) price that the regex captures."""
|
||||
match = re.search(r"£([\d,]+)", text)
|
||||
if not match:
|
||||
return None
|
||||
price = int(match.group(1).replace(",", ""))
|
||||
lower = text.lower()
|
||||
if "pcm" in lower or "per month" in lower or "/m" in lower:
|
||||
return price, "monthly"
|
||||
if "pw" in lower or "per week" in lower or "/w" in lower:
|
||||
return price, "weekly"
|
||||
if "pa" in lower or "per annum" in lower or "/y" in lower:
|
||||
return price, "yearly"
|
||||
# OpenRent defaults to pcm (per calendar month)
|
||||
return price, "monthly"
|
||||
|
||||
|
||||
def _extract_bedrooms_from_title(title: str) -> int | None:
|
||||
"""Extract bedroom count from title like '2 Bed Flat, Pimlico'."""
|
||||
match = re.search(r"(\d+)\s*bed", title, re.IGNORECASE)
|
||||
if match:
|
||||
return int(match.group(1))
|
||||
if re.search(r"\bstudio\b", title, re.IGNORECASE):
|
||||
return 0
|
||||
return None
|
||||
|
||||
|
||||
def _extract_beds_baths_from_features(
|
||||
feature_items: list,
|
||||
) -> tuple[int | None, int | None]:
|
||||
"""Extract bedrooms and bathrooms from feature list items.
|
||||
|
||||
OpenRent search cards have <ul> with items like:
|
||||
<li>1 Bed</li> <li>1 Bath</li> <li>Furnished</li>
|
||||
"""
|
||||
bedrooms = None
|
||||
bathrooms = None
|
||||
for li in feature_items:
|
||||
text = li.get_text(strip=True).lower()
|
||||
bed_match = re.search(r"(\d+)\s*bed", text)
|
||||
if bed_match:
|
||||
bedrooms = int(bed_match.group(1))
|
||||
bath_match = re.search(r"(\d+)\s*bath", text)
|
||||
if bath_match:
|
||||
bathrooms = int(bath_match.group(1))
|
||||
return bedrooms, bathrooms
|
||||
|
||||
|
||||
def _extract_postcode(text: str) -> str | None:
|
||||
"""Extract full UK postcode from text like '2 Bed Flat, Pimlico, SW1V 2AA'.
|
||||
Normalizes to include a space before the 3-char incode."""
|
||||
match = re.search(r"([A-Z]{1,2}\d[A-Z0-9]?\s*\d[A-Z]{2})", text, re.IGNORECASE)
|
||||
if match:
|
||||
raw = match.group(1).upper().strip()
|
||||
# Ensure space before incode (last 3 chars): "IP265AT" → "IP26 5AT"
|
||||
if " " not in raw and len(raw) >= 5:
|
||||
return raw[:-3] + " " + raw[-3:]
|
||||
return raw
|
||||
return None
|
||||
|
||||
|
||||
def _extract_outcode(text: str) -> str | None:
|
||||
"""Extract UK outcode from text like '1 Bed Flat, Bank Chambers, SW1Y'.
|
||||
|
||||
Looks for an outcode pattern (e.g., SW1Y, E1, EC2A) at the end of the text
|
||||
or after the last comma."""
|
||||
# Try after last comma first (most reliable position in OpenRent titles)
|
||||
parts = text.split(",")
|
||||
if len(parts) > 1:
|
||||
last_part = parts[-1].strip()
|
||||
match = re.match(r"^([A-Z]{1,2}\d[A-Z0-9]?)$", last_part, re.IGNORECASE)
|
||||
if match:
|
||||
return match.group(1).upper()
|
||||
|
||||
# Fall back to searching anywhere in text
|
||||
match = re.search(r"\b([A-Z]{1,2}\d[A-Z0-9]?)\b", text, re.IGNORECASE)
|
||||
if match:
|
||||
candidate = match.group(1).upper()
|
||||
# Avoid matching things like "1 Bed" → "1B"
|
||||
if len(candidate) >= 2 and not candidate[0].isdigit():
|
||||
return candidate
|
||||
return None
|
||||
|
||||
|
||||
def _infer_property_type(title: str) -> str:
|
||||
"""Infer property type from title text.
|
||||
|
||||
Order matters: "Room in a Shared Flat" should be "Room" not "Flat",
|
||||
so check "room" before "flat"."""
|
||||
lower = title.lower()
|
||||
if "room in" in lower or "room " in lower:
|
||||
return "Room"
|
||||
if "studio" in lower:
|
||||
return "Studio"
|
||||
if "flat" in lower or "apartment" in lower:
|
||||
return "Flat"
|
||||
if "maisonette" in lower:
|
||||
return "Maisonette"
|
||||
if "house" in lower:
|
||||
return "House"
|
||||
if "bungalow" in lower:
|
||||
return "Bungalow"
|
||||
return ""
|
||||
|
||||
|
||||
def parse_search_results(html: str) -> list[dict]:
|
||||
"""Parse property data from OpenRent search results HTML.
|
||||
|
||||
Returns list of raw property dicts extracted from property cards.
|
||||
|
||||
Current OpenRent card structure (2026-03):
|
||||
<a class="pli search-property-card" href="/property-to-rent/.../ID">
|
||||
<div class="or-swiper" data-listing-id="ID">
|
||||
<div class="pim"><span class="text-primary">£2,100</span> per month</div>
|
||||
<div class="piw"><span class="text-primary">£485</span> per week</div>
|
||||
<div class="fw-medium text-primary fs-3">1 Bed Flat, Location, SW1Y</div>
|
||||
<ul>...<li>1 Bed</li><li>1 Bath</li><li>Furnished</li>...</ul>
|
||||
"""
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
properties = []
|
||||
|
||||
# Property cards: <a class="pli search-property-card">
|
||||
cards = soup.select("a.pli")
|
||||
if not cards:
|
||||
cards = soup.find_all("a", href=re.compile(r"/property-to-rent/"))
|
||||
|
||||
if not cards:
|
||||
log.warning(
|
||||
"No property cards found in search HTML (%d bytes). "
|
||||
"CSS selectors may need updating.",
|
||||
len(html),
|
||||
)
|
||||
return []
|
||||
|
||||
for card in cards:
|
||||
prop: dict = {}
|
||||
|
||||
# Extract property URL and ID from href
|
||||
href = card.get("href", "")
|
||||
if not href:
|
||||
continue
|
||||
|
||||
prop["url"] = href if href.startswith("http") else OPENRENT_BASE + href
|
||||
id_match = re.search(r"/(\d+)(?:\?|$|#)", href)
|
||||
if id_match:
|
||||
prop["id"] = id_match.group(1)
|
||||
else:
|
||||
# Try data-listing-id on the swiper element
|
||||
swiper = card.select_one("[data-listing-id]")
|
||||
if swiper:
|
||||
prop["id"] = swiper["data-listing-id"]
|
||||
else:
|
||||
continue # can't use a property without an ID
|
||||
|
||||
# --- Price ---
|
||||
# Prefer structured price elements over free-text parsing.
|
||||
# Monthly price is in <div class="pim"><span class="text-primary">£X</span>
|
||||
pim = card.select_one(".pim .text-primary, .pim span")
|
||||
piw = card.select_one(".piw .text-primary, .piw span")
|
||||
|
||||
monthly_price = _extract_price_from_element(pim)
|
||||
weekly_price = _extract_price_from_element(piw)
|
||||
|
||||
if monthly_price:
|
||||
prop["price"] = monthly_price
|
||||
prop["frequency"] = "monthly"
|
||||
elif weekly_price:
|
||||
prop["price"] = weekly_price
|
||||
prop["frequency"] = "weekly"
|
||||
else:
|
||||
# Fall back to parsing card text
|
||||
card_text = card.get_text(" ", strip=True)
|
||||
price_result = _extract_price(card_text)
|
||||
if price_result:
|
||||
prop["price"], prop["frequency"] = price_result
|
||||
|
||||
# --- Title / Address ---
|
||||
# The property title is in a div with classes "fw-medium text-primary fs-3"
|
||||
# e.g., "1 Bed Flat, Bank Chambers, SW1Y"
|
||||
title_el = card.select_one("div.fw-medium.fs-3")
|
||||
if not title_el:
|
||||
# Fallback: try image alt text which also has the title
|
||||
img = card.select_one("img.propertyPic")
|
||||
if img and img.get("alt"):
|
||||
prop["title"] = img["alt"]
|
||||
else:
|
||||
# Last resort: extract from card text, excluding price/nav noise
|
||||
prop["title"] = ""
|
||||
else:
|
||||
prop["title"] = title_el.get_text(strip=True)
|
||||
|
||||
# --- Bedrooms / Bathrooms from feature list ---
|
||||
feature_list = card.select("ul li")
|
||||
beds_from_features, baths_from_features = _extract_beds_baths_from_features(
|
||||
feature_list,
|
||||
)
|
||||
|
||||
# Bedrooms: prefer feature list, fall back to title parsing
|
||||
if beds_from_features is not None:
|
||||
prop["bedrooms"] = beds_from_features
|
||||
else:
|
||||
beds = _extract_bedrooms_from_title(prop.get("title", ""))
|
||||
if beds is not None:
|
||||
prop["bedrooms"] = beds
|
||||
|
||||
if baths_from_features is not None:
|
||||
prop["bathrooms"] = baths_from_features
|
||||
|
||||
# --- Property type from title ---
|
||||
title = prop.get("title", "")
|
||||
prop["property_type"] = _infer_property_type(title)
|
||||
|
||||
# --- Postcode / outcode from title ---
|
||||
postcode = _extract_postcode(title)
|
||||
if postcode:
|
||||
prop["postcode"] = postcode
|
||||
else:
|
||||
outcode = _extract_outcode(title)
|
||||
if outcode:
|
||||
prop["outcode"] = outcode
|
||||
|
||||
# --- Description snippet ---
|
||||
desc_el = card.select_one(".line-clamp-2")
|
||||
if desc_el:
|
||||
prop["description"] = desc_el.get_text(strip=True)
|
||||
|
||||
# --- Coordinates from data attributes (may not be present on cards) ---
|
||||
for el in [card] + card.select("[data-lat], [data-latitude]"):
|
||||
lat = el.get("data-lat") or el.get("data-latitude")
|
||||
lng = el.get("data-lng") or el.get("data-longitude") or el.get("data-lon")
|
||||
if lat and lng:
|
||||
try:
|
||||
prop["lat"] = float(lat)
|
||||
prop["lng"] = float(lng)
|
||||
except ValueError:
|
||||
pass
|
||||
break
|
||||
|
||||
properties.append(prop)
|
||||
|
||||
log.debug("Parsed %d property cards from search HTML", len(properties))
|
||||
return properties
|
||||
|
||||
|
||||
def parse_property_detail(html: str) -> dict:
|
||||
"""Parse a single property detail page for additional data.
|
||||
|
||||
Current detail page structure (2026-03):
|
||||
- <h1> has the full title (e.g., "Room in a Shared House, Lime Tree Court, AL2")
|
||||
- <div id="map" data-lat="..." data-lng="..."> has coordinates
|
||||
- Tables have "Rent PCM", "Deposit", "Bills Included", etc. (NOT bedrooms)
|
||||
- Description in elements with class containing "description"
|
||||
"""
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
details: dict = {}
|
||||
|
||||
# --- Title from h1 ---
|
||||
h1 = soup.select_one("h1")
|
||||
if h1:
|
||||
title_text = h1.get_text(strip=True)
|
||||
# Validate it's not a nav/modal element (e.g. "Log in")
|
||||
if len(title_text) > 10 and "log in" not in title_text.lower():
|
||||
details["title"] = title_text
|
||||
postcode = _extract_postcode(title_text)
|
||||
if postcode:
|
||||
details["postcode"] = postcode
|
||||
|
||||
# --- Coordinates from map element ---
|
||||
# The map div has id="map" with data-lat and data-lng
|
||||
map_el = soup.select_one("#map[data-lat]")
|
||||
if not map_el:
|
||||
# Fallback: any element with data-lat (but prefer #map)
|
||||
map_el = soup.select_one("[data-lat]")
|
||||
if map_el:
|
||||
lat = map_el.get("data-lat")
|
||||
lng = map_el.get("data-lng") or map_el.get("data-lon")
|
||||
if lat and lng:
|
||||
try:
|
||||
details["lat"] = float(lat)
|
||||
details["lng"] = float(lng)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# --- Parse tables for rent and property details ---
|
||||
for table in soup.select("table"):
|
||||
for row in table.select("tr"):
|
||||
cells = row.select("td")
|
||||
if len(cells) < 2:
|
||||
continue
|
||||
label = cells[0].get_text(strip=True).lower()
|
||||
value = cells[1].get_text(strip=True)
|
||||
|
||||
if "rent" in label and "pcm" in label:
|
||||
match = re.search(r"£([\d,]+)", value)
|
||||
if match:
|
||||
details["price"] = int(match.group(1).replace(",", ""))
|
||||
elif "bedroom" in label:
|
||||
match = re.search(r"(\d+)", value)
|
||||
if match:
|
||||
details["bedrooms"] = int(match.group(1))
|
||||
elif "bathroom" in label:
|
||||
match = re.search(r"(\d+)", value)
|
||||
if match:
|
||||
details["bathrooms"] = int(match.group(1))
|
||||
elif "type" in label and "property" in label:
|
||||
details["property_type"] = value
|
||||
elif "available" in label or "move" in label:
|
||||
details["available_date"] = value
|
||||
elif "furnish" in label:
|
||||
details["furnished"] = value
|
||||
|
||||
# --- Coordinates from inline JavaScript (last resort) ---
|
||||
if "lat" not in details:
|
||||
for script in soup.select("script"):
|
||||
text = script.string or ""
|
||||
lat_match = re.search(r'"latitude"\s*:\s*([\d.-]+)', text)
|
||||
lng_match = re.search(r'"longitude"\s*:\s*([\d.-]+)', text)
|
||||
if lat_match and lng_match:
|
||||
try:
|
||||
details["lat"] = float(lat_match.group(1))
|
||||
details["lng"] = float(lng_match.group(1))
|
||||
except ValueError:
|
||||
pass
|
||||
break
|
||||
|
||||
# --- Description for floor area ---
|
||||
desc_el = soup.select_one(".description, [class*='description'], #description")
|
||||
if desc_el:
|
||||
details["description"] = desc_el.get_text(strip=True)
|
||||
|
||||
return details
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Property type mapping & floor area
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def map_property_type(raw_type: str | None) -> str:
|
||||
"""Map OpenRent property type to canonical type."""
|
||||
if not raw_type:
|
||||
return "Other"
|
||||
canonical = PROPERTY_TYPE_MAP.get(raw_type)
|
||||
if canonical:
|
||||
return canonical
|
||||
lower = raw_type.lower()
|
||||
if "room" in lower or "shared" in lower:
|
||||
return "Other"
|
||||
if (
|
||||
"flat" in lower
|
||||
or "apartment" in lower
|
||||
or "maisonette" in lower
|
||||
or "studio" in lower
|
||||
):
|
||||
return "Flats/Maisonettes"
|
||||
if "detached" in lower and "semi" not in lower:
|
||||
return "Detached"
|
||||
if "semi" in lower:
|
||||
return "Semi-Detached"
|
||||
if "terrace" in lower or "mews" in lower:
|
||||
return "Terraced"
|
||||
if "house" in lower:
|
||||
return "Detached"
|
||||
log.debug("Unknown property type: %r — mapping to Other", raw_type)
|
||||
return "Other"
|
||||
|
||||
|
||||
def parse_floor_area(description: str | None) -> float | None:
|
||||
"""Try to extract floor area from description text."""
|
||||
if not description:
|
||||
return None
|
||||
m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*ft", description, re.IGNORECASE)
|
||||
if m:
|
||||
sqft = float(m.group(1).replace(",", ""))
|
||||
return validate_floor_area(round(sqft * 0.092903, 1))
|
||||
m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*m", description, re.IGNORECASE)
|
||||
if m:
|
||||
return validate_floor_area(round(float(m.group(1).replace(",", "")), 1))
|
||||
return None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Transform & search
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _resolve_outcode_postcodes(
|
||||
outcode: str,
|
||||
pc_coords: dict[str, tuple[float, float]],
|
||||
) -> list[str]:
|
||||
"""Get all postcodes for an outcode from the postcode coordinates lookup."""
|
||||
# ONSPD 7-char format: 4-char outcodes have no space before incode
|
||||
# (e.g., "BH191AB"), while shorter outcodes do (e.g., "E14 5AB").
|
||||
prefix = outcode + " "
|
||||
results = [pcd for pcd in pc_coords if pcd.startswith(prefix)]
|
||||
if not results and len(outcode) >= 4:
|
||||
results = [pcd for pcd in pc_coords if pcd.startswith(outcode) and len(pcd) > len(outcode)]
|
||||
return results
|
||||
|
||||
|
||||
def _parse_or_date(date_str: str) -> str:
|
||||
"""Parse OpenRent date strings to ISO format (YYYY-MM-DD).
|
||||
Handles 'Today', 'Tomorrow', and 'DD Month, YYYY' formats."""
|
||||
if not date_str:
|
||||
return ""
|
||||
stripped = date_str.strip()
|
||||
lower = stripped.lower()
|
||||
if lower == "today":
|
||||
from datetime import datetime
|
||||
return datetime.now().strftime("%Y-%m-%d")
|
||||
if lower == "tomorrow":
|
||||
from datetime import datetime, timedelta
|
||||
return (datetime.now() + timedelta(days=1)).strftime("%Y-%m-%d")
|
||||
# Try "DD Month, YYYY" format (e.g., "01 April, 2026")
|
||||
from datetime import datetime
|
||||
for fmt in ("%d %B, %Y", "%d %B %Y"):
|
||||
try:
|
||||
return datetime.strptime(stripped, fmt).strftime("%Y-%m-%d")
|
||||
except ValueError:
|
||||
continue
|
||||
return date_str # Return as-is if unparseable
|
||||
|
||||
|
||||
def transform_property(
|
||||
search_data: dict,
|
||||
detail_data: dict | None,
|
||||
pc_index: PostcodeSpatialIndex,
|
||||
pc_coords: dict[str, tuple[float, float]],
|
||||
) -> dict | None:
|
||||
"""Transform OpenRent property data into our output schema.
|
||||
|
||||
Merges data from the search results page and (optionally) the detail page.
|
||||
Uses pc_coords (postcode -> lat/lng) as a fallback when coordinates are
|
||||
missing but a postcode is available.
|
||||
"""
|
||||
detail = detail_data or {}
|
||||
|
||||
# Merge: detail page data takes precedence
|
||||
lat = detail.get("lat") or search_data.get("lat")
|
||||
lng = detail.get("lng") or search_data.get("lng")
|
||||
price = detail.get("price") or search_data.get("price")
|
||||
if not price or int(price) <= 0:
|
||||
return None
|
||||
|
||||
frequency = search_data.get("frequency", "monthly")
|
||||
|
||||
# Get postcode: detail page > search card
|
||||
postcode = detail.get("postcode") or search_data.get("postcode")
|
||||
|
||||
if lat is not None and lng is not None:
|
||||
# Validate coordinates are in England
|
||||
if not (49 <= lat <= 56 and -7 <= lng <= 2):
|
||||
log.debug("Coords outside England: lat=%.4f lng=%.4f — skipping", lat, lng)
|
||||
return None
|
||||
if not postcode:
|
||||
if pc_index:
|
||||
postcode = pc_index.nearest(lat, lng)
|
||||
elif search_data.get("outcode"):
|
||||
# No spatial index — try outcode lookup as fallback
|
||||
outcode_pcs = _resolve_outcode_postcodes(
|
||||
search_data["outcode"],
|
||||
pc_coords,
|
||||
)
|
||||
if outcode_pcs:
|
||||
postcode = outcode_pcs[0]
|
||||
elif postcode:
|
||||
# Have postcode but no coordinates — look up centroid from arcgis data
|
||||
coords = pc_coords.get(postcode)
|
||||
if coords:
|
||||
lat, lng = coords
|
||||
else:
|
||||
log.debug("Postcode %s not in arcgis data — skipping", postcode)
|
||||
return None
|
||||
elif search_data.get("outcode"):
|
||||
# Have only outcode — find postcodes in that outcode and use centroid
|
||||
outcode = search_data["outcode"]
|
||||
outcode_postcodes = _resolve_outcode_postcodes(outcode, pc_coords)
|
||||
if outcode_postcodes:
|
||||
# Use the first postcode as a rough approximation
|
||||
postcode = outcode_postcodes[0]
|
||||
lat, lng = pc_coords[postcode]
|
||||
else:
|
||||
log.debug("No postcodes found for outcode %s — skipping", outcode)
|
||||
return None
|
||||
else:
|
||||
return None
|
||||
|
||||
if not postcode:
|
||||
log.debug("No postcode for property — skipping")
|
||||
return None
|
||||
|
||||
raw_beds = detail.get("bedrooms") or search_data.get("bedrooms", 0) or 0
|
||||
raw_baths = detail.get("bathrooms") or search_data.get("bathrooms", 0) or 0
|
||||
bedrooms = raw_beds if raw_beds <= MAX_BEDROOMS else 0
|
||||
bathrooms = raw_baths if raw_baths <= MAX_BEDROOMS else 0
|
||||
if raw_beds > MAX_BEDROOMS or raw_baths > MAX_BEDROOMS:
|
||||
log.warning(
|
||||
"OpenRent %s: implausible beds=%d baths=%d (capped to 0)",
|
||||
search_data.get("id", "?"), raw_beds, raw_baths,
|
||||
)
|
||||
|
||||
# Title: prefer detail page (has h1 with full title)
|
||||
title = detail.get("title") or search_data.get("title", "")
|
||||
|
||||
# Address: take the middle part of the title (skip the "N Bed Type" prefix
|
||||
# and the outcode suffix). E.g., "1 Bed Flat, Bank Chambers, SW1Y" -> "Bank Chambers"
|
||||
address = ""
|
||||
if title:
|
||||
parts = [p.strip() for p in title.split(",")]
|
||||
if len(parts) >= 3:
|
||||
# Skip first (type) and last (outcode), join the middle
|
||||
address = ", ".join(parts[1:-1])
|
||||
elif len(parts) == 2:
|
||||
# Could be "Location, OUTCODE" or "Type, Location"
|
||||
# If last part looks like an outcode, use the first part
|
||||
if re.match(r"^[A-Z]{1,2}\d", parts[-1].strip()):
|
||||
address = parts[0]
|
||||
else:
|
||||
address = parts[1]
|
||||
else:
|
||||
address = title
|
||||
|
||||
# Property type: prefer detail, then search card, then infer from title
|
||||
property_type = detail.get("property_type") or search_data.get("property_type", "")
|
||||
if not property_type and title:
|
||||
property_type = _infer_property_type(title)
|
||||
|
||||
prop_id = search_data.get("id", "")
|
||||
listing_url = search_data.get(
|
||||
"url",
|
||||
f"{OPENRENT_BASE}/{prop_id}" if prop_id else "",
|
||||
)
|
||||
description = detail.get("description") or search_data.get("description", "")
|
||||
|
||||
return {
|
||||
"id": f"or_{prop_id}",
|
||||
"Bedrooms": bedrooms,
|
||||
"Bathrooms": bathrooms,
|
||||
"Number of bedrooms & living rooms": bedrooms,
|
||||
"lon": lng,
|
||||
"lat": lat,
|
||||
"Postcode": normalize_postcode(postcode),
|
||||
"Address per Property Register": address,
|
||||
# OpenRent is a rental-only platform — tenure (Freehold/Leasehold) is a
|
||||
# property ownership concept that doesn't apply to rental listings. The
|
||||
# landlord's tenure is not shown on OpenRent listing pages.
|
||||
"Leasehold/Freehold": None,
|
||||
"Property type": map_property_type(property_type),
|
||||
"Property sub-type": normalize_sub_type(property_type),
|
||||
"price": int(price),
|
||||
"price_frequency": frequency,
|
||||
"Price qualifier": "",
|
||||
"Total floor area (sqm)": parse_floor_area(description),
|
||||
"Listing URL": listing_url,
|
||||
"Listing features": [],
|
||||
"first_visible_date": _parse_or_date(detail.get("available_date", "")),
|
||||
}
|
||||
|
||||
|
||||
def search_outcode(
|
||||
client: Session,
|
||||
outcode: str,
|
||||
pc_index: PostcodeSpatialIndex,
|
||||
pc_coords: dict[str, tuple[float, float]],
|
||||
fetch_details: bool = True,
|
||||
) -> list[dict]:
|
||||
"""Search OpenRent for rental properties in one outcode.
|
||||
|
||||
1. Fetches the search results page for the outcode
|
||||
2. Parses property cards from the HTML (title, price, beds, baths)
|
||||
3. Fetches each property's detail page for coordinates
|
||||
4. Transforms to common output schema
|
||||
|
||||
The search card provides most data (price, bedrooms, bathrooms, title,
|
||||
property type). Detail pages are needed primarily for precise coordinates
|
||||
and full postcodes. When detail pages fail, we fall back to outcode-level
|
||||
coordinates from the postcode lookup.
|
||||
"""
|
||||
search_url = f"{OPENRENT_BASE}/properties-to-rent/?term={outcode}&isLive=true"
|
||||
|
||||
html = fetch_page(client, search_url)
|
||||
if not html:
|
||||
return []
|
||||
|
||||
search_results = parse_search_results(html)
|
||||
if not search_results:
|
||||
return []
|
||||
|
||||
properties = []
|
||||
for search_data in search_results:
|
||||
detail_data = None
|
||||
|
||||
# Skip detail page if we already have coordinates or a resolvable postcode
|
||||
has_coords = (
|
||||
search_data.get("lat") is not None
|
||||
and search_data.get("lng") is not None
|
||||
)
|
||||
has_resolvable_pc = (
|
||||
search_data.get("postcode")
|
||||
and pc_coords
|
||||
and search_data["postcode"] in pc_coords
|
||||
)
|
||||
needs_detail = (
|
||||
fetch_details
|
||||
and search_data.get("url")
|
||||
and not has_coords
|
||||
and not has_resolvable_pc
|
||||
)
|
||||
|
||||
if needs_detail:
|
||||
detail_html = fetch_page(client, search_data["url"])
|
||||
if detail_html:
|
||||
detail_data = parse_property_detail(detail_html)
|
||||
# Shorter delay for detail pages (within same outcode)
|
||||
time.sleep(0.15)
|
||||
|
||||
transformed = transform_property(
|
||||
search_data,
|
||||
detail_data,
|
||||
pc_index,
|
||||
pc_coords,
|
||||
)
|
||||
if transformed:
|
||||
properties.append(transformed)
|
||||
openrent_properties_scraped.labels(channel="rent").inc()
|
||||
|
||||
return properties
|
||||
Loading…
Add table
Add a link
Reference in a new issue