Fix openrent scraping
This commit is contained in:
parent
96a4934b0c
commit
ea8389ef40
1 changed files with 285 additions and 112 deletions
|
|
@ -6,11 +6,20 @@ impersonation to make requests with those cookies.
|
||||||
|
|
||||||
OpenRent is a rental-only platform, so this scraper only handles RENT channel.
|
OpenRent is a rental-only platform, so this scraper only handles RENT channel.
|
||||||
|
|
||||||
HTML parsing notes:
|
HTML structure (as of 2026-03):
|
||||||
OpenRent server-renders property cards in the search results page.
|
Search results page renders property cards as <a class="pli search-property-card">.
|
||||||
Property cards use class "pli" (property list item). Each card contains
|
Each card contains:
|
||||||
a link to the property detail page, price, bedrooms, and address info.
|
- Monthly price in <div class="pim"> with <span class="text-primary">£X,XXX</span>
|
||||||
The CSS selectors below may need updating if OpenRent changes their markup.
|
- Weekly price in <div class="piw"> (hidden by Alpine.js)
|
||||||
|
- Title in <div class="fw-medium text-primary fs-3">N Bed Type, Location, OUTCODE</div>
|
||||||
|
- Features in <ul> with <li> items like "1 Bed", "1 Bath", "Furnished"
|
||||||
|
- Listing ID in data-listing-id on the .or-swiper div
|
||||||
|
- Description snippet in <div class="line-clamp-2">
|
||||||
|
|
||||||
|
Detail page has:
|
||||||
|
- <h1> with property title including outcode
|
||||||
|
- <div id="map" data-lat="..." data-lng="..."> for coordinates
|
||||||
|
- Tables with deposit, rent, furnishing, tenant preferences
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
|
@ -70,7 +79,7 @@ def solve_waf() -> tuple[dict[str, str], str] | None:
|
||||||
if "AwsWafIntegration" in content:
|
if "AwsWafIntegration" in content:
|
||||||
log.info("Got WAF challenge page, waiting for resolution...")
|
log.info("Got WAF challenge page, waiting for resolution...")
|
||||||
page.wait_for_selector(
|
page.wait_for_selector(
|
||||||
"a.pli, .pli, [class*=propertyListing]", timeout=30000,
|
"a.pli, .pli, .search-property-card", timeout=30000,
|
||||||
)
|
)
|
||||||
|
|
||||||
raw_cookies = context.cookies()
|
raw_cookies = context.cookies()
|
||||||
|
|
@ -195,6 +204,17 @@ def fetch_page(
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_price_from_element(el) -> tuple[int, str] | None:
|
||||||
|
"""Extract price integer from a price element's text like '£2,100'."""
|
||||||
|
if not el:
|
||||||
|
return None
|
||||||
|
text = el.get_text(strip=True)
|
||||||
|
match = re.search(r"£([\d,]+)", text)
|
||||||
|
if not match:
|
||||||
|
return None
|
||||||
|
return int(match.group(1).replace(",", ""))
|
||||||
|
|
||||||
|
|
||||||
def _extract_price(text: str) -> tuple[int, str] | None:
|
def _extract_price(text: str) -> tuple[int, str] | None:
|
||||||
"""Extract price and frequency from text like '£1,500 pcm' or '£350 pw'.
|
"""Extract price and frequency from text like '£1,500 pcm' or '£350 pw'.
|
||||||
Returns (price_int, frequency) or None.
|
Returns (price_int, frequency) or None.
|
||||||
|
|
@ -227,6 +247,25 @@ def _extract_bedrooms_from_title(title: str) -> int | None:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_beds_baths_from_features(feature_items: list) -> tuple[int | None, int | None]:
|
||||||
|
"""Extract bedrooms and bathrooms from feature list items.
|
||||||
|
|
||||||
|
OpenRent search cards have <ul> with items like:
|
||||||
|
<li>1 Bed</li> <li>1 Bath</li> <li>Furnished</li>
|
||||||
|
"""
|
||||||
|
bedrooms = None
|
||||||
|
bathrooms = None
|
||||||
|
for li in feature_items:
|
||||||
|
text = li.get_text(strip=True).lower()
|
||||||
|
bed_match = re.search(r"(\d+)\s*bed", text)
|
||||||
|
if bed_match:
|
||||||
|
bedrooms = int(bed_match.group(1))
|
||||||
|
bath_match = re.search(r"(\d+)\s*bath", text)
|
||||||
|
if bath_match:
|
||||||
|
bathrooms = int(bath_match.group(1))
|
||||||
|
return bedrooms, bathrooms
|
||||||
|
|
||||||
|
|
||||||
def _extract_postcode(text: str) -> str | None:
|
def _extract_postcode(text: str) -> str | None:
|
||||||
"""Extract full UK postcode from text like '2 Bed Flat, Pimlico, SW1V 2AA'."""
|
"""Extract full UK postcode from text like '2 Bed Flat, Pimlico, SW1V 2AA'."""
|
||||||
match = re.search(r"([A-Z]{1,2}\d[A-Z0-9]?\s*\d[A-Z]{2})", text, re.IGNORECASE)
|
match = re.search(r"([A-Z]{1,2}\d[A-Z0-9]?\s*\d[A-Z]{2})", text, re.IGNORECASE)
|
||||||
|
|
@ -235,23 +274,69 @@ def _extract_postcode(text: str) -> str | None:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_outcode(text: str) -> str | None:
|
||||||
|
"""Extract UK outcode from text like '1 Bed Flat, Bank Chambers, SW1Y'.
|
||||||
|
|
||||||
|
Looks for an outcode pattern (e.g., SW1Y, E1, EC2A) at the end of the text
|
||||||
|
or after the last comma."""
|
||||||
|
# Try after last comma first (most reliable position in OpenRent titles)
|
||||||
|
parts = text.split(",")
|
||||||
|
if len(parts) > 1:
|
||||||
|
last_part = parts[-1].strip()
|
||||||
|
match = re.match(r"^([A-Z]{1,2}\d[A-Z0-9]?)$", last_part, re.IGNORECASE)
|
||||||
|
if match:
|
||||||
|
return match.group(1).upper()
|
||||||
|
|
||||||
|
# Fall back to searching anywhere in text
|
||||||
|
match = re.search(r"\b([A-Z]{1,2}\d[A-Z0-9]?)\b", text, re.IGNORECASE)
|
||||||
|
if match:
|
||||||
|
candidate = match.group(1).upper()
|
||||||
|
# Avoid matching things like "1 Bed" → "1B"
|
||||||
|
if len(candidate) >= 2 and not candidate[0].isdigit():
|
||||||
|
return candidate
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _infer_property_type(title: str) -> str:
|
||||||
|
"""Infer property type from title text.
|
||||||
|
|
||||||
|
Order matters: "Room in a Shared Flat" should be "Room" not "Flat",
|
||||||
|
so check "room" before "flat"."""
|
||||||
|
lower = title.lower()
|
||||||
|
if "room in" in lower or "room " in lower:
|
||||||
|
return "Room"
|
||||||
|
if "studio" in lower:
|
||||||
|
return "Studio"
|
||||||
|
if "flat" in lower or "apartment" in lower:
|
||||||
|
return "Flat"
|
||||||
|
if "maisonette" in lower:
|
||||||
|
return "Maisonette"
|
||||||
|
if "house" in lower:
|
||||||
|
return "House"
|
||||||
|
if "bungalow" in lower:
|
||||||
|
return "Bungalow"
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
def parse_search_results(html: str) -> list[dict]:
|
def parse_search_results(html: str) -> list[dict]:
|
||||||
"""Parse property data from OpenRent search results HTML.
|
"""Parse property data from OpenRent search results HTML.
|
||||||
|
|
||||||
Returns list of raw property dicts extracted from property cards.
|
Returns list of raw property dicts extracted from property cards.
|
||||||
Uses multiple fallback selectors for resilience against markup changes.
|
|
||||||
|
Current OpenRent card structure (2026-03):
|
||||||
|
<a class="pli search-property-card" href="/property-to-rent/.../ID">
|
||||||
|
<div class="or-swiper" data-listing-id="ID">
|
||||||
|
<div class="pim"><span class="text-primary">£2,100</span> per month</div>
|
||||||
|
<div class="piw"><span class="text-primary">£485</span> per week</div>
|
||||||
|
<div class="fw-medium text-primary fs-3">1 Bed Flat, Location, SW1Y</div>
|
||||||
|
<ul>...<li>1 Bed</li><li>1 Bath</li><li>Furnished</li>...</ul>
|
||||||
"""
|
"""
|
||||||
soup = BeautifulSoup(html, "html.parser")
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
properties = []
|
properties = []
|
||||||
|
|
||||||
# Try known selectors for property cards (most specific first)
|
# Property cards: <a class="pli search-property-card">
|
||||||
cards = soup.select("a.pli")
|
cards = soup.select("a.pli")
|
||||||
if not cards:
|
if not cards:
|
||||||
cards = soup.select(".pli.clearfix")
|
|
||||||
if not cards:
|
|
||||||
cards = soup.select("[class*='propertyListing']")
|
|
||||||
if not cards:
|
|
||||||
# Last resort: look for links that match property URL pattern
|
|
||||||
cards = soup.find_all("a", href=re.compile(r"/property-to-rent/"))
|
cards = soup.find_all("a", href=re.compile(r"/property-to-rent/"))
|
||||||
|
|
||||||
if not cards:
|
if not cards:
|
||||||
|
|
@ -275,36 +360,86 @@ def parse_search_results(html: str) -> list[dict]:
|
||||||
if id_match:
|
if id_match:
|
||||||
prop["id"] = id_match.group(1)
|
prop["id"] = id_match.group(1)
|
||||||
else:
|
else:
|
||||||
continue # can't use a property without an ID
|
# Try data-listing-id on the swiper element
|
||||||
|
swiper = card.select_one("[data-listing-id]")
|
||||||
|
if swiper:
|
||||||
|
prop["id"] = swiper["data-listing-id"]
|
||||||
|
else:
|
||||||
|
continue # can't use a property without an ID
|
||||||
|
|
||||||
# Extract card text for parsing
|
# --- Price ---
|
||||||
card_text = card.get_text(" ", strip=True)
|
# Prefer structured price elements over free-text parsing.
|
||||||
|
# Monthly price is in <div class="pim"><span class="text-primary">£X</span>
|
||||||
|
pim = card.select_one(".pim .text-primary, .pim span")
|
||||||
|
piw = card.select_one(".piw .text-primary, .piw span")
|
||||||
|
|
||||||
# Price
|
monthly_price = _extract_price_from_element(pim)
|
||||||
price_result = _extract_price(card_text)
|
weekly_price = _extract_price_from_element(piw)
|
||||||
if price_result:
|
|
||||||
prop["price"], prop["frequency"] = price_result
|
|
||||||
|
|
||||||
# Title / address — try specific elements first, fall back to card text
|
if monthly_price:
|
||||||
title_el = card.select_one(
|
prop["price"] = monthly_price
|
||||||
".listing-title, .banda, h2, h3, [class*='title']"
|
prop["frequency"] = "monthly"
|
||||||
)
|
elif weekly_price:
|
||||||
prop["title"] = (
|
prop["price"] = weekly_price
|
||||||
title_el.get_text(strip=True) if title_el
|
prop["frequency"] = "weekly"
|
||||||
else card_text[:200]
|
else:
|
||||||
|
# Fall back to parsing card text
|
||||||
|
card_text = card.get_text(" ", strip=True)
|
||||||
|
price_result = _extract_price(card_text)
|
||||||
|
if price_result:
|
||||||
|
prop["price"], prop["frequency"] = price_result
|
||||||
|
|
||||||
|
# --- Title / Address ---
|
||||||
|
# The property title is in a div with classes "fw-medium text-primary fs-3"
|
||||||
|
# e.g., "1 Bed Flat, Bank Chambers, SW1Y"
|
||||||
|
title_el = card.select_one("div.fw-medium.fs-3")
|
||||||
|
if not title_el:
|
||||||
|
# Fallback: try image alt text which also has the title
|
||||||
|
img = card.select_one("img.propertyPic")
|
||||||
|
if img and img.get("alt"):
|
||||||
|
prop["title"] = img["alt"]
|
||||||
|
else:
|
||||||
|
# Last resort: extract from card text, excluding price/nav noise
|
||||||
|
prop["title"] = ""
|
||||||
|
else:
|
||||||
|
prop["title"] = title_el.get_text(strip=True)
|
||||||
|
|
||||||
|
# --- Bedrooms / Bathrooms from feature list ---
|
||||||
|
feature_list = card.select("ul li")
|
||||||
|
beds_from_features, baths_from_features = _extract_beds_baths_from_features(
|
||||||
|
feature_list,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Bedrooms from title text
|
# Bedrooms: prefer feature list, fall back to title parsing
|
||||||
beds = _extract_bedrooms_from_title(prop["title"])
|
if beds_from_features is not None:
|
||||||
if beds is not None:
|
prop["bedrooms"] = beds_from_features
|
||||||
prop["bedrooms"] = beds
|
else:
|
||||||
|
beds = _extract_bedrooms_from_title(prop.get("title", ""))
|
||||||
|
if beds is not None:
|
||||||
|
prop["bedrooms"] = beds
|
||||||
|
|
||||||
# Postcode from title
|
if baths_from_features is not None:
|
||||||
postcode = _extract_postcode(prop["title"])
|
prop["bathrooms"] = baths_from_features
|
||||||
|
|
||||||
|
# --- Property type from title ---
|
||||||
|
title = prop.get("title", "")
|
||||||
|
prop["property_type"] = _infer_property_type(title)
|
||||||
|
|
||||||
|
# --- Postcode / outcode from title ---
|
||||||
|
postcode = _extract_postcode(title)
|
||||||
if postcode:
|
if postcode:
|
||||||
prop["postcode"] = postcode
|
prop["postcode"] = postcode
|
||||||
|
else:
|
||||||
|
outcode = _extract_outcode(title)
|
||||||
|
if outcode:
|
||||||
|
prop["outcode"] = outcode
|
||||||
|
|
||||||
# Coordinates from data attributes (if present on card or child elements)
|
# --- Description snippet ---
|
||||||
|
desc_el = card.select_one(".line-clamp-2")
|
||||||
|
if desc_el:
|
||||||
|
prop["description"] = desc_el.get_text(strip=True)
|
||||||
|
|
||||||
|
# --- Coordinates from data attributes (may not be present on cards) ---
|
||||||
for el in [card] + card.select("[data-lat], [data-latitude]"):
|
for el in [card] + card.select("[data-lat], [data-latitude]"):
|
||||||
lat = el.get("data-lat") or el.get("data-latitude")
|
lat = el.get("data-lat") or el.get("data-latitude")
|
||||||
lng = (
|
lng = (
|
||||||
|
|
@ -329,62 +464,35 @@ def parse_search_results(html: str) -> list[dict]:
|
||||||
def parse_property_detail(html: str) -> dict:
|
def parse_property_detail(html: str) -> dict:
|
||||||
"""Parse a single property detail page for additional data.
|
"""Parse a single property detail page for additional data.
|
||||||
|
|
||||||
Extracts: bedrooms, bathrooms, price, property_type, postcode,
|
Current detail page structure (2026-03):
|
||||||
lat/lng (from map data), description (for floor area).
|
- <h1> has the full title (e.g., "Room in a Shared House, Lime Tree Court, AL2")
|
||||||
|
- <div id="map" data-lat="..." data-lng="..."> has coordinates
|
||||||
|
- Tables have "Rent PCM", "Deposit", "Bills Included", etc. (NOT bedrooms)
|
||||||
|
- Description in elements with class containing "description"
|
||||||
"""
|
"""
|
||||||
soup = BeautifulSoup(html, "html.parser")
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
details: dict = {}
|
details: dict = {}
|
||||||
|
|
||||||
# Parse structured data tables (class "table table-striped")
|
# --- Title from h1 ---
|
||||||
for table in soup.select("table.table-striped, table.table"):
|
h1 = soup.select_one("h1")
|
||||||
for row in table.select("tr"):
|
if h1:
|
||||||
cells = row.select("td, th")
|
title_text = h1.get_text(strip=True)
|
||||||
if len(cells) < 2:
|
# Validate it's not a nav/modal element (e.g. "Log in")
|
||||||
continue
|
if len(title_text) > 10 and "log in" not in title_text.lower():
|
||||||
label = cells[0].get_text(strip=True).lower()
|
details["title"] = title_text
|
||||||
value = cells[1].get_text(strip=True)
|
postcode = _extract_postcode(title_text)
|
||||||
|
if postcode:
|
||||||
|
details["postcode"] = postcode
|
||||||
|
|
||||||
if "bedroom" in label:
|
# --- Coordinates from map element ---
|
||||||
match = re.search(r"(\d+)", value)
|
# The map div has id="map" with data-lat and data-lng
|
||||||
if match:
|
map_el = soup.select_one("#map[data-lat]")
|
||||||
details["bedrooms"] = int(match.group(1))
|
if not map_el:
|
||||||
elif "bathroom" in label:
|
# Fallback: any element with data-lat (but prefer #map)
|
||||||
match = re.search(r"(\d+)", value)
|
map_el = soup.select_one("[data-lat]")
|
||||||
if match:
|
|
||||||
details["bathrooms"] = int(match.group(1))
|
|
||||||
elif "rent" in label or "price" in label:
|
|
||||||
match = re.search(r"£([\d,]+)", value)
|
|
||||||
if match:
|
|
||||||
details["price"] = int(match.group(1).replace(",", ""))
|
|
||||||
elif "type" in label:
|
|
||||||
details["property_type"] = value
|
|
||||||
elif "available" in label or "move" in label:
|
|
||||||
details["available_date"] = value
|
|
||||||
elif "furnish" in label:
|
|
||||||
details["furnished"] = value
|
|
||||||
|
|
||||||
# Extract postcode from page title / address heading.
|
|
||||||
# Prefer h1 (the actual property title) over generic [class*='title']
|
|
||||||
# which may match nav elements like "Log in".
|
|
||||||
title_tag = soup.select_one("h1")
|
|
||||||
if not title_tag:
|
|
||||||
title_tag = soup.select_one(".property-title, [class*='title']")
|
|
||||||
if title_tag:
|
|
||||||
title_text = title_tag.get_text(strip=True)
|
|
||||||
details["title"] = title_text
|
|
||||||
postcode = _extract_postcode(title_text)
|
|
||||||
if postcode:
|
|
||||||
details["postcode"] = postcode
|
|
||||||
|
|
||||||
# Extract coordinates from map element data attributes
|
|
||||||
map_el = soup.select_one("[data-lat], [data-latitude]")
|
|
||||||
if map_el:
|
if map_el:
|
||||||
lat = map_el.get("data-lat") or map_el.get("data-latitude")
|
lat = map_el.get("data-lat")
|
||||||
lng = (
|
lng = map_el.get("data-lng") or map_el.get("data-lon")
|
||||||
map_el.get("data-lng")
|
|
||||||
or map_el.get("data-longitude")
|
|
||||||
or map_el.get("data-lon")
|
|
||||||
)
|
|
||||||
if lat and lng:
|
if lat and lng:
|
||||||
try:
|
try:
|
||||||
details["lat"] = float(lat)
|
details["lat"] = float(lat)
|
||||||
|
|
@ -392,7 +500,35 @@ def parse_property_detail(html: str) -> dict:
|
||||||
except ValueError:
|
except ValueError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# Also check for coordinates in JSON-LD or inline JavaScript
|
# --- Parse tables for rent and property details ---
|
||||||
|
for table in soup.select("table"):
|
||||||
|
for row in table.select("tr"):
|
||||||
|
cells = row.select("td")
|
||||||
|
if len(cells) < 2:
|
||||||
|
continue
|
||||||
|
label = cells[0].get_text(strip=True).lower()
|
||||||
|
value = cells[1].get_text(strip=True)
|
||||||
|
|
||||||
|
if "rent" in label and "pcm" in label:
|
||||||
|
match = re.search(r"£([\d,]+)", value)
|
||||||
|
if match:
|
||||||
|
details["price"] = int(match.group(1).replace(",", ""))
|
||||||
|
elif "bedroom" in label:
|
||||||
|
match = re.search(r"(\d+)", value)
|
||||||
|
if match:
|
||||||
|
details["bedrooms"] = int(match.group(1))
|
||||||
|
elif "bathroom" in label:
|
||||||
|
match = re.search(r"(\d+)", value)
|
||||||
|
if match:
|
||||||
|
details["bathrooms"] = int(match.group(1))
|
||||||
|
elif "type" in label and "property" in label:
|
||||||
|
details["property_type"] = value
|
||||||
|
elif "available" in label or "move" in label:
|
||||||
|
details["available_date"] = value
|
||||||
|
elif "furnish" in label:
|
||||||
|
details["furnished"] = value
|
||||||
|
|
||||||
|
# --- Coordinates from inline JavaScript (last resort) ---
|
||||||
if "lat" not in details:
|
if "lat" not in details:
|
||||||
for script in soup.select("script"):
|
for script in soup.select("script"):
|
||||||
text = script.string or ""
|
text = script.string or ""
|
||||||
|
|
@ -406,7 +542,7 @@ def parse_property_detail(html: str) -> dict:
|
||||||
pass
|
pass
|
||||||
break
|
break
|
||||||
|
|
||||||
# Extract description for floor area parsing
|
# --- Description for floor area ---
|
||||||
desc_el = soup.select_one(
|
desc_el = soup.select_one(
|
||||||
".description, [class*='description'], #description"
|
".description, [class*='description'], #description"
|
||||||
)
|
)
|
||||||
|
|
@ -464,6 +600,16 @@ def parse_floor_area(description: str | None) -> float | None:
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def _resolve_outcode_postcodes(
|
||||||
|
outcode: str,
|
||||||
|
pc_coords: dict[str, tuple[float, float]],
|
||||||
|
) -> list[str]:
|
||||||
|
"""Get all postcodes for an outcode from the postcode coordinates lookup."""
|
||||||
|
prefix = outcode + " "
|
||||||
|
# Also try without space for non-standard format (e.g. "SW1Y" matches "SW1Y 4AA")
|
||||||
|
return [pcd for pcd in pc_coords if pcd.startswith(prefix)]
|
||||||
|
|
||||||
|
|
||||||
def transform_property(
|
def transform_property(
|
||||||
search_data: dict,
|
search_data: dict,
|
||||||
detail_data: dict | None,
|
detail_data: dict | None,
|
||||||
|
|
@ -473,7 +619,7 @@ def transform_property(
|
||||||
"""Transform OpenRent property data into our output schema.
|
"""Transform OpenRent property data into our output schema.
|
||||||
|
|
||||||
Merges data from the search results page and (optionally) the detail page.
|
Merges data from the search results page and (optionally) the detail page.
|
||||||
Uses pc_coords (postcode → lat/lng) as a fallback when coordinates are
|
Uses pc_coords (postcode -> lat/lng) as a fallback when coordinates are
|
||||||
missing but a postcode is available.
|
missing but a postcode is available.
|
||||||
"""
|
"""
|
||||||
detail = detail_data or {}
|
detail = detail_data or {}
|
||||||
|
|
@ -487,7 +633,7 @@ def transform_property(
|
||||||
|
|
||||||
frequency = search_data.get("frequency", "monthly")
|
frequency = search_data.get("frequency", "monthly")
|
||||||
|
|
||||||
# Get postcode: detail page > search card > spatial index
|
# Get postcode: detail page > search card
|
||||||
postcode = detail.get("postcode") or search_data.get("postcode")
|
postcode = detail.get("postcode") or search_data.get("postcode")
|
||||||
|
|
||||||
if lat is not None and lng is not None:
|
if lat is not None and lng is not None:
|
||||||
|
|
@ -496,7 +642,15 @@ def transform_property(
|
||||||
log.debug("Coords outside England: lat=%.4f lng=%.4f — skipping", lat, lng)
|
log.debug("Coords outside England: lat=%.4f lng=%.4f — skipping", lat, lng)
|
||||||
return None
|
return None
|
||||||
if not postcode:
|
if not postcode:
|
||||||
postcode = pc_index.nearest(lat, lng)
|
if pc_index:
|
||||||
|
postcode = pc_index.nearest(lat, lng)
|
||||||
|
elif search_data.get("outcode"):
|
||||||
|
# No spatial index — try outcode lookup as fallback
|
||||||
|
outcode_pcs = _resolve_outcode_postcodes(
|
||||||
|
search_data["outcode"], pc_coords,
|
||||||
|
)
|
||||||
|
if outcode_pcs:
|
||||||
|
postcode = outcode_pcs[0]
|
||||||
elif postcode:
|
elif postcode:
|
||||||
# Have postcode but no coordinates — look up centroid from arcgis data
|
# Have postcode but no coordinates — look up centroid from arcgis data
|
||||||
coords = pc_coords.get(postcode)
|
coords = pc_coords.get(postcode)
|
||||||
|
|
@ -505,6 +659,17 @@ def transform_property(
|
||||||
else:
|
else:
|
||||||
log.debug("Postcode %s not in arcgis data — skipping", postcode)
|
log.debug("Postcode %s not in arcgis data — skipping", postcode)
|
||||||
return None
|
return None
|
||||||
|
elif search_data.get("outcode"):
|
||||||
|
# Have only outcode — find postcodes in that outcode and use centroid
|
||||||
|
outcode = search_data["outcode"]
|
||||||
|
outcode_postcodes = _resolve_outcode_postcodes(outcode, pc_coords)
|
||||||
|
if outcode_postcodes:
|
||||||
|
# Use the first postcode as a rough approximation
|
||||||
|
postcode = outcode_postcodes[0]
|
||||||
|
lat, lng = pc_coords[postcode]
|
||||||
|
else:
|
||||||
|
log.debug("No postcodes found for outcode %s — skipping", outcode)
|
||||||
|
return None
|
||||||
else:
|
else:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
@ -513,37 +678,45 @@ def transform_property(
|
||||||
return None
|
return None
|
||||||
|
|
||||||
bedrooms = detail.get("bedrooms") or search_data.get("bedrooms", 0) or 0
|
bedrooms = detail.get("bedrooms") or search_data.get("bedrooms", 0) or 0
|
||||||
bathrooms = detail.get("bathrooms", 0) or 0
|
bathrooms = detail.get("bathrooms") or search_data.get("bathrooms", 0) or 0
|
||||||
|
|
||||||
|
# Title: prefer detail page (has h1 with full title)
|
||||||
title = detail.get("title") or search_data.get("title", "")
|
title = detail.get("title") or search_data.get("title", "")
|
||||||
address = title.split(",")[0].strip() if title else ""
|
|
||||||
|
|
||||||
property_type = detail.get("property_type", "")
|
# Address: take the middle part of the title (skip the "N Bed Type" prefix
|
||||||
# Infer from title if not found in detail page
|
# and the outcode suffix). E.g., "1 Bed Flat, Bank Chambers, SW1Y" -> "Bank Chambers"
|
||||||
|
address = ""
|
||||||
|
if title:
|
||||||
|
parts = [p.strip() for p in title.split(",")]
|
||||||
|
if len(parts) >= 3:
|
||||||
|
# Skip first (type) and last (outcode), join the middle
|
||||||
|
address = ", ".join(parts[1:-1])
|
||||||
|
elif len(parts) == 2:
|
||||||
|
# Could be "Location, OUTCODE" or "Type, Location"
|
||||||
|
# If last part looks like an outcode, use the first part
|
||||||
|
if re.match(r"^[A-Z]{1,2}\d", parts[-1].strip()):
|
||||||
|
address = parts[0]
|
||||||
|
else:
|
||||||
|
address = parts[1]
|
||||||
|
else:
|
||||||
|
address = title
|
||||||
|
|
||||||
|
# Property type: prefer detail, then search card, then infer from title
|
||||||
|
property_type = detail.get("property_type") or search_data.get("property_type", "")
|
||||||
if not property_type and title:
|
if not property_type and title:
|
||||||
lower = title.lower()
|
property_type = _infer_property_type(title)
|
||||||
if "flat" in lower or "apartment" in lower:
|
|
||||||
property_type = "Flat"
|
|
||||||
elif "studio" in lower:
|
|
||||||
property_type = "Studio"
|
|
||||||
elif "maisonette" in lower:
|
|
||||||
property_type = "Maisonette"
|
|
||||||
elif "house" in lower:
|
|
||||||
property_type = "House"
|
|
||||||
elif "room" in lower:
|
|
||||||
property_type = "Room"
|
|
||||||
|
|
||||||
prop_id = search_data.get("id", "")
|
prop_id = search_data.get("id", "")
|
||||||
listing_url = search_data.get(
|
listing_url = search_data.get(
|
||||||
"url", f"{OPENRENT_BASE}/{prop_id}" if prop_id else "",
|
"url", f"{OPENRENT_BASE}/{prop_id}" if prop_id else "",
|
||||||
)
|
)
|
||||||
description = detail.get("description", "")
|
description = detail.get("description") or search_data.get("description", "")
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"id": f"or_{prop_id}",
|
"id": f"or_{prop_id}",
|
||||||
"Bedrooms": bedrooms,
|
"Bedrooms": bedrooms,
|
||||||
"Bathrooms": bathrooms,
|
"Bathrooms": bathrooms,
|
||||||
"Number of bedrooms & living rooms": bedrooms + bathrooms,
|
"Number of bedrooms & living rooms": bedrooms,
|
||||||
"lon": lng,
|
"lon": lng,
|
||||||
"lat": lat,
|
"lat": lat,
|
||||||
"Postcode": postcode,
|
"Postcode": postcode,
|
||||||
|
|
@ -571,14 +744,14 @@ def search_outcode(
|
||||||
"""Search OpenRent for rental properties in one outcode.
|
"""Search OpenRent for rental properties in one outcode.
|
||||||
|
|
||||||
1. Fetches the search results page for the outcode
|
1. Fetches the search results page for the outcode
|
||||||
2. Parses property cards from the HTML
|
2. Parses property cards from the HTML (title, price, beds, baths)
|
||||||
3. Optionally fetches each property's detail page for full data
|
3. Fetches each property's detail page for coordinates
|
||||||
4. Transforms to common output schema
|
4. Transforms to common output schema
|
||||||
|
|
||||||
Args:
|
The search card provides most data (price, bedrooms, bathrooms, title,
|
||||||
fetch_details: If True, visits each property's detail page for
|
property type). Detail pages are needed primarily for precise coordinates
|
||||||
coordinates and extra data. Slower but more complete.
|
and full postcodes. When detail pages fail, we fall back to outcode-level
|
||||||
If False, relies only on search card data + postcode lookup.
|
coordinates from the postcode lookup.
|
||||||
"""
|
"""
|
||||||
search_url = f"{OPENRENT_BASE}/properties-to-rent/?term={outcode}&isLive=true"
|
search_url = f"{OPENRENT_BASE}/properties-to-rent/?term={outcode}&isLive=true"
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue