This commit is contained in:
Andras Schmelczer 2026-03-15 21:22:28 +00:00
parent 479ef92236
commit c38d654ac7
44 changed files with 2526 additions and 701 deletions

View file

@ -79,7 +79,8 @@ def solve_waf() -> tuple[dict[str, str], str] | None:
if "AwsWafIntegration" in content:
log.info("Got WAF challenge page, waiting for resolution...")
page.wait_for_selector(
"a.pli, .pli, .search-property-card", timeout=30000,
"a.pli, .pli, .search-property-card",
timeout=30000,
)
raw_cookies = context.cookies()
@ -94,7 +95,8 @@ def solve_waf() -> tuple[dict[str, str], str] | None:
log.info(
"AWS WAF solved — got %d cookies, UA: %s",
len(cookies), user_agent[:60],
len(cookies),
user_agent[:60],
)
flaresolverr_attempts_total.labels(result="success").inc()
return cookies, user_agent
@ -130,11 +132,13 @@ def make_client(cookies: dict[str, str], user_agent: str) -> Session:
"""Create a curl_cffi Session configured for OpenRent.
Uses Chrome TLS impersonation so AWS WAF cookies remain valid."""
session = Session(impersonate="chrome")
session.headers.update({
"User-Agent": user_agent,
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-GB,en;q=0.9",
})
session.headers.update(
{
"User-Agent": user_agent,
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-GB,en;q=0.9",
}
)
for name, value in cookies.items():
session.cookies.set(name, value, domain="openrent.co.uk")
return session
@ -152,7 +156,9 @@ def _status_label(code: int) -> str:
def fetch_page(
client: Session, url: str, max_retries: int = 3,
client: Session,
url: str,
max_retries: int = 3,
) -> str | None:
"""GET HTML with retries on 429/5xx. Returns None on permanent failure.
WAF challenge (202 or 403 with challenge JS) raises WafChallengeError."""
@ -165,17 +171,25 @@ def fetch_page(
html = resp.text
# Detect WAF challenge page masquerading as 200
if "AwsWafIntegration" in html and "challenge.js" in html:
raise WafChallengeError("Got AWS WAF challenge page — cookies expired")
raise WafChallengeError(
"Got AWS WAF challenge page — cookies expired"
)
return html
if resp.status_code in (202, 403):
raise WafChallengeError(f"HTTP {resp.status_code} — cookies likely expired")
raise WafChallengeError(
f"HTTP {resp.status_code} — cookies likely expired"
)
if resp.status_code in (429, 500, 502, 503, 504):
delay = RETRY_BASE_DELAY * (2 ** attempt)
delay = RETRY_BASE_DELAY * (2**attempt)
log.warning(
"HTTP %d from %s, retry %d/%d in %.1fs",
resp.status_code, url, attempt + 1, max_retries, delay,
resp.status_code,
url,
attempt + 1,
max_retries,
delay,
)
time.sleep(delay)
continue
@ -187,10 +201,14 @@ def fetch_page(
raise
except RequestsError as e:
openrent_errors_total.labels(type=type(e).__name__).inc()
delay = RETRY_BASE_DELAY * (2 ** attempt)
delay = RETRY_BASE_DELAY * (2**attempt)
log.warning(
"%s from %s, retry %d/%d in %.1fs",
type(e).__name__, url, attempt + 1, max_retries, delay,
type(e).__name__,
url,
attempt + 1,
max_retries,
delay,
)
time.sleep(delay)
@ -247,7 +265,9 @@ def _extract_bedrooms_from_title(title: str) -> int | None:
return None
def _extract_beds_baths_from_features(feature_items: list) -> tuple[int | None, int | None]:
def _extract_beds_baths_from_features(
feature_items: list,
) -> tuple[int | None, int | None]:
"""Extract bedrooms and bathrooms from feature list items.
OpenRent search cards have <ul> with items like:
@ -442,11 +462,7 @@ def parse_search_results(html: str) -> list[dict]:
# --- Coordinates from data attributes (may not be present on cards) ---
for el in [card] + card.select("[data-lat], [data-latitude]"):
lat = el.get("data-lat") or el.get("data-latitude")
lng = (
el.get("data-lng")
or el.get("data-longitude")
or el.get("data-lon")
)
lng = el.get("data-lng") or el.get("data-longitude") or el.get("data-lon")
if lat and lng:
try:
prop["lat"] = float(lat)
@ -543,9 +559,7 @@ def parse_property_detail(html: str) -> dict:
break
# --- Description for floor area ---
desc_el = soup.select_one(
".description, [class*='description'], #description"
)
desc_el = soup.select_one(".description, [class*='description'], #description")
if desc_el:
details["description"] = desc_el.get_text(strip=True)
@ -567,7 +581,12 @@ def map_property_type(raw_type: str | None) -> str:
lower = raw_type.lower()
if "room" in lower or "shared" in lower:
return "Other"
if "flat" in lower or "apartment" in lower or "maisonette" in lower or "studio" in lower:
if (
"flat" in lower
or "apartment" in lower
or "maisonette" in lower
or "studio" in lower
):
return "Flats/Maisonettes"
if "detached" in lower and "semi" not in lower:
return "Detached"
@ -647,7 +666,8 @@ def transform_property(
elif search_data.get("outcode"):
# No spatial index — try outcode lookup as fallback
outcode_pcs = _resolve_outcode_postcodes(
search_data["outcode"], pc_coords,
search_data["outcode"],
pc_coords,
)
if outcode_pcs:
postcode = outcode_pcs[0]
@ -708,7 +728,8 @@ def transform_property(
prop_id = search_data.get("id", "")
listing_url = search_data.get(
"url", f"{OPENRENT_BASE}/{prop_id}" if prop_id else "",
"url",
f"{OPENRENT_BASE}/{prop_id}" if prop_id else "",
)
description = detail.get("description") or search_data.get("description", "")
@ -775,7 +796,10 @@ def search_outcode(
time.sleep(DELAY_BETWEEN_PAGES * 0.5)
transformed = transform_property(
search_data, detail_data, pc_index, pc_coords,
search_data,
detail_data,
pc_index,
pc_coords,
)
if transformed:
properties.append(transformed)