Fix openrent
This commit is contained in:
parent
b3a7ab40c8
commit
4d08f5d08d
5 changed files with 135 additions and 53 deletions
|
|
@ -1,8 +1,8 @@
|
|||
"""OpenRent (openrent.co.uk) scraper — rental properties only.
|
||||
|
||||
OpenRent is behind AWS WAF, so we use FlareSolverr to solve the challenge
|
||||
and get valid cookies. Then we use curl_cffi with Chrome TLS impersonation
|
||||
to make requests with those cookies (same pattern as homecouk.py).
|
||||
OpenRent is behind AWS WAF, so we use Playwright (headless Chromium) to solve
|
||||
the challenge and get valid cookies. Then we use curl_cffi with Chrome TLS
|
||||
impersonation to make requests with those cookies.
|
||||
|
||||
OpenRent is a rental-only platform, so this scraper only handles RENT channel.
|
||||
|
||||
|
|
@ -18,10 +18,10 @@ import os
|
|||
import re
|
||||
import time
|
||||
|
||||
import httpx
|
||||
from bs4 import BeautifulSoup
|
||||
from curl_cffi.requests import Session
|
||||
from curl_cffi.requests.errors import RequestsError
|
||||
from playwright.sync_api import sync_playwright
|
||||
|
||||
from constants import (
|
||||
DELAY_BETWEEN_PAGES,
|
||||
|
|
@ -44,72 +44,61 @@ class WafChallengeError(Exception):
|
|||
"""Raised when OpenRent returns a WAF challenge, indicating cookies need refresh."""
|
||||
|
||||
|
||||
FLARESOLVERR_URL = os.environ.get("FLARESOLVERR_URL", "http://flaresolverr:8191")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Cookie / session management (mirrors homecouk.py pattern)
|
||||
# Cookie / session management via Playwright
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def solve_waf() -> tuple[dict[str, str], str] | None:
|
||||
"""Use FlareSolverr to solve the AWS WAF challenge.
|
||||
"""Use Playwright (headless Chromium) to solve the AWS WAF challenge.
|
||||
Returns (cookies_dict, user_agent) or None on failure."""
|
||||
log.info("Solving AWS WAF challenge via FlareSolverr at %s", FLARESOLVERR_URL)
|
||||
log.info("Solving AWS WAF challenge via Playwright")
|
||||
try:
|
||||
with httpx.Client(timeout=120) as client:
|
||||
resp = client.post(
|
||||
f"{FLARESOLVERR_URL}/v1",
|
||||
json={
|
||||
"cmd": "request.get",
|
||||
"url": f"{OPENRENT_BASE}/properties-to-rent/?term=london&isLive=true",
|
||||
"maxTimeout": 60000,
|
||||
},
|
||||
with sync_playwright() as p:
|
||||
browser = p.chromium.launch(
|
||||
headless=True,
|
||||
args=["--no-sandbox", "--disable-blink-features=AutomationControlled"],
|
||||
)
|
||||
if resp.status_code != 200:
|
||||
log.error("FlareSolverr returned HTTP %d", resp.status_code)
|
||||
return None
|
||||
context = browser.new_context()
|
||||
page = context.new_page()
|
||||
|
||||
data = resp.json()
|
||||
if data.get("status") != "ok":
|
||||
log.error("FlareSolverr error: %s", data.get("message", "unknown"))
|
||||
return None
|
||||
url = f"{OPENRENT_BASE}/properties-to-rent/?term=london&isLive=true"
|
||||
log.info("Navigating to %s", url)
|
||||
page.goto(url, wait_until="domcontentloaded", timeout=60000)
|
||||
|
||||
solution = data["solution"]
|
||||
raw_cookies = solution.get("cookies", [])
|
||||
user_agent = solution.get("userAgent", "")
|
||||
content = page.content()
|
||||
if "AwsWafIntegration" in content:
|
||||
log.info("Got WAF challenge page, waiting for resolution...")
|
||||
page.wait_for_selector(
|
||||
"a.pli, .pli, [class*=propertyListing]", timeout=30000,
|
||||
)
|
||||
|
||||
cookies = {}
|
||||
for c in raw_cookies:
|
||||
name = c.get("name", "")
|
||||
if name:
|
||||
cookies[name] = c["value"]
|
||||
raw_cookies = context.cookies()
|
||||
user_agent = page.evaluate("navigator.userAgent")
|
||||
browser.close()
|
||||
|
||||
if not cookies:
|
||||
log.error("FlareSolverr solved but returned no cookies")
|
||||
flaresolverr_attempts_total.labels(result="no_cookies").inc()
|
||||
return None
|
||||
cookies = {c["name"]: c["value"] for c in raw_cookies}
|
||||
if "aws-waf-token" not in cookies:
|
||||
log.error("Playwright solved page but no aws-waf-token cookie found")
|
||||
flaresolverr_attempts_total.labels(result="no_cookies").inc()
|
||||
return None
|
||||
|
||||
log.info(
|
||||
"AWS WAF solved — got %d cookies, UA: %s",
|
||||
len(cookies), user_agent[:60],
|
||||
)
|
||||
flaresolverr_attempts_total.labels(result="success").inc()
|
||||
return cookies, user_agent
|
||||
log.info(
|
||||
"AWS WAF solved — got %d cookies, UA: %s",
|
||||
len(cookies), user_agent[:60],
|
||||
)
|
||||
flaresolverr_attempts_total.labels(result="success").inc()
|
||||
return cookies, user_agent
|
||||
|
||||
except (httpx.ConnectError, httpx.ReadTimeout) as e:
|
||||
log.warning("FlareSolverr not available: %s", e)
|
||||
flaresolverr_attempts_total.labels(result="unavailable").inc()
|
||||
return None
|
||||
except Exception as e:
|
||||
log.error("FlareSolverr error: %s", e)
|
||||
log.error("Playwright WAF solve failed: %s", e)
|
||||
flaresolverr_attempts_total.labels(result="error").inc()
|
||||
return None
|
||||
|
||||
|
||||
def load_cookies() -> tuple[dict[str, str], str] | None:
|
||||
"""Get OpenRent cookies + user-agent.
|
||||
Tries FlareSolverr first, then falls back to environment variables."""
|
||||
Tries Playwright first, then falls back to environment variables."""
|
||||
result = solve_waf()
|
||||
if result:
|
||||
return result
|
||||
|
|
@ -208,12 +197,18 @@ def fetch_page(
|
|||
|
||||
def _extract_price(text: str) -> tuple[int, str] | None:
|
||||
"""Extract price and frequency from text like '£1,500 pcm' or '£350 pw'.
|
||||
Returns (price_int, frequency) or None."""
|
||||
Returns (price_int, frequency) or None.
|
||||
|
||||
OpenRent card text shows both monthly and weekly prices (e.g.
|
||||
'£2,800 per month £646 per week'), so check monthly *before* weekly
|
||||
to match the first (monthly) price that the regex captures."""
|
||||
match = re.search(r"£([\d,]+)", text)
|
||||
if not match:
|
||||
return None
|
||||
price = int(match.group(1).replace(",", ""))
|
||||
lower = text.lower()
|
||||
if "pcm" in lower or "per month" in lower or "/m" in lower:
|
||||
return price, "monthly"
|
||||
if "pw" in lower or "per week" in lower or "/w" in lower:
|
||||
return price, "weekly"
|
||||
if "pa" in lower or "per annum" in lower or "/y" in lower:
|
||||
|
|
@ -368,8 +363,12 @@ def parse_property_detail(html: str) -> dict:
|
|||
elif "furnish" in label:
|
||||
details["furnished"] = value
|
||||
|
||||
# Extract postcode from page title / address heading
|
||||
title_tag = soup.select_one("h1, .property-title, [class*='title']")
|
||||
# Extract postcode from page title / address heading.
|
||||
# Prefer h1 (the actual property title) over generic [class*='title']
|
||||
# which may match nav elements like "Log in".
|
||||
title_tag = soup.select_one("h1")
|
||||
if not title_tag:
|
||||
title_tag = soup.select_one(".property-title, [class*='title']")
|
||||
if title_tag:
|
||||
title_text = title_tag.get_text(strip=True)
|
||||
details["title"] = title_text
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue