all good
This commit is contained in:
parent
47d89f6fad
commit
017902b8e6
82 changed files with 331466 additions and 54841 deletions
1
finder/.gitignore
vendored
Normal file
1
finder/.gitignore
vendored
Normal file
|
|
@ -0,0 +1 @@
|
|||
data/
|
||||
|
|
@ -1,18 +0,0 @@
|
|||
FROM python:3.12-slim
|
||||
|
||||
COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
|
||||
|
||||
WORKDIR /app
|
||||
COPY pyproject.toml ./
|
||||
RUN uv pip install --system -r pyproject.toml
|
||||
RUN playwright install-deps firefox
|
||||
RUN camoufox fetch \
|
||||
&& python -c "from camoufox.pkgman import camoufox_path; p = camoufox_path(download_if_missing=False); print('Camoufox verified at', p)"
|
||||
|
||||
COPY *.py ./
|
||||
COPY property-data/arcgis_data.parquet /data/arcgis_data.parquet
|
||||
|
||||
HEALTHCHECK --interval=30s --timeout=5s --retries=3 \
|
||||
CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:1234/health')"
|
||||
|
||||
CMD ["python3", "main.py"]
|
||||
|
|
@ -1,8 +1,13 @@
|
|||
import os
|
||||
from pathlib import Path
|
||||
|
||||
ARCGIS_PATH = os.environ.get("ARCGIS_PATH", "/data/arcgis_data.parquet")
|
||||
DATA_DIR = Path("/app/data")
|
||||
FINDER_DIR = Path(__file__).resolve().parent
|
||||
REPO_DIR = FINDER_DIR.parent
|
||||
|
||||
DATA_DIR = Path(os.environ.get("DATA_DIR", str(FINDER_DIR / "data")))
|
||||
ARCGIS_PATH = Path(
|
||||
os.environ.get("ARCGIS_PATH", str(REPO_DIR / "property-data" / "arcgis_data.parquet"))
|
||||
)
|
||||
PAGE_SIZE = 24
|
||||
DELAY_BETWEEN_PAGES = 0.3
|
||||
DELAY_BETWEEN_OUTCODES = 0.5
|
||||
|
|
@ -10,42 +15,6 @@ MAX_RETRIES = 3
|
|||
RETRY_BASE_DELAY = 2.0
|
||||
GRID_CELL_SIZE = 0.01 # degrees for postcode spatial index
|
||||
MAX_BEDROOMS = 20 # sanity cap — values above this are almost certainly parsing errors
|
||||
# Rent sanity bounds (monthly). Rents outside this range are nulled out — they are
|
||||
# almost always total-stay pricing (e.g. "Golf Open 2026" short lets), annual rents
|
||||
# mislabelled as monthly, or data errors.
|
||||
MIN_RENT_MONTHLY = 50 # below £50/month is implausible for any UK property
|
||||
MAX_RENT_MONTHLY = 25_000 # above £25k/month covers ultra-prime London; higher is suspect
|
||||
SEED = 42
|
||||
CHECKPOINT_INTERVAL = int(os.environ.get("CHECKPOINT_INTERVAL", "900")) # seconds
|
||||
|
||||
# Schedule: hour of day (UTC) to auto-run scrape. Set to -1 to disable.
|
||||
SCHEDULE_HOUR = int(os.environ.get("SCHEDULE_HOUR", "3"))
|
||||
# Whether to run a scrape immediately on startup
|
||||
RUN_ON_STARTUP = os.environ.get("RUN_ON_STARTUP", "").lower() in ("1", "true", "yes")
|
||||
# Enable/disable individual sources
|
||||
SCRAPE_RIGHTMOVE = os.environ.get("SCRAPE_RIGHTMOVE", "true").lower() in (
|
||||
"1",
|
||||
"true",
|
||||
"yes",
|
||||
)
|
||||
SCRAPE_HOMECOUK = os.environ.get("SCRAPE_HOMECOUK", "true").lower() in (
|
||||
"1",
|
||||
"true",
|
||||
"yes",
|
||||
)
|
||||
SCRAPE_OPENRENT = os.environ.get("SCRAPE_OPENRENT", "true").lower() in (
|
||||
"1",
|
||||
"true",
|
||||
"yes",
|
||||
)
|
||||
SCRAPE_ZOOPLA = os.environ.get("SCRAPE_ZOOPLA", "true").lower() in (
|
||||
"1",
|
||||
"true",
|
||||
"yes",
|
||||
)
|
||||
|
||||
# URL to trigger server data reload after scrape (e.g. http://server:8001/api/reload)
|
||||
RELOAD_URL = os.environ.get("RELOAD_URL", "")
|
||||
|
||||
TYPEAHEAD_URL = "https://los.rightmove.co.uk/typeahead"
|
||||
SEARCH_URL = "https://www.rightmove.co.uk/api/property-search/listing/search"
|
||||
|
|
@ -55,14 +24,36 @@ RIGHTMOVE_BASE = "https://www.rightmove.co.uk"
|
|||
HOMECOUK_BASE = "https://home.co.uk"
|
||||
HOMECOUK_API_BASE = f"{HOMECOUK_BASE}/api"
|
||||
HOMECOUK_PER_PAGE = 30 # max supported by the API
|
||||
HOMECOUK_CONCURRENCY = int(os.environ.get("HOMECOUK_CONCURRENCY", "4"))
|
||||
|
||||
# OpenRent
|
||||
OPENRENT_BASE = "https://www.openrent.co.uk"
|
||||
|
||||
# Zoopla
|
||||
ZOOPLA_BASE = "https://www.zoopla.co.uk"
|
||||
|
||||
# Greater London-ish postcode areas. This intentionally uses broad area
|
||||
# prefixes so a manual scrape can include central/inner London plus common
|
||||
# outer-London and near-London outcodes without maintaining a long borough list.
|
||||
LONDON_OUTCODE_PREFIXES = {
|
||||
"E",
|
||||
"EC",
|
||||
"N",
|
||||
"NW",
|
||||
"SE",
|
||||
"SW",
|
||||
"W",
|
||||
"WC",
|
||||
"BR",
|
||||
"CR",
|
||||
"DA",
|
||||
"EN",
|
||||
"HA",
|
||||
"IG",
|
||||
"KT",
|
||||
"RM",
|
||||
"SM",
|
||||
"TW",
|
||||
"UB",
|
||||
"WD",
|
||||
}
|
||||
|
||||
PROPERTY_TYPE_MAP = {
|
||||
"Detached": "Detached",
|
||||
"Semi-Detached": "Semi-Detached",
|
||||
|
|
@ -150,5 +141,4 @@ PROPERTY_TYPE_MAP = {
|
|||
|
||||
CHANNELS = [
|
||||
{"channel": "BUY", "transactionType": "BUY", "sortType": "2"},
|
||||
{"channel": "RENT", "transactionType": "LETTING", "sortType": "6"},
|
||||
]
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
Binary file not shown.
Binary file not shown.
|
|
@ -6,7 +6,6 @@ import re
|
|||
import time
|
||||
from urllib.parse import unquote
|
||||
|
||||
import httpx
|
||||
from curl_cffi.requests import Session
|
||||
from curl_cffi.requests.errors import RequestsError
|
||||
|
||||
|
|
@ -19,12 +18,6 @@ from constants import (
|
|||
PROPERTY_TYPE_MAP,
|
||||
RETRY_BASE_DELAY,
|
||||
)
|
||||
from metrics import (
|
||||
flaresolverr_attempts_total,
|
||||
homecouk_errors_total,
|
||||
homecouk_properties_scraped,
|
||||
homecouk_requests_total,
|
||||
)
|
||||
from spatial import PostcodeSpatialIndex
|
||||
from transform import normalize_postcode, normalize_sub_type, validate_floor_area
|
||||
|
||||
|
|
@ -36,101 +29,73 @@ class CookiesExpiredError(Exception):
|
|||
|
||||
|
||||
# Channel mapping: internal name → URL path segment
|
||||
HOMECOUK_CHANNELS = {
|
||||
"BUY": "for-sale",
|
||||
"RENT": "to-rent",
|
||||
}
|
||||
|
||||
|
||||
FLARESOLVERR_URL = os.environ.get("FLARESOLVERR_URL", "http://flaresolverr:8191")
|
||||
|
||||
|
||||
def solve_cloudflare() -> tuple[dict[str, str], str] | None:
|
||||
"""Use FlareSolverr to solve the Cloudflare challenge.
|
||||
Returns (cookies_dict, user_agent) or None on failure."""
|
||||
log.info("Solving Cloudflare challenge via FlareSolverr at %s", FLARESOLVERR_URL)
|
||||
try:
|
||||
with httpx.Client(timeout=120) as client:
|
||||
resp = client.post(
|
||||
f"{FLARESOLVERR_URL}/v1",
|
||||
json={
|
||||
"cmd": "request.get",
|
||||
"url": f"{HOMECOUK_BASE}/for-sale/e1/",
|
||||
"maxTimeout": 60000,
|
||||
},
|
||||
)
|
||||
if resp.status_code != 200:
|
||||
log.error("FlareSolverr returned HTTP %d", resp.status_code)
|
||||
return None
|
||||
|
||||
data = resp.json()
|
||||
if data.get("status") != "ok":
|
||||
log.error("FlareSolverr error: %s", data.get("message", "unknown"))
|
||||
return None
|
||||
|
||||
solution = data["solution"]
|
||||
raw_cookies = solution.get("cookies", [])
|
||||
user_agent = solution.get("userAgent", "")
|
||||
|
||||
# Pass through ALL cookies from FlareSolverr — different Cloudflare
|
||||
# configurations set different cookies (cf_clearance only appears when
|
||||
# a challenge is triggered; it's not needed if no challenge was detected)
|
||||
cookies = {}
|
||||
for c in raw_cookies:
|
||||
name = c.get("name", "")
|
||||
if name:
|
||||
cookies[name] = c["value"]
|
||||
|
||||
if not cookies:
|
||||
log.error("FlareSolverr solved but returned no cookies at all")
|
||||
flaresolverr_attempts_total.labels(result="no_cookies").inc()
|
||||
return None
|
||||
|
||||
log.info(
|
||||
"Cloudflare solved — got %d cookies, UA: %s",
|
||||
len(cookies),
|
||||
user_agent[:60],
|
||||
)
|
||||
flaresolverr_attempts_total.labels(result="success").inc()
|
||||
return cookies, user_agent
|
||||
|
||||
except (httpx.ConnectError, httpx.ReadTimeout) as e:
|
||||
log.warning("FlareSolverr not available: %s", e)
|
||||
flaresolverr_attempts_total.labels(result="unavailable").inc()
|
||||
return None
|
||||
except Exception as e:
|
||||
log.error("FlareSolverr error: %s", e)
|
||||
flaresolverr_attempts_total.labels(result="error").inc()
|
||||
return None
|
||||
HOMECOUK_URL_SEGMENT = "for-sale"
|
||||
|
||||
|
||||
def load_cookies() -> tuple[dict[str, str], str] | None:
|
||||
"""Get home.co.uk cookies + user-agent.
|
||||
Tries FlareSolverr first, then falls back to environment variables.
|
||||
Returns (cookies_dict, user_agent) or None if not configured."""
|
||||
# Try FlareSolverr first
|
||||
result = solve_cloudflare()
|
||||
if result:
|
||||
return result
|
||||
|
||||
# Fall back to env vars
|
||||
cf_clearance = os.environ.get("HOMECOUK_CF_CLEARANCE", "")
|
||||
session = os.environ.get("HOMECOUK_SESSION", "")
|
||||
if not cf_clearance or not session:
|
||||
return None
|
||||
Environment cookies are optional. When they are not present, bootstrap a
|
||||
regular local session by visiting home.co.uk with curl_cffi's Chrome
|
||||
impersonation and reusing the cookies set by the site.
|
||||
"""
|
||||
user_agent = os.environ.get(
|
||||
"HOMECOUK_USER_AGENT",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/145.0.0.0 Safari/537.36",
|
||||
)
|
||||
return {"cf_clearance": cf_clearance, "homecouk_session": session}, user_agent
|
||||
|
||||
env_cookies = {
|
||||
name: value
|
||||
for name, value in {
|
||||
"cf_clearance": os.environ.get("HOMECOUK_CF_CLEARANCE", ""),
|
||||
"homecouk_session": os.environ.get("HOMECOUK_SESSION", ""),
|
||||
"XSRF-TOKEN": os.environ.get("HOMECOUK_XSRF_TOKEN", ""),
|
||||
}.items()
|
||||
if value
|
||||
}
|
||||
if env_cookies.get("homecouk_session"):
|
||||
return env_cookies, user_agent
|
||||
|
||||
session = Session(impersonate="chrome")
|
||||
session.headers.update(
|
||||
{
|
||||
"User-Agent": user_agent,
|
||||
"Accept": (
|
||||
"text/html,application/xhtml+xml,application/xml;q=0.9,"
|
||||
"*/*;q=0.8"
|
||||
),
|
||||
}
|
||||
)
|
||||
|
||||
for url in (HOMECOUK_BASE, f"{HOMECOUK_BASE}/for-sale/br1/"):
|
||||
try:
|
||||
response = session.get(url, timeout=30)
|
||||
except RequestsError as exc:
|
||||
log.warning("home.co.uk cookie bootstrap failed for %s: %s", url, exc)
|
||||
continue
|
||||
if response.status_code == 403:
|
||||
raise CookiesExpiredError("home.co.uk returned HTTP 403 during bootstrap")
|
||||
if response.status_code >= 400:
|
||||
log.warning(
|
||||
"home.co.uk cookie bootstrap got HTTP %d from %s",
|
||||
response.status_code,
|
||||
url,
|
||||
)
|
||||
|
||||
cookies = session.cookies.get_dict()
|
||||
if cookies.get("homecouk_session") and cookies.get("XSRF-TOKEN"):
|
||||
log.info("home.co.uk local session bootstrapped")
|
||||
return cookies, user_agent
|
||||
|
||||
log.warning("home.co.uk did not provide session cookies during bootstrap")
|
||||
return None
|
||||
|
||||
|
||||
def make_client(cookies: dict[str, str], user_agent: str) -> Session:
|
||||
"""Create a curl_cffi Session configured for home.co.uk API calls.
|
||||
Uses Chrome TLS impersonation so cf_clearance cookies (which are bound
|
||||
to Chrome's JA3 fingerprint from FlareSolverr) remain valid."""
|
||||
Uses Chrome TLS impersonation so browser-derived cookies remain valid."""
|
||||
session = Session(impersonate="chrome")
|
||||
session.headers.update(
|
||||
{
|
||||
|
|
@ -150,12 +115,6 @@ def make_client(cookies: dict[str, str], user_agent: str) -> Session:
|
|||
return session
|
||||
|
||||
|
||||
def _status_label(code: int) -> str:
|
||||
if code >= 500:
|
||||
return "5xx"
|
||||
return str(code)
|
||||
|
||||
|
||||
def fetch_page(
|
||||
client: Session, url: str, params: dict, max_retries: int = 3
|
||||
) -> dict | None:
|
||||
|
|
@ -164,12 +123,10 @@ def fetch_page(
|
|||
for attempt in range(max_retries):
|
||||
try:
|
||||
resp = client.get(url, params=params, timeout=30)
|
||||
homecouk_requests_total.labels(status=_status_label(resp.status_code)).inc()
|
||||
if resp.status_code == 200:
|
||||
try:
|
||||
return resp.json()
|
||||
except json.JSONDecodeError:
|
||||
homecouk_errors_total.labels(type="json_decode").inc()
|
||||
log.error(
|
||||
"Non-JSON response from %s (got %s)",
|
||||
url,
|
||||
|
|
@ -195,7 +152,6 @@ def fetch_page(
|
|||
except CookiesExpiredError:
|
||||
raise
|
||||
except RequestsError as e:
|
||||
homecouk_errors_total.labels(type=type(e).__name__).inc()
|
||||
delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1)
|
||||
log.warning(
|
||||
"%s from %s, retry %d/%d in %.1fs",
|
||||
|
|
@ -206,7 +162,6 @@ def fetch_page(
|
|||
delay,
|
||||
)
|
||||
time.sleep(delay)
|
||||
homecouk_errors_total.labels(type="retry_exhausted").inc()
|
||||
log.error("All %d retries exhausted for %s", max_retries, url)
|
||||
return None
|
||||
|
||||
|
|
@ -301,7 +256,6 @@ def map_property_type(raw_type: str | None) -> str:
|
|||
|
||||
def transform_property(
|
||||
prop: dict,
|
||||
channel: str,
|
||||
pc_index: PostcodeSpatialIndex,
|
||||
) -> dict | None:
|
||||
"""Transform a raw home.co.uk property dict into our output schema."""
|
||||
|
|
@ -365,7 +319,7 @@ def transform_property(
|
|||
"Property type": map_property_type(listing_type),
|
||||
"Property sub-type": normalize_sub_type(listing_type),
|
||||
"price": int(price),
|
||||
"price_frequency": "" if channel == "BUY" else "monthly",
|
||||
"price_frequency": "",
|
||||
"Price qualifier": price_qualifier,
|
||||
"Total floor area (sqm)": parse_floor_area(prop.get("description")),
|
||||
"Listing URL": f"{HOMECOUK_BASE}/property/{listing_id}",
|
||||
|
|
@ -377,13 +331,11 @@ def transform_property(
|
|||
def search_outcode(
|
||||
client: Session,
|
||||
outcode: str,
|
||||
channel: str,
|
||||
pc_index: PostcodeSpatialIndex,
|
||||
max_properties: int | None = None,
|
||||
) -> list[dict]:
|
||||
"""Paginate through search results for one outcode+channel.
|
||||
channel: "BUY" or "RENT".
|
||||
Returns transformed properties."""
|
||||
url_segment = HOMECOUK_CHANNELS[channel]
|
||||
"""Paginate through sale search results for one outcode."""
|
||||
url_segment = HOMECOUK_URL_SEGMENT
|
||||
url = f"{HOMECOUK_API_BASE}/{url_segment}/{outcode.lower()}/"
|
||||
properties = []
|
||||
page = 1
|
||||
|
|
@ -410,12 +362,11 @@ def search_outcode(
|
|||
break
|
||||
|
||||
for prop in raw_props:
|
||||
transformed = transform_property(prop, channel, pc_index)
|
||||
transformed = transform_property(prop, pc_index)
|
||||
if transformed:
|
||||
properties.append(transformed)
|
||||
homecouk_properties_scraped.labels(
|
||||
channel="buy" if channel == "BUY" else "rent",
|
||||
).inc()
|
||||
if max_properties is not None and len(properties) >= max_properties:
|
||||
return properties
|
||||
|
||||
# Check pagination
|
||||
pagination = data.get("pagination", {})
|
||||
|
|
|
|||
|
|
@ -1,13 +1,11 @@
|
|||
import logging
|
||||
import random
|
||||
import threading
|
||||
import time
|
||||
|
||||
import httpx
|
||||
from fake_useragent import UserAgent
|
||||
|
||||
from constants import MAX_RETRIES, RETRY_BASE_DELAY
|
||||
from metrics import http_errors_total, http_requests_total, ip_rotations_total
|
||||
|
||||
log = logging.getLogger("rightmove")
|
||||
|
||||
|
|
@ -16,83 +14,6 @@ _ua = UserAgent(
|
|||
)
|
||||
|
||||
|
||||
def _endpoint_label(url: str) -> str:
|
||||
if "typeahead" in url:
|
||||
return "typeahead"
|
||||
if "search" in url:
|
||||
return "search"
|
||||
return "other"
|
||||
|
||||
|
||||
def _status_label(code: int) -> str:
|
||||
if code >= 500:
|
||||
return "5xx"
|
||||
return str(code)
|
||||
|
||||
|
||||
# Gluetun control API — runs on port 8000 inside the gluetun container.
|
||||
# Since finder uses network_mode: service:gluetun, localhost IS gluetun.
|
||||
GLUETUN_API = "http://127.0.0.1:8000"
|
||||
_ip_rotate_lock = threading.Lock()
|
||||
|
||||
|
||||
def rotate_ip() -> bool:
|
||||
"""Ask gluetun to reconnect to a different VPN server, getting a new IP.
|
||||
Returns True if the IP changed successfully."""
|
||||
with _ip_rotate_lock:
|
||||
log.info("Rotating VPN IP via gluetun...")
|
||||
try:
|
||||
# Get current IP
|
||||
with httpx.Client(timeout=10) as ctl:
|
||||
old_ip_resp = ctl.get(f"{GLUETUN_API}/v1/publicip/ip")
|
||||
old_ip = (
|
||||
old_ip_resp.json().get("public_ip", "unknown")
|
||||
if old_ip_resp.status_code == 200
|
||||
else "unknown"
|
||||
)
|
||||
log.info("Current IP: %s", old_ip)
|
||||
|
||||
# Trigger server change — PUT with empty JSON body picks a random server
|
||||
resp = ctl.put(
|
||||
f"{GLUETUN_API}/v1/vpn/status", json={"status": "stopped"}
|
||||
)
|
||||
if resp.status_code != 200:
|
||||
log.error("Failed to stop VPN: %d %s", resp.status_code, resp.text)
|
||||
return False
|
||||
time.sleep(2)
|
||||
|
||||
resp = ctl.put(
|
||||
f"{GLUETUN_API}/v1/vpn/status", json={"status": "running"}
|
||||
)
|
||||
if resp.status_code != 200:
|
||||
log.error("Failed to start VPN: %d %s", resp.status_code, resp.text)
|
||||
return False
|
||||
|
||||
# Wait for reconnection
|
||||
for _ in range(30):
|
||||
time.sleep(2)
|
||||
try:
|
||||
with httpx.Client(timeout=10) as ctl:
|
||||
new_ip_resp = ctl.get(f"{GLUETUN_API}/v1/publicip/ip")
|
||||
if new_ip_resp.status_code == 200:
|
||||
new_ip = new_ip_resp.json().get("public_ip", "")
|
||||
if new_ip and new_ip != old_ip:
|
||||
log.info("IP rotated: %s → %s", old_ip, new_ip)
|
||||
ip_rotations_total.labels(result="success").inc()
|
||||
return True
|
||||
except Exception:
|
||||
pass # VPN still reconnecting
|
||||
|
||||
log.warning("IP rotation timed out (may still be same IP)")
|
||||
ip_rotations_total.labels(result="failure").inc()
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
log.error("IP rotation failed: %s", e)
|
||||
ip_rotations_total.labels(result="failure").inc()
|
||||
return False
|
||||
|
||||
|
||||
def make_client() -> httpx.Client:
|
||||
return httpx.Client(
|
||||
timeout=30,
|
||||
|
|
@ -104,23 +25,18 @@ def make_client() -> httpx.Client:
|
|||
def fetch_with_retry(
|
||||
client: httpx.Client, url: str, params: dict | None = None, on_403: bool = True
|
||||
) -> dict | None:
|
||||
"""GET JSON with retries on 429/5xx/connection errors. Returns None on permanent failure.
|
||||
On 403, triggers IP rotation and retries once."""
|
||||
endpoint = _endpoint_label(url)
|
||||
"""GET JSON with retries on 429/5xx/connection errors.
|
||||
|
||||
Returns None on permanent failure. The on_403 argument is kept for
|
||||
compatibility with older callers; 403 is now treated as non-retryable.
|
||||
"""
|
||||
for attempt in range(MAX_RETRIES):
|
||||
try:
|
||||
resp = client.get(url, params=params)
|
||||
http_requests_total.labels(
|
||||
status=_status_label(resp.status_code), endpoint=endpoint
|
||||
).inc()
|
||||
if resp.status_code == 200:
|
||||
return resp.json()
|
||||
if resp.status_code == 403 and on_403:
|
||||
log.warning("HTTP 403 — IP likely blocked, rotating...")
|
||||
if rotate_ip():
|
||||
# Retry once with new IP (but don't recurse on 403 again)
|
||||
return fetch_with_retry(client, url, params, on_403=False)
|
||||
log.error("IP rotation failed, giving up on %s", url)
|
||||
log.error("HTTP 403 from %s (forbidden)", url)
|
||||
return None
|
||||
if resp.status_code in (429, 500, 502, 503, 504):
|
||||
delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1)
|
||||
|
|
@ -142,7 +58,6 @@ def fetch_with_retry(
|
|||
httpx.WriteTimeout,
|
||||
httpx.PoolTimeout,
|
||||
) as e:
|
||||
http_errors_total.labels(type=type(e).__name__).inc()
|
||||
delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1)
|
||||
log.warning(
|
||||
"%s from %s, retry %d/%d in %.1fs",
|
||||
|
|
@ -153,6 +68,5 @@ def fetch_with_retry(
|
|||
delay,
|
||||
)
|
||||
time.sleep(delay)
|
||||
http_errors_total.labels(type="retry_exhausted").inc()
|
||||
log.error("All %d retries exhausted for %s", MAX_RETRIES, url)
|
||||
return None
|
||||
|
|
|
|||
319
finder/main.py
319
finder/main.py
|
|
@ -1,211 +1,166 @@
|
|||
import argparse
|
||||
import logging
|
||||
import threading
|
||||
import os
|
||||
import tempfile
|
||||
import time
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from pathlib import Path
|
||||
|
||||
from flask import Flask, Response, jsonify, send_from_directory
|
||||
from prometheus_client import generate_latest, CONTENT_TYPE_LATEST
|
||||
from constants import DATA_DIR
|
||||
|
||||
from constants import (
|
||||
DATA_DIR,
|
||||
RUN_ON_STARTUP,
|
||||
SCHEDULE_HOUR,
|
||||
SCRAPE_HOMECOUK,
|
||||
SCRAPE_OPENRENT,
|
||||
SCRAPE_RIGHTMOVE,
|
||||
SCRAPE_ZOOPLA,
|
||||
)
|
||||
from homecouk import load_cookies as load_homecouk_cookies
|
||||
from openrent import load_cookies as load_openrent_cookies
|
||||
from rightmove import outcode_cache
|
||||
from scraper import (
|
||||
_sync_gauges,
|
||||
build_postcode_coords,
|
||||
build_postcode_index,
|
||||
load_outcodes,
|
||||
run_scrape,
|
||||
status,
|
||||
status_lock,
|
||||
|
||||
SOURCE_CHOICES = ("rightmove", "homecouk", "zoopla", "all")
|
||||
TEST_MAX_PROPERTIES_PER_SOURCE = 100
|
||||
TEST_OUTCODES = (
|
||||
"E1",
|
||||
"N1",
|
||||
"NW1",
|
||||
"SE1",
|
||||
"SW1",
|
||||
"W1",
|
||||
"WC1",
|
||||
"BR1",
|
||||
"CR0",
|
||||
"TW1",
|
||||
)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Logging
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
LOG_DIR = Path("/app/data")
|
||||
LOG_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.DEBUG,
|
||||
format="%(asctime)s [%(levelname)s] %(message)s",
|
||||
handlers=[
|
||||
logging.StreamHandler(),
|
||||
logging.FileHandler(LOG_DIR / "rightmove.log"),
|
||||
],
|
||||
)
|
||||
log = logging.getLogger("rightmove")
|
||||
log.setLevel(logging.DEBUG)
|
||||
logging.getLogger("httpx").setLevel(logging.WARNING)
|
||||
logging.getLogger("httpcore").setLevel(logging.WARNING)
|
||||
log = logging.getLogger("finder")
|
||||
|
||||
|
||||
# Suppress noisy /metrics and /health request logs from werkzeug
|
||||
class _NoiseFilter(logging.Filter):
|
||||
def filter(self, record):
|
||||
msg = record.getMessage()
|
||||
return "GET /metrics" not in msg and "GET /health" not in msg
|
||||
def configure_standalone_runtime() -> None:
|
||||
"""Keep browser/cache/temp files on the project volume for local runs."""
|
||||
runtime_dir = DATA_DIR / ".runtime"
|
||||
cache_dir = runtime_dir / "cache"
|
||||
temp_dir = runtime_dir / "tmp"
|
||||
cache_dir.mkdir(parents=True, exist_ok=True)
|
||||
temp_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
os.environ.setdefault("XDG_CACHE_HOME", str(cache_dir))
|
||||
os.environ.setdefault("TMPDIR", str(temp_dir))
|
||||
tempfile.tempdir = str(temp_dir)
|
||||
|
||||
|
||||
logging.getLogger("werkzeug").addFilter(_NoiseFilter())
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Startup: load data
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
log.info("Loading arcgis data...")
|
||||
OUTCODES = load_outcodes()
|
||||
PC_INDEX = build_postcode_index()
|
||||
PC_COORDS = build_postcode_coords() if (SCRAPE_OPENRENT or SCRAPE_ZOOPLA) else None
|
||||
log.info(
|
||||
"Ready — %d outcodes, postcode index built (rightmove=%s, homecouk=%s, openrent=%s, zoopla=%s)",
|
||||
len(OUTCODES),
|
||||
SCRAPE_RIGHTMOVE,
|
||||
SCRAPE_HOMECOUK,
|
||||
SCRAPE_OPENRENT,
|
||||
SCRAPE_ZOOPLA,
|
||||
)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Scheduler
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _start_scrape() -> bool:
|
||||
"""Try to start a scrape. Returns True if started, False if already running."""
|
||||
with status_lock:
|
||||
if status.state == "running":
|
||||
return False
|
||||
status.state = "running"
|
||||
thread = threading.Thread(
|
||||
target=run_scrape, args=(OUTCODES, PC_INDEX, PC_COORDS), daemon=True
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Run a manual Greater London-ish property scrape."
|
||||
)
|
||||
thread.start()
|
||||
return True
|
||||
parser.add_argument(
|
||||
"--source",
|
||||
choices=SOURCE_CHOICES,
|
||||
default="all",
|
||||
help="Portal to scrape. 'all' runs Rightmove, home.co.uk, and Zoopla.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-dir",
|
||||
type=Path,
|
||||
default=DATA_DIR,
|
||||
help=f"Directory for parquet output. Defaults to {DATA_DIR}.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--limit-outcodes",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Limit outcodes for a quick manual smoke test.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-properties-per-source",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Stop each source after this many transformed listings.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--test",
|
||||
action="store_true",
|
||||
help=(
|
||||
"Run a small standalone smoke test: use likely London outcodes and "
|
||||
f"fetch at most {TEST_MAX_PROPERTIES_PER_SOURCE} listings per source."
|
||||
),
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def _seconds_until(hour: int) -> float:
|
||||
"""Seconds from now until the next occurrence of `hour`:00 UTC."""
|
||||
now = datetime.now(timezone.utc)
|
||||
target = now.replace(hour=hour, minute=0, second=0, microsecond=0)
|
||||
if target <= now:
|
||||
target += timedelta(days=1)
|
||||
return (target - now).total_seconds()
|
||||
def configure_logging() -> None:
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s [%(levelname)s] %(message)s",
|
||||
)
|
||||
logging.getLogger("httpx").setLevel(logging.WARNING)
|
||||
logging.getLogger("httpcore").setLevel(logging.WARNING)
|
||||
|
||||
|
||||
def _scheduler_loop() -> None:
|
||||
"""Background thread that triggers a daily scrape at SCHEDULE_HOUR UTC."""
|
||||
log.info("Scheduler active — will run daily at %02d:00 UTC", SCHEDULE_HOUR)
|
||||
while True:
|
||||
wait = _seconds_until(SCHEDULE_HOUR)
|
||||
log.info(
|
||||
"Next scheduled scrape in %.0f seconds (%.1f hours)", wait, wait / 3600
|
||||
)
|
||||
time.sleep(wait)
|
||||
log.info("Scheduled scrape triggered")
|
||||
if not _start_scrape():
|
||||
log.warning("Scheduled scrape skipped — already running")
|
||||
def selected_sources(source: str) -> list[str]:
|
||||
if source == "all":
|
||||
return ["rightmove", "homecouk", "zoopla"]
|
||||
return [source]
|
||||
|
||||
|
||||
if RUN_ON_STARTUP:
|
||||
log.info("RUN_ON_STARTUP=true — starting initial scrape")
|
||||
_start_scrape()
|
||||
def main() -> int:
|
||||
args = parse_args()
|
||||
configure_standalone_runtime()
|
||||
configure_logging()
|
||||
|
||||
if SCHEDULE_HOUR >= 0:
|
||||
scheduler = threading.Thread(target=_scheduler_loop, daemon=True)
|
||||
scheduler.start()
|
||||
if args.limit_outcodes is not None and args.limit_outcodes < 1:
|
||||
raise SystemExit("--limit-outcodes must be greater than zero")
|
||||
if (
|
||||
args.max_properties_per_source is not None
|
||||
and args.max_properties_per_source < 1
|
||||
):
|
||||
raise SystemExit("--max-properties-per-source must be greater than zero")
|
||||
|
||||
output_dir = args.output_dir.expanduser().resolve()
|
||||
if args.test and args.output_dir == DATA_DIR:
|
||||
output_dir = (DATA_DIR / "test").expanduser().resolve()
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Flask app
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
|
||||
@app.route("/health")
|
||||
def health():
|
||||
return "ok", 200
|
||||
|
||||
|
||||
@app.route("/run", methods=["POST"])
|
||||
def trigger_run():
|
||||
if _start_scrape():
|
||||
return jsonify({"message": "Scrape started"}), 200
|
||||
return jsonify({"error": "Scrape already running"}), 409
|
||||
|
||||
|
||||
@app.route("/status")
|
||||
def get_status():
|
||||
with status_lock:
|
||||
elapsed = 0.0
|
||||
if status.started_at:
|
||||
end = status.finished_at if status.finished_at else time.time()
|
||||
elapsed = end - status.started_at
|
||||
resp = {
|
||||
"state": status.state,
|
||||
"channel": status.channel,
|
||||
"outcode": status.outcode,
|
||||
"outcodes_done": status.outcodes_done,
|
||||
"outcodes_total": status.outcodes_total,
|
||||
"properties_buy": status.properties_buy,
|
||||
"properties_rent": status.properties_rent,
|
||||
"properties_by_source": {
|
||||
"rightmove": status.rm_properties,
|
||||
"homecouk": status.hk_properties,
|
||||
"openrent": status.or_properties,
|
||||
"zoopla": status.zp_properties,
|
||||
},
|
||||
"errors": status.errors[-20:], # last 20 errors
|
||||
"elapsed_seconds": round(elapsed, 1),
|
||||
}
|
||||
if SCHEDULE_HOUR >= 0:
|
||||
resp["next_scrape_in_seconds"] = round(_seconds_until(SCHEDULE_HOUR))
|
||||
return jsonify(resp)
|
||||
|
||||
|
||||
@app.route("/debug")
|
||||
def get_debug():
|
||||
hk_cookies = load_homecouk_cookies() if SCRAPE_HOMECOUK else None
|
||||
or_cookies = load_openrent_cookies() if SCRAPE_OPENRENT else None
|
||||
return jsonify(
|
||||
{
|
||||
"outcode_cache_size": len(outcode_cache),
|
||||
"outcode_cache_sample": dict(list(outcode_cache.items())[:20]),
|
||||
"scrape_rightmove": SCRAPE_RIGHTMOVE,
|
||||
"scrape_homecouk": SCRAPE_HOMECOUK,
|
||||
"scrape_openrent": SCRAPE_OPENRENT,
|
||||
"scrape_zoopla": SCRAPE_ZOOPLA,
|
||||
"homecouk_cookies_available": hk_cookies is not None,
|
||||
"openrent_cookies_available": or_cookies is not None,
|
||||
"zoopla_note": "browser-based (Camoufox), no cookies needed",
|
||||
}
|
||||
from scraper import (
|
||||
build_postcode_coords,
|
||||
build_postcode_index,
|
||||
load_outcodes,
|
||||
run_scrape,
|
||||
)
|
||||
|
||||
outcodes = load_outcodes()
|
||||
if args.test and args.limit_outcodes is None:
|
||||
preferred = [outcode for outcode in TEST_OUTCODES if outcode in set(outcodes)]
|
||||
if preferred:
|
||||
outcodes = preferred
|
||||
if args.limit_outcodes is not None:
|
||||
outcodes = outcodes[: args.limit_outcodes]
|
||||
|
||||
@app.route("/metrics")
|
||||
def metrics():
|
||||
with status_lock:
|
||||
_sync_gauges()
|
||||
return Response(generate_latest(), mimetype=CONTENT_TYPE_LATEST)
|
||||
if not outcodes:
|
||||
raise SystemExit("No Greater London-ish outcodes loaded; nothing to scrape.")
|
||||
|
||||
sources = selected_sources(args.source)
|
||||
max_properties_per_source = args.max_properties_per_source
|
||||
if args.test and max_properties_per_source is None:
|
||||
max_properties_per_source = TEST_MAX_PROPERTIES_PER_SOURCE
|
||||
|
||||
@app.route("/data/<filename>")
|
||||
def serve_data(filename):
|
||||
if not filename.endswith(".parquet"):
|
||||
return jsonify({"error": "Only parquet files served"}), 400
|
||||
return send_from_directory(DATA_DIR, filename)
|
||||
log.info(
|
||||
"Starting sale scrape: source=%s outcodes=%d output_dir=%s test=%s",
|
||||
args.source,
|
||||
len(outcodes),
|
||||
output_dir,
|
||||
args.test,
|
||||
)
|
||||
started = time.monotonic()
|
||||
|
||||
pc_index = build_postcode_index()
|
||||
pc_coords = build_postcode_coords() if "zoopla" in sources else None
|
||||
result = run_scrape(
|
||||
outcodes,
|
||||
pc_index,
|
||||
pc_coords=pc_coords,
|
||||
sources=sources,
|
||||
output_dir=output_dir,
|
||||
max_properties_per_source=max_properties_per_source,
|
||||
)
|
||||
|
||||
elapsed = time.monotonic() - started
|
||||
log.info("Scrape finished in %.1fs", elapsed)
|
||||
log.info("Result: %s", result)
|
||||
if args.test and result.get("errors"):
|
||||
raise SystemExit("Test scrape failed; see errors in the result above.")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.run(host="0.0.0.0", port=1234, debug=False)
|
||||
raise SystemExit(main())
|
||||
|
|
|
|||
|
|
@ -1,167 +0,0 @@
|
|||
from prometheus_client import Counter, Gauge
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Gauges — current scrape state, updated after each outcode
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
scrape_state = Gauge(
|
||||
"scrape_state",
|
||||
"Current scrape state as a labeled gauge (1 = active)",
|
||||
["state"],
|
||||
)
|
||||
|
||||
scrape_outcodes_done = Gauge(
|
||||
"scrape_outcodes_done",
|
||||
"Outcodes processed in current channel",
|
||||
)
|
||||
|
||||
scrape_outcodes_total = Gauge(
|
||||
"scrape_outcodes_total",
|
||||
"Total outcodes in current channel",
|
||||
)
|
||||
|
||||
scrape_properties_total = Gauge(
|
||||
"scrape_properties_total",
|
||||
"Properties found so far",
|
||||
["channel", "source"],
|
||||
)
|
||||
|
||||
scrape_elapsed_seconds = Gauge(
|
||||
"scrape_elapsed_seconds",
|
||||
"Seconds since scrape started",
|
||||
)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Counters — Rightmove (monotonically increasing)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
http_requests_total = Counter(
|
||||
"http_requests_total",
|
||||
"HTTP requests made to Rightmove",
|
||||
["status", "endpoint"],
|
||||
)
|
||||
|
||||
http_errors_total = Counter(
|
||||
"http_errors_total",
|
||||
"Rightmove HTTP connection/timeout errors",
|
||||
["type"],
|
||||
)
|
||||
|
||||
ip_rotations_total = Counter(
|
||||
"ip_rotations_total",
|
||||
"VPN IP rotation attempts",
|
||||
["result"],
|
||||
)
|
||||
|
||||
scrape_errors_total = Counter(
|
||||
"scrape_errors_total",
|
||||
"Per-outcode scrape errors",
|
||||
["source"],
|
||||
)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Counters — home.co.uk
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
homecouk_requests_total = Counter(
|
||||
"homecouk_requests_total",
|
||||
"HTTP requests made to home.co.uk API",
|
||||
["status"],
|
||||
)
|
||||
|
||||
homecouk_errors_total = Counter(
|
||||
"homecouk_errors_total",
|
||||
"home.co.uk HTTP connection/timeout errors",
|
||||
["type"],
|
||||
)
|
||||
|
||||
homecouk_properties_scraped = Counter(
|
||||
"homecouk_properties_scraped",
|
||||
"Properties scraped from home.co.uk (before dedup)",
|
||||
["channel"],
|
||||
)
|
||||
|
||||
cross_source_dedup_total = Counter(
|
||||
"cross_source_dedup_total",
|
||||
"Properties skipped because same property already found on another source",
|
||||
["channel"],
|
||||
)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Counters — OpenRent
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
openrent_requests_total = Counter(
|
||||
"openrent_requests_total",
|
||||
"HTTP requests made to OpenRent",
|
||||
["status"],
|
||||
)
|
||||
|
||||
openrent_errors_total = Counter(
|
||||
"openrent_errors_total",
|
||||
"OpenRent HTTP connection/timeout errors",
|
||||
["type"],
|
||||
)
|
||||
|
||||
openrent_properties_scraped = Counter(
|
||||
"openrent_properties_scraped",
|
||||
"Properties scraped from OpenRent (before dedup)",
|
||||
["channel"],
|
||||
)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Counters — Zoopla
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
zoopla_pages_scraped = Counter(
|
||||
"zoopla_pages_scraped",
|
||||
"Search result pages scraped from Zoopla",
|
||||
["channel"],
|
||||
)
|
||||
|
||||
zoopla_errors_total = Counter(
|
||||
"zoopla_errors_total",
|
||||
"Zoopla scraping errors",
|
||||
["type"],
|
||||
)
|
||||
|
||||
zoopla_properties_scraped = Counter(
|
||||
"zoopla_properties_scraped",
|
||||
"Properties scraped from Zoopla (before dedup)",
|
||||
["channel"],
|
||||
)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Counters — FlareSolverr / cookie management
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
flaresolverr_attempts_total = Counter(
|
||||
"flaresolverr_attempts_total",
|
||||
"FlareSolverr Cloudflare challenge-solving attempts",
|
||||
["result"],
|
||||
)
|
||||
|
||||
cookie_refreshes_total = Counter(
|
||||
"cookie_refreshes_total",
|
||||
"home.co.uk cookie refresh attempts (triggered by 403)",
|
||||
["result"],
|
||||
)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Gauges — home.co.uk state
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
homecouk_enabled = Gauge(
|
||||
"homecouk_enabled",
|
||||
"Whether home.co.uk scraping is currently active (1=yes, 0=no)",
|
||||
)
|
||||
|
||||
openrent_enabled = Gauge(
|
||||
"openrent_enabled",
|
||||
"Whether OpenRent scraping is currently active (1=yes, 0=no)",
|
||||
)
|
||||
|
||||
zoopla_enabled = Gauge(
|
||||
"zoopla_enabled",
|
||||
"Whether Zoopla scraping is currently active (1=yes, 0=no)",
|
||||
)
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
|
||||
Hit the following url with the outcode as the location-id and the page. So for E13, page 2 it's:
|
||||
|
||||
https://www.onthemarket.com/async/search/properties-v2/?search-type=for-sale&location-id=e13&page=2&view=map-list
|
||||
|
||||
and the response is in [[response.json]]
|
||||
File diff suppressed because it is too large
Load diff
|
|
@ -1,869 +0,0 @@
|
|||
"""OpenRent (openrent.co.uk) scraper — rental properties only.
|
||||
|
||||
OpenRent is behind AWS WAF, so we use Playwright (headless Chromium) to solve
|
||||
the challenge and get valid cookies. Then we use curl_cffi with Chrome TLS
|
||||
impersonation to make requests with those cookies.
|
||||
|
||||
OpenRent is a rental-only platform, so this scraper only handles RENT channel.
|
||||
|
||||
HTML structure (as of 2026-03):
|
||||
Search results page renders property cards as <a class="pli search-property-card">.
|
||||
Each card contains:
|
||||
- Monthly price in <div class="pim"> with <span class="text-primary">£X,XXX</span>
|
||||
- Weekly price in <div class="piw"> (hidden by Alpine.js)
|
||||
- Title in <div class="fw-medium text-primary fs-3">N Bed Type, Location, OUTCODE</div>
|
||||
- Features in <ul> with <li> items like "1 Bed", "1 Bath", "Furnished"
|
||||
- Listing ID in data-listing-id on the .or-swiper div
|
||||
- Description snippet in <div class="line-clamp-2">
|
||||
|
||||
Detail page has:
|
||||
- <h1> with property title including outcode
|
||||
- <div id="map" data-lat="..." data-lng="..."> for coordinates
|
||||
- Tables with deposit, rent, furnishing, tenant preferences
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from curl_cffi.requests import Session
|
||||
from curl_cffi.requests.errors import RequestsError
|
||||
from playwright.sync_api import sync_playwright
|
||||
|
||||
from constants import (
|
||||
DELAY_BETWEEN_PAGES,
|
||||
MAX_BEDROOMS,
|
||||
OPENRENT_BASE,
|
||||
PROPERTY_TYPE_MAP,
|
||||
RETRY_BASE_DELAY,
|
||||
)
|
||||
from metrics import (
|
||||
flaresolverr_attempts_total,
|
||||
openrent_errors_total,
|
||||
openrent_properties_scraped,
|
||||
openrent_requests_total,
|
||||
)
|
||||
from spatial import PostcodeSpatialIndex
|
||||
from transform import normalize_postcode, normalize_sub_type, validate_floor_area
|
||||
|
||||
log = logging.getLogger("openrent")
|
||||
|
||||
|
||||
class WafChallengeError(Exception):
|
||||
"""Raised when OpenRent returns a WAF challenge, indicating cookies need refresh."""
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Cookie / session management via Playwright
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def solve_waf() -> tuple[dict[str, str], str] | None:
|
||||
"""Use Playwright (headless Chromium) to solve the AWS WAF challenge.
|
||||
Returns (cookies_dict, user_agent) or None on failure."""
|
||||
log.info("Solving AWS WAF challenge via Playwright")
|
||||
try:
|
||||
with sync_playwright() as p:
|
||||
browser = p.chromium.launch(
|
||||
headless=True,
|
||||
args=["--no-sandbox", "--disable-blink-features=AutomationControlled"],
|
||||
)
|
||||
context = browser.new_context()
|
||||
page = context.new_page()
|
||||
|
||||
url = f"{OPENRENT_BASE}/properties-to-rent/?term=london&isLive=true"
|
||||
log.info("Navigating to %s", url)
|
||||
page.goto(url, wait_until="domcontentloaded", timeout=60000)
|
||||
|
||||
content = page.content()
|
||||
if "AwsWafIntegration" in content:
|
||||
log.info("Got WAF challenge page, waiting for resolution...")
|
||||
page.wait_for_selector(
|
||||
"a.pli, .pli, .search-property-card",
|
||||
timeout=30000,
|
||||
)
|
||||
|
||||
raw_cookies = context.cookies()
|
||||
user_agent = page.evaluate("navigator.userAgent")
|
||||
browser.close()
|
||||
|
||||
cookies = {c["name"]: c["value"] for c in raw_cookies}
|
||||
if "aws-waf-token" not in cookies:
|
||||
log.error("Playwright solved page but no aws-waf-token cookie found")
|
||||
flaresolverr_attempts_total.labels(result="no_cookies").inc()
|
||||
return None
|
||||
|
||||
log.info(
|
||||
"AWS WAF solved — got %d cookies, UA: %s",
|
||||
len(cookies),
|
||||
user_agent[:60],
|
||||
)
|
||||
flaresolverr_attempts_total.labels(result="success").inc()
|
||||
return cookies, user_agent
|
||||
|
||||
except Exception as e:
|
||||
log.error("Playwright WAF solve failed: %s", e)
|
||||
flaresolverr_attempts_total.labels(result="error").inc()
|
||||
return None
|
||||
|
||||
|
||||
def load_cookies() -> tuple[dict[str, str], str] | None:
|
||||
"""Get OpenRent cookies + user-agent.
|
||||
Tries Playwright first, then falls back to environment variables."""
|
||||
result = solve_waf()
|
||||
if result:
|
||||
return result
|
||||
|
||||
# Fall back to env vars
|
||||
waf_token = os.environ.get("OPENRENT_WAF_TOKEN", "")
|
||||
if not waf_token:
|
||||
return None
|
||||
|
||||
user_agent = os.environ.get(
|
||||
"OPENRENT_USER_AGENT",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/145.0.0.0 Safari/537.36",
|
||||
)
|
||||
return {"aws-waf-token": waf_token}, user_agent
|
||||
|
||||
|
||||
def make_client(cookies: dict[str, str], user_agent: str) -> Session:
|
||||
"""Create a curl_cffi Session configured for OpenRent.
|
||||
Uses Chrome TLS impersonation so AWS WAF cookies remain valid."""
|
||||
session = Session(impersonate="chrome")
|
||||
session.headers.update(
|
||||
{
|
||||
"User-Agent": user_agent,
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Accept-Language": "en-GB,en;q=0.9",
|
||||
}
|
||||
)
|
||||
for name, value in cookies.items():
|
||||
session.cookies.set(name, value, domain="openrent.co.uk")
|
||||
return session
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# HTTP fetch with retry
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _status_label(code: int) -> str:
|
||||
if code >= 500:
|
||||
return "5xx"
|
||||
return str(code)
|
||||
|
||||
|
||||
def fetch_page(
|
||||
client: Session,
|
||||
url: str,
|
||||
max_retries: int = 3,
|
||||
) -> str | None:
|
||||
"""GET HTML with retries on 429/5xx. Returns None on permanent failure.
|
||||
WAF challenge (202 or 403 with challenge JS) raises WafChallengeError."""
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
resp = client.get(url, timeout=30)
|
||||
openrent_requests_total.labels(status=_status_label(resp.status_code)).inc()
|
||||
|
||||
if resp.status_code == 200:
|
||||
html = resp.text
|
||||
# Detect WAF challenge page masquerading as 200
|
||||
if "AwsWafIntegration" in html and "challenge.js" in html:
|
||||
raise WafChallengeError(
|
||||
"Got AWS WAF challenge page — cookies expired"
|
||||
)
|
||||
return html
|
||||
|
||||
if resp.status_code in (202, 403):
|
||||
raise WafChallengeError(
|
||||
f"HTTP {resp.status_code} — cookies likely expired"
|
||||
)
|
||||
|
||||
if resp.status_code in (429, 500, 502, 503, 504):
|
||||
delay = RETRY_BASE_DELAY * (2**attempt)
|
||||
log.warning(
|
||||
"HTTP %d from %s, retry %d/%d in %.1fs",
|
||||
resp.status_code,
|
||||
url,
|
||||
attempt + 1,
|
||||
max_retries,
|
||||
delay,
|
||||
)
|
||||
time.sleep(delay)
|
||||
continue
|
||||
|
||||
log.error("HTTP %d from %s (non-retryable)", resp.status_code, url)
|
||||
return None
|
||||
|
||||
except WafChallengeError:
|
||||
raise
|
||||
except RequestsError as e:
|
||||
openrent_errors_total.labels(type=type(e).__name__).inc()
|
||||
delay = RETRY_BASE_DELAY * (2**attempt)
|
||||
log.warning(
|
||||
"%s from %s, retry %d/%d in %.1fs",
|
||||
type(e).__name__,
|
||||
url,
|
||||
attempt + 1,
|
||||
max_retries,
|
||||
delay,
|
||||
)
|
||||
time.sleep(delay)
|
||||
|
||||
openrent_errors_total.labels(type="retry_exhausted").inc()
|
||||
log.error("All %d retries exhausted for %s", max_retries, url)
|
||||
return None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# HTML parsing
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _extract_price_from_element(el) -> tuple[int, str] | None:
|
||||
"""Extract price integer from a price element's text like '£2,100'."""
|
||||
if not el:
|
||||
return None
|
||||
text = el.get_text(strip=True)
|
||||
match = re.search(r"£([\d,]+)", text)
|
||||
if not match:
|
||||
return None
|
||||
return int(match.group(1).replace(",", ""))
|
||||
|
||||
|
||||
def _extract_price(text: str) -> tuple[int, str] | None:
|
||||
"""Extract price and frequency from text like '£1,500 pcm' or '£350 pw'.
|
||||
Returns (price_int, frequency) or None.
|
||||
|
||||
OpenRent card text shows both monthly and weekly prices (e.g.
|
||||
'£2,800 per month £646 per week'), so check monthly *before* weekly
|
||||
to match the first (monthly) price that the regex captures."""
|
||||
match = re.search(r"£([\d,]+)", text)
|
||||
if not match:
|
||||
return None
|
||||
price = int(match.group(1).replace(",", ""))
|
||||
lower = text.lower()
|
||||
if "pcm" in lower or "per month" in lower or "/m" in lower:
|
||||
return price, "monthly"
|
||||
if "pw" in lower or "per week" in lower or "/w" in lower:
|
||||
return price, "weekly"
|
||||
if "pa" in lower or "per annum" in lower or "/y" in lower:
|
||||
return price, "yearly"
|
||||
# OpenRent defaults to pcm (per calendar month)
|
||||
return price, "monthly"
|
||||
|
||||
|
||||
def _extract_bedrooms_from_title(title: str) -> int | None:
|
||||
"""Extract bedroom count from title like '2 Bed Flat, Pimlico'."""
|
||||
match = re.search(r"(\d+)\s*bed", title, re.IGNORECASE)
|
||||
if match:
|
||||
return int(match.group(1))
|
||||
if re.search(r"\bstudio\b", title, re.IGNORECASE):
|
||||
return 0
|
||||
return None
|
||||
|
||||
|
||||
def _extract_beds_baths_from_features(
|
||||
feature_items: list,
|
||||
) -> tuple[int | None, int | None]:
|
||||
"""Extract bedrooms and bathrooms from feature list items.
|
||||
|
||||
OpenRent search cards have <ul> with items like:
|
||||
<li>1 Bed</li> <li>1 Bath</li> <li>Furnished</li>
|
||||
"""
|
||||
bedrooms = None
|
||||
bathrooms = None
|
||||
for li in feature_items:
|
||||
text = li.get_text(strip=True).lower()
|
||||
bed_match = re.search(r"(\d+)\s*bed", text)
|
||||
if bed_match:
|
||||
bedrooms = int(bed_match.group(1))
|
||||
bath_match = re.search(r"(\d+)\s*bath", text)
|
||||
if bath_match:
|
||||
bathrooms = int(bath_match.group(1))
|
||||
return bedrooms, bathrooms
|
||||
|
||||
|
||||
def _extract_postcode(text: str) -> str | None:
|
||||
"""Extract full UK postcode from text like '2 Bed Flat, Pimlico, SW1V 2AA'.
|
||||
Normalizes to include a space before the 3-char incode."""
|
||||
match = re.search(r"([A-Z]{1,2}\d[A-Z0-9]?\s*\d[A-Z]{2})", text, re.IGNORECASE)
|
||||
if match:
|
||||
raw = match.group(1).upper().strip()
|
||||
# Ensure space before incode (last 3 chars): "IP265AT" → "IP26 5AT"
|
||||
if " " not in raw and len(raw) >= 5:
|
||||
return raw[:-3] + " " + raw[-3:]
|
||||
return raw
|
||||
return None
|
||||
|
||||
|
||||
def _extract_outcode(text: str) -> str | None:
|
||||
"""Extract UK outcode from text like '1 Bed Flat, Bank Chambers, SW1Y'.
|
||||
|
||||
Looks for an outcode pattern (e.g., SW1Y, E1, EC2A) at the end of the text
|
||||
or after the last comma."""
|
||||
# Try after last comma first (most reliable position in OpenRent titles)
|
||||
parts = text.split(",")
|
||||
if len(parts) > 1:
|
||||
last_part = parts[-1].strip()
|
||||
match = re.match(r"^([A-Z]{1,2}\d[A-Z0-9]?)$", last_part, re.IGNORECASE)
|
||||
if match:
|
||||
return match.group(1).upper()
|
||||
|
||||
# Fall back to searching anywhere in text
|
||||
match = re.search(r"\b([A-Z]{1,2}\d[A-Z0-9]?)\b", text, re.IGNORECASE)
|
||||
if match:
|
||||
candidate = match.group(1).upper()
|
||||
# Avoid matching things like "1 Bed" → "1B"
|
||||
if len(candidate) >= 2 and not candidate[0].isdigit():
|
||||
return candidate
|
||||
return None
|
||||
|
||||
|
||||
def _infer_property_type(title: str) -> str:
|
||||
"""Infer property type from title text.
|
||||
|
||||
Order matters: "Room in a Shared Flat" should be "Room" not "Flat",
|
||||
so check "room" before "flat"."""
|
||||
lower = title.lower()
|
||||
if "room in" in lower or "room " in lower:
|
||||
return "Room"
|
||||
if "studio" in lower:
|
||||
return "Studio"
|
||||
if "flat" in lower or "apartment" in lower:
|
||||
return "Flat"
|
||||
if "maisonette" in lower:
|
||||
return "Maisonette"
|
||||
if "house" in lower:
|
||||
return "House"
|
||||
if "bungalow" in lower:
|
||||
return "Bungalow"
|
||||
return ""
|
||||
|
||||
|
||||
def parse_search_results(html: str) -> list[dict]:
|
||||
"""Parse property data from OpenRent search results HTML.
|
||||
|
||||
Returns list of raw property dicts extracted from property cards.
|
||||
|
||||
Current OpenRent card structure (2026-03):
|
||||
<a class="pli search-property-card" href="/property-to-rent/.../ID">
|
||||
<div class="or-swiper" data-listing-id="ID">
|
||||
<div class="pim"><span class="text-primary">£2,100</span> per month</div>
|
||||
<div class="piw"><span class="text-primary">£485</span> per week</div>
|
||||
<div class="fw-medium text-primary fs-3">1 Bed Flat, Location, SW1Y</div>
|
||||
<ul>...<li>1 Bed</li><li>1 Bath</li><li>Furnished</li>...</ul>
|
||||
"""
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
properties = []
|
||||
|
||||
# Property cards: <a class="pli search-property-card">
|
||||
cards = soup.select("a.pli")
|
||||
if not cards:
|
||||
cards = soup.find_all("a", href=re.compile(r"/property-to-rent/"))
|
||||
|
||||
if not cards:
|
||||
log.warning(
|
||||
"No property cards found in search HTML (%d bytes). "
|
||||
"CSS selectors may need updating.",
|
||||
len(html),
|
||||
)
|
||||
return []
|
||||
|
||||
for card in cards:
|
||||
prop: dict = {}
|
||||
|
||||
# Extract property URL and ID from href
|
||||
href = card.get("href", "")
|
||||
if not href:
|
||||
continue
|
||||
|
||||
prop["url"] = href if href.startswith("http") else OPENRENT_BASE + href
|
||||
id_match = re.search(r"/(\d+)(?:\?|$|#)", href)
|
||||
if id_match:
|
||||
prop["id"] = id_match.group(1)
|
||||
else:
|
||||
# Try data-listing-id on the swiper element
|
||||
swiper = card.select_one("[data-listing-id]")
|
||||
if swiper:
|
||||
prop["id"] = swiper["data-listing-id"]
|
||||
else:
|
||||
continue # can't use a property without an ID
|
||||
|
||||
# --- Price ---
|
||||
# Prefer structured price elements over free-text parsing.
|
||||
# Monthly price is in <div class="pim"><span class="text-primary">£X</span>
|
||||
pim = card.select_one(".pim .text-primary, .pim span")
|
||||
piw = card.select_one(".piw .text-primary, .piw span")
|
||||
|
||||
monthly_price = _extract_price_from_element(pim)
|
||||
weekly_price = _extract_price_from_element(piw)
|
||||
|
||||
if monthly_price:
|
||||
prop["price"] = monthly_price
|
||||
prop["frequency"] = "monthly"
|
||||
elif weekly_price:
|
||||
prop["price"] = weekly_price
|
||||
prop["frequency"] = "weekly"
|
||||
else:
|
||||
# Fall back to parsing card text
|
||||
card_text = card.get_text(" ", strip=True)
|
||||
price_result = _extract_price(card_text)
|
||||
if price_result:
|
||||
prop["price"], prop["frequency"] = price_result
|
||||
|
||||
# --- Title / Address ---
|
||||
# The property title is in a div with classes "fw-medium text-primary fs-3"
|
||||
# e.g., "1 Bed Flat, Bank Chambers, SW1Y"
|
||||
title_el = card.select_one("div.fw-medium.fs-3")
|
||||
if not title_el:
|
||||
# Fallback: try image alt text which also has the title
|
||||
img = card.select_one("img.propertyPic")
|
||||
if img and img.get("alt"):
|
||||
prop["title"] = img["alt"]
|
||||
else:
|
||||
# Last resort: extract from card text, excluding price/nav noise
|
||||
prop["title"] = ""
|
||||
else:
|
||||
prop["title"] = title_el.get_text(strip=True)
|
||||
|
||||
# --- Bedrooms / Bathrooms from feature list ---
|
||||
feature_list = card.select("ul li")
|
||||
beds_from_features, baths_from_features = _extract_beds_baths_from_features(
|
||||
feature_list,
|
||||
)
|
||||
|
||||
# Bedrooms: prefer feature list, fall back to title parsing
|
||||
if beds_from_features is not None:
|
||||
prop["bedrooms"] = beds_from_features
|
||||
else:
|
||||
beds = _extract_bedrooms_from_title(prop.get("title", ""))
|
||||
if beds is not None:
|
||||
prop["bedrooms"] = beds
|
||||
|
||||
if baths_from_features is not None:
|
||||
prop["bathrooms"] = baths_from_features
|
||||
|
||||
# --- Property type from title ---
|
||||
title = prop.get("title", "")
|
||||
prop["property_type"] = _infer_property_type(title)
|
||||
|
||||
# --- Postcode / outcode from title ---
|
||||
postcode = _extract_postcode(title)
|
||||
if postcode:
|
||||
prop["postcode"] = postcode
|
||||
else:
|
||||
outcode = _extract_outcode(title)
|
||||
if outcode:
|
||||
prop["outcode"] = outcode
|
||||
|
||||
# --- Description snippet ---
|
||||
desc_el = card.select_one(".line-clamp-2")
|
||||
if desc_el:
|
||||
prop["description"] = desc_el.get_text(strip=True)
|
||||
|
||||
# --- Coordinates from data attributes (may not be present on cards) ---
|
||||
for el in [card] + card.select("[data-lat], [data-latitude]"):
|
||||
lat = el.get("data-lat") or el.get("data-latitude")
|
||||
lng = el.get("data-lng") or el.get("data-longitude") or el.get("data-lon")
|
||||
if lat and lng:
|
||||
try:
|
||||
prop["lat"] = float(lat)
|
||||
prop["lng"] = float(lng)
|
||||
except ValueError:
|
||||
pass
|
||||
break
|
||||
|
||||
properties.append(prop)
|
||||
|
||||
log.debug("Parsed %d property cards from search HTML", len(properties))
|
||||
return properties
|
||||
|
||||
|
||||
def parse_property_detail(html: str) -> dict:
|
||||
"""Parse a single property detail page for additional data.
|
||||
|
||||
Current detail page structure (2026-03):
|
||||
- <h1> has the full title (e.g., "Room in a Shared House, Lime Tree Court, AL2")
|
||||
- <div id="map" data-lat="..." data-lng="..."> has coordinates
|
||||
- Tables have "Rent PCM", "Deposit", "Bills Included", etc. (NOT bedrooms)
|
||||
- Description in elements with class containing "description"
|
||||
"""
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
details: dict = {}
|
||||
|
||||
# --- Title from h1 ---
|
||||
h1 = soup.select_one("h1")
|
||||
if h1:
|
||||
title_text = h1.get_text(strip=True)
|
||||
# Validate it's not a nav/modal element (e.g. "Log in")
|
||||
if len(title_text) > 10 and "log in" not in title_text.lower():
|
||||
details["title"] = title_text
|
||||
postcode = _extract_postcode(title_text)
|
||||
if postcode:
|
||||
details["postcode"] = postcode
|
||||
|
||||
# --- Coordinates from map element ---
|
||||
# The map div has id="map" with data-lat and data-lng
|
||||
map_el = soup.select_one("#map[data-lat]")
|
||||
if not map_el:
|
||||
# Fallback: any element with data-lat (but prefer #map)
|
||||
map_el = soup.select_one("[data-lat]")
|
||||
if map_el:
|
||||
lat = map_el.get("data-lat")
|
||||
lng = map_el.get("data-lng") or map_el.get("data-lon")
|
||||
if lat and lng:
|
||||
try:
|
||||
details["lat"] = float(lat)
|
||||
details["lng"] = float(lng)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# --- Parse tables for rent and property details ---
|
||||
for table in soup.select("table"):
|
||||
for row in table.select("tr"):
|
||||
cells = row.select("td")
|
||||
if len(cells) < 2:
|
||||
continue
|
||||
label = cells[0].get_text(strip=True).lower()
|
||||
value = cells[1].get_text(strip=True)
|
||||
|
||||
if "rent" in label and "pcm" in label:
|
||||
match = re.search(r"£([\d,]+)", value)
|
||||
if match:
|
||||
details["price"] = int(match.group(1).replace(",", ""))
|
||||
elif "bedroom" in label:
|
||||
match = re.search(r"(\d+)", value)
|
||||
if match:
|
||||
details["bedrooms"] = int(match.group(1))
|
||||
elif "bathroom" in label:
|
||||
match = re.search(r"(\d+)", value)
|
||||
if match:
|
||||
details["bathrooms"] = int(match.group(1))
|
||||
elif "type" in label and "property" in label:
|
||||
details["property_type"] = value
|
||||
elif "available" in label or "move" in label:
|
||||
details["available_date"] = value
|
||||
elif "furnish" in label:
|
||||
details["furnished"] = value
|
||||
|
||||
# --- Coordinates from inline JavaScript (last resort) ---
|
||||
if "lat" not in details:
|
||||
for script in soup.select("script"):
|
||||
text = script.string or ""
|
||||
lat_match = re.search(r'"latitude"\s*:\s*([\d.-]+)', text)
|
||||
lng_match = re.search(r'"longitude"\s*:\s*([\d.-]+)', text)
|
||||
if lat_match and lng_match:
|
||||
try:
|
||||
details["lat"] = float(lat_match.group(1))
|
||||
details["lng"] = float(lng_match.group(1))
|
||||
except ValueError:
|
||||
pass
|
||||
break
|
||||
|
||||
# --- Description for floor area ---
|
||||
desc_el = soup.select_one(".description, [class*='description'], #description")
|
||||
if desc_el:
|
||||
details["description"] = desc_el.get_text(strip=True)
|
||||
|
||||
return details
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Property type mapping & floor area
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def map_property_type(raw_type: str | None) -> str:
|
||||
"""Map OpenRent property type to canonical type."""
|
||||
if not raw_type:
|
||||
return "Other"
|
||||
canonical = PROPERTY_TYPE_MAP.get(raw_type)
|
||||
if canonical:
|
||||
return canonical
|
||||
lower = raw_type.lower()
|
||||
if "room" in lower or "shared" in lower:
|
||||
return "Other"
|
||||
if (
|
||||
"flat" in lower
|
||||
or "apartment" in lower
|
||||
or "maisonette" in lower
|
||||
or "studio" in lower
|
||||
):
|
||||
return "Flats/Maisonettes"
|
||||
if "detached" in lower and "semi" not in lower:
|
||||
return "Detached"
|
||||
if "semi" in lower:
|
||||
return "Semi-Detached"
|
||||
if "terrace" in lower or "mews" in lower:
|
||||
return "Terraced"
|
||||
if "house" in lower:
|
||||
return "Detached"
|
||||
log.debug("Unknown property type: %r — mapping to Other", raw_type)
|
||||
return "Other"
|
||||
|
||||
|
||||
def parse_floor_area(description: str | None) -> float | None:
|
||||
"""Try to extract floor area from description text."""
|
||||
if not description:
|
||||
return None
|
||||
m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*ft", description, re.IGNORECASE)
|
||||
if m:
|
||||
sqft = float(m.group(1).replace(",", ""))
|
||||
return validate_floor_area(round(sqft * 0.092903, 1))
|
||||
m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*m", description, re.IGNORECASE)
|
||||
if m:
|
||||
return validate_floor_area(round(float(m.group(1).replace(",", "")), 1))
|
||||
return None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Transform & search
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _resolve_outcode_postcodes(
|
||||
outcode: str,
|
||||
pc_coords: dict[str, tuple[float, float]],
|
||||
) -> list[str]:
|
||||
"""Get all postcodes for an outcode from the postcode coordinates lookup."""
|
||||
# ONSPD 7-char format: 4-char outcodes have no space before incode
|
||||
# (e.g., "BH191AB"), while shorter outcodes do (e.g., "E14 5AB").
|
||||
prefix = outcode + " "
|
||||
results = [pcd for pcd in pc_coords if pcd.startswith(prefix)]
|
||||
if not results and len(outcode) >= 4:
|
||||
results = [pcd for pcd in pc_coords if pcd.startswith(outcode) and len(pcd) > len(outcode)]
|
||||
return results
|
||||
|
||||
|
||||
def _parse_or_date(date_str: str) -> str:
|
||||
"""Parse OpenRent date strings to ISO format (YYYY-MM-DD).
|
||||
Handles 'Today', 'Tomorrow', and 'DD Month, YYYY' formats."""
|
||||
if not date_str:
|
||||
return ""
|
||||
stripped = date_str.strip()
|
||||
lower = stripped.lower()
|
||||
if lower == "today":
|
||||
from datetime import datetime
|
||||
return datetime.now().strftime("%Y-%m-%d")
|
||||
if lower == "tomorrow":
|
||||
from datetime import datetime, timedelta
|
||||
return (datetime.now() + timedelta(days=1)).strftime("%Y-%m-%d")
|
||||
# Try "DD Month, YYYY" format (e.g., "01 April, 2026")
|
||||
from datetime import datetime
|
||||
for fmt in ("%d %B, %Y", "%d %B %Y"):
|
||||
try:
|
||||
return datetime.strptime(stripped, fmt).strftime("%Y-%m-%d")
|
||||
except ValueError:
|
||||
continue
|
||||
return date_str # Return as-is if unparseable
|
||||
|
||||
|
||||
def transform_property(
|
||||
search_data: dict,
|
||||
detail_data: dict | None,
|
||||
pc_index: PostcodeSpatialIndex,
|
||||
pc_coords: dict[str, tuple[float, float]],
|
||||
) -> dict | None:
|
||||
"""Transform OpenRent property data into our output schema.
|
||||
|
||||
Merges data from the search results page and (optionally) the detail page.
|
||||
Uses pc_coords (postcode -> lat/lng) as a fallback when coordinates are
|
||||
missing but a postcode is available.
|
||||
"""
|
||||
detail = detail_data or {}
|
||||
|
||||
# Merge: detail page data takes precedence
|
||||
lat = detail.get("lat") or search_data.get("lat")
|
||||
lng = detail.get("lng") or search_data.get("lng")
|
||||
price = detail.get("price") or search_data.get("price")
|
||||
if not price or int(price) <= 0:
|
||||
return None
|
||||
|
||||
frequency = search_data.get("frequency", "monthly")
|
||||
|
||||
# Get postcode: detail page > search card
|
||||
postcode = detail.get("postcode") or search_data.get("postcode")
|
||||
|
||||
if lat is not None and lng is not None:
|
||||
# Validate coordinates are in England
|
||||
if not (49 <= lat <= 56 and -7 <= lng <= 2):
|
||||
log.debug("Coords outside England: lat=%.4f lng=%.4f — skipping", lat, lng)
|
||||
return None
|
||||
if not postcode:
|
||||
if pc_index:
|
||||
postcode = pc_index.nearest(lat, lng)
|
||||
elif search_data.get("outcode"):
|
||||
# No spatial index — try outcode lookup as fallback
|
||||
outcode_pcs = _resolve_outcode_postcodes(
|
||||
search_data["outcode"],
|
||||
pc_coords,
|
||||
)
|
||||
if outcode_pcs:
|
||||
postcode = outcode_pcs[0]
|
||||
elif postcode:
|
||||
# Have postcode but no coordinates — look up centroid from arcgis data
|
||||
coords = pc_coords.get(postcode)
|
||||
if coords:
|
||||
lat, lng = coords
|
||||
else:
|
||||
log.debug("Postcode %s not in arcgis data — skipping", postcode)
|
||||
return None
|
||||
elif search_data.get("outcode"):
|
||||
# Have only outcode — find postcodes in that outcode and use centroid
|
||||
outcode = search_data["outcode"]
|
||||
outcode_postcodes = _resolve_outcode_postcodes(outcode, pc_coords)
|
||||
if outcode_postcodes:
|
||||
# Use the first postcode as a rough approximation
|
||||
postcode = outcode_postcodes[0]
|
||||
lat, lng = pc_coords[postcode]
|
||||
else:
|
||||
log.debug("No postcodes found for outcode %s — skipping", outcode)
|
||||
return None
|
||||
else:
|
||||
return None
|
||||
|
||||
if not postcode:
|
||||
log.debug("No postcode for property — skipping")
|
||||
return None
|
||||
|
||||
raw_beds = detail.get("bedrooms") or search_data.get("bedrooms", 0) or 0
|
||||
raw_baths = detail.get("bathrooms") or search_data.get("bathrooms", 0) or 0
|
||||
bedrooms = raw_beds if raw_beds <= MAX_BEDROOMS else 0
|
||||
bathrooms = raw_baths if raw_baths <= MAX_BEDROOMS else 0
|
||||
if raw_beds > MAX_BEDROOMS or raw_baths > MAX_BEDROOMS:
|
||||
log.warning(
|
||||
"OpenRent %s: implausible beds=%d baths=%d (capped to 0)",
|
||||
search_data.get("id", "?"), raw_beds, raw_baths,
|
||||
)
|
||||
|
||||
# Title: prefer detail page (has h1 with full title)
|
||||
title = detail.get("title") or search_data.get("title", "")
|
||||
|
||||
# Address: take the middle part of the title (skip the "N Bed Type" prefix
|
||||
# and the outcode suffix). E.g., "1 Bed Flat, Bank Chambers, SW1Y" -> "Bank Chambers"
|
||||
address = ""
|
||||
if title:
|
||||
parts = [p.strip() for p in title.split(",")]
|
||||
if len(parts) >= 3:
|
||||
# Skip first (type) and last (outcode), join the middle
|
||||
address = ", ".join(parts[1:-1])
|
||||
elif len(parts) == 2:
|
||||
# Could be "Location, OUTCODE" or "Type, Location"
|
||||
# If last part looks like an outcode, use the first part
|
||||
if re.match(r"^[A-Z]{1,2}\d", parts[-1].strip()):
|
||||
address = parts[0]
|
||||
else:
|
||||
address = parts[1]
|
||||
else:
|
||||
address = title
|
||||
|
||||
# Property type: prefer detail, then search card, then infer from title
|
||||
property_type = detail.get("property_type") or search_data.get("property_type", "")
|
||||
if not property_type and title:
|
||||
property_type = _infer_property_type(title)
|
||||
|
||||
prop_id = search_data.get("id", "")
|
||||
listing_url = search_data.get(
|
||||
"url",
|
||||
f"{OPENRENT_BASE}/{prop_id}" if prop_id else "",
|
||||
)
|
||||
description = detail.get("description") or search_data.get("description", "")
|
||||
|
||||
return {
|
||||
"id": f"or_{prop_id}",
|
||||
"Bedrooms": bedrooms,
|
||||
"Bathrooms": bathrooms,
|
||||
"Number of bedrooms & living rooms": bedrooms,
|
||||
"lon": lng,
|
||||
"lat": lat,
|
||||
"Postcode": normalize_postcode(postcode),
|
||||
"Address per Property Register": address,
|
||||
# OpenRent is a rental-only platform — tenure (Freehold/Leasehold) is a
|
||||
# property ownership concept that doesn't apply to rental listings. The
|
||||
# landlord's tenure is not shown on OpenRent listing pages.
|
||||
"Leasehold/Freehold": None,
|
||||
"Property type": map_property_type(property_type),
|
||||
"Property sub-type": normalize_sub_type(property_type),
|
||||
"price": int(price),
|
||||
"price_frequency": frequency,
|
||||
"Price qualifier": "",
|
||||
"Total floor area (sqm)": parse_floor_area(description),
|
||||
"Listing URL": listing_url,
|
||||
"Listing features": [],
|
||||
"first_visible_date": _parse_or_date(detail.get("available_date", "")),
|
||||
}
|
||||
|
||||
|
||||
def search_outcode(
|
||||
client: Session,
|
||||
outcode: str,
|
||||
pc_index: PostcodeSpatialIndex,
|
||||
pc_coords: dict[str, tuple[float, float]],
|
||||
fetch_details: bool = True,
|
||||
) -> list[dict]:
|
||||
"""Search OpenRent for rental properties in one outcode.
|
||||
|
||||
1. Fetches the search results page for the outcode
|
||||
2. Parses property cards from the HTML (title, price, beds, baths)
|
||||
3. Fetches each property's detail page for coordinates
|
||||
4. Transforms to common output schema
|
||||
|
||||
The search card provides most data (price, bedrooms, bathrooms, title,
|
||||
property type). Detail pages are needed primarily for precise coordinates
|
||||
and full postcodes. When detail pages fail, we fall back to outcode-level
|
||||
coordinates from the postcode lookup.
|
||||
"""
|
||||
search_url = f"{OPENRENT_BASE}/properties-to-rent/?term={outcode}&isLive=true"
|
||||
|
||||
html = fetch_page(client, search_url)
|
||||
if not html:
|
||||
return []
|
||||
|
||||
search_results = parse_search_results(html)
|
||||
if not search_results:
|
||||
return []
|
||||
|
||||
properties = []
|
||||
for search_data in search_results:
|
||||
detail_data = None
|
||||
|
||||
# Skip detail page if we already have coordinates or a resolvable postcode
|
||||
has_coords = (
|
||||
search_data.get("lat") is not None
|
||||
and search_data.get("lng") is not None
|
||||
)
|
||||
has_resolvable_pc = (
|
||||
search_data.get("postcode")
|
||||
and pc_coords
|
||||
and search_data["postcode"] in pc_coords
|
||||
)
|
||||
needs_detail = (
|
||||
fetch_details
|
||||
and search_data.get("url")
|
||||
and not has_coords
|
||||
and not has_resolvable_pc
|
||||
)
|
||||
|
||||
if needs_detail:
|
||||
detail_html = fetch_page(client, search_data["url"])
|
||||
if detail_html:
|
||||
detail_data = parse_property_detail(detail_html)
|
||||
# Shorter delay for detail pages (within same outcode)
|
||||
time.sleep(0.15)
|
||||
|
||||
transformed = transform_property(
|
||||
search_data,
|
||||
detail_data,
|
||||
pc_index,
|
||||
pc_coords,
|
||||
)
|
||||
if transformed:
|
||||
properties.append(transformed)
|
||||
openrent_properties_scraped.labels(channel="rent").inc()
|
||||
|
||||
return properties
|
||||
|
|
@ -3,15 +3,10 @@ name = "finder"
|
|||
version = "0.1.0"
|
||||
requires-python = ">=3.12"
|
||||
dependencies = [
|
||||
"flask",
|
||||
"httpx",
|
||||
"curl_cffi",
|
||||
"polars",
|
||||
"fake-useragent>=2.2.0",
|
||||
"prometheus-client",
|
||||
"beautifulsoup4",
|
||||
"lxml",
|
||||
"playwright>=1.58.0",
|
||||
"playwright-stealth>=2.0.2",
|
||||
"camoufox>=0.4.11",
|
||||
]
|
||||
|
|
|
|||
|
|
@ -58,6 +58,7 @@ def _paginate(
|
|||
channel_cfg: dict,
|
||||
pc_index: PostcodeSpatialIndex,
|
||||
extra_params: dict | None = None,
|
||||
max_properties: int | None = None,
|
||||
) -> tuple[list[dict], int]:
|
||||
"""Paginate through search results. Returns (properties, result_count)."""
|
||||
properties = []
|
||||
|
|
@ -94,6 +95,8 @@ def _paginate(
|
|||
transformed = transform_property(prop, outcode, pc_index)
|
||||
if transformed:
|
||||
properties.append(transformed)
|
||||
if max_properties is not None and len(properties) >= max_properties:
|
||||
return properties, result_count
|
||||
|
||||
# Check if there are more pages
|
||||
result_count_str = data.get("resultCount", "0")
|
||||
|
|
@ -114,6 +117,7 @@ def search_outcode(
|
|||
outcode: str,
|
||||
channel_cfg: dict,
|
||||
pc_index: PostcodeSpatialIndex,
|
||||
max_properties: int | None = None,
|
||||
) -> list[dict]:
|
||||
"""Paginate through search results for one outcode+channel. Returns transformed properties.
|
||||
|
||||
|
|
@ -121,9 +125,12 @@ def search_outcode(
|
|||
re-queries per property type to recover listings beyond the cap.
|
||||
"""
|
||||
properties, result_count = _paginate(
|
||||
client, outcode_id, outcode, channel_cfg, pc_index
|
||||
client, outcode_id, outcode, channel_cfg, pc_index, max_properties=max_properties
|
||||
)
|
||||
|
||||
if max_properties is not None and len(properties) >= max_properties:
|
||||
return properties[:max_properties]
|
||||
|
||||
if result_count <= _MAX_INDEX:
|
||||
return properties
|
||||
|
||||
|
|
@ -140,17 +147,28 @@ def search_outcode(
|
|||
pt_props, _ = _paginate(
|
||||
client, outcode_id, outcode, channel_cfg, pc_index,
|
||||
extra_params={"propertyTypes": pt},
|
||||
max_properties=max_properties,
|
||||
)
|
||||
new = 0
|
||||
for p in pt_props:
|
||||
if p["id"] not in all_by_id:
|
||||
all_by_id[p["id"]] = p
|
||||
new += 1
|
||||
if (
|
||||
max_properties is not None
|
||||
and len(all_by_id) >= max_properties
|
||||
):
|
||||
break
|
||||
if new:
|
||||
log.debug("%s/%s type=%s: +%d new properties", outcode, ch, pt, new)
|
||||
if max_properties is not None and len(all_by_id) >= max_properties:
|
||||
break
|
||||
|
||||
log.info(
|
||||
"%s/%s: type split recovered %d → %d properties",
|
||||
outcode, ch, len(properties), len(all_by_id),
|
||||
)
|
||||
return list(all_by_id.values())
|
||||
properties = list(all_by_id.values())
|
||||
if max_properties is not None:
|
||||
return properties[:max_properties]
|
||||
return properties
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
|
|
@ -1,52 +0,0 @@
|
|||
The API works as follows, you must search for outcodes, such as E11, then hit https://los.rightmove.co.uk/typeahead?query=E11&limit=10&exclude=STREET which will return something like:
|
||||
|
||||
{
|
||||
"matches": [
|
||||
{
|
||||
"id": "746",
|
||||
"type": "OUTCODE",
|
||||
"displayName": "E11",
|
||||
"highlighting": "<span class='highlightLetter'>E11</span>",
|
||||
"highlights": [
|
||||
{
|
||||
"text": "E11",
|
||||
"highlighted": true
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "749",
|
||||
"type": "OUTCODE",
|
||||
"displayName": "E14",
|
||||
"highlighting": "displayName",
|
||||
"highlights": []
|
||||
},
|
||||
{
|
||||
"id": "752",
|
||||
"type": "OUTCODE",
|
||||
"displayName": "E17",
|
||||
"highlighting": "displayName",
|
||||
"highlights": []
|
||||
},
|
||||
...
|
||||
]
|
||||
}
|
||||
|
||||
We need to find the id of the object which has "type": "OUTCODE", and displayName matching the outcode we searched for, in this case E11, which is 746. Then we can hit the search endpoint with that id, and it will return the properties for that outcode:
|
||||
|
||||
https://www.rightmove.co.uk/api/property-search/listing/search?useLocationIdentifier=true&locationIdentifier=OUTCODE%5E746&buy=For+sale&_includeSSTC=on&index=0&sortType=2&channel=BUY&transactionType=BUY&displayLocationIdentifier=E12.html
|
||||
|
||||
You can see the example response to this at [[buy.json]]
|
||||
|
||||
You must set locationIdentifier=OUTCODE%5E{id} where id is 746 in this case, so it's 746 locationIdentifier=OUTCODE%5E746. Paging works by increasing index by the number of results per page, which is 24. So the next page would be index=24, then index=48, etc.
|
||||
|
||||
|
||||
The rental endpoint works similarly:
|
||||
|
||||
https://www.rightmove.co.uk/api/property-search/listing/search?locationIdentifier=OUTCODE%5E745&index=0&sortType=6&channel=RENT&transactionType=LETTING&displayLocationIdentifier=E16.html
|
||||
|
||||
https://www.rightmove.co.uk/api/property-search/listing/search?locationIdentifier=OUTCODE%5E752&index=48&sortType=6&channel=RENT&transactionType=LETTING&displayLocationIdentifier=E17.html
|
||||
|
||||
|
||||
See a response example for the rental endpoint at [[rent.json]]
|
||||
|
||||
File diff suppressed because it is too large
Load diff
1337
finder/scraper.py
1337
finder/scraper.py
File diff suppressed because it is too large
Load diff
|
|
@ -4,17 +4,14 @@ from pathlib import Path
|
|||
|
||||
import polars as pl
|
||||
|
||||
from constants import MAX_BEDROOMS, MAX_RENT_MONTHLY, MIN_RENT_MONTHLY
|
||||
from transform import map_property_type, normalize_postcode, normalize_price
|
||||
from constants import MAX_BEDROOMS
|
||||
from transform import map_property_type, normalize_postcode
|
||||
|
||||
log = logging.getLogger("rightmove")
|
||||
|
||||
|
||||
def write_parquet(properties: list[dict], path: Path, channel: str) -> None:
|
||||
"""Write properties list to parquet with server-ready column names.
|
||||
|
||||
channel: "buy" or "rent"
|
||||
"""
|
||||
def write_parquet(properties: list[dict], path: Path) -> None:
|
||||
"""Write sale properties list to parquet with server-ready column names."""
|
||||
if not properties:
|
||||
log.warning("No properties to write to %s", path)
|
||||
return
|
||||
|
|
@ -69,7 +66,7 @@ def write_parquet(properties: list[dict], path: Path, channel: str) -> None:
|
|||
dt = dt.astimezone(timezone.utc).replace(tzinfo=None)
|
||||
listing_dates.append(dt)
|
||||
except (ValueError, TypeError):
|
||||
# Try additional date formats (OpenRent: "DD Month, YYYY", "Today")
|
||||
# Try additional date formats used by scraped listing sources.
|
||||
parsed = None
|
||||
stripped = fvd.strip()
|
||||
lower = stripped.lower()
|
||||
|
|
@ -93,35 +90,9 @@ def write_parquet(properties: list[dict], path: Path, channel: str) -> None:
|
|||
else:
|
||||
listing_dates.append(None)
|
||||
|
||||
# Derive asking price / asking rent based on channel
|
||||
# Zero prices indicate parsing failures or POA/auction listings — treat as null
|
||||
if channel == "buy":
|
||||
asking_prices = [p["price"] if p["price"] > 0 else None for p in properties]
|
||||
asking_rents = [None] * len(properties)
|
||||
listing_statuses = ["For sale"] * len(properties)
|
||||
else:
|
||||
asking_prices = [None] * len(properties)
|
||||
# Normalize to monthly, then apply sanity bounds. Rents outside
|
||||
# [MIN_RENT_MONTHLY, MAX_RENT_MONTHLY] are almost always total-stay
|
||||
# pricing (short lets), annual rents mislabelled as monthly, or £0
|
||||
# placeholders — null them out rather than polluting aggregates.
|
||||
rent_outliers = 0
|
||||
asking_rents = []
|
||||
for p in properties:
|
||||
monthly = normalize_price(p["price"], p["price_frequency"])
|
||||
if monthly < MIN_RENT_MONTHLY or monthly > MAX_RENT_MONTHLY:
|
||||
rent_outliers += 1
|
||||
asking_rents.append(None)
|
||||
else:
|
||||
asking_rents.append(monthly)
|
||||
if rent_outliers:
|
||||
log.warning(
|
||||
"Nulled %d rent outliers outside [£%d, £%d]/month",
|
||||
rent_outliers,
|
||||
MIN_RENT_MONTHLY,
|
||||
MAX_RENT_MONTHLY,
|
||||
)
|
||||
listing_statuses = ["For rent"] * len(properties)
|
||||
asking_prices = [p["price"] if p["price"] > 0 else None for p in properties]
|
||||
listing_statuses = ["For sale"] * len(properties)
|
||||
|
||||
df = pl.DataFrame(
|
||||
{
|
||||
|
|
@ -146,7 +117,6 @@ def write_parquet(properties: list[dict], path: Path, channel: str) -> None:
|
|||
"Listing date": listing_dates,
|
||||
"Listing status": listing_statuses,
|
||||
"Asking price": asking_prices,
|
||||
"Asking rent (monthly)": asking_rents,
|
||||
},
|
||||
schema={
|
||||
"Bedrooms": pl.Int32,
|
||||
|
|
@ -166,18 +136,15 @@ def write_parquet(properties: list[dict], path: Path, channel: str) -> None:
|
|||
"Listing date": pl.Datetime("us"),
|
||||
"Listing status": pl.Utf8,
|
||||
"Asking price": pl.Int64,
|
||||
"Asking rent (monthly)": pl.Int64,
|
||||
},
|
||||
)
|
||||
|
||||
# Derive asking price per sqm for buy listings
|
||||
if channel == "buy":
|
||||
df = df.with_columns(
|
||||
(pl.col("Asking price") / pl.col("Total floor area (sqm)"))
|
||||
.round(0)
|
||||
.cast(pl.Int32, strict=False)
|
||||
.alias("Asking price per sqm"),
|
||||
)
|
||||
df = df.with_columns(
|
||||
(pl.col("Asking price") / pl.col("Total floor area (sqm)"))
|
||||
.round(0)
|
||||
.cast(pl.Int32, strict=False)
|
||||
.alias("Asking price per sqm"),
|
||||
)
|
||||
|
||||
df.write_parquet(path)
|
||||
log.info("Wrote %d properties to %s", len(df), path)
|
||||
|
|
|
|||
|
|
@ -143,15 +143,6 @@ def normalize_postcode(postcode: str) -> str:
|
|||
return compact[:-3] + " " + compact[-3:]
|
||||
|
||||
|
||||
def normalize_price(amount: int, frequency: str) -> int:
|
||||
"""Normalise price to monthly for rentals (weekly × 52/12, yearly ÷ 12)."""
|
||||
if frequency == "weekly":
|
||||
return round(amount * 52 / 12)
|
||||
if frequency == "yearly":
|
||||
return round(amount / 12)
|
||||
return amount
|
||||
|
||||
|
||||
def transform_property(
|
||||
prop: dict, outcode: str, pc_index: PostcodeSpatialIndex
|
||||
) -> dict | None:
|
||||
|
|
@ -170,8 +161,6 @@ def transform_property(
|
|||
amount = price_obj.get("amount")
|
||||
if not amount:
|
||||
return None
|
||||
frequency = price_obj.get("frequency", "")
|
||||
# Store raw price — normalization to monthly happens once in storage.py
|
||||
price = int(amount)
|
||||
if price <= 0:
|
||||
return None
|
||||
|
|
@ -221,7 +210,7 @@ def transform_property(
|
|||
"Property type": map_property_type(sub_type),
|
||||
"Property sub-type": normalize_sub_type(sub_type),
|
||||
"price": price,
|
||||
"price_frequency": frequency,
|
||||
"price_frequency": "",
|
||||
"Price qualifier": price_qualifier,
|
||||
"Total floor area (sqm)": parse_display_size(prop.get("displaySize")),
|
||||
"Listing URL": RIGHTMOVE_BASE + prop.get("propertyUrl", ""),
|
||||
|
|
|
|||
175
finder/uv.lock
generated
175
finder/uv.lock
generated
|
|
@ -24,28 +24,6 @@ wheels = [
|
|||
{ url = "https://files.pythonhosted.org/packages/af/38/9483eb52fc0f00039c684af627f8a8f994a8a99e8eceb869ba93b3fd740b/apify_fingerprint_datapoints-0.11.0-py3-none-any.whl", hash = "sha256:333340ccc3e520f19b5561e95d7abe2b31702e61d34b6247b328c9b8c93fbe1d", size = 726498, upload-time = "2026-03-01T01:00:03.103Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "beautifulsoup4"
|
||||
version = "4.14.3"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "soupsieve" },
|
||||
{ name = "typing-extensions" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/c3/b0/1c6a16426d389813b48d95e26898aff79abbde42ad353958ad95cc8c9b21/beautifulsoup4-4.14.3.tar.gz", hash = "sha256:6292b1c5186d356bba669ef9f7f051757099565ad9ada5dd630bd9de5fa7fb86", size = 627737, upload-time = "2025-11-30T15:08:26.084Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/1a/39/47f9197bdd44df24d67ac8893641e16f386c984a0619ef2ee4c51fbbc019/beautifulsoup4-4.14.3-py3-none-any.whl", hash = "sha256:0918bfe44902e6ad8d57732ba310582e98da931428d231a5ecb9e7c703a735bb", size = 107721, upload-time = "2025-11-30T15:08:24.087Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "blinker"
|
||||
version = "1.9.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/21/28/9b3f50ce0e048515135495f198351908d99540d69bfdc8c1d15b73dc55ce/blinker-1.9.0.tar.gz", hash = "sha256:b4ce2265a7abece45e7cc896e98dbebe6cead56bcf805a3d23136d145f5445bf", size = 22460, upload-time = "2024-11-08T17:25:47.436Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/10/cb/f2ad4230dc2eb1a74edf38f1a38b9b52277f75bef262d8908e60d957e13c/blinker-1.9.0-py3-none-any.whl", hash = "sha256:ba0efaa9080b619ff2f3459d1d500c57bddea4a6b424b60a91141db6fd2f08bc", size = 8458, upload-time = "2024-11-08T17:25:46.184Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "browserforge"
|
||||
version = "1.2.4"
|
||||
|
|
@ -295,49 +273,22 @@ name = "finder"
|
|||
version = "0.1.0"
|
||||
source = { virtual = "." }
|
||||
dependencies = [
|
||||
{ name = "beautifulsoup4" },
|
||||
{ name = "camoufox" },
|
||||
{ name = "curl-cffi" },
|
||||
{ name = "fake-useragent" },
|
||||
{ name = "flask" },
|
||||
{ name = "httpx" },
|
||||
{ name = "lxml" },
|
||||
{ name = "playwright" },
|
||||
{ name = "playwright-stealth" },
|
||||
{ name = "polars" },
|
||||
{ name = "prometheus-client" },
|
||||
]
|
||||
|
||||
[package.metadata]
|
||||
requires-dist = [
|
||||
{ name = "beautifulsoup4" },
|
||||
{ name = "camoufox", specifier = ">=0.4.11" },
|
||||
{ name = "curl-cffi" },
|
||||
{ name = "fake-useragent", specifier = ">=2.2.0" },
|
||||
{ name = "flask" },
|
||||
{ name = "httpx" },
|
||||
{ name = "lxml" },
|
||||
{ name = "playwright", specifier = ">=1.58.0" },
|
||||
{ name = "playwright-stealth", specifier = ">=2.0.2" },
|
||||
{ name = "polars" },
|
||||
{ name = "prometheus-client" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "flask"
|
||||
version = "3.1.3"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "blinker" },
|
||||
{ name = "click" },
|
||||
{ name = "itsdangerous" },
|
||||
{ name = "jinja2" },
|
||||
{ name = "markupsafe" },
|
||||
{ name = "werkzeug" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/26/00/35d85dcce6c57fdc871f3867d465d780f302a175ea360f62533f12b27e2b/flask-3.1.3.tar.gz", hash = "sha256:0ef0e52b8a9cd932855379197dd8f94047b359ca0a78695144304cb45f87c9eb", size = 759004, upload-time = "2026-02-19T05:00:57.678Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/7f/9c/34f6962f9b9e9c71f6e5ed806e0d0ff03c9d1b0b2340088a0cf4bce09b18/flask-3.1.3-py3-none-any.whl", hash = "sha256:f4bcbefc124291925f1a26446da31a5178f9483862233b23c0c96a20701f670c", size = 103424, upload-time = "2026-02-19T05:00:56.027Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
|
@ -429,27 +380,6 @@ wheels = [
|
|||
{ url = "https://files.pythonhosted.org/packages/0e/61/66938bbb5fc52dbdf84594873d5b51fb1f7c7794e9c0f5bd885f30bc507b/idna-3.11-py3-none-any.whl", hash = "sha256:771a87f49d9defaf64091e6e6fe9c18d4833f140bd19464795bc32d966ca37ea", size = 71008, upload-time = "2025-10-12T14:55:18.883Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "itsdangerous"
|
||||
version = "2.2.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/9c/cb/8ac0172223afbccb63986cc25049b154ecfb5e85932587206f42317be31d/itsdangerous-2.2.0.tar.gz", hash = "sha256:e0050c0b7da1eea53ffaf149c0cfbb5c6e2e2b69c4bef22c81fa6eb73e5f6173", size = 54410, upload-time = "2024-04-16T21:28:15.614Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/04/96/92447566d16df59b2a776c0fb82dbc4d9e07cd95062562af01e408583fc4/itsdangerous-2.2.0-py3-none-any.whl", hash = "sha256:c6242fc49e35958c8b15141343aa660db5fc54d4f13a1db01a3f5891b98700ef", size = 16234, upload-time = "2024-04-16T21:28:14.499Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "jinja2"
|
||||
version = "3.1.6"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "markupsafe" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/df/bf/f7da0350254c0ed7c72f3e33cef02e048281fec7ecec5f032d4aac52226b/jinja2-3.1.6.tar.gz", hash = "sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d", size = 245115, upload-time = "2025-03-05T20:05:02.478Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/62/a1/3d680cbfd5f4b8f15abc1d571870c5fc3e594bb582bc3b64ea099db13e56/jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67", size = 134899, upload-time = "2025-03-05T20:05:00.369Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "language-tags"
|
||||
version = "1.2.0"
|
||||
|
|
@ -539,69 +469,6 @@ wheels = [
|
|||
{ url = "https://files.pythonhosted.org/packages/92/aa/df863bcc39c5e0946263454aba394de8a9084dbaff8ad143846b0d844739/lxml-6.0.2-cp314-cp314t-win_arm64.whl", hash = "sha256:bb4c1847b303835d89d785a18801a883436cdfd5dc3d62947f9c49e24f0f5a2c", size = 3822205, upload-time = "2025-09-22T04:03:36.249Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "markupsafe"
|
||||
version = "3.0.3"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/7e/99/7690b6d4034fffd95959cbe0c02de8deb3098cc577c67bb6a24fe5d7caa7/markupsafe-3.0.3.tar.gz", hash = "sha256:722695808f4b6457b320fdc131280796bdceb04ab50fe1795cd540799ebe1698", size = 80313, upload-time = "2025-09-27T18:37:40.426Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/5a/72/147da192e38635ada20e0a2e1a51cf8823d2119ce8883f7053879c2199b5/markupsafe-3.0.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d53197da72cc091b024dd97249dfc7794d6a56530370992a5e1a08983ad9230e", size = 11615, upload-time = "2025-09-27T18:36:30.854Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/9a/81/7e4e08678a1f98521201c3079f77db69fb552acd56067661f8c2f534a718/markupsafe-3.0.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:1872df69a4de6aead3491198eaf13810b565bdbeec3ae2dc8780f14458ec73ce", size = 12020, upload-time = "2025-09-27T18:36:31.971Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/1e/2c/799f4742efc39633a1b54a92eec4082e4f815314869865d876824c257c1e/markupsafe-3.0.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3a7e8ae81ae39e62a41ec302f972ba6ae23a5c5396c8e60113e9066ef893da0d", size = 24332, upload-time = "2025-09-27T18:36:32.813Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/3c/2e/8d0c2ab90a8c1d9a24f0399058ab8519a3279d1bd4289511d74e909f060e/markupsafe-3.0.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d6dd0be5b5b189d31db7cda48b91d7e0a9795f31430b7f271219ab30f1d3ac9d", size = 22947, upload-time = "2025-09-27T18:36:33.86Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/2c/54/887f3092a85238093a0b2154bd629c89444f395618842e8b0c41783898ea/markupsafe-3.0.3-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:94c6f0bb423f739146aec64595853541634bde58b2135f27f61c1ffd1cd4d16a", size = 21962, upload-time = "2025-09-27T18:36:35.099Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/c9/2f/336b8c7b6f4a4d95e91119dc8521402461b74a485558d8f238a68312f11c/markupsafe-3.0.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:be8813b57049a7dc738189df53d69395eba14fb99345e0a5994914a3864c8a4b", size = 23760, upload-time = "2025-09-27T18:36:36.001Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/32/43/67935f2b7e4982ffb50a4d169b724d74b62a3964bc1a9a527f5ac4f1ee2b/markupsafe-3.0.3-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:83891d0e9fb81a825d9a6d61e3f07550ca70a076484292a70fde82c4b807286f", size = 21529, upload-time = "2025-09-27T18:36:36.906Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/89/e0/4486f11e51bbba8b0c041098859e869e304d1c261e59244baa3d295d47b7/markupsafe-3.0.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:77f0643abe7495da77fb436f50f8dab76dbc6e5fd25d39589a0f1fe6548bfa2b", size = 23015, upload-time = "2025-09-27T18:36:37.868Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/2f/e1/78ee7a023dac597a5825441ebd17170785a9dab23de95d2c7508ade94e0e/markupsafe-3.0.3-cp312-cp312-win32.whl", hash = "sha256:d88b440e37a16e651bda4c7c2b930eb586fd15ca7406cb39e211fcff3bf3017d", size = 14540, upload-time = "2025-09-27T18:36:38.761Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/aa/5b/bec5aa9bbbb2c946ca2733ef9c4ca91c91b6a24580193e891b5f7dbe8e1e/markupsafe-3.0.3-cp312-cp312-win_amd64.whl", hash = "sha256:26a5784ded40c9e318cfc2bdb30fe164bdb8665ded9cd64d500a34fb42067b1c", size = 15105, upload-time = "2025-09-27T18:36:39.701Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/e5/f1/216fc1bbfd74011693a4fd837e7026152e89c4bcf3e77b6692fba9923123/markupsafe-3.0.3-cp312-cp312-win_arm64.whl", hash = "sha256:35add3b638a5d900e807944a078b51922212fb3dedb01633a8defc4b01a3c85f", size = 13906, upload-time = "2025-09-27T18:36:40.689Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/38/2f/907b9c7bbba283e68f20259574b13d005c121a0fa4c175f9bed27c4597ff/markupsafe-3.0.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e1cf1972137e83c5d4c136c43ced9ac51d0e124706ee1c8aa8532c1287fa8795", size = 11622, upload-time = "2025-09-27T18:36:41.777Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/9c/d9/5f7756922cdd676869eca1c4e3c0cd0df60ed30199ffd775e319089cb3ed/markupsafe-3.0.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:116bb52f642a37c115f517494ea5feb03889e04df47eeff5b130b1808ce7c219", size = 12029, upload-time = "2025-09-27T18:36:43.257Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/00/07/575a68c754943058c78f30db02ee03a64b3c638586fba6a6dd56830b30a3/markupsafe-3.0.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:133a43e73a802c5562be9bbcd03d090aa5a1fe899db609c29e8c8d815c5f6de6", size = 24374, upload-time = "2025-09-27T18:36:44.508Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/a9/21/9b05698b46f218fc0e118e1f8168395c65c8a2c750ae2bab54fc4bd4e0e8/markupsafe-3.0.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ccfcd093f13f0f0b7fdd0f198b90053bf7b2f02a3927a30e63f3ccc9df56b676", size = 22980, upload-time = "2025-09-27T18:36:45.385Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/7f/71/544260864f893f18b6827315b988c146b559391e6e7e8f7252839b1b846a/markupsafe-3.0.3-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:509fa21c6deb7a7a273d629cf5ec029bc209d1a51178615ddf718f5918992ab9", size = 21990, upload-time = "2025-09-27T18:36:46.916Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/c2/28/b50fc2f74d1ad761af2f5dcce7492648b983d00a65b8c0e0cb457c82ebbe/markupsafe-3.0.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a4afe79fb3de0b7097d81da19090f4df4f8d3a2b3adaa8764138aac2e44f3af1", size = 23784, upload-time = "2025-09-27T18:36:47.884Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/ed/76/104b2aa106a208da8b17a2fb72e033a5a9d7073c68f7e508b94916ed47a9/markupsafe-3.0.3-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:795e7751525cae078558e679d646ae45574b47ed6e7771863fcc079a6171a0fc", size = 21588, upload-time = "2025-09-27T18:36:48.82Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/b5/99/16a5eb2d140087ebd97180d95249b00a03aa87e29cc224056274f2e45fd6/markupsafe-3.0.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:8485f406a96febb5140bfeca44a73e3ce5116b2501ac54fe953e488fb1d03b12", size = 23041, upload-time = "2025-09-27T18:36:49.797Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/19/bc/e7140ed90c5d61d77cea142eed9f9c303f4c4806f60a1044c13e3f1471d0/markupsafe-3.0.3-cp313-cp313-win32.whl", hash = "sha256:bdd37121970bfd8be76c5fb069c7751683bdf373db1ed6c010162b2a130248ed", size = 14543, upload-time = "2025-09-27T18:36:51.584Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/05/73/c4abe620b841b6b791f2edc248f556900667a5a1cf023a6646967ae98335/markupsafe-3.0.3-cp313-cp313-win_amd64.whl", hash = "sha256:9a1abfdc021a164803f4d485104931fb8f8c1efd55bc6b748d2f5774e78b62c5", size = 15113, upload-time = "2025-09-27T18:36:52.537Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/f0/3a/fa34a0f7cfef23cf9500d68cb7c32dd64ffd58a12b09225fb03dd37d5b80/markupsafe-3.0.3-cp313-cp313-win_arm64.whl", hash = "sha256:7e68f88e5b8799aa49c85cd116c932a1ac15caaa3f5db09087854d218359e485", size = 13911, upload-time = "2025-09-27T18:36:53.513Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/e4/d7/e05cd7efe43a88a17a37b3ae96e79a19e846f3f456fe79c57ca61356ef01/markupsafe-3.0.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:218551f6df4868a8d527e3062d0fb968682fe92054e89978594c28e642c43a73", size = 11658, upload-time = "2025-09-27T18:36:54.819Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/99/9e/e412117548182ce2148bdeacdda3bb494260c0b0184360fe0d56389b523b/markupsafe-3.0.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:3524b778fe5cfb3452a09d31e7b5adefeea8c5be1d43c4f810ba09f2ceb29d37", size = 12066, upload-time = "2025-09-27T18:36:55.714Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/bc/e6/fa0ffcda717ef64a5108eaa7b4f5ed28d56122c9a6d70ab8b72f9f715c80/markupsafe-3.0.3-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4e885a3d1efa2eadc93c894a21770e4bc67899e3543680313b09f139e149ab19", size = 25639, upload-time = "2025-09-27T18:36:56.908Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/96/ec/2102e881fe9d25fc16cb4b25d5f5cde50970967ffa5dddafdb771237062d/markupsafe-3.0.3-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8709b08f4a89aa7586de0aadc8da56180242ee0ada3999749b183aa23df95025", size = 23569, upload-time = "2025-09-27T18:36:57.913Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/4b/30/6f2fce1f1f205fc9323255b216ca8a235b15860c34b6798f810f05828e32/markupsafe-3.0.3-cp313-cp313t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:b8512a91625c9b3da6f127803b166b629725e68af71f8184ae7e7d54686a56d6", size = 23284, upload-time = "2025-09-27T18:36:58.833Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/58/47/4a0ccea4ab9f5dcb6f79c0236d954acb382202721e704223a8aafa38b5c8/markupsafe-3.0.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:9b79b7a16f7fedff2495d684f2b59b0457c3b493778c9eed31111be64d58279f", size = 24801, upload-time = "2025-09-27T18:36:59.739Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/6a/70/3780e9b72180b6fecb83a4814d84c3bf4b4ae4bf0b19c27196104149734c/markupsafe-3.0.3-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:12c63dfb4a98206f045aa9563db46507995f7ef6d83b2f68eda65c307c6829eb", size = 22769, upload-time = "2025-09-27T18:37:00.719Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/98/c5/c03c7f4125180fc215220c035beac6b9cb684bc7a067c84fc69414d315f5/markupsafe-3.0.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:8f71bc33915be5186016f675cd83a1e08523649b0e33efdb898db577ef5bb009", size = 23642, upload-time = "2025-09-27T18:37:01.673Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/80/d6/2d1b89f6ca4bff1036499b1e29a1d02d282259f3681540e16563f27ebc23/markupsafe-3.0.3-cp313-cp313t-win32.whl", hash = "sha256:69c0b73548bc525c8cb9a251cddf1931d1db4d2258e9599c28c07ef3580ef354", size = 14612, upload-time = "2025-09-27T18:37:02.639Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/2b/98/e48a4bfba0a0ffcf9925fe2d69240bfaa19c6f7507b8cd09c70684a53c1e/markupsafe-3.0.3-cp313-cp313t-win_amd64.whl", hash = "sha256:1b4b79e8ebf6b55351f0d91fe80f893b4743f104bff22e90697db1590e47a218", size = 15200, upload-time = "2025-09-27T18:37:03.582Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/0e/72/e3cc540f351f316e9ed0f092757459afbc595824ca724cbc5a5d4263713f/markupsafe-3.0.3-cp313-cp313t-win_arm64.whl", hash = "sha256:ad2cf8aa28b8c020ab2fc8287b0f823d0a7d8630784c31e9ee5edea20f406287", size = 13973, upload-time = "2025-09-27T18:37:04.929Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/33/8a/8e42d4838cd89b7dde187011e97fe6c3af66d8c044997d2183fbd6d31352/markupsafe-3.0.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:eaa9599de571d72e2daf60164784109f19978b327a3910d3e9de8c97b5b70cfe", size = 11619, upload-time = "2025-09-27T18:37:06.342Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/b5/64/7660f8a4a8e53c924d0fa05dc3a55c9cee10bbd82b11c5afb27d44b096ce/markupsafe-3.0.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:c47a551199eb8eb2121d4f0f15ae0f923d31350ab9280078d1e5f12b249e0026", size = 12029, upload-time = "2025-09-27T18:37:07.213Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/da/ef/e648bfd021127bef5fa12e1720ffed0c6cbb8310c8d9bea7266337ff06de/markupsafe-3.0.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f34c41761022dd093b4b6896d4810782ffbabe30f2d443ff5f083e0cbbb8c737", size = 24408, upload-time = "2025-09-27T18:37:09.572Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/41/3c/a36c2450754618e62008bf7435ccb0f88053e07592e6028a34776213d877/markupsafe-3.0.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:457a69a9577064c05a97c41f4e65148652db078a3a509039e64d3467b9e7ef97", size = 23005, upload-time = "2025-09-27T18:37:10.58Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/bc/20/b7fdf89a8456b099837cd1dc21974632a02a999ec9bf7ca3e490aacd98e7/markupsafe-3.0.3-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:e8afc3f2ccfa24215f8cb28dcf43f0113ac3c37c2f0f0806d8c70e4228c5cf4d", size = 22048, upload-time = "2025-09-27T18:37:11.547Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/9a/a7/591f592afdc734f47db08a75793a55d7fbcc6902a723ae4cfbab61010cc5/markupsafe-3.0.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:ec15a59cf5af7be74194f7ab02d0f59a62bdcf1a537677ce67a2537c9b87fcda", size = 23821, upload-time = "2025-09-27T18:37:12.48Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/7d/33/45b24e4f44195b26521bc6f1a82197118f74df348556594bd2262bda1038/markupsafe-3.0.3-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:0eb9ff8191e8498cca014656ae6b8d61f39da5f95b488805da4bb029cccbfbaf", size = 21606, upload-time = "2025-09-27T18:37:13.485Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/ff/0e/53dfaca23a69fbfbbf17a4b64072090e70717344c52eaaaa9c5ddff1e5f0/markupsafe-3.0.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:2713baf880df847f2bece4230d4d094280f4e67b1e813eec43b4c0e144a34ffe", size = 23043, upload-time = "2025-09-27T18:37:14.408Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/46/11/f333a06fc16236d5238bfe74daccbca41459dcd8d1fa952e8fbd5dccfb70/markupsafe-3.0.3-cp314-cp314-win32.whl", hash = "sha256:729586769a26dbceff69f7a7dbbf59ab6572b99d94576a5592625d5b411576b9", size = 14747, upload-time = "2025-09-27T18:37:15.36Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/28/52/182836104b33b444e400b14f797212f720cbc9ed6ba34c800639d154e821/markupsafe-3.0.3-cp314-cp314-win_amd64.whl", hash = "sha256:bdc919ead48f234740ad807933cdf545180bfbe9342c2bb451556db2ed958581", size = 15341, upload-time = "2025-09-27T18:37:16.496Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/6f/18/acf23e91bd94fd7b3031558b1f013adfa21a8e407a3fdb32745538730382/markupsafe-3.0.3-cp314-cp314-win_arm64.whl", hash = "sha256:5a7d5dc5140555cf21a6fefbdbf8723f06fcd2f63ef108f2854de715e4422cb4", size = 14073, upload-time = "2025-09-27T18:37:17.476Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/3c/f0/57689aa4076e1b43b15fdfa646b04653969d50cf30c32a102762be2485da/markupsafe-3.0.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:1353ef0c1b138e1907ae78e2f6c63ff67501122006b0f9abad68fda5f4ffc6ab", size = 11661, upload-time = "2025-09-27T18:37:18.453Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/89/c3/2e67a7ca217c6912985ec766c6393b636fb0c2344443ff9d91404dc4c79f/markupsafe-3.0.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:1085e7fbddd3be5f89cc898938f42c0b3c711fdcb37d75221de2666af647c175", size = 12069, upload-time = "2025-09-27T18:37:19.332Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/f0/00/be561dce4e6ca66b15276e184ce4b8aec61fe83662cce2f7d72bd3249d28/markupsafe-3.0.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1b52b4fb9df4eb9ae465f8d0c228a00624de2334f216f178a995ccdcf82c4634", size = 25670, upload-time = "2025-09-27T18:37:20.245Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/50/09/c419f6f5a92e5fadde27efd190eca90f05e1261b10dbd8cbcb39cd8ea1dc/markupsafe-3.0.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fed51ac40f757d41b7c48425901843666a6677e3e8eb0abcff09e4ba6e664f50", size = 23598, upload-time = "2025-09-27T18:37:21.177Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/22/44/a0681611106e0b2921b3033fc19bc53323e0b50bc70cffdd19f7d679bb66/markupsafe-3.0.3-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f190daf01f13c72eac4efd5c430a8de82489d9cff23c364c3ea822545032993e", size = 23261, upload-time = "2025-09-27T18:37:22.167Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/5f/57/1b0b3f100259dc9fffe780cfb60d4be71375510e435efec3d116b6436d43/markupsafe-3.0.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:e56b7d45a839a697b5eb268c82a71bd8c7f6c94d6fd50c3d577fa39a9f1409f5", size = 24835, upload-time = "2025-09-27T18:37:23.296Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/26/6a/4bf6d0c97c4920f1597cc14dd720705eca0bf7c787aebc6bb4d1bead5388/markupsafe-3.0.3-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:f3e98bb3798ead92273dc0e5fd0f31ade220f59a266ffd8a4f6065e0a3ce0523", size = 22733, upload-time = "2025-09-27T18:37:24.237Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/14/c7/ca723101509b518797fedc2fdf79ba57f886b4aca8a7d31857ba3ee8281f/markupsafe-3.0.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:5678211cb9333a6468fb8d8be0305520aa073f50d17f089b5b4b477ea6e67fdc", size = 23672, upload-time = "2025-09-27T18:37:25.271Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/fb/df/5bd7a48c256faecd1d36edc13133e51397e41b73bb77e1a69deab746ebac/markupsafe-3.0.3-cp314-cp314t-win32.whl", hash = "sha256:915c04ba3851909ce68ccc2b8e2cd691618c4dc4c4232fb7982bca3f41fd8c3d", size = 14819, upload-time = "2025-09-27T18:37:26.285Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/1a/8a/0402ba61a2f16038b48b39bccca271134be00c5c9f0f623208399333c448/markupsafe-3.0.3-cp314-cp314t-win_amd64.whl", hash = "sha256:4faffd047e07c38848ce017e8725090413cd80cbc23d86e55c587bf979e579c9", size = 15426, upload-time = "2025-09-27T18:37:27.316Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/70/bc/6f1c2f612465f5fa89b95bead1f44dcb607670fd42891d8fdcd5d039f4f4/markupsafe-3.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:32001d6a8fc98c8cb5c947787c5d08b0a50663d139f1305bac5885d98d9b40fa", size = 14146, upload-time = "2025-09-27T18:37:28.327Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "numpy"
|
||||
version = "2.4.3"
|
||||
|
|
@ -744,18 +611,6 @@ wheels = [
|
|||
{ url = "https://files.pythonhosted.org/packages/c8/c4/cc0229fea55c87d6c9c67fe44a21e2cd28d1d558a5478ed4d617e9fb0c93/playwright-1.58.0-py3-none-win_arm64.whl", hash = "sha256:32ffe5c303901a13a0ecab91d1c3f74baf73b84f4bedbb6b935f5bc11cc98e1b", size = 33085919, upload-time = "2026-01-30T15:09:45.71Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "playwright-stealth"
|
||||
version = "2.0.2"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "playwright" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/61/ee/871901103c7b2a12070011fd4d978191f8f962837bf8bb51847274f528fa/playwright_stealth-2.0.2.tar.gz", hash = "sha256:ac57e51873190da5e653e03720e948c8f0a3d06b098f1d56763103d23ee48143", size = 24902, upload-time = "2026-02-13T02:36:25.137Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/f1/30/f95f087f4b071611a7f63a2a0c9af4df3ac046dae2a693bfdacd70512867/playwright_stealth-2.0.2-py3-none-any.whl", hash = "sha256:37a5733f481b9c0ad602cf71491aa5a7c96c2a2fe4fa1e7ab764d2cd35520f2f", size = 33209, upload-time = "2026-02-13T02:36:26.334Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "polars"
|
||||
version = "1.39.0"
|
||||
|
|
@ -784,15 +639,6 @@ wheels = [
|
|||
{ url = "https://files.pythonhosted.org/packages/b3/eb/936f5eeae196e8c8aaabe5f7d98891be8a5bbc741d50ce5c60f55575ad29/polars_runtime_32-1.39.0-cp310-abi3-win_arm64.whl", hash = "sha256:d69abde5f148566860bbe910010847bd7791e72f7c8063a4d2c462246a33a72a", size = 41885761, upload-time = "2026-03-12T14:23:16.773Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "prometheus-client"
|
||||
version = "0.24.1"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/f0/58/a794d23feb6b00fc0c72787d7e87d872a6730dd9ed7c7b3e954637d8f280/prometheus_client-0.24.1.tar.gz", hash = "sha256:7e0ced7fbbd40f7b84962d5d2ab6f17ef88a72504dcf7c0b40737b43b2a461f9", size = 85616, upload-time = "2026-01-14T15:26:26.965Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/74/c3/24a2f845e3917201628ecaba4f18bab4d18a337834c1df2a159ee9d22a42/prometheus_client-0.24.1-py3-none-any.whl", hash = "sha256:150db128af71a5c2482b36e588fc8a6b95e498750da4b17065947c16070f4055", size = 64057, upload-time = "2026-01-14T15:26:24.42Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pycparser"
|
||||
version = "3.0"
|
||||
|
|
@ -926,15 +772,6 @@ wheels = [
|
|||
{ url = "https://files.pythonhosted.org/packages/6e/bf/c5205d480307bef660e56544b9e3d7ff687da776abb30c9cb3f330887570/screeninfo-0.8.1-py3-none-any.whl", hash = "sha256:e97d6b173856edcfa3bd282f81deb528188aff14b11ec3e195584e7641be733c", size = 12907, upload-time = "2022-09-09T11:35:21.351Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "soupsieve"
|
||||
version = "2.8.3"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/7b/ae/2d9c981590ed9999a0d91755b47fc74f74de286b0f5cee14c9269041e6c4/soupsieve-2.8.3.tar.gz", hash = "sha256:3267f1eeea4251fb42728b6dfb746edc9acaffc4a45b27e19450b676586e8349", size = 118627, upload-time = "2026-01-20T04:27:02.457Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/46/2c/1462b1d0a634697ae9e55b3cecdcb64788e8b7d63f54d923fcd0bb140aed/soupsieve-2.8.3-py3-none-any.whl", hash = "sha256:ed64f2ba4eebeab06cc4962affce381647455978ffc1e36bb79a545b91f45a95", size = 37016, upload-time = "2026-01-20T04:27:01.012Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tqdm"
|
||||
version = "4.67.3"
|
||||
|
|
@ -984,15 +821,3 @@ sdist = { url = "https://files.pythonhosted.org/packages/c7/24/5f1b3bdffd70275f6
|
|||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/39/08/aaaad47bc4e9dc8c725e68f9d04865dbcb2052843ff09c97b08904852d84/urllib3-2.6.3-py3-none-any.whl", hash = "sha256:bf272323e553dfb2e87d9bfd225ca7b0f467b919d7bbd355436d3fd37cb0acd4", size = 131584, upload-time = "2026-01-07T16:24:42.685Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "werkzeug"
|
||||
version = "3.1.6"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "markupsafe" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/61/f1/ee81806690a87dab5f5653c1f146c92bc066d7f4cebc603ef88eb9e13957/werkzeug-3.1.6.tar.gz", hash = "sha256:210c6bede5a420a913956b4791a7f4d6843a43b6fcee4dfa08a65e93007d0d25", size = 864736, upload-time = "2026-02-19T15:17:18.884Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/4d/ec/d58832f89ede95652fd01f4f24236af7d32b70cab2196dfcc2d2fd13c5c2/werkzeug-3.1.6-py3-none-any.whl", hash = "sha256:7ddf3357bb9564e407607f988f683d72038551200c704012bb9a4c523d42f131", size = 225166, upload-time = "2026-02-19T15:17:17.475Z" },
|
||||
]
|
||||
|
|
|
|||
283
finder/zoopla.py
283
finder/zoopla.py
|
|
@ -1,4 +1,4 @@
|
|||
"""Zoopla (zoopla.co.uk) scraper — buy and rental properties.
|
||||
"""Zoopla (zoopla.co.uk) scraper — sale properties.
|
||||
|
||||
Zoopla is behind Cloudflare Turnstile (managed interactive challenge), which
|
||||
blocks all HTTP clients (curl_cffi, httpx) and even Playwright with stealth
|
||||
|
|
@ -6,18 +6,14 @@ patches. Only Camoufox (an anti-fingerprinting Firefox fork) passes reliably.
|
|||
|
||||
Zoopla uses Next.js App Router with React Server Components (RSC). Search
|
||||
result data is server-rendered in an RSC stream, not available via
|
||||
__NEXT_DATA__ or a JSON API. URL-based location slugs return 0 results —
|
||||
the working flow requires typing into the autocomplete input, selecting a
|
||||
suggestion, and clicking Search.
|
||||
__NEXT_DATA__ or a JSON API.
|
||||
|
||||
Architecture:
|
||||
Unlike the other scrapers which use HTTP clients per outcode, Zoopla keeps
|
||||
a single Camoufox browser alive for the entire scrape. For each outcode, it:
|
||||
1. Clears and types the outcode into the search input
|
||||
2. Selects the first autocomplete suggestion
|
||||
3. Clicks Search
|
||||
4. Extracts listing data from the rendered DOM
|
||||
5. Handles pagination via ?pn=N parameter
|
||||
1. Navigates directly to the sale search URL
|
||||
2. Extracts listing data from the rendered DOM
|
||||
3. Handles pagination via ?pn=N parameter
|
||||
|
||||
The browser session replaces the cookie/client pattern used by other scrapers.
|
||||
"""
|
||||
|
|
@ -27,7 +23,6 @@ import re
|
|||
import time
|
||||
|
||||
from constants import DELAY_BETWEEN_PAGES, MAX_BEDROOMS, PROPERTY_TYPE_MAP, ZOOPLA_BASE
|
||||
from metrics import zoopla_errors_total, zoopla_pages_scraped, zoopla_properties_scraped
|
||||
from spatial import PostcodeSpatialIndex
|
||||
from transform import normalize_sub_type, validate_floor_area
|
||||
|
||||
|
|
@ -38,6 +33,25 @@ class TurnstileError(Exception):
|
|||
"""Raised when Cloudflare Turnstile challenge cannot be passed."""
|
||||
|
||||
|
||||
class _ManagedCamoufoxBrowser:
|
||||
def __init__(self, context_manager, browser):
|
||||
self._context_manager = context_manager
|
||||
self._browser = browser
|
||||
self._closed = False
|
||||
|
||||
def close(self) -> None:
|
||||
if self._closed:
|
||||
return
|
||||
self._closed = True
|
||||
try:
|
||||
self._browser.close()
|
||||
finally:
|
||||
self._context_manager.__exit__(None, None, None)
|
||||
|
||||
def __getattr__(self, name):
|
||||
return getattr(self._browser, name)
|
||||
|
||||
|
||||
# Maximum search result pages to scrape per outcode (25 listings/page)
|
||||
MAX_PAGES_PER_OUTCODE = 40
|
||||
|
||||
|
|
@ -55,7 +69,7 @@ _EXTRACT_LISTINGS_JS = r"""() => {
|
|||
|
||||
for (const card of listingCards) {
|
||||
const link = card.querySelector(
|
||||
'a[href*="/for-sale/details/"], a[href*="/new-homes/details/"], a[href*="/to-rent/details/"]'
|
||||
'a[href*="/for-sale/details/"], a[href*="/new-homes/details/"]'
|
||||
);
|
||||
if (!link) continue;
|
||||
|
||||
|
|
@ -100,9 +114,9 @@ _EXTRACT_LISTINGS_JS = r"""() => {
|
|||
|
||||
// Extract property type (e.g., "2 bed flat for sale" → "flat")
|
||||
let property_type = '';
|
||||
const ptMatch = text.match(/\d+\s*(?:beds?|bedrooms?)\s+([\w\s-]+?)\s+(?:for\s+sale|to\s+(?:rent|let)|for\s+rent)/i);
|
||||
const ptMatch = text.match(/\d+\s*(?:beds?|bedrooms?)\s+([\w\s-]+?)\s+for\s+sale/i);
|
||||
if (ptMatch) property_type = ptMatch[1].trim();
|
||||
else if (/\bstudio\s*(?:flat|apartment)?\s+(?:for\s+sale|to\s+(?:rent|let)|for\s+rent)/i.test(text)) property_type = 'Studio';
|
||||
else if (/\bstudio\s*(?:flat|apartment)?\s+for\s+sale/i.test(text)) property_type = 'Studio';
|
||||
|
||||
// Keyword fallback when regex doesn't match current DOM format
|
||||
if (!property_type) {
|
||||
|
|
@ -135,7 +149,7 @@ _EXTRACT_LISTINGS_JS = r"""() => {
|
|||
// Strategy 2: Fall back to href-based link matching with parent-walking
|
||||
if (results.length === 0) {
|
||||
const links = Array.from(document.querySelectorAll(
|
||||
'a[href*="/for-sale/details/"], a[href*="/new-homes/details/"], a[href*="/to-rent/details/"]'
|
||||
'a[href*="/for-sale/details/"], a[href*="/new-homes/details/"]'
|
||||
));
|
||||
|
||||
for (const link of links) {
|
||||
|
|
@ -184,9 +198,9 @@ _EXTRACT_LISTINGS_JS = r"""() => {
|
|||
|
||||
// Extract property type
|
||||
let property_type = '';
|
||||
const ptMatch2 = text.match(/\d+\s*(?:beds?|bedrooms?)\s+([\w\s-]+?)\s+(?:for\s+sale|to\s+(?:rent|let)|for\s+rent)/i);
|
||||
const ptMatch2 = text.match(/\d+\s*(?:beds?|bedrooms?)\s+([\w\s-]+?)\s+for\s+sale/i);
|
||||
if (ptMatch2) property_type = ptMatch2[1].trim();
|
||||
else if (/\bstudio\s*(?:flat|apartment)?\s+(?:for\s+sale|to\s+(?:rent|let)|for\s+rent)/i.test(text)) property_type = 'Studio';
|
||||
else if (/\bstudio\s*(?:flat|apartment)?\s+for\s+sale/i.test(text)) property_type = 'Studio';
|
||||
|
||||
// Keyword fallback when regex doesn't match current DOM format
|
||||
if (!property_type) {
|
||||
|
|
@ -243,17 +257,20 @@ def launch_browser():
|
|||
"""Launch Camoufox, navigate to Zoopla homepage, pass Cloudflare Turnstile,
|
||||
and dismiss cookie consent. Returns (browser, page) tuple.
|
||||
|
||||
Raises TurnstileError if Cloudflare cannot be passed within 60 seconds.
|
||||
Raises TurnstileError if Cloudflare cannot be passed within two minutes.
|
||||
Caller must close browser when done."""
|
||||
from camoufox.pkgman import camoufox_path
|
||||
|
||||
# Verify camoufox is pre-installed — never download at runtime
|
||||
camoufox_path(download_if_missing=False)
|
||||
# Standalone local runs should not require the old container image to have
|
||||
# pre-fetched Camoufox.
|
||||
camoufox_path(download_if_missing=True)
|
||||
|
||||
from camoufox.sync_api import Camoufox
|
||||
|
||||
log.info("Launching Camoufox browser for Zoopla...")
|
||||
browser = Camoufox(headless=True).__enter__()
|
||||
camoufox = Camoufox(headless=True)
|
||||
raw_browser = camoufox.__enter__()
|
||||
browser = _ManagedCamoufoxBrowser(camoufox, raw_browser)
|
||||
page = browser.new_page()
|
||||
|
||||
log.info("Navigating to Zoopla homepage...")
|
||||
|
|
@ -261,7 +278,7 @@ def launch_browser():
|
|||
|
||||
# Wait for Cloudflare Turnstile to resolve.
|
||||
# Try clicking the Turnstile checkbox if present (helps in some cases).
|
||||
for i in range(20):
|
||||
for i in range(40):
|
||||
if "Just a moment" not in page.title():
|
||||
break
|
||||
# Attempt to click the Turnstile checkbox in the challenge iframe
|
||||
|
|
@ -280,7 +297,7 @@ def launch_browser():
|
|||
else:
|
||||
page.close()
|
||||
browser.close()
|
||||
raise TurnstileError("Cloudflare Turnstile did not resolve after 60s")
|
||||
raise TurnstileError("Cloudflare Turnstile did not resolve after 120s")
|
||||
|
||||
log.info("Cloudflare passed — title: %s", page.title())
|
||||
time.sleep(2)
|
||||
|
|
@ -298,13 +315,13 @@ def _ensure_not_challenged(page) -> None:
|
|||
return
|
||||
|
||||
log.warning("Cloudflare challenge detected mid-session, waiting...")
|
||||
for i in range(20):
|
||||
for i in range(40):
|
||||
time.sleep(3)
|
||||
if "Just a moment" not in page.title():
|
||||
log.info("Cloudflare challenge resolved")
|
||||
return
|
||||
|
||||
raise TurnstileError("Cloudflare re-challenge did not resolve")
|
||||
raise TurnstileError("Cloudflare re-challenge did not resolve after 120s")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
|
|
@ -312,21 +329,8 @@ def _ensure_not_challenged(page) -> None:
|
|||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _navigate_direct(page, url: str) -> bool:
|
||||
"""Navigate directly to a Zoopla search URL (skipping the homepage flow).
|
||||
|
||||
Used to load the second channel (e.g., RENT after BUY) for the same outcode
|
||||
by swapping the path component. Falls back gracefully — returns False if
|
||||
the page has no listings, so the caller can retry via the full search flow.
|
||||
"""
|
||||
try:
|
||||
page.goto(url, wait_until="domcontentloaded", timeout=30000)
|
||||
except Exception as e:
|
||||
log.debug("Direct navigation failed: %s", e)
|
||||
return False
|
||||
_ensure_not_challenged(page)
|
||||
|
||||
# Wait for listing content to hydrate
|
||||
def _wait_for_listing_content(page) -> None:
|
||||
"""Wait for rendered listing cards to contain usable text."""
|
||||
try:
|
||||
page.wait_for_function(
|
||||
"""() => {
|
||||
|
|
@ -343,100 +347,42 @@ def _navigate_direct(page, url: str) -> bool:
|
|||
timeout=8000,
|
||||
)
|
||||
except Exception:
|
||||
# Check if the page has any listings at all
|
||||
has_listings = page.query_selector('a[href*="/details/"]')
|
||||
if not has_listings:
|
||||
return False
|
||||
time.sleep(1.5)
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def _navigate_search(page, outcode: str, channel: str) -> bool:
|
||||
"""Navigate to search results for an outcode via the homepage search flow.
|
||||
def _navigate_search(page, outcode: str) -> bool:
|
||||
"""Navigate directly to sale search results for an outcode.
|
||||
|
||||
Returns True if results were found, False if no results or navigation failed.
|
||||
Raises TurnstileError if Cloudflare blocks us."""
|
||||
# Navigate to homepage to reset search state
|
||||
page.goto(f"{ZOOPLA_BASE}/", wait_until="domcontentloaded", timeout=30000)
|
||||
time.sleep(0.5)
|
||||
url = (
|
||||
f"{ZOOPLA_BASE}/for-sale/property/{outcode.lower()}/"
|
||||
f"?q={outcode}&search_source=home"
|
||||
)
|
||||
try:
|
||||
page.goto(url, wait_until="domcontentloaded", timeout=30000)
|
||||
except Exception as exc:
|
||||
log.debug("Zoopla direct navigation failed for %s: %s", outcode, exc)
|
||||
return False
|
||||
|
||||
_ensure_not_challenged(page)
|
||||
|
||||
# Dismiss cookie consent (may reappear after navigation)
|
||||
page.evaluate(_DISMISS_COOKIES_JS)
|
||||
time.sleep(0.3)
|
||||
try:
|
||||
page.evaluate(_DISMISS_COOKIES_JS)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Select Buy/Rent tab
|
||||
if channel == "RENT":
|
||||
rent_tab = page.query_selector(
|
||||
'button:has-text("Rent"), [role="tab"]:has-text("Rent")'
|
||||
)
|
||||
if rent_tab:
|
||||
rent_tab.click()
|
||||
time.sleep(0.2)
|
||||
|
||||
# Find and fill search input
|
||||
search_input = page.query_selector(
|
||||
'input[name="autosuggest-input"]'
|
||||
) or page.query_selector('input[type="text"]')
|
||||
if not search_input:
|
||||
log.warning("Could not find search input on homepage")
|
||||
return False
|
||||
|
||||
search_input.click()
|
||||
time.sleep(0.1)
|
||||
search_input.fill("")
|
||||
search_input.type(outcode, delay=60)
|
||||
time.sleep(1.2)
|
||||
|
||||
# Select first autocomplete suggestion
|
||||
first_option = page.query_selector('[role="option"]')
|
||||
if not first_option:
|
||||
log.debug("No autocomplete suggestions for outcode %s", outcode)
|
||||
return False
|
||||
|
||||
first_option.click()
|
||||
time.sleep(0.2)
|
||||
|
||||
# Click search button
|
||||
search_btn = page.query_selector('button:has-text("Search")')
|
||||
if search_btn:
|
||||
search_btn.click()
|
||||
else:
|
||||
search_input.press("Enter")
|
||||
|
||||
# Wait for results to load — try waiting for listings container, fall back to fixed wait
|
||||
try:
|
||||
page.wait_for_selector(
|
||||
'[data-testid="regular-listings"], a[href*="/details/"]',
|
||||
'[data-testid="regular-listings"], a[href*="/for-sale/details/"], a[href*="/new-homes/details/"]',
|
||||
timeout=10000,
|
||||
)
|
||||
except Exception:
|
||||
time.sleep(4)
|
||||
_ensure_not_challenged(page)
|
||||
if not page.query_selector('a[href*="/details/"]'):
|
||||
return False
|
||||
|
||||
# Wait for client-side hydration to populate listing content (prices, addresses).
|
||||
# The structural container appears in server-rendered HTML before React hydrates
|
||||
# the actual card content — extracting too early yields empty price/address fields.
|
||||
try:
|
||||
page.wait_for_function(
|
||||
"""() => {
|
||||
const cards = document.querySelectorAll(
|
||||
'[data-testid="regular-listings"] > div'
|
||||
);
|
||||
if (cards.length === 0) return false;
|
||||
for (const card of cards) {
|
||||
const t = card.innerText || '';
|
||||
if (t.includes('\\u00a3') && t.length > 50) return true;
|
||||
}
|
||||
return false;
|
||||
}""",
|
||||
timeout=8000,
|
||||
)
|
||||
except Exception:
|
||||
# Content never appeared — extraction will likely fail but let it try
|
||||
log.debug("Listing content hydration wait timed out — prices may not have rendered")
|
||||
time.sleep(2)
|
||||
_wait_for_listing_content(page)
|
||||
|
||||
return True
|
||||
|
||||
|
|
@ -516,18 +462,21 @@ def _extract_listings(page) -> list[dict]:
|
|||
return listings
|
||||
except Exception as e:
|
||||
log.warning("Failed to extract listings from DOM: %s", e)
|
||||
zoopla_errors_total.labels(type="extract_failed").inc()
|
||||
return []
|
||||
|
||||
|
||||
def _paginate(page, total_results: int, channel: str) -> list[dict]:
|
||||
def _paginate(
|
||||
page,
|
||||
total_results: int,
|
||||
max_properties: int | None = None,
|
||||
) -> list[dict]:
|
||||
"""Extract listings from all pages of search results.
|
||||
|
||||
Page 1 is already loaded. For subsequent pages, clicks the Next button
|
||||
or navigates via URL parameter ?pn=N."""
|
||||
all_listings = _extract_listings(page)
|
||||
channel_label = "buy" if channel == "BUY" else "rent"
|
||||
zoopla_pages_scraped.labels(channel=channel_label).inc()
|
||||
if max_properties is not None and len(all_listings) >= max_properties:
|
||||
return all_listings[:max_properties]
|
||||
|
||||
if not all_listings or total_results <= len(all_listings):
|
||||
return all_listings
|
||||
|
|
@ -550,24 +499,7 @@ def _paginate(page, total_results: int, channel: str) -> list[dict]:
|
|||
try:
|
||||
page.goto(next_url, wait_until="domcontentloaded", timeout=30000)
|
||||
_ensure_not_challenged(page)
|
||||
# Wait for listing content instead of fixed sleep
|
||||
try:
|
||||
page.wait_for_function(
|
||||
"""() => {
|
||||
const cards = document.querySelectorAll(
|
||||
'[data-testid="regular-listings"] > div'
|
||||
);
|
||||
if (cards.length === 0) return false;
|
||||
for (const card of cards) {
|
||||
const t = card.innerText || '';
|
||||
if (t.includes('\\u00a3') && t.length > 50) return true;
|
||||
}
|
||||
return false;
|
||||
}""",
|
||||
timeout=8000,
|
||||
)
|
||||
except Exception:
|
||||
time.sleep(1.5)
|
||||
_wait_for_listing_content(page)
|
||||
except TurnstileError:
|
||||
raise
|
||||
except Exception as e:
|
||||
|
|
@ -585,8 +517,8 @@ def _paginate(page, total_results: int, channel: str) -> list[dict]:
|
|||
seen_ids.add(listing["id"])
|
||||
all_listings.append(listing)
|
||||
new_count += 1
|
||||
|
||||
zoopla_pages_scraped.labels(channel=channel_label).inc()
|
||||
if max_properties is not None and len(all_listings) >= max_properties:
|
||||
return all_listings[:max_properties]
|
||||
|
||||
if new_count == 0:
|
||||
break # No new listings on this page
|
||||
|
|
@ -692,31 +624,8 @@ def _map_property_type(raw_type: str | None) -> str:
|
|||
return "Other"
|
||||
|
||||
|
||||
def _detect_rent_frequency(price_text: str) -> str:
|
||||
"""Detect rent frequency from Zoopla price text.
|
||||
|
||||
Zoopla price elements contain text like '£1,500 pcm', '£350 pw',
|
||||
'£18,000 pa'. Defaults to 'monthly' if no frequency indicator found.
|
||||
|
||||
Checks monthly indicators (pcm) BEFORE weekly (pw) because Zoopla cards
|
||||
often display both monthly and weekly prices in the same text. When the
|
||||
JS extraction falls back to full card text, checking pcm first ensures
|
||||
the captured monthly price gets the correct frequency label.
|
||||
"""
|
||||
lower = price_text.lower()
|
||||
if "pcm" in lower or "per month" in lower or "per calendar month" in lower:
|
||||
return "monthly"
|
||||
if "pw" in lower or "per week" in lower or "/w" in lower:
|
||||
return "weekly"
|
||||
if "pa" in lower or "per annum" in lower or "/y" in lower or "per year" in lower:
|
||||
return "yearly"
|
||||
# No indicator — default monthly (Zoopla standard)
|
||||
return "monthly"
|
||||
|
||||
|
||||
def transform_property(
|
||||
raw: dict,
|
||||
channel: str,
|
||||
pc_index: PostcodeSpatialIndex,
|
||||
pc_coords: dict[str, tuple[float, float]],
|
||||
search_outcode: str | None = None,
|
||||
|
|
@ -783,13 +692,6 @@ def transform_property(
|
|||
if listing_url and not listing_url.startswith("http"):
|
||||
listing_url = ZOOPLA_BASE + listing_url
|
||||
|
||||
# Detect rent frequency from price text (e.g. "£1,500 pcm" vs "£350 pw")
|
||||
if channel == "BUY":
|
||||
frequency = ""
|
||||
else:
|
||||
price_text = raw.get("price_text", "")
|
||||
frequency = _detect_rent_frequency(price_text)
|
||||
|
||||
return {
|
||||
"id": f"zp_{listing_id}",
|
||||
"Bedrooms": bedrooms,
|
||||
|
|
@ -803,7 +705,7 @@ def transform_property(
|
|||
"Property type": _map_property_type(raw.get("property_type")),
|
||||
"Property sub-type": normalize_sub_type(raw.get("property_type")),
|
||||
"price": int(price),
|
||||
"price_frequency": frequency,
|
||||
"price_frequency": "",
|
||||
"Price qualifier": "",
|
||||
"Total floor area (sqm)": floor_area_sqm,
|
||||
"Listing URL": listing_url,
|
||||
|
|
@ -820,10 +722,9 @@ def transform_property(
|
|||
def search_outcode(
|
||||
page,
|
||||
outcode: str,
|
||||
channel: str,
|
||||
pc_index: PostcodeSpatialIndex,
|
||||
pc_coords: dict[str, tuple[float, float]],
|
||||
base_search_url: str | None = None,
|
||||
max_properties: int | None = None,
|
||||
) -> tuple[list[dict], str | None]:
|
||||
"""Search Zoopla for properties in one outcode.
|
||||
|
||||
|
|
@ -831,47 +732,37 @@ def search_outcode(
|
|||
search flow, extracts listings from rendered DOM, and transforms to the
|
||||
standard output schema.
|
||||
|
||||
If base_search_url is provided (from a previous channel search for the same
|
||||
outcode), tries direct URL navigation first — skipping the slow homepage
|
||||
search flow. Falls back to full navigation if direct fails.
|
||||
|
||||
Returns (properties, search_url) where search_url can be passed to the next
|
||||
channel call for this outcode.
|
||||
Returns (properties, search_url).
|
||||
|
||||
Raises TurnstileError if Cloudflare blocks us mid-session.
|
||||
"""
|
||||
navigated = False
|
||||
if base_search_url:
|
||||
navigated = _navigate_direct(page, base_search_url)
|
||||
if navigated:
|
||||
log.debug("Zoopla %s %s: used direct URL navigation", outcode, channel)
|
||||
|
||||
if not navigated:
|
||||
if not _navigate_search(page, outcode, channel):
|
||||
return [], None
|
||||
if not _navigate_search(page, outcode):
|
||||
return [], None
|
||||
|
||||
total_results = _get_result_count(page)
|
||||
|
||||
# Always try extraction even if result count is 0 — the count regex may
|
||||
# not match Zoopla's current text format, but listings may still be in DOM
|
||||
raw_listings = _paginate(page, max(total_results, 25), channel)
|
||||
raw_listings = _paginate(
|
||||
page,
|
||||
max(total_results, 25),
|
||||
max_properties=max_properties,
|
||||
)
|
||||
if not raw_listings:
|
||||
if total_results > 0:
|
||||
log.debug(
|
||||
"Zoopla %s %s: page claims %d results but extraction found 0 — "
|
||||
"DOM selectors may need updating",
|
||||
outcode, channel, total_results,
|
||||
outcode, "BUY", total_results,
|
||||
)
|
||||
return [], None
|
||||
|
||||
channel_label = "buy" if channel == "BUY" else "rent"
|
||||
properties = []
|
||||
dropped = 0
|
||||
for raw in raw_listings:
|
||||
transformed = transform_property(raw, channel, pc_index, pc_coords, search_outcode=outcode)
|
||||
transformed = transform_property(raw, pc_index, pc_coords, search_outcode=outcode)
|
||||
if transformed:
|
||||
properties.append(transformed)
|
||||
zoopla_properties_scraped.labels(channel=channel_label).inc()
|
||||
else:
|
||||
dropped += 1
|
||||
|
||||
|
|
@ -881,13 +772,13 @@ def search_outcode(
|
|||
log.debug(
|
||||
"Zoopla %s %s: extracted %d raw listings but all %d dropped in transform "
|
||||
"(no price/postcode/coords). Sample raw: price=%s address=%r",
|
||||
outcode, channel, len(raw_listings), dropped,
|
||||
outcode, "BUY", len(raw_listings), dropped,
|
||||
sample.get("price"), sample.get("address", ""),
|
||||
)
|
||||
elif dropped > len(raw_listings) // 2:
|
||||
log.debug(
|
||||
"Zoopla %s %s: %d/%d listings dropped in transform",
|
||||
outcode, channel, dropped, len(raw_listings),
|
||||
outcode, "BUY", dropped, len(raw_listings),
|
||||
)
|
||||
|
||||
return properties, page.url
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue