All good
This commit is contained in:
parent
6ea544a0f6
commit
6cc7288126
45 changed files with 929 additions and 1043 deletions
|
|
@ -20,11 +20,6 @@ TYPEAHEAD_URL = "https://los.rightmove.co.uk/typeahead"
|
|||
SEARCH_URL = "https://www.rightmove.co.uk/api/property-search/listing/search"
|
||||
RIGHTMOVE_BASE = "https://www.rightmove.co.uk"
|
||||
|
||||
# home.co.uk
|
||||
HOMECOUK_BASE = "https://home.co.uk"
|
||||
HOMECOUK_API_BASE = f"{HOMECOUK_BASE}/api"
|
||||
HOMECOUK_PER_PAGE = 30 # max supported by the API
|
||||
|
||||
# Zoopla
|
||||
ZOOPLA_BASE = "https://www.zoopla.co.uk"
|
||||
|
||||
|
|
@ -108,13 +103,13 @@ PROPERTY_TYPE_MAP = {
|
|||
"House Boat": "Other",
|
||||
"Barn": "Other",
|
||||
"Serviced Apartments": "Other",
|
||||
# Space-separated variants (from home.co.uk underscore/hyphen normalization)
|
||||
# Space-separated variants from legacy provider normalization.
|
||||
"Semi Detached": "Semi-Detached",
|
||||
"Semi Detached Bungalow": "Semi-Detached",
|
||||
"End Of Terrace": "Terraced",
|
||||
"End Terrace": "Terraced",
|
||||
"Block Of Apartments": "Other",
|
||||
# Lowercase variants (from home.co.uk / Rightmove APIs)
|
||||
# Lowercase variants from listing APIs.
|
||||
"house": "Detached",
|
||||
"bungalow": "Other",
|
||||
"townhouse": "Terraced",
|
||||
|
|
|
|||
|
|
@ -1,461 +0,0 @@
|
|||
import json
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
import time
|
||||
from urllib.parse import unquote
|
||||
|
||||
from curl_cffi.requests import Session
|
||||
from curl_cffi.requests.errors import RequestsError
|
||||
|
||||
from constants import (
|
||||
DELAY_BETWEEN_PAGES,
|
||||
HOMECOUK_API_BASE,
|
||||
HOMECOUK_BASE,
|
||||
HOMECOUK_PER_PAGE,
|
||||
MAX_BEDROOMS,
|
||||
PROPERTY_TYPE_MAP,
|
||||
RETRY_BASE_DELAY,
|
||||
)
|
||||
from spatial import PostcodeSpatialIndex
|
||||
from transform import (
|
||||
normalize_postcode,
|
||||
normalize_sub_type,
|
||||
parse_int_value,
|
||||
validate_floor_area,
|
||||
)
|
||||
|
||||
log = logging.getLogger("homecouk")
|
||||
|
||||
|
||||
class CookiesExpiredError(Exception):
|
||||
"""Raised when home.co.uk returns 403, indicating cookies need refresh."""
|
||||
|
||||
|
||||
class PaginationError(Exception):
|
||||
"""Raised when home.co.uk pagination cannot be completed."""
|
||||
|
||||
|
||||
# Channel mapping: internal name → URL path segment
|
||||
HOMECOUK_URL_SEGMENT = "for-sale"
|
||||
|
||||
|
||||
def load_cookies() -> tuple[dict[str, str], str] | None:
|
||||
"""Get home.co.uk cookies + user-agent.
|
||||
|
||||
Environment cookies are optional. When they are not present, bootstrap a
|
||||
regular local session by visiting home.co.uk with curl_cffi's Chrome
|
||||
impersonation and reusing the cookies set by the site.
|
||||
"""
|
||||
user_agent = os.environ.get(
|
||||
"HOMECOUK_USER_AGENT",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/145.0.0.0 Safari/537.36",
|
||||
)
|
||||
|
||||
env_cookies = {
|
||||
name: value
|
||||
for name, value in {
|
||||
"cf_clearance": os.environ.get("HOMECOUK_CF_CLEARANCE", ""),
|
||||
"homecouk_session": os.environ.get("HOMECOUK_SESSION", ""),
|
||||
"XSRF-TOKEN": os.environ.get("HOMECOUK_XSRF_TOKEN", ""),
|
||||
}.items()
|
||||
if value
|
||||
}
|
||||
if env_cookies.get("homecouk_session"):
|
||||
return env_cookies, user_agent
|
||||
|
||||
session = Session(impersonate="chrome")
|
||||
session.headers.update(
|
||||
{
|
||||
"User-Agent": user_agent,
|
||||
"Accept": (
|
||||
"text/html,application/xhtml+xml,application/xml;q=0.9,"
|
||||
"*/*;q=0.8"
|
||||
),
|
||||
}
|
||||
)
|
||||
|
||||
for url in (HOMECOUK_BASE, f"{HOMECOUK_BASE}/for-sale/br1/"):
|
||||
try:
|
||||
response = session.get(url, timeout=30)
|
||||
except RequestsError as exc:
|
||||
log.warning("home.co.uk cookie bootstrap failed for %s: %s", url, exc)
|
||||
continue
|
||||
if response.status_code == 403:
|
||||
raise CookiesExpiredError("home.co.uk returned HTTP 403 during bootstrap")
|
||||
if response.status_code >= 400:
|
||||
log.warning(
|
||||
"home.co.uk cookie bootstrap got HTTP %d from %s",
|
||||
response.status_code,
|
||||
url,
|
||||
)
|
||||
|
||||
cookies = session.cookies.get_dict()
|
||||
if cookies.get("homecouk_session") and cookies.get("XSRF-TOKEN"):
|
||||
log.info("home.co.uk local session bootstrapped")
|
||||
return cookies, user_agent
|
||||
|
||||
log.warning("home.co.uk did not provide session cookies during bootstrap")
|
||||
return None
|
||||
|
||||
|
||||
def make_client(cookies: dict[str, str], user_agent: str) -> Session:
|
||||
"""Create a curl_cffi Session configured for home.co.uk API calls.
|
||||
Uses Chrome TLS impersonation so browser-derived cookies remain valid."""
|
||||
session = Session(impersonate="chrome")
|
||||
session.headers.update(
|
||||
{
|
||||
"User-Agent": user_agent,
|
||||
"Accept": "application/json, text/plain, */*",
|
||||
"x-requested-with": "XMLHttpRequest",
|
||||
}
|
||||
)
|
||||
# Laravel CSRF: the XSRF-TOKEN cookie value must also be sent as the
|
||||
# X-XSRF-TOKEN request header (URL-decoded). Without this header, the
|
||||
# server rejects every request with 419/403.
|
||||
xsrf = cookies.get("XSRF-TOKEN")
|
||||
if xsrf:
|
||||
session.headers["X-XSRF-TOKEN"] = unquote(xsrf)
|
||||
for name, value in cookies.items():
|
||||
session.cookies.set(name, value, domain="home.co.uk")
|
||||
return session
|
||||
|
||||
|
||||
def fetch_page(
|
||||
client: Session, url: str, params: dict, max_retries: int = 3
|
||||
) -> dict | None:
|
||||
"""GET JSON with retries on 429/5xx. Returns None on permanent failure.
|
||||
403 means cookies expired — raises CookiesExpiredError immediately."""
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
resp = client.get(url, params=params, timeout=30)
|
||||
if resp.status_code == 200:
|
||||
try:
|
||||
return resp.json()
|
||||
except json.JSONDecodeError:
|
||||
log.error(
|
||||
"Non-JSON response from %s (got %s)",
|
||||
url,
|
||||
resp.headers.get("content-type", "?"),
|
||||
)
|
||||
return None
|
||||
if resp.status_code == 403:
|
||||
raise CookiesExpiredError("HTTP 403 — cookies likely expired")
|
||||
if resp.status_code in (429, 500, 502, 503, 504):
|
||||
delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1)
|
||||
log.warning(
|
||||
"HTTP %d from %s, retry %d/%d in %.1fs",
|
||||
resp.status_code,
|
||||
url,
|
||||
attempt + 1,
|
||||
max_retries,
|
||||
delay,
|
||||
)
|
||||
time.sleep(delay)
|
||||
continue
|
||||
log.error("HTTP %d from %s (non-retryable)", resp.status_code, url)
|
||||
return None
|
||||
except CookiesExpiredError:
|
||||
raise
|
||||
except RequestsError as e:
|
||||
delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1)
|
||||
log.warning(
|
||||
"%s from %s, retry %d/%d in %.1fs",
|
||||
type(e).__name__,
|
||||
url,
|
||||
attempt + 1,
|
||||
max_retries,
|
||||
delay,
|
||||
)
|
||||
time.sleep(delay)
|
||||
log.error("All %d retries exhausted for %s", max_retries, url)
|
||||
return None
|
||||
|
||||
|
||||
def _coerce_positive_int(value) -> int | None:
|
||||
parsed = parse_int_value(value)
|
||||
if parsed is None or parsed <= 0:
|
||||
return None
|
||||
return parsed
|
||||
|
||||
|
||||
def _property_identity(prop: dict, page: int, index: int) -> str:
|
||||
for key in ("listing_id", "property_id", "id"):
|
||||
value = prop.get(key)
|
||||
if value:
|
||||
return f"{key}:{value}"
|
||||
return (
|
||||
f"page:{page}:index:{index}:"
|
||||
f"{prop.get('display_address') or prop.get('address') or ''}:"
|
||||
f"{prop.get('price') or prop.get('latest_price') or ''}"
|
||||
)
|
||||
|
||||
|
||||
def parse_floor_area(description: str | None) -> float | None:
|
||||
"""Try to extract floor area from description text like '789 sq.ft.' or '73 sq.m.'."""
|
||||
if not description:
|
||||
return None
|
||||
m = re.search(
|
||||
r"([\d,]+(?:\.\d+)?)\s*(?:sq\.?\s*ft|square\s+feet|ft(?:\^?2|²))",
|
||||
description,
|
||||
re.IGNORECASE,
|
||||
)
|
||||
if m:
|
||||
sqft = float(m.group(1).replace(",", ""))
|
||||
return validate_floor_area(round(sqft * 0.092903, 1))
|
||||
m = re.search(
|
||||
r"([\d,]+(?:\.\d+)?)\s*(?:sq\.?\s*m|square\s+met(?:er|re)s?|m(?:\^?2|²))",
|
||||
description,
|
||||
re.IGNORECASE,
|
||||
)
|
||||
if m:
|
||||
return validate_floor_area(round(float(m.group(1).replace(",", "")), 1))
|
||||
return None
|
||||
|
||||
|
||||
def parse_tenure(prop: dict) -> str | None:
|
||||
"""Extract tenure from home.co.uk property data.
|
||||
|
||||
Checks multiple sources in priority order:
|
||||
1. Dedicated 'tenure' or 'tenure_type' field in the API response
|
||||
2. Free-text search in the description for 'freehold' / 'leasehold'
|
||||
3. Free-text search in features lists
|
||||
|
||||
home.co.uk aggregates listings from estate agents, so tenure is often
|
||||
embedded in the description text rather than a structured field.
|
||||
"""
|
||||
# 1. Check dedicated tenure fields (in case the API adds them)
|
||||
for key in ("tenure", "tenure_type", "tenureType"):
|
||||
val = prop.get(key)
|
||||
if val and isinstance(val, str):
|
||||
lower = val.lower().strip()
|
||||
if "leasehold" in lower:
|
||||
return "Leasehold"
|
||||
if "freehold" in lower:
|
||||
return "Freehold"
|
||||
|
||||
# 2. Check description text — estate agents often include tenure here
|
||||
description = prop.get("description") or ""
|
||||
if description:
|
||||
lower_desc = description.lower()
|
||||
if re.search(r"\bleasehold\b", lower_desc):
|
||||
return "Leasehold"
|
||||
if re.search(r"\bfreehold\b", lower_desc):
|
||||
# Matches "Freehold" and "Share of Freehold" (both = freehold ownership)
|
||||
return "Freehold"
|
||||
|
||||
# 3. Check features / key_features lists if present
|
||||
for key in ("features", "key_features", "keyFeatures"):
|
||||
features = prop.get(key)
|
||||
if features and isinstance(features, list):
|
||||
for feat in features:
|
||||
if not isinstance(feat, str):
|
||||
continue
|
||||
lower_feat = feat.lower()
|
||||
if "leasehold" in lower_feat:
|
||||
return "Leasehold"
|
||||
if "freehold" in lower_feat:
|
||||
return "Freehold"
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def map_property_type(raw_type: str | None) -> str:
|
||||
"""Map home.co.uk property type to canonical type."""
|
||||
if not raw_type:
|
||||
return "Other"
|
||||
canonical = PROPERTY_TYPE_MAP.get(raw_type)
|
||||
if canonical:
|
||||
return canonical
|
||||
# Home.co.uk uses types like "House", "Flat", "Apartment", "Detached", etc.
|
||||
# Try common patterns
|
||||
lower = raw_type.lower()
|
||||
excluded_flat_like = (
|
||||
"block of apartment",
|
||||
"house of multiple occupation",
|
||||
"private halls",
|
||||
"retirement",
|
||||
"serviced apartment",
|
||||
)
|
||||
if any(term in lower for term in excluded_flat_like):
|
||||
return "Other"
|
||||
if (
|
||||
"flat" in lower
|
||||
or "apartment" in lower
|
||||
or "maisonette" in lower
|
||||
or "studio" in lower
|
||||
):
|
||||
return "Flats/Maisonettes"
|
||||
if "detached" in lower and "semi" not in lower:
|
||||
return "Detached"
|
||||
if "semi" in lower:
|
||||
return "Semi-Detached"
|
||||
if "terrace" in lower or "mews" in lower:
|
||||
return "Terraced"
|
||||
log.debug("Unknown property type: %r — mapping to Other", raw_type)
|
||||
return "Other"
|
||||
|
||||
|
||||
def transform_property(
|
||||
prop: dict,
|
||||
pc_index: PostcodeSpatialIndex,
|
||||
) -> dict | None:
|
||||
"""Transform a raw home.co.uk property dict into our output schema."""
|
||||
lat = prop.get("latitude")
|
||||
lng = prop.get("longitude")
|
||||
if lat is None or lng is None:
|
||||
return None
|
||||
|
||||
# Validate coordinates are in England
|
||||
if not (49 <= lat <= 56 and -7 <= lng <= 2):
|
||||
log.debug("Coords outside England: lat=%.4f lng=%.4f — skipping", lat, lng)
|
||||
return None
|
||||
|
||||
price = parse_int_value(prop.get("price")) or parse_int_value(
|
||||
prop.get("latest_price")
|
||||
)
|
||||
if not price or price <= 0:
|
||||
return None
|
||||
|
||||
# Home.co.uk provides postcodes directly, but fall back to spatial index
|
||||
postcode = prop.get("postcode")
|
||||
if not postcode:
|
||||
postcode = pc_index.nearest(lat, lng)
|
||||
if not postcode:
|
||||
log.debug("No postcode for property at %.4f, %.4f — skipping", lat, lng)
|
||||
return None
|
||||
|
||||
raw_beds = parse_int_value(prop.get("bedrooms")) or 0
|
||||
raw_baths = parse_int_value(prop.get("bathrooms")) or 0
|
||||
bedrooms = raw_beds if 0 <= raw_beds <= MAX_BEDROOMS else 0
|
||||
bathrooms = raw_baths if 0 <= raw_baths <= MAX_BEDROOMS else 0
|
||||
if raw_beds > MAX_BEDROOMS or raw_baths > MAX_BEDROOMS:
|
||||
log.warning(
|
||||
"home.co.uk %s: implausible beds=%d baths=%d (capped to 0)",
|
||||
prop.get("listing_id") or prop.get("property_id") or "?",
|
||||
raw_beds, raw_baths,
|
||||
)
|
||||
|
||||
listing_type = prop.get("listing_property_type") or prop.get("property_type") or ""
|
||||
address = prop.get("display_address") or prop.get("address") or ""
|
||||
|
||||
# Derive price qualifier from reduction info
|
||||
price_qualifier = ""
|
||||
if prop.get("is_reduced"):
|
||||
pct = prop.get("reduction_percent", 0)
|
||||
if pct:
|
||||
price_qualifier = f"Reduced by {pct}%"
|
||||
else:
|
||||
price_qualifier = "Reduced"
|
||||
|
||||
listing_id = prop.get("listing_id") or prop.get("property_id") or ""
|
||||
|
||||
return {
|
||||
"id": f"hk_{listing_id}", # prefix to avoid collision with Rightmove IDs
|
||||
"Bedrooms": bedrooms,
|
||||
"Bathrooms": bathrooms,
|
||||
"Number of bedrooms & living rooms": bedrooms + bathrooms,
|
||||
"lon": lng,
|
||||
"lat": lat,
|
||||
"Postcode": normalize_postcode(postcode),
|
||||
"Address per Property Register": address,
|
||||
"Leasehold/Freehold": parse_tenure(prop),
|
||||
"Property type": map_property_type(listing_type),
|
||||
"Property sub-type": normalize_sub_type(listing_type),
|
||||
"price": price,
|
||||
"price_frequency": "",
|
||||
"Price qualifier": price_qualifier,
|
||||
"Total floor area (sqm)": parse_floor_area(prop.get("description")),
|
||||
"Listing URL": f"{HOMECOUK_BASE}/property/{listing_id}",
|
||||
"Listing features": [], # not available from home.co.uk
|
||||
"first_visible_date": prop.get("added_date") or "",
|
||||
}
|
||||
|
||||
|
||||
def search_outcode(
|
||||
client: Session,
|
||||
outcode: str,
|
||||
pc_index: PostcodeSpatialIndex,
|
||||
max_properties: int | None = None,
|
||||
) -> list[dict]:
|
||||
"""Paginate through sale search results for one outcode."""
|
||||
url_segment = HOMECOUK_URL_SEGMENT
|
||||
url = f"{HOMECOUK_API_BASE}/{url_segment}/{outcode.lower()}/"
|
||||
properties = []
|
||||
page = 1
|
||||
last_page: int | None = None
|
||||
total_results: int | None = None
|
||||
seen_ids: set[str] = set()
|
||||
|
||||
while True:
|
||||
params = {
|
||||
"page": str(page),
|
||||
"sort": "date_desc",
|
||||
"per_page": str(HOMECOUK_PER_PAGE),
|
||||
}
|
||||
|
||||
# Set referer to match the page URL pattern
|
||||
client.headers["referer"] = (
|
||||
f"https://home.co.uk/{url_segment}/{outcode.lower()}/"
|
||||
f"?page={page}&sort=date_desc&per_page={HOMECOUK_PER_PAGE}"
|
||||
)
|
||||
|
||||
data = fetch_page(client, url, params)
|
||||
if not data:
|
||||
raise PaginationError(f"home.co.uk {outcode} page {page} failed to load")
|
||||
|
||||
pagination = data.get("pagination", {}) or {}
|
||||
if last_page is None:
|
||||
last_page = _coerce_positive_int(pagination.get("last_page"))
|
||||
if total_results is None:
|
||||
total_results = _coerce_positive_int(pagination.get("total"))
|
||||
|
||||
raw_props = data.get("properties", [])
|
||||
if not raw_props:
|
||||
if total_results and page <= (last_page or page):
|
||||
raise PaginationError(
|
||||
f"home.co.uk {outcode} page {page} returned no properties "
|
||||
f"before the advertised end"
|
||||
)
|
||||
break
|
||||
|
||||
page_ids = {
|
||||
_property_identity(prop, page, idx) for idx, prop in enumerate(raw_props)
|
||||
}
|
||||
if page_ids and page_ids.issubset(seen_ids):
|
||||
raise PaginationError(
|
||||
f"home.co.uk {outcode} page {page} repeated previously seen results"
|
||||
)
|
||||
seen_ids.update(page_ids)
|
||||
|
||||
for prop in raw_props:
|
||||
try:
|
||||
transformed = transform_property(prop, pc_index)
|
||||
except Exception as exc:
|
||||
log.warning(
|
||||
"home.co.uk %s property %s failed to transform: %s",
|
||||
outcode,
|
||||
prop.get("listing_id") or prop.get("property_id") or "?",
|
||||
exc,
|
||||
)
|
||||
continue
|
||||
if transformed:
|
||||
properties.append(transformed)
|
||||
if max_properties is not None and len(properties) >= max_properties:
|
||||
return properties
|
||||
|
||||
if last_page is not None:
|
||||
if page >= last_page:
|
||||
break
|
||||
elif total_results is not None and len(seen_ids) >= total_results:
|
||||
break
|
||||
elif len(raw_props) < HOMECOUK_PER_PAGE:
|
||||
break
|
||||
|
||||
page += 1
|
||||
time.sleep(DELAY_BETWEEN_PAGES)
|
||||
|
||||
return properties
|
||||
|
|
@ -1,63 +0,0 @@
|
|||
"""Shared target filters for manual buy-listing scrapes."""
|
||||
|
||||
import math
|
||||
from typing import Any
|
||||
|
||||
BUY_MAX_PRICE = 1_000_000
|
||||
BUY_MIN_BEDROOMS = 2
|
||||
BUY_MAX_BEDROOMS = 5
|
||||
BUY_ALLOWED_BATHROOMS = frozenset({2, 3})
|
||||
BUY_MIN_FLOOR_AREA_SQM = 90.0
|
||||
BUY_MAX_FLOOR_AREA_SQM = 170.0
|
||||
BUY_PROPERTY_TYPES = frozenset({"Flats/Maisonettes"})
|
||||
|
||||
BUY_MIN_FLOOR_AREA_SQFT = round(BUY_MIN_FLOOR_AREA_SQM / 0.092903)
|
||||
BUY_MAX_FLOOR_AREA_SQFT = round(BUY_MAX_FLOOR_AREA_SQM / 0.092903)
|
||||
|
||||
|
||||
def _number(value: Any) -> float | None:
|
||||
if value is None:
|
||||
return None
|
||||
try:
|
||||
number = float(value)
|
||||
except (TypeError, ValueError):
|
||||
return None
|
||||
if not math.isfinite(number):
|
||||
return None
|
||||
return number
|
||||
|
||||
|
||||
def _int(value: Any) -> int | None:
|
||||
number = _number(value)
|
||||
if number is None or not number.is_integer():
|
||||
return None
|
||||
return int(number)
|
||||
|
||||
|
||||
def matches_strict_buy_listing_filter(prop: dict) -> bool:
|
||||
"""Exact filter used to guard scraped/output datasets."""
|
||||
if "price" in prop:
|
||||
price = _number(prop.get("price"))
|
||||
else:
|
||||
price = _number(prop.get("Asking price"))
|
||||
if price is None or price <= 0 or price >= BUY_MAX_PRICE:
|
||||
return False
|
||||
|
||||
bedrooms = _int(prop.get("Bedrooms"))
|
||||
if bedrooms is None or (
|
||||
bedrooms < BUY_MIN_BEDROOMS or bedrooms > BUY_MAX_BEDROOMS
|
||||
):
|
||||
return False
|
||||
|
||||
property_type = prop.get("Property type")
|
||||
if property_type not in BUY_PROPERTY_TYPES:
|
||||
return False
|
||||
|
||||
bathrooms = _int(prop.get("Bathrooms"))
|
||||
if bathrooms not in BUY_ALLOWED_BATHROOMS:
|
||||
return False
|
||||
|
||||
floor_area = _number(prop.get("Total floor area (sqm)"))
|
||||
if floor_area is None:
|
||||
return False
|
||||
return BUY_MIN_FLOOR_AREA_SQM <= floor_area <= BUY_MAX_FLOOR_AREA_SQM
|
||||
|
|
@ -5,10 +5,10 @@ import tempfile
|
|||
import time
|
||||
from pathlib import Path
|
||||
|
||||
from constants import DATA_DIR
|
||||
from constants import DATA_DIR, REPO_DIR
|
||||
|
||||
|
||||
SOURCE_CHOICES = ("rightmove", "homecouk", "zoopla", "all")
|
||||
SOURCE_CHOICES = ("rightmove", "zoopla", "all")
|
||||
TEST_MAX_PROPERTIES_PER_SOURCE = 100
|
||||
TEST_OUTCODES = (
|
||||
"E1",
|
||||
|
|
@ -28,14 +28,16 @@ log = logging.getLogger("finder")
|
|||
|
||||
def configure_standalone_runtime() -> None:
|
||||
"""Keep browser/cache/temp files on the project volume for local runs."""
|
||||
runtime_dir = DATA_DIR / ".runtime"
|
||||
runtime_dir = REPO_DIR / ".tmp" / "finder"
|
||||
cache_dir = runtime_dir / "cache"
|
||||
temp_dir = runtime_dir / "tmp"
|
||||
cache_dir.mkdir(parents=True, exist_ok=True)
|
||||
temp_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
os.environ.setdefault("XDG_CACHE_HOME", str(cache_dir))
|
||||
os.environ.setdefault("TMPDIR", str(temp_dir))
|
||||
os.environ["XDG_CACHE_HOME"] = str(cache_dir)
|
||||
os.environ["TMPDIR"] = str(temp_dir)
|
||||
os.environ["TEMP"] = str(temp_dir)
|
||||
os.environ["TMP"] = str(temp_dir)
|
||||
tempfile.tempdir = str(temp_dir)
|
||||
|
||||
|
||||
|
|
@ -47,7 +49,7 @@ def parse_args() -> argparse.Namespace:
|
|||
"--source",
|
||||
choices=SOURCE_CHOICES,
|
||||
default="all",
|
||||
help="Portal to scrape. 'all' runs Rightmove, home.co.uk, and Zoopla.",
|
||||
help="Portal to scrape. 'all' runs Rightmove and Zoopla.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-dir",
|
||||
|
|
@ -89,7 +91,7 @@ def configure_logging() -> None:
|
|||
|
||||
def selected_sources(source: str) -> list[str]:
|
||||
if source == "all":
|
||||
return ["rightmove", "homecouk", "zoopla"]
|
||||
return ["rightmove", "zoopla"]
|
||||
return [source]
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -4,7 +4,6 @@ version = "0.1.0"
|
|||
requires-python = ">=3.12"
|
||||
dependencies = [
|
||||
"httpx",
|
||||
"curl_cffi",
|
||||
"polars",
|
||||
"fake-useragent>=2.2.0",
|
||||
"playwright>=1.58.0",
|
||||
|
|
|
|||
|
|
@ -10,15 +10,6 @@ from constants import (
|
|||
TYPEAHEAD_URL,
|
||||
)
|
||||
from http_client import fetch_with_retry
|
||||
from listing_filters import (
|
||||
BUY_ALLOWED_BATHROOMS,
|
||||
BUY_MAX_BEDROOMS,
|
||||
BUY_MAX_FLOOR_AREA_SQFT,
|
||||
BUY_MAX_PRICE,
|
||||
BUY_MIN_BEDROOMS,
|
||||
BUY_MIN_FLOOR_AREA_SQFT,
|
||||
matches_strict_buy_listing_filter,
|
||||
)
|
||||
from spatial import PostcodeSpatialIndex
|
||||
from transform import transform_property
|
||||
|
||||
|
|
@ -31,24 +22,6 @@ outcode_cache: dict[str, str] = {}
|
|||
# Requesting index >= 1008 returns HTTP 400.
|
||||
_MAX_INDEX = 1008
|
||||
|
||||
_BASE_BUY_SEARCH_PARAMS = {
|
||||
"propertyTypes": "flat",
|
||||
"minBedrooms": str(BUY_MIN_BEDROOMS),
|
||||
"maxBedrooms": str(BUY_MAX_BEDROOMS),
|
||||
"minBathrooms": str(min(BUY_ALLOWED_BATHROOMS)),
|
||||
"maxBathrooms": str(max(BUY_ALLOWED_BATHROOMS)),
|
||||
"minSize": str(BUY_MIN_FLOOR_AREA_SQFT),
|
||||
"maxSize": str(BUY_MAX_FLOOR_AREA_SQFT),
|
||||
"maxPrice": str(BUY_MAX_PRICE - 1),
|
||||
}
|
||||
|
||||
|
||||
def _buy_search_params(extra_params: dict | None = None) -> dict:
|
||||
params = dict(_BASE_BUY_SEARCH_PARAMS)
|
||||
if extra_params:
|
||||
params.update(extra_params)
|
||||
return params
|
||||
|
||||
|
||||
def resolve_outcode_id(client: httpx.Client, outcode: str) -> str | None:
|
||||
"""Look up Rightmove's internal ID for an outcode via typeahead API."""
|
||||
|
|
@ -77,7 +50,6 @@ def _paginate(
|
|||
outcode: str,
|
||||
channel_cfg: dict,
|
||||
pc_index: PostcodeSpatialIndex,
|
||||
extra_params: dict | None = None,
|
||||
max_properties: int | None = None,
|
||||
) -> tuple[list[dict], int]:
|
||||
"""Paginate through search results. Returns (properties, result_count)."""
|
||||
|
|
@ -94,9 +66,6 @@ def _paginate(
|
|||
"channel": channel_cfg["channel"],
|
||||
"transactionType": channel_cfg["transactionType"],
|
||||
}
|
||||
if extra_params:
|
||||
params.update(extra_params)
|
||||
|
||||
data = fetch_with_retry(client, SEARCH_URL, params)
|
||||
if not data:
|
||||
log.warning(
|
||||
|
|
@ -123,7 +92,7 @@ def _paginate(
|
|||
exc,
|
||||
)
|
||||
continue
|
||||
if transformed and matches_strict_buy_listing_filter(transformed):
|
||||
if transformed:
|
||||
properties.append(transformed)
|
||||
if max_properties is not None and len(properties) >= max_properties:
|
||||
return properties, result_count
|
||||
|
|
@ -137,7 +106,7 @@ def _paginate(
|
|||
break
|
||||
if index >= _MAX_INDEX:
|
||||
log.warning(
|
||||
"%s/%s: %d filtered results exceed Rightmove's %d-result page cap",
|
||||
"%s/%s: %d results exceed Rightmove's %d-result page cap",
|
||||
outcode,
|
||||
channel_cfg["channel"],
|
||||
result_count,
|
||||
|
|
@ -158,18 +127,13 @@ def search_outcode(
|
|||
pc_index: PostcodeSpatialIndex,
|
||||
max_properties: int | None = None,
|
||||
) -> list[dict]:
|
||||
"""Paginate through search results for one outcode+channel. Returns transformed properties.
|
||||
|
||||
Search requests set the supported Rightmove filters directly: flats,
|
||||
2-5 bedrooms, 2-3 bathrooms, 969-1830 sq ft, and asking price below £1m.
|
||||
"""
|
||||
"""Paginate through unfiltered sale results for one outcode+channel."""
|
||||
properties, _ = _paginate(
|
||||
client,
|
||||
outcode_id,
|
||||
outcode,
|
||||
channel_cfg,
|
||||
pc_index,
|
||||
extra_params=_buy_search_params(),
|
||||
max_properties=max_properties,
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -14,12 +14,7 @@ from constants import (
|
|||
LONDON_OUTCODE_PREFIXES,
|
||||
)
|
||||
|
||||
from homecouk import CookiesExpiredError
|
||||
from homecouk import load_cookies as load_homecouk_cookies
|
||||
from homecouk import make_client as make_homecouk_client
|
||||
from homecouk import search_outcode as homecouk_search_outcode
|
||||
from http_client import make_client
|
||||
from listing_filters import matches_strict_buy_listing_filter
|
||||
from rightmove import resolve_outcode_id
|
||||
from rightmove import search_outcode as rightmove_search_outcode
|
||||
from spatial import PostcodeSpatialIndex
|
||||
|
|
@ -30,7 +25,7 @@ from zoopla import search_outcode as zoopla_search_outcode
|
|||
|
||||
log = logging.getLogger("rightmove")
|
||||
|
||||
SOURCE_ORDER = ("rightmove", "homecouk", "zoopla")
|
||||
SOURCE_ORDER = ("rightmove", "zoopla")
|
||||
SALE_CHANNEL = CHANNELS[0]
|
||||
LONDON_AREAS = sorted({prefix.upper() for prefix in LONDON_OUTCODE_PREFIXES})
|
||||
OUTCODE_RE = re.compile(r"^([A-Z]{1,2}\d[A-Z0-9]?)")
|
||||
|
|
@ -260,16 +255,7 @@ def _store_properties(
|
|||
dropped_outside_area,
|
||||
)
|
||||
|
||||
eligible = [prop for prop in londonish if matches_strict_buy_listing_filter(prop)]
|
||||
dropped_non_matching = len(londonish) - len(eligible)
|
||||
if dropped_non_matching:
|
||||
log.debug(
|
||||
"%s dropped %d properties outside the strict buy-listing filters",
|
||||
source,
|
||||
dropped_non_matching,
|
||||
)
|
||||
|
||||
selected = eligible if remaining is None else eligible[:remaining]
|
||||
selected = londonish if remaining is None else londonish[:remaining]
|
||||
results[source].extend(selected)
|
||||
return len(selected)
|
||||
|
||||
|
|
@ -290,6 +276,8 @@ def _launch_zoopla_with_retries(attempts: int = 3):
|
|||
for attempt in range(1, attempts + 1):
|
||||
try:
|
||||
return launch_zoopla_browser()
|
||||
except TurnstileError:
|
||||
raise
|
||||
except Exception as exc:
|
||||
last_error = exc
|
||||
log.warning(
|
||||
|
|
@ -304,13 +292,6 @@ def _launch_zoopla_with_retries(attempts: int = 3):
|
|||
raise last_error
|
||||
|
||||
|
||||
def _new_homecouk_client():
|
||||
cookie_data = load_homecouk_cookies()
|
||||
if not cookie_data:
|
||||
return None
|
||||
return make_homecouk_client(*cookie_data)
|
||||
|
||||
|
||||
def _scrape_rightmove(
|
||||
outcodes: list[str],
|
||||
pc_index: PostcodeSpatialIndex,
|
||||
|
|
@ -368,74 +349,6 @@ def _scrape_rightmove(
|
|||
client.close()
|
||||
|
||||
|
||||
def _scrape_homecouk(
|
||||
outcodes: list[str],
|
||||
pc_index: PostcodeSpatialIndex,
|
||||
results: dict[str, list[dict]],
|
||||
errors: list[str],
|
||||
max_properties_per_source: int | None,
|
||||
) -> None:
|
||||
client = _new_homecouk_client()
|
||||
if client is None:
|
||||
log.warning("home.co.uk skipped: could not bootstrap a local session")
|
||||
return
|
||||
|
||||
try:
|
||||
for outcode in outcodes:
|
||||
if _source_remaining(results, "homecouk", max_properties_per_source) == 0:
|
||||
log.info("home.co.uk cap reached")
|
||||
return
|
||||
|
||||
for attempt in range(2):
|
||||
try:
|
||||
# home.co.uk cannot express the full filter set at source.
|
||||
# Fetch the outcode page set first; _store_properties applies
|
||||
# the strict filter and source cap after transformation.
|
||||
props = homecouk_search_outcode(
|
||||
client,
|
||||
outcode,
|
||||
pc_index,
|
||||
max_properties=None,
|
||||
)
|
||||
added = _store_properties(
|
||||
results,
|
||||
"homecouk",
|
||||
props,
|
||||
max_properties_per_source,
|
||||
)
|
||||
log.info("home.co.uk %s: +%d", outcode, added)
|
||||
break
|
||||
except CookiesExpiredError as exc:
|
||||
if attempt == 1:
|
||||
_record_error(errors, "homecouk", outcode, exc)
|
||||
break
|
||||
|
||||
log.warning(
|
||||
"home.co.uk cookies expired at %s; refreshing local session",
|
||||
outcode,
|
||||
)
|
||||
try:
|
||||
client.close()
|
||||
except Exception:
|
||||
pass
|
||||
client = _new_homecouk_client()
|
||||
if client is None:
|
||||
_record_error(
|
||||
errors,
|
||||
"homecouk",
|
||||
outcode,
|
||||
RuntimeError("could not refresh local session"),
|
||||
)
|
||||
return
|
||||
except Exception as exc:
|
||||
_record_error(errors, "homecouk", outcode, exc)
|
||||
break
|
||||
|
||||
time.sleep(DELAY_BETWEEN_OUTCODES)
|
||||
finally:
|
||||
client.close()
|
||||
|
||||
|
||||
def _scrape_zoopla(
|
||||
outcodes: list[str],
|
||||
pc_index: PostcodeSpatialIndex,
|
||||
|
|
@ -459,9 +372,8 @@ def _scrape_zoopla(
|
|||
|
||||
for attempt in range(2):
|
||||
try:
|
||||
# Zoopla source-side filters are unverified here. Fetch the
|
||||
# outcode page set first; _store_properties applies the
|
||||
# strict filter and source cap after transformation.
|
||||
# Fetch the outcode page set first; _store_properties applies
|
||||
# the London-ish postcode filter and source cap after transformation.
|
||||
props, _ = zoopla_search_outcode(
|
||||
page,
|
||||
outcode,
|
||||
|
|
@ -539,15 +451,6 @@ def run_scrape(
|
|||
max_properties_per_source,
|
||||
)
|
||||
|
||||
if "homecouk" in selected_sources:
|
||||
_scrape_homecouk(
|
||||
selected_outcodes,
|
||||
pc_index,
|
||||
results,
|
||||
errors,
|
||||
max_properties_per_source,
|
||||
)
|
||||
|
||||
if "zoopla" in selected_sources:
|
||||
if pc_coords is None:
|
||||
pc_coords = build_postcode_coords()
|
||||
|
|
@ -567,20 +470,10 @@ def run_scrape(
|
|||
else:
|
||||
if output_path.exists():
|
||||
output_path.unlink()
|
||||
log.warning("No strict properties to write to %s", output_path)
|
||||
|
||||
filtered = [prop for prop in merged if matches_strict_buy_listing_filter(prop)]
|
||||
filtered_output_path = output_base / "online_listings_buy_filtered.parquet"
|
||||
if filtered:
|
||||
write_parquet(filtered, filtered_output_path)
|
||||
else:
|
||||
if filtered_output_path.exists():
|
||||
filtered_output_path.unlink()
|
||||
log.warning("No strict-filtered properties to write to %s", filtered_output_path)
|
||||
log.warning("No London-ish properties to write to %s", output_path)
|
||||
|
||||
counts = {
|
||||
"total": len(merged),
|
||||
"filtered_total": len(filtered),
|
||||
"deduped": deduped,
|
||||
"sources": source_counts,
|
||||
}
|
||||
|
|
@ -588,9 +481,8 @@ def run_scrape(
|
|||
f"{source}:{source_counts[source]}" for source in SOURCE_ORDER
|
||||
)
|
||||
log.info(
|
||||
"Sale scrape complete: %d unique, %d strict-filtered (%s deduped:%d)",
|
||||
"Sale scrape complete: %d unique (%s deduped:%d)",
|
||||
len(merged),
|
||||
len(filtered),
|
||||
source_summary,
|
||||
deduped,
|
||||
)
|
||||
|
|
@ -603,7 +495,6 @@ def run_scrape(
|
|||
},
|
||||
"counts": counts,
|
||||
"path": str(output_path),
|
||||
"filtered_path": str(filtered_output_path),
|
||||
"errors": errors,
|
||||
"elapsed_seconds": round(time.time() - started_at, 3),
|
||||
}
|
||||
|
|
|
|||
|
|
@ -76,7 +76,7 @@ def normalize_sub_type(sub_type: str | None) -> str:
|
|||
"""Normalize property sub-type for consistent storage.
|
||||
|
||||
Fixes delimiter inconsistencies (underscores/hyphens → spaces) from
|
||||
home.co.uk and truncates Zoopla description fragments that were
|
||||
legacy listing data and truncates Zoopla description fragments that were
|
||||
accidentally captured as sub-types.
|
||||
"""
|
||||
if not sub_type:
|
||||
|
|
@ -200,31 +200,13 @@ def transform_property(
|
|||
|
||||
price_obj = prop.get("price", {})
|
||||
amount = parse_int_value(price_obj.get("amount"))
|
||||
if not amount:
|
||||
return None
|
||||
price = amount
|
||||
if price <= 0:
|
||||
return None
|
||||
price = amount or 0
|
||||
|
||||
display_prices = price_obj.get("displayPrices", [])
|
||||
price_qualifier = (
|
||||
display_prices[0].get("displayPriceQualifier", "") if display_prices else ""
|
||||
)
|
||||
|
||||
# POA / Auction listings have unreliable prices — treat as no price
|
||||
pq_lower = price_qualifier.lower()
|
||||
non_comparable_price_terms = (
|
||||
"poa",
|
||||
"auction",
|
||||
"shared ownership",
|
||||
"shared equity",
|
||||
"part buy",
|
||||
"part rent",
|
||||
"from",
|
||||
)
|
||||
if any(term in pq_lower for term in non_comparable_price_terms):
|
||||
return None
|
||||
|
||||
sub_type = prop.get("propertySubType", "")
|
||||
raw_beds = parse_int_value(prop.get("bedrooms")) or 0
|
||||
raw_baths = parse_int_value(prop.get("bathrooms")) or 0
|
||||
|
|
|
|||
91
finder/uv.lock
generated
91
finder/uv.lock
generated
|
|
@ -72,63 +72,6 @@ wheels = [
|
|||
{ url = "https://files.pythonhosted.org/packages/9a/3c/c17fb3ca2d9c3acff52e30b309f538586f9f5b9c9cf454f3845fc9af4881/certifi-2026.2.25-py3-none-any.whl", hash = "sha256:027692e4402ad994f1c42e52a4997a9763c646b73e4096e4d5d6db8af1d6f0fa", size = 153684, upload-time = "2026-02-25T02:54:15.766Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cffi"
|
||||
version = "2.0.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "pycparser", marker = "implementation_name != 'PyPy'" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/eb/56/b1ba7935a17738ae8453301356628e8147c79dbb825bcbc73dc7401f9846/cffi-2.0.0.tar.gz", hash = "sha256:44d1b5909021139fe36001ae048dbdde8214afa20200eda0f64c068cac5d5529", size = 523588, upload-time = "2025-09-08T23:24:04.541Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/ea/47/4f61023ea636104d4f16ab488e268b93008c3d0bb76893b1b31db1f96802/cffi-2.0.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:6d02d6655b0e54f54c4ef0b94eb6be0607b70853c45ce98bd278dc7de718be5d", size = 185271, upload-time = "2025-09-08T23:22:44.795Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/df/a2/781b623f57358e360d62cdd7a8c681f074a71d445418a776eef0aadb4ab4/cffi-2.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8eca2a813c1cb7ad4fb74d368c2ffbbb4789d377ee5bb8df98373c2cc0dee76c", size = 181048, upload-time = "2025-09-08T23:22:45.938Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/ff/df/a4f0fbd47331ceeba3d37c2e51e9dfc9722498becbeec2bd8bc856c9538a/cffi-2.0.0-cp312-cp312-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:21d1152871b019407d8ac3985f6775c079416c282e431a4da6afe7aefd2bccbe", size = 212529, upload-time = "2025-09-08T23:22:47.349Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/d5/72/12b5f8d3865bf0f87cf1404d8c374e7487dcf097a1c91c436e72e6badd83/cffi-2.0.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:b21e08af67b8a103c71a250401c78d5e0893beff75e28c53c98f4de42f774062", size = 220097, upload-time = "2025-09-08T23:22:48.677Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/c2/95/7a135d52a50dfa7c882ab0ac17e8dc11cec9d55d2c18dda414c051c5e69e/cffi-2.0.0-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:1e3a615586f05fc4065a8b22b8152f0c1b00cdbc60596d187c2a74f9e3036e4e", size = 207983, upload-time = "2025-09-08T23:22:50.06Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/3a/c8/15cb9ada8895957ea171c62dc78ff3e99159ee7adb13c0123c001a2546c1/cffi-2.0.0-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:81afed14892743bbe14dacb9e36d9e0e504cd204e0b165062c488942b9718037", size = 206519, upload-time = "2025-09-08T23:22:51.364Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/78/2d/7fa73dfa841b5ac06c7b8855cfc18622132e365f5b81d02230333ff26e9e/cffi-2.0.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:3e17ed538242334bf70832644a32a7aae3d83b57567f9fd60a26257e992b79ba", size = 219572, upload-time = "2025-09-08T23:22:52.902Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/07/e0/267e57e387b4ca276b90f0434ff88b2c2241ad72b16d31836adddfd6031b/cffi-2.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3925dd22fa2b7699ed2617149842d2e6adde22b262fcbfada50e3d195e4b3a94", size = 222963, upload-time = "2025-09-08T23:22:54.518Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/b6/75/1f2747525e06f53efbd878f4d03bac5b859cbc11c633d0fb81432d98a795/cffi-2.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:2c8f814d84194c9ea681642fd164267891702542f028a15fc97d4674b6206187", size = 221361, upload-time = "2025-09-08T23:22:55.867Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/7b/2b/2b6435f76bfeb6bbf055596976da087377ede68df465419d192acf00c437/cffi-2.0.0-cp312-cp312-win32.whl", hash = "sha256:da902562c3e9c550df360bfa53c035b2f241fed6d9aef119048073680ace4a18", size = 172932, upload-time = "2025-09-08T23:22:57.188Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/f8/ed/13bd4418627013bec4ed6e54283b1959cf6db888048c7cf4b4c3b5b36002/cffi-2.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:da68248800ad6320861f129cd9c1bf96ca849a2771a59e0344e88681905916f5", size = 183557, upload-time = "2025-09-08T23:22:58.351Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/95/31/9f7f93ad2f8eff1dbc1c3656d7ca5bfd8fb52c9d786b4dcf19b2d02217fa/cffi-2.0.0-cp312-cp312-win_arm64.whl", hash = "sha256:4671d9dd5ec934cb9a73e7ee9676f9362aba54f7f34910956b84d727b0d73fb6", size = 177762, upload-time = "2025-09-08T23:22:59.668Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/4b/8d/a0a47a0c9e413a658623d014e91e74a50cdd2c423f7ccfd44086ef767f90/cffi-2.0.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:00bdf7acc5f795150faa6957054fbbca2439db2f775ce831222b66f192f03beb", size = 185230, upload-time = "2025-09-08T23:23:00.879Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/4a/d2/a6c0296814556c68ee32009d9c2ad4f85f2707cdecfd7727951ec228005d/cffi-2.0.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:45d5e886156860dc35862657e1494b9bae8dfa63bf56796f2fb56e1679fc0bca", size = 181043, upload-time = "2025-09-08T23:23:02.231Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/b0/1e/d22cc63332bd59b06481ceaac49d6c507598642e2230f201649058a7e704/cffi-2.0.0-cp313-cp313-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:07b271772c100085dd28b74fa0cd81c8fb1a3ba18b21e03d7c27f3436a10606b", size = 212446, upload-time = "2025-09-08T23:23:03.472Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/a9/f5/a2c23eb03b61a0b8747f211eb716446c826ad66818ddc7810cc2cc19b3f2/cffi-2.0.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d48a880098c96020b02d5a1f7d9251308510ce8858940e6fa99ece33f610838b", size = 220101, upload-time = "2025-09-08T23:23:04.792Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/f2/7f/e6647792fc5850d634695bc0e6ab4111ae88e89981d35ac269956605feba/cffi-2.0.0-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:f93fd8e5c8c0a4aa1f424d6173f14a892044054871c771f8566e4008eaa359d2", size = 207948, upload-time = "2025-09-08T23:23:06.127Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/cb/1e/a5a1bd6f1fb30f22573f76533de12a00bf274abcdc55c8edab639078abb6/cffi-2.0.0-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:dd4f05f54a52fb558f1ba9f528228066954fee3ebe629fc1660d874d040ae5a3", size = 206422, upload-time = "2025-09-08T23:23:07.753Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/98/df/0a1755e750013a2081e863e7cd37e0cdd02664372c754e5560099eb7aa44/cffi-2.0.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c8d3b5532fc71b7a77c09192b4a5a200ea992702734a2e9279a37f2478236f26", size = 219499, upload-time = "2025-09-08T23:23:09.648Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/50/e1/a969e687fcf9ea58e6e2a928ad5e2dd88cc12f6f0ab477e9971f2309b57c/cffi-2.0.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:d9b29c1f0ae438d5ee9acb31cadee00a58c46cc9c0b2f9038c6b0b3470877a8c", size = 222928, upload-time = "2025-09-08T23:23:10.928Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/36/54/0362578dd2c9e557a28ac77698ed67323ed5b9775ca9d3fe73fe191bb5d8/cffi-2.0.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6d50360be4546678fc1b79ffe7a66265e28667840010348dd69a314145807a1b", size = 221302, upload-time = "2025-09-08T23:23:12.42Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/eb/6d/bf9bda840d5f1dfdbf0feca87fbdb64a918a69bca42cfa0ba7b137c48cb8/cffi-2.0.0-cp313-cp313-win32.whl", hash = "sha256:74a03b9698e198d47562765773b4a8309919089150a0bb17d829ad7b44b60d27", size = 172909, upload-time = "2025-09-08T23:23:14.32Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/37/18/6519e1ee6f5a1e579e04b9ddb6f1676c17368a7aba48299c3759bbc3c8b3/cffi-2.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:19f705ada2530c1167abacb171925dd886168931e0a7b78f5bffcae5c6b5be75", size = 183402, upload-time = "2025-09-08T23:23:15.535Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/cb/0e/02ceeec9a7d6ee63bb596121c2c8e9b3a9e150936f4fbef6ca1943e6137c/cffi-2.0.0-cp313-cp313-win_arm64.whl", hash = "sha256:256f80b80ca3853f90c21b23ee78cd008713787b1b1e93eae9f3d6a7134abd91", size = 177780, upload-time = "2025-09-08T23:23:16.761Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/92/c4/3ce07396253a83250ee98564f8d7e9789fab8e58858f35d07a9a2c78de9f/cffi-2.0.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:fc33c5141b55ed366cfaad382df24fe7dcbc686de5be719b207bb248e3053dc5", size = 185320, upload-time = "2025-09-08T23:23:18.087Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/59/dd/27e9fa567a23931c838c6b02d0764611c62290062a6d4e8ff7863daf9730/cffi-2.0.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:c654de545946e0db659b3400168c9ad31b5d29593291482c43e3564effbcee13", size = 181487, upload-time = "2025-09-08T23:23:19.622Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/d6/43/0e822876f87ea8a4ef95442c3d766a06a51fc5298823f884ef87aaad168c/cffi-2.0.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:24b6f81f1983e6df8db3adc38562c83f7d4a0c36162885ec7f7b77c7dcbec97b", size = 220049, upload-time = "2025-09-08T23:23:20.853Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/b4/89/76799151d9c2d2d1ead63c2429da9ea9d7aac304603de0c6e8764e6e8e70/cffi-2.0.0-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:12873ca6cb9b0f0d3a0da705d6086fe911591737a59f28b7936bdfed27c0d47c", size = 207793, upload-time = "2025-09-08T23:23:22.08Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/bb/dd/3465b14bb9e24ee24cb88c9e3730f6de63111fffe513492bf8c808a3547e/cffi-2.0.0-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:d9b97165e8aed9272a6bb17c01e3cc5871a594a446ebedc996e2397a1c1ea8ef", size = 206300, upload-time = "2025-09-08T23:23:23.314Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/47/d9/d83e293854571c877a92da46fdec39158f8d7e68da75bf73581225d28e90/cffi-2.0.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:afb8db5439b81cf9c9d0c80404b60c3cc9c3add93e114dcae767f1477cb53775", size = 219244, upload-time = "2025-09-08T23:23:24.541Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/2b/0f/1f177e3683aead2bb00f7679a16451d302c436b5cbf2505f0ea8146ef59e/cffi-2.0.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:737fe7d37e1a1bffe70bd5754ea763a62a066dc5913ca57e957824b72a85e205", size = 222828, upload-time = "2025-09-08T23:23:26.143Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/c6/0f/cafacebd4b040e3119dcb32fed8bdef8dfe94da653155f9d0b9dc660166e/cffi-2.0.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:38100abb9d1b1435bc4cc340bb4489635dc2f0da7456590877030c9b3d40b0c1", size = 220926, upload-time = "2025-09-08T23:23:27.873Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/3e/aa/df335faa45b395396fcbc03de2dfcab242cd61a9900e914fe682a59170b1/cffi-2.0.0-cp314-cp314-win32.whl", hash = "sha256:087067fa8953339c723661eda6b54bc98c5625757ea62e95eb4898ad5e776e9f", size = 175328, upload-time = "2025-09-08T23:23:44.61Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/bb/92/882c2d30831744296ce713f0feb4c1cd30f346ef747b530b5318715cc367/cffi-2.0.0-cp314-cp314-win_amd64.whl", hash = "sha256:203a48d1fb583fc7d78a4c6655692963b860a417c0528492a6bc21f1aaefab25", size = 185650, upload-time = "2025-09-08T23:23:45.848Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/9f/2c/98ece204b9d35a7366b5b2c6539c350313ca13932143e79dc133ba757104/cffi-2.0.0-cp314-cp314-win_arm64.whl", hash = "sha256:dbd5c7a25a7cb98f5ca55d258b103a2054f859a46ae11aaf23134f9cc0d356ad", size = 180687, upload-time = "2025-09-08T23:23:47.105Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/3e/61/c768e4d548bfa607abcda77423448df8c471f25dbe64fb2ef6d555eae006/cffi-2.0.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:9a67fc9e8eb39039280526379fb3a70023d77caec1852002b4da7e8b270c4dd9", size = 188773, upload-time = "2025-09-08T23:23:29.347Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/2c/ea/5f76bce7cf6fcd0ab1a1058b5af899bfbef198bea4d5686da88471ea0336/cffi-2.0.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:7a66c7204d8869299919db4d5069a82f1561581af12b11b3c9f48c584eb8743d", size = 185013, upload-time = "2025-09-08T23:23:30.63Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/be/b4/c56878d0d1755cf9caa54ba71e5d049479c52f9e4afc230f06822162ab2f/cffi-2.0.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:7cc09976e8b56f8cebd752f7113ad07752461f48a58cbba644139015ac24954c", size = 221593, upload-time = "2025-09-08T23:23:31.91Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/e0/0d/eb704606dfe8033e7128df5e90fee946bbcb64a04fcdaa97321309004000/cffi-2.0.0-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:92b68146a71df78564e4ef48af17551a5ddd142e5190cdf2c5624d0c3ff5b2e8", size = 209354, upload-time = "2025-09-08T23:23:33.214Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/d8/19/3c435d727b368ca475fb8742ab97c9cb13a0de600ce86f62eab7fa3eea60/cffi-2.0.0-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:b1e74d11748e7e98e2f426ab176d4ed720a64412b6a15054378afdb71e0f37dc", size = 208480, upload-time = "2025-09-08T23:23:34.495Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/d0/44/681604464ed9541673e486521497406fadcc15b5217c3e326b061696899a/cffi-2.0.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:28a3a209b96630bca57cce802da70c266eb08c6e97e5afd61a75611ee6c64592", size = 221584, upload-time = "2025-09-08T23:23:36.096Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/25/8e/342a504ff018a2825d395d44d63a767dd8ebc927ebda557fecdaca3ac33a/cffi-2.0.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:7553fb2090d71822f02c629afe6042c299edf91ba1bf94951165613553984512", size = 224443, upload-time = "2025-09-08T23:23:37.328Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/e1/5e/b666bacbbc60fbf415ba9988324a132c9a7a0448a9a8f125074671c0f2c3/cffi-2.0.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:6c6c373cfc5c83a975506110d17457138c8c63016b563cc9ed6e056a82f13ce4", size = 223437, upload-time = "2025-09-08T23:23:38.945Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/a0/1d/ec1a60bd1a10daa292d3cd6bb0b359a81607154fb8165f3ec95fe003b85c/cffi-2.0.0-cp314-cp314t-win32.whl", hash = "sha256:1fc9ea04857caf665289b7a75923f2c6ed559b8298a1b8c49e59f7dd95c8481e", size = 180487, upload-time = "2025-09-08T23:23:40.423Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/bf/41/4c1168c74fac325c0c8156f04b6749c8b6a8f405bbf91413ba088359f60d/cffi-2.0.0-cp314-cp314t-win_amd64.whl", hash = "sha256:d68b6cef7827e8641e8ef16f4494edda8b36104d79773a334beaa1e3521430f6", size = 191726, upload-time = "2025-09-08T23:23:41.742Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/ae/3a/dbeec9d1ee0844c679f6bb5d6ad4e9f198b1224f4e7a32825f47f6192b0c/cffi-2.0.0-cp314-cp314t-win_arm64.whl", hash = "sha256:0a1527a803f0a659de1af2e1fd700213caba79377e27e4693648c2923da066f9", size = 184195, upload-time = "2025-09-08T23:23:43.004Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "charset-normalizer"
|
||||
version = "3.4.6"
|
||||
|
|
@ -223,29 +166,6 @@ wheels = [
|
|||
{ url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "curl-cffi"
|
||||
version = "0.14.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "certifi" },
|
||||
{ name = "cffi" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/9b/c9/0067d9a25ed4592b022d4558157fcdb6e123516083700786d38091688767/curl_cffi-0.14.0.tar.gz", hash = "sha256:5ffbc82e59f05008ec08ea432f0e535418823cda44178ee518906a54f27a5f0f", size = 162633, upload-time = "2025-12-16T03:25:07.931Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/aa/f0/0f21e9688eaac85e705537b3a87a5588d0cefb2f09d83e83e0e8be93aa99/curl_cffi-0.14.0-cp39-abi3-macosx_14_0_arm64.whl", hash = "sha256:e35e89c6a69872f9749d6d5fda642ed4fc159619329e99d577d0104c9aad5893", size = 3087277, upload-time = "2025-12-16T03:24:49.607Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/ba/a3/0419bd48fce5b145cb6a2344c6ac17efa588f5b0061f212c88e0723da026/curl_cffi-0.14.0-cp39-abi3-macosx_15_0_x86_64.whl", hash = "sha256:5945478cd28ad7dfb5c54473bcfb6743ee1d66554d57951fdf8fc0e7d8cf4e45", size = 5804650, upload-time = "2025-12-16T03:24:51.518Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/e2/07/a238dd062b7841b8caa2fa8a359eb997147ff3161288f0dd46654d898b4d/curl_cffi-0.14.0-cp39-abi3-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c42e8fa3c667db9ccd2e696ee47adcd3cd5b0838d7282f3fc45f6c0ef3cfdfa7", size = 8231918, upload-time = "2025-12-16T03:24:52.862Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/7c/d2/ce907c9b37b5caf76ac08db40cc4ce3d9f94c5500db68a195af3513eacbc/curl_cffi-0.14.0-cp39-abi3-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:060fe2c99c41d3cb7f894de318ddf4b0301b08dca70453d769bd4e74b36b8483", size = 8654624, upload-time = "2025-12-16T03:24:54.579Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/f2/ae/6256995b18c75e6ef76b30753a5109e786813aa79088b27c8eabb1ef85c9/curl_cffi-0.14.0-cp39-abi3-manylinux_2_28_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:b158c41a25388690dd0d40b5bc38d1e0f512135f17fdb8029868cbc1993d2e5b", size = 8010654, upload-time = "2025-12-16T03:24:56.507Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/fb/10/ff64249e516b103cb762e0a9dca3ee0f04cf25e2a1d5d9838e0f1273d071/curl_cffi-0.14.0-cp39-abi3-manylinux_2_28_i686.whl", hash = "sha256:1439fbef3500fb723333c826adf0efb0e2e5065a703fb5eccce637a2250db34a", size = 7781969, upload-time = "2025-12-16T03:24:57.885Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/51/76/d6f7bb76c2d12811aa7ff16f5e17b678abdd1b357b9a8ac56310ceccabd5/curl_cffi-0.14.0-cp39-abi3-manylinux_2_34_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:e7176f2c2d22b542e3cf261072a81deb018cfa7688930f95dddef215caddb469", size = 7969133, upload-time = "2025-12-16T03:24:59.261Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/23/7c/cca39c0ed4e1772613d3cba13091c0e9d3b89365e84b9bf9838259a3cd8f/curl_cffi-0.14.0-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:03f21ade2d72978c2bb8670e9b6de5260e2755092b02d94b70b906813662998d", size = 9080167, upload-time = "2025-12-16T03:25:00.946Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/75/03/a942d7119d3e8911094d157598ae0169b1c6ca1bd3f27d7991b279bcc45b/curl_cffi-0.14.0-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:58ebf02de64ee5c95613209ddacb014c2d2f86298d7080c0a1c12ed876ee0690", size = 9520464, upload-time = "2025-12-16T03:25:02.922Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/a2/77/78900e9b0833066d2274bda75cba426fdb4cef7fbf6a4f6a6ca447607bec/curl_cffi-0.14.0-cp39-abi3-win_amd64.whl", hash = "sha256:6e503f9a103f6ae7acfb3890c843b53ec030785a22ae7682a22cc43afb94123e", size = 1677416, upload-time = "2025-12-16T03:25:04.902Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/5c/7c/d2ba86b0b3e1e2830bd94163d047de122c69a8df03c5c7c36326c456ad82/curl_cffi-0.14.0-cp39-abi3-win_arm64.whl", hash = "sha256:2eed50a969201605c863c4c31269dfc3e0da52916086ac54553cfa353022425c", size = 1425067, upload-time = "2025-12-16T03:25:06.454Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cython"
|
||||
version = "3.2.4"
|
||||
|
|
@ -274,7 +194,6 @@ version = "0.1.0"
|
|||
source = { virtual = "." }
|
||||
dependencies = [
|
||||
{ name = "camoufox" },
|
||||
{ name = "curl-cffi" },
|
||||
{ name = "fake-useragent" },
|
||||
{ name = "httpx" },
|
||||
{ name = "playwright" },
|
||||
|
|
@ -284,7 +203,6 @@ dependencies = [
|
|||
[package.metadata]
|
||||
requires-dist = [
|
||||
{ name = "camoufox", specifier = ">=0.4.11" },
|
||||
{ name = "curl-cffi" },
|
||||
{ name = "fake-useragent", specifier = ">=2.2.0" },
|
||||
{ name = "httpx" },
|
||||
{ name = "playwright", specifier = ">=1.58.0" },
|
||||
|
|
@ -639,15 +557,6 @@ wheels = [
|
|||
{ url = "https://files.pythonhosted.org/packages/b3/eb/936f5eeae196e8c8aaabe5f7d98891be8a5bbc741d50ce5c60f55575ad29/polars_runtime_32-1.39.0-cp310-abi3-win_arm64.whl", hash = "sha256:d69abde5f148566860bbe910010847bd7791e72f7c8063a4d2c462246a33a72a", size = 41885761, upload-time = "2026-03-12T14:23:16.773Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pycparser"
|
||||
version = "3.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/1b/7d/92392ff7815c21062bea51aa7b87d45576f649f16458d78b7cf94b9ab2e6/pycparser-3.0.tar.gz", hash = "sha256:600f49d217304a5902ac3c37e1281c9fe94e4d0489de643a9504c5cdfdfc6b29", size = 103492, upload-time = "2026-01-21T14:26:51.89Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/0c/c3/44f3fbbfa403ea2a7c779186dc20772604442dde72947e7d01069cbe98e3/pycparser-3.0-py3-none-any.whl", hash = "sha256:b727414169a36b7d524c1c3e31839a521725078d7b2ff038656844266160a992", size = 48172, upload-time = "2026-01-21T14:26:50.693Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pyee"
|
||||
version = "13.0.1"
|
||||
|
|
|
|||
209
finder/zoopla.py
209
finder/zoopla.py
|
|
@ -1,8 +1,8 @@
|
|||
"""Zoopla (zoopla.co.uk) scraper — sale properties.
|
||||
|
||||
Zoopla is behind Cloudflare Turnstile (managed interactive challenge), which
|
||||
blocks all HTTP clients (curl_cffi, httpx) and even Playwright with stealth
|
||||
patches. Only Camoufox (an anti-fingerprinting Firefox fork) passes reliably.
|
||||
blocks non-browser HTTP clients and even Playwright with stealth patches. Only
|
||||
Camoufox (an anti-fingerprinting Firefox fork) passes reliably.
|
||||
|
||||
Zoopla uses Next.js App Router with React Server Components (RSC). Search
|
||||
result data is server-rendered in an RSC stream, not available via
|
||||
|
|
@ -19,11 +19,20 @@ Architecture:
|
|||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
from urllib.parse import parse_qsl, urlencode, urljoin, urlparse, urlunparse
|
||||
|
||||
from constants import DELAY_BETWEEN_PAGES, MAX_BEDROOMS, PROPERTY_TYPE_MAP, ZOOPLA_BASE
|
||||
from constants import (
|
||||
DATA_DIR,
|
||||
DELAY_BETWEEN_PAGES,
|
||||
MAX_BEDROOMS,
|
||||
PROPERTY_TYPE_MAP,
|
||||
ZOOPLA_BASE,
|
||||
)
|
||||
from spatial import PostcodeSpatialIndex
|
||||
from transform import normalize_sub_type, parse_int_value, validate_floor_area
|
||||
|
||||
|
|
@ -255,11 +264,120 @@ _DISMISS_COOKIES_JS = """() => {
|
|||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
_FALSE_ENV_VALUES = {"0", "false", "no", "off"}
|
||||
_TRUE_ENV_VALUES = {"1", "true", "yes", "on"}
|
||||
|
||||
|
||||
def _env_bool_or_virtual(name: str, default: bool | str) -> bool | str:
|
||||
raw = os.environ.get(name)
|
||||
if raw is None:
|
||||
return default
|
||||
|
||||
value = raw.strip().lower()
|
||||
if value == "virtual":
|
||||
return "virtual"
|
||||
if value in _TRUE_ENV_VALUES:
|
||||
return True
|
||||
if value in _FALSE_ENV_VALUES:
|
||||
return False
|
||||
raise ValueError(
|
||||
f"{name} must be one of 1/0, true/false, yes/no, on/off, or virtual"
|
||||
)
|
||||
|
||||
|
||||
def _visible_display_available() -> bool:
|
||||
if sys.platform.startswith("linux"):
|
||||
return bool(os.environ.get("DISPLAY") or os.environ.get("WAYLAND_DISPLAY"))
|
||||
return True
|
||||
|
||||
|
||||
def _zoopla_headless_mode() -> bool | str:
|
||||
# Prefer a visible browser by default so Cloudflare can be completed by the
|
||||
# person running the scrape. In display-less Linux shells, keep startup
|
||||
# headless and fail fast with an actionable error if a challenge appears.
|
||||
default: bool | str = not _visible_display_available()
|
||||
return _env_bool_or_virtual("ZOOPLA_HEADLESS", default)
|
||||
|
||||
|
||||
def _zoopla_profile_dir() -> Path:
|
||||
raw = os.environ.get("ZOOPLA_PROFILE_DIR")
|
||||
if raw:
|
||||
return Path(raw).expanduser().resolve()
|
||||
return (DATA_DIR / ".runtime" / "zoopla-profile").expanduser().resolve()
|
||||
|
||||
|
||||
def _challenge_timeout_seconds() -> int:
|
||||
raw = os.environ.get("ZOOPLA_CHALLENGE_TIMEOUT_SECONDS")
|
||||
if raw is None:
|
||||
return 300
|
||||
try:
|
||||
timeout = int(raw)
|
||||
except ValueError as exc:
|
||||
raise ValueError("ZOOPLA_CHALLENGE_TIMEOUT_SECONDS must be an integer") from exc
|
||||
if timeout < 1:
|
||||
raise ValueError("ZOOPLA_CHALLENGE_TIMEOUT_SECONDS must be greater than zero")
|
||||
return timeout
|
||||
|
||||
|
||||
def _is_turnstile_challenge(page) -> bool:
|
||||
try:
|
||||
if "just a moment" in page.title().lower():
|
||||
return True
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
try:
|
||||
return bool(
|
||||
page.query_selector(
|
||||
'iframe[src*="challenges.cloudflare.com"], '
|
||||
'input[name="cf-turnstile-response"]'
|
||||
)
|
||||
)
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def _wait_for_turnstile(page, headless_mode: bool | str) -> None:
|
||||
if not _is_turnstile_challenge(page):
|
||||
return
|
||||
|
||||
profile_dir = _zoopla_profile_dir()
|
||||
if headless_mode is True or headless_mode == "virtual":
|
||||
raise TurnstileError(
|
||||
"Cloudflare Turnstile requires a visible browser session. "
|
||||
"Run Zoopla from a desktop session with ZOOPLA_HEADLESS=0; "
|
||||
f"the solved session will be saved in {profile_dir}."
|
||||
)
|
||||
|
||||
timeout = _challenge_timeout_seconds()
|
||||
log.warning(
|
||||
"Cloudflare Turnstile challenge shown. Complete it in the Zoopla browser "
|
||||
"window; waiting up to %ds. Profile: %s",
|
||||
timeout,
|
||||
profile_dir,
|
||||
)
|
||||
try:
|
||||
page.bring_to_front()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
deadline = time.monotonic() + timeout
|
||||
while time.monotonic() < deadline:
|
||||
time.sleep(3)
|
||||
if not _is_turnstile_challenge(page):
|
||||
log.info("Cloudflare challenge resolved")
|
||||
return
|
||||
|
||||
raise TurnstileError(
|
||||
f"Cloudflare Turnstile was not completed after {timeout}s"
|
||||
)
|
||||
|
||||
|
||||
def launch_browser():
|
||||
"""Launch Camoufox, navigate to Zoopla homepage, pass Cloudflare Turnstile,
|
||||
and dismiss cookie consent. Returns (browser, page) tuple.
|
||||
|
||||
Raises TurnstileError if Cloudflare cannot be passed within two minutes.
|
||||
Raises TurnstileError if Cloudflare cannot be completed.
|
||||
Caller must close browser when done."""
|
||||
from camoufox.pkgman import camoufox_path
|
||||
|
||||
|
|
@ -269,61 +387,50 @@ def launch_browser():
|
|||
|
||||
from camoufox.sync_api import Camoufox
|
||||
|
||||
log.info("Launching Camoufox browser for Zoopla...")
|
||||
camoufox = Camoufox(headless=True)
|
||||
headless_mode = _zoopla_headless_mode()
|
||||
profile_dir = _zoopla_profile_dir()
|
||||
profile_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
log.info(
|
||||
"Launching Camoufox browser for Zoopla (headless=%s, profile=%s)...",
|
||||
headless_mode,
|
||||
profile_dir,
|
||||
)
|
||||
camoufox = Camoufox(
|
||||
headless=headless_mode,
|
||||
persistent_context=True,
|
||||
user_data_dir=str(profile_dir),
|
||||
locale=["en-GB", "en"],
|
||||
enable_cache=True,
|
||||
)
|
||||
raw_browser = camoufox.__enter__()
|
||||
browser = _ManagedCamoufoxBrowser(camoufox, raw_browser)
|
||||
page = browser.new_page()
|
||||
page = raw_browser.pages[0] if raw_browser.pages else raw_browser.new_page()
|
||||
|
||||
log.info("Navigating to Zoopla homepage...")
|
||||
page.goto(f"{ZOOPLA_BASE}/", wait_until="domcontentloaded", timeout=60000)
|
||||
try:
|
||||
log.info("Navigating to Zoopla homepage...")
|
||||
page.goto(f"{ZOOPLA_BASE}/", wait_until="domcontentloaded", timeout=60000)
|
||||
_wait_for_turnstile(page, headless_mode)
|
||||
|
||||
# Wait for Cloudflare Turnstile to resolve.
|
||||
# Try clicking the Turnstile checkbox if present (helps in some cases).
|
||||
for i in range(40):
|
||||
if "Just a moment" not in page.title():
|
||||
break
|
||||
# Attempt to click the Turnstile checkbox in the challenge iframe
|
||||
for frame in page.frames:
|
||||
if "challenges.cloudflare.com" in frame.url:
|
||||
try:
|
||||
iframe_el = page.query_selector('iframe[src*="challenges.cloudflare"]')
|
||||
if iframe_el:
|
||||
box = iframe_el.bounding_box()
|
||||
if box:
|
||||
page.mouse.click(box["x"] + 30, box["y"] + box["height"] / 2)
|
||||
except Exception:
|
||||
pass
|
||||
break
|
||||
time.sleep(3)
|
||||
else:
|
||||
page.close()
|
||||
browser.close()
|
||||
raise TurnstileError("Cloudflare Turnstile did not resolve after 120s")
|
||||
log.info("Zoopla browser ready — title: %s", page.title())
|
||||
time.sleep(2)
|
||||
|
||||
log.info("Cloudflare passed — title: %s", page.title())
|
||||
time.sleep(2)
|
||||
|
||||
# Dismiss cookie consent
|
||||
page.evaluate(_DISMISS_COOKIES_JS)
|
||||
time.sleep(1)
|
||||
# Dismiss cookie consent
|
||||
page.evaluate(_DISMISS_COOKIES_JS)
|
||||
time.sleep(1)
|
||||
except Exception:
|
||||
try:
|
||||
page.close()
|
||||
finally:
|
||||
browser.close()
|
||||
raise
|
||||
|
||||
return browser, page
|
||||
|
||||
|
||||
def _ensure_not_challenged(page) -> None:
|
||||
"""Check if current page is a Cloudflare challenge and wait/raise."""
|
||||
if "Just a moment" not in page.title():
|
||||
return
|
||||
|
||||
log.warning("Cloudflare challenge detected mid-session, waiting...")
|
||||
for i in range(40):
|
||||
time.sleep(3)
|
||||
if "Just a moment" not in page.title():
|
||||
log.info("Cloudflare challenge resolved")
|
||||
return
|
||||
|
||||
raise TurnstileError("Cloudflare re-challenge did not resolve after 120s")
|
||||
_wait_for_turnstile(page, _zoopla_headless_mode())
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
|
|
@ -704,9 +811,7 @@ def transform_property(
|
|||
|
||||
Zoopla search cards do not include coordinates, so we resolve lat/lng
|
||||
from postcodes extracted from the address text."""
|
||||
price = parse_int_value(raw.get("price"))
|
||||
if not price or price <= 0:
|
||||
return None
|
||||
price = parse_int_value(raw.get("price")) or 0
|
||||
|
||||
address = raw.get("address", "")
|
||||
|
||||
|
|
@ -856,7 +961,7 @@ def search_outcode(
|
|||
sample = raw_listings[0] if raw_listings else {}
|
||||
log.debug(
|
||||
"Zoopla %s %s: extracted %d raw listings but all %d dropped in transform "
|
||||
"(no price/postcode/coords). Sample raw: price=%s address=%r",
|
||||
"(no postcode/coords). Sample raw: price=%s address=%r",
|
||||
outcode, "BUY", len(raw_listings), dropped,
|
||||
sample.get("price"), sample.get("address", ""),
|
||||
)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue