import json import logging import os import random import re import time from urllib.parse import unquote from curl_cffi.requests import Session from curl_cffi.requests.errors import RequestsError from constants import ( DELAY_BETWEEN_PAGES, HOMECOUK_API_BASE, HOMECOUK_BASE, HOMECOUK_PER_PAGE, MAX_BEDROOMS, PROPERTY_TYPE_MAP, RETRY_BASE_DELAY, ) from spatial import PostcodeSpatialIndex from transform import normalize_postcode, normalize_sub_type, validate_floor_area log = logging.getLogger("homecouk") class CookiesExpiredError(Exception): """Raised when home.co.uk returns 403, indicating cookies need refresh.""" # Channel mapping: internal name → URL path segment HOMECOUK_URL_SEGMENT = "for-sale" def load_cookies() -> tuple[dict[str, str], str] | None: """Get home.co.uk cookies + user-agent. Environment cookies are optional. When they are not present, bootstrap a regular local session by visiting home.co.uk with curl_cffi's Chrome impersonation and reusing the cookies set by the site. """ user_agent = os.environ.get( "HOMECOUK_USER_AGENT", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/145.0.0.0 Safari/537.36", ) env_cookies = { name: value for name, value in { "cf_clearance": os.environ.get("HOMECOUK_CF_CLEARANCE", ""), "homecouk_session": os.environ.get("HOMECOUK_SESSION", ""), "XSRF-TOKEN": os.environ.get("HOMECOUK_XSRF_TOKEN", ""), }.items() if value } if env_cookies.get("homecouk_session"): return env_cookies, user_agent session = Session(impersonate="chrome") session.headers.update( { "User-Agent": user_agent, "Accept": ( "text/html,application/xhtml+xml,application/xml;q=0.9," "*/*;q=0.8" ), } ) for url in (HOMECOUK_BASE, f"{HOMECOUK_BASE}/for-sale/br1/"): try: response = session.get(url, timeout=30) except RequestsError as exc: log.warning("home.co.uk cookie bootstrap failed for %s: %s", url, exc) continue if response.status_code == 403: raise CookiesExpiredError("home.co.uk returned HTTP 403 during bootstrap") if response.status_code >= 400: log.warning( "home.co.uk cookie bootstrap got HTTP %d from %s", response.status_code, url, ) cookies = session.cookies.get_dict() if cookies.get("homecouk_session") and cookies.get("XSRF-TOKEN"): log.info("home.co.uk local session bootstrapped") return cookies, user_agent log.warning("home.co.uk did not provide session cookies during bootstrap") return None def make_client(cookies: dict[str, str], user_agent: str) -> Session: """Create a curl_cffi Session configured for home.co.uk API calls. Uses Chrome TLS impersonation so browser-derived cookies remain valid.""" session = Session(impersonate="chrome") session.headers.update( { "User-Agent": user_agent, "Accept": "application/json, text/plain, */*", "x-requested-with": "XMLHttpRequest", } ) # Laravel CSRF: the XSRF-TOKEN cookie value must also be sent as the # X-XSRF-TOKEN request header (URL-decoded). Without this header, the # server rejects every request with 419/403. xsrf = cookies.get("XSRF-TOKEN") if xsrf: session.headers["X-XSRF-TOKEN"] = unquote(xsrf) for name, value in cookies.items(): session.cookies.set(name, value, domain="home.co.uk") return session def fetch_page( client: Session, url: str, params: dict, max_retries: int = 3 ) -> dict | None: """GET JSON with retries on 429/5xx. Returns None on permanent failure. 403 means cookies expired — raises CookiesExpiredError immediately.""" for attempt in range(max_retries): try: resp = client.get(url, params=params, timeout=30) if resp.status_code == 200: try: return resp.json() except json.JSONDecodeError: log.error( "Non-JSON response from %s (got %s)", url, resp.headers.get("content-type", "?"), ) return None if resp.status_code == 403: raise CookiesExpiredError("HTTP 403 — cookies likely expired") if resp.status_code in (429, 500, 502, 503, 504): delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1) log.warning( "HTTP %d from %s, retry %d/%d in %.1fs", resp.status_code, url, attempt + 1, max_retries, delay, ) time.sleep(delay) continue log.error("HTTP %d from %s (non-retryable)", resp.status_code, url) return None except CookiesExpiredError: raise except RequestsError as e: delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1) log.warning( "%s from %s, retry %d/%d in %.1fs", type(e).__name__, url, attempt + 1, max_retries, delay, ) time.sleep(delay) log.error("All %d retries exhausted for %s", max_retries, url) return None def parse_floor_area(description: str | None) -> float | None: """Try to extract floor area from description text like '789 sq.ft.' or '73 sq.m.'.""" if not description: return None m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*ft", description, re.IGNORECASE) if m: sqft = float(m.group(1).replace(",", "")) return validate_floor_area(round(sqft * 0.092903, 1)) m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*m", description, re.IGNORECASE) if m: return validate_floor_area(round(float(m.group(1).replace(",", "")), 1)) return None def parse_tenure(prop: dict) -> str | None: """Extract tenure from home.co.uk property data. Checks multiple sources in priority order: 1. Dedicated 'tenure' or 'tenure_type' field in the API response 2. Free-text search in the description for 'freehold' / 'leasehold' 3. Free-text search in features lists home.co.uk aggregates listings from estate agents, so tenure is often embedded in the description text rather than a structured field. """ # 1. Check dedicated tenure fields (in case the API adds them) for key in ("tenure", "tenure_type", "tenureType"): val = prop.get(key) if val and isinstance(val, str): lower = val.lower().strip() if "leasehold" in lower: return "Leasehold" if "freehold" in lower: return "Freehold" # 2. Check description text — estate agents often include tenure here description = prop.get("description") or "" if description: lower_desc = description.lower() if re.search(r"\bleasehold\b", lower_desc): return "Leasehold" if re.search(r"\bfreehold\b", lower_desc): # Matches "Freehold" and "Share of Freehold" (both = freehold ownership) return "Freehold" # 3. Check features / key_features lists if present for key in ("features", "key_features", "keyFeatures"): features = prop.get(key) if features and isinstance(features, list): for feat in features: if not isinstance(feat, str): continue lower_feat = feat.lower() if "leasehold" in lower_feat: return "Leasehold" if "freehold" in lower_feat: return "Freehold" return None def map_property_type(raw_type: str | None) -> str: """Map home.co.uk property type to canonical type.""" if not raw_type: return "Other" canonical = PROPERTY_TYPE_MAP.get(raw_type) if canonical: return canonical # Home.co.uk uses types like "House", "Flat", "Apartment", "Detached", etc. # Try common patterns lower = raw_type.lower() if ( "flat" in lower or "apartment" in lower or "maisonette" in lower or "studio" in lower ): return "Flats/Maisonettes" if "detached" in lower and "semi" not in lower: return "Detached" if "semi" in lower: return "Semi-Detached" if "terrace" in lower or "mews" in lower: return "Terraced" log.debug("Unknown property type: %r — mapping to Other", raw_type) return "Other" def transform_property( prop: dict, pc_index: PostcodeSpatialIndex, ) -> dict | None: """Transform a raw home.co.uk property dict into our output schema.""" lat = prop.get("latitude") lng = prop.get("longitude") if lat is None or lng is None: return None # Validate coordinates are in England if not (49 <= lat <= 56 and -7 <= lng <= 2): log.debug("Coords outside England: lat=%.4f lng=%.4f — skipping", lat, lng) return None price = prop.get("price") or prop.get("latest_price") if not price or int(price) <= 0: return None # Home.co.uk provides postcodes directly, but fall back to spatial index postcode = prop.get("postcode") if not postcode: postcode = pc_index.nearest(lat, lng) if not postcode: log.debug("No postcode for property at %.4f, %.4f — skipping", lat, lng) return None raw_beds = prop.get("bedrooms", 0) or 0 raw_baths = prop.get("bathrooms", 0) or 0 bedrooms = raw_beds if raw_beds <= MAX_BEDROOMS else 0 bathrooms = raw_baths if raw_baths <= MAX_BEDROOMS else 0 if raw_beds > MAX_BEDROOMS or raw_baths > MAX_BEDROOMS: log.warning( "home.co.uk %s: implausible beds=%d baths=%d (capped to 0)", prop.get("listing_id") or prop.get("property_id") or "?", raw_beds, raw_baths, ) listing_type = prop.get("listing_property_type") or prop.get("property_type") or "" address = prop.get("display_address") or prop.get("address") or "" # Derive price qualifier from reduction info price_qualifier = "" if prop.get("is_reduced"): pct = prop.get("reduction_percent", 0) if pct: price_qualifier = f"Reduced by {pct}%" else: price_qualifier = "Reduced" listing_id = prop.get("listing_id") or prop.get("property_id") or "" return { "id": f"hk_{listing_id}", # prefix to avoid collision with Rightmove IDs "Bedrooms": bedrooms, "Bathrooms": bathrooms, "Number of bedrooms & living rooms": bedrooms + bathrooms, "lon": lng, "lat": lat, "Postcode": normalize_postcode(postcode), "Address per Property Register": address, "Leasehold/Freehold": parse_tenure(prop), "Property type": map_property_type(listing_type), "Property sub-type": normalize_sub_type(listing_type), "price": int(price), "price_frequency": "", "Price qualifier": price_qualifier, "Total floor area (sqm)": parse_floor_area(prop.get("description")), "Listing URL": f"{HOMECOUK_BASE}/property/{listing_id}", "Listing features": [], # not available from home.co.uk "first_visible_date": prop.get("added_date") or "", } def search_outcode( client: Session, outcode: str, pc_index: PostcodeSpatialIndex, max_properties: int | None = None, ) -> list[dict]: """Paginate through sale search results for one outcode.""" url_segment = HOMECOUK_URL_SEGMENT url = f"{HOMECOUK_API_BASE}/{url_segment}/{outcode.lower()}/" properties = [] page = 1 while True: params = { "page": str(page), "sort": "date_desc", "per_page": str(HOMECOUK_PER_PAGE), } # Set referer to match the page URL pattern client.headers["referer"] = ( f"https://home.co.uk/{url_segment}/{outcode.lower()}/" f"?page={page}&sort=date_desc&per_page={HOMECOUK_PER_PAGE}" ) data = fetch_page(client, url, params) if not data: break raw_props = data.get("properties", []) if not raw_props: break for prop in raw_props: transformed = transform_property(prop, pc_index) if transformed: properties.append(transformed) if max_properties is not None and len(properties) >= max_properties: return properties # Check pagination pagination = data.get("pagination", {}) last_page = pagination.get("last_page", 1) if page >= last_page: break page += 1 time.sleep(DELAY_BETWEEN_PAGES) return properties