This commit is contained in:
Andras Schmelczer 2026-03-15 21:22:28 +00:00
parent 479ef92236
commit c38d654ac7
44 changed files with 2526 additions and 701 deletions

View file

@ -16,9 +16,21 @@ SCHEDULE_HOUR = int(os.environ.get("SCHEDULE_HOUR", "3"))
# Whether to run a scrape immediately on startup
RUN_ON_STARTUP = os.environ.get("RUN_ON_STARTUP", "").lower() in ("1", "true", "yes")
# Enable/disable individual sources
SCRAPE_RIGHTMOVE = os.environ.get("SCRAPE_RIGHTMOVE", "true").lower() in ("1", "true", "yes")
SCRAPE_HOMECOUK = os.environ.get("SCRAPE_HOMECOUK", "true").lower() in ("1", "true", "yes")
SCRAPE_OPENRENT = os.environ.get("SCRAPE_OPENRENT", "true").lower() in ("1", "true", "yes")
SCRAPE_RIGHTMOVE = os.environ.get("SCRAPE_RIGHTMOVE", "true").lower() in (
"1",
"true",
"yes",
)
SCRAPE_HOMECOUK = os.environ.get("SCRAPE_HOMECOUK", "true").lower() in (
"1",
"true",
"yes",
)
SCRAPE_OPENRENT = os.environ.get("SCRAPE_OPENRENT", "true").lower() in (
"1",
"true",
"yes",
)
TYPEAHEAD_URL = "https://los.rightmove.co.uk/typeahead"
SEARCH_URL = "https://www.rightmove.co.uk/api/property-search/listing/search"

View file

@ -86,7 +86,8 @@ def solve_cloudflare() -> tuple[dict[str, str], str] | None:
log.info(
"Cloudflare solved — got %d cookies, UA: %s",
len(cookies), user_agent[:60],
len(cookies),
user_agent[:60],
)
flaresolverr_attempts_total.labels(result="success").inc()
return cookies, user_agent
@ -129,11 +130,13 @@ def make_client(cookies: dict[str, str], user_agent: str) -> Session:
Uses Chrome TLS impersonation so cf_clearance cookies (which are bound
to Chrome's JA3 fingerprint from FlareSolverr) remain valid."""
session = Session(impersonate="chrome")
session.headers.update({
"User-Agent": user_agent,
"Accept": "application/json, text/plain, */*",
"x-requested-with": "XMLHttpRequest",
})
session.headers.update(
{
"User-Agent": user_agent,
"Accept": "application/json, text/plain, */*",
"x-requested-with": "XMLHttpRequest",
}
)
# Laravel CSRF: the XSRF-TOKEN cookie value must also be sent as the
# X-XSRF-TOKEN request header (URL-decoded). Without this header, the
# server rejects every request with 419/403.
@ -165,7 +168,11 @@ def fetch_page(
return resp.json()
except json.JSONDecodeError:
homecouk_errors_total.labels(type="json_decode").inc()
log.error("Non-JSON response from %s (got %s)", url, resp.headers.get("content-type", "?"))
log.error(
"Non-JSON response from %s (got %s)",
url,
resp.headers.get("content-type", "?"),
)
return None
if resp.status_code == 403:
raise CookiesExpiredError("HTTP 403 — cookies likely expired")
@ -173,7 +180,11 @@ def fetch_page(
delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1)
log.warning(
"HTTP %d from %s, retry %d/%d in %.1fs",
resp.status_code, url, attempt + 1, max_retries, delay,
resp.status_code,
url,
attempt + 1,
max_retries,
delay,
)
time.sleep(delay)
continue
@ -186,7 +197,11 @@ def fetch_page(
delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1)
log.warning(
"%s from %s, retry %d/%d in %.1fs",
type(e).__name__, url, attempt + 1, max_retries, delay,
type(e).__name__,
url,
attempt + 1,
max_retries,
delay,
)
time.sleep(delay)
homecouk_errors_total.labels(type="retry_exhausted").inc()
@ -218,7 +233,12 @@ def map_property_type(raw_type: str | None) -> str:
# Home.co.uk uses types like "House", "Flat", "Apartment", "Detached", etc.
# Try common patterns
lower = raw_type.lower()
if "flat" in lower or "apartment" in lower or "maisonette" in lower or "studio" in lower:
if (
"flat" in lower
or "apartment" in lower
or "maisonette" in lower
or "studio" in lower
):
return "Flats/Maisonettes"
if "detached" in lower and "semi" not in lower:
return "Detached"
@ -231,7 +251,9 @@ def map_property_type(raw_type: str | None) -> str:
def transform_property(
prop: dict, channel: str, pc_index: PostcodeSpatialIndex,
prop: dict,
channel: str,
pc_index: PostcodeSpatialIndex,
) -> dict | None:
"""Transform a raw home.co.uk property dict into our output schema."""
lat = prop.get("latitude")

View file

@ -11,7 +11,9 @@ from metrics import http_errors_total, http_requests_total, ip_rotations_total
log = logging.getLogger("rightmove")
_ua = UserAgent(browsers=["Chrome", "Edge"], os=["Windows", "Mac OS X"], min_version=120.0)
_ua = UserAgent(
browsers=["Chrome", "Edge"], os=["Windows", "Mac OS X"], min_version=120.0
)
def _endpoint_label(url: str) -> str:
@ -27,6 +29,7 @@ def _status_label(code: int) -> str:
return "5xx"
return str(code)
# Gluetun control API — runs on port 8000 inside the gluetun container.
# Since finder uses network_mode: service:gluetun, localhost IS gluetun.
GLUETUN_API = "http://127.0.0.1:8000"
@ -42,17 +45,25 @@ def rotate_ip() -> bool:
# Get current IP
with httpx.Client(timeout=10) as ctl:
old_ip_resp = ctl.get(f"{GLUETUN_API}/v1/publicip/ip")
old_ip = old_ip_resp.json().get("public_ip", "unknown") if old_ip_resp.status_code == 200 else "unknown"
old_ip = (
old_ip_resp.json().get("public_ip", "unknown")
if old_ip_resp.status_code == 200
else "unknown"
)
log.info("Current IP: %s", old_ip)
# Trigger server change — PUT with empty JSON body picks a random server
resp = ctl.put(f"{GLUETUN_API}/v1/vpn/status", json={"status": "stopped"})
resp = ctl.put(
f"{GLUETUN_API}/v1/vpn/status", json={"status": "stopped"}
)
if resp.status_code != 200:
log.error("Failed to stop VPN: %d %s", resp.status_code, resp.text)
return False
time.sleep(2)
resp = ctl.put(f"{GLUETUN_API}/v1/vpn/status", json={"status": "running"})
resp = ctl.put(
f"{GLUETUN_API}/v1/vpn/status", json={"status": "running"}
)
if resp.status_code != 200:
log.error("Failed to start VPN: %d %s", resp.status_code, resp.text)
return False
@ -99,7 +110,9 @@ def fetch_with_retry(
for attempt in range(MAX_RETRIES):
try:
resp = client.get(url, params=params)
http_requests_total.labels(status=_status_label(resp.status_code), endpoint=endpoint).inc()
http_requests_total.labels(
status=_status_label(resp.status_code), endpoint=endpoint
).inc()
if resp.status_code == 200:
return resp.json()
if resp.status_code == 403 and on_403:
@ -111,15 +124,34 @@ def fetch_with_retry(
return None
if resp.status_code in (429, 500, 502, 503, 504):
delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1)
log.warning("HTTP %d from %s, retry %d/%d in %.1fs", resp.status_code, url, attempt + 1, MAX_RETRIES, delay)
log.warning(
"HTTP %d from %s, retry %d/%d in %.1fs",
resp.status_code,
url,
attempt + 1,
MAX_RETRIES,
delay,
)
time.sleep(delay)
continue
log.error("HTTP %d from %s (non-retryable)", resp.status_code, url)
return None
except (httpx.ConnectError, httpx.ReadTimeout, httpx.WriteTimeout, httpx.PoolTimeout) as e:
except (
httpx.ConnectError,
httpx.ReadTimeout,
httpx.WriteTimeout,
httpx.PoolTimeout,
) as e:
http_errors_total.labels(type=type(e).__name__).inc()
delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1)
log.warning("%s from %s, retry %d/%d in %.1fs", type(e).__name__, url, attempt + 1, MAX_RETRIES, delay)
log.warning(
"%s from %s, retry %d/%d in %.1fs",
type(e).__name__,
url,
attempt + 1,
MAX_RETRIES,
delay,
)
time.sleep(delay)
http_errors_total.labels(type="retry_exhausted").inc()
log.error("All %d retries exhausted for %s", MAX_RETRIES, url)

View file

@ -7,7 +7,14 @@ from pathlib import Path
from flask import Flask, Response, jsonify, send_from_directory
from prometheus_client import generate_latest, CONTENT_TYPE_LATEST
from constants import DATA_DIR, RUN_ON_STARTUP, SCHEDULE_HOUR, SCRAPE_HOMECOUK, SCRAPE_OPENRENT, SCRAPE_RIGHTMOVE
from constants import (
DATA_DIR,
RUN_ON_STARTUP,
SCHEDULE_HOUR,
SCRAPE_HOMECOUK,
SCRAPE_OPENRENT,
SCRAPE_RIGHTMOVE,
)
from homecouk import load_cookies as load_homecouk_cookies
from openrent import load_cookies as load_openrent_cookies
from rightmove import outcode_cache
@ -49,8 +56,13 @@ log.info("Loading arcgis data...")
OUTCODES = load_outcodes()
PC_INDEX = build_postcode_index()
PC_COORDS = build_postcode_coords() if SCRAPE_OPENRENT else None
log.info("Ready — %d outcodes, postcode index built (rightmove=%s, homecouk=%s, openrent=%s)",
len(OUTCODES), SCRAPE_RIGHTMOVE, SCRAPE_HOMECOUK, SCRAPE_OPENRENT)
log.info(
"Ready — %d outcodes, postcode index built (rightmove=%s, homecouk=%s, openrent=%s)",
len(OUTCODES),
SCRAPE_RIGHTMOVE,
SCRAPE_HOMECOUK,
SCRAPE_OPENRENT,
)
# ---------------------------------------------------------------------------
# Scheduler
@ -63,7 +75,9 @@ def _start_scrape() -> bool:
if status.state == "running":
return False
status.state = "running"
thread = threading.Thread(target=run_scrape, args=(OUTCODES, PC_INDEX, PC_COORDS), daemon=True)
thread = threading.Thread(
target=run_scrape, args=(OUTCODES, PC_INDEX, PC_COORDS), daemon=True
)
thread.start()
return True
@ -82,7 +96,9 @@ def _scheduler_loop() -> None:
log.info("Scheduler active — will run daily at %02d:00 UTC", SCHEDULE_HOUR)
while True:
wait = _seconds_until(SCHEDULE_HOUR)
log.info("Next scheduled scrape in %.0f seconds (%.1f hours)", wait, wait / 3600)
log.info(
"Next scheduled scrape in %.0f seconds (%.1f hours)", wait, wait / 3600
)
time.sleep(wait)
log.info("Scheduled scrape triggered")
if not _start_scrape():
@ -144,15 +160,17 @@ def get_status():
def get_debug():
hk_cookies = load_homecouk_cookies() if SCRAPE_HOMECOUK else None
or_cookies = load_openrent_cookies() if SCRAPE_OPENRENT else None
return jsonify({
"outcode_cache_size": len(outcode_cache),
"outcode_cache_sample": dict(list(outcode_cache.items())[:20]),
"scrape_rightmove": SCRAPE_RIGHTMOVE,
"scrape_homecouk": SCRAPE_HOMECOUK,
"scrape_openrent": SCRAPE_OPENRENT,
"homecouk_cookies_available": hk_cookies is not None,
"openrent_cookies_available": or_cookies is not None,
})
return jsonify(
{
"outcode_cache_size": len(outcode_cache),
"outcode_cache_sample": dict(list(outcode_cache.items())[:20]),
"scrape_rightmove": SCRAPE_RIGHTMOVE,
"scrape_homecouk": SCRAPE_HOMECOUK,
"scrape_openrent": SCRAPE_OPENRENT,
"homecouk_cookies_available": hk_cookies is not None,
"openrent_cookies_available": or_cookies is not None,
}
)
@app.route("/metrics")

View file

@ -79,7 +79,8 @@ def solve_waf() -> tuple[dict[str, str], str] | None:
if "AwsWafIntegration" in content:
log.info("Got WAF challenge page, waiting for resolution...")
page.wait_for_selector(
"a.pli, .pli, .search-property-card", timeout=30000,
"a.pli, .pli, .search-property-card",
timeout=30000,
)
raw_cookies = context.cookies()
@ -94,7 +95,8 @@ def solve_waf() -> tuple[dict[str, str], str] | None:
log.info(
"AWS WAF solved — got %d cookies, UA: %s",
len(cookies), user_agent[:60],
len(cookies),
user_agent[:60],
)
flaresolverr_attempts_total.labels(result="success").inc()
return cookies, user_agent
@ -130,11 +132,13 @@ def make_client(cookies: dict[str, str], user_agent: str) -> Session:
"""Create a curl_cffi Session configured for OpenRent.
Uses Chrome TLS impersonation so AWS WAF cookies remain valid."""
session = Session(impersonate="chrome")
session.headers.update({
"User-Agent": user_agent,
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-GB,en;q=0.9",
})
session.headers.update(
{
"User-Agent": user_agent,
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-GB,en;q=0.9",
}
)
for name, value in cookies.items():
session.cookies.set(name, value, domain="openrent.co.uk")
return session
@ -152,7 +156,9 @@ def _status_label(code: int) -> str:
def fetch_page(
client: Session, url: str, max_retries: int = 3,
client: Session,
url: str,
max_retries: int = 3,
) -> str | None:
"""GET HTML with retries on 429/5xx. Returns None on permanent failure.
WAF challenge (202 or 403 with challenge JS) raises WafChallengeError."""
@ -165,17 +171,25 @@ def fetch_page(
html = resp.text
# Detect WAF challenge page masquerading as 200
if "AwsWafIntegration" in html and "challenge.js" in html:
raise WafChallengeError("Got AWS WAF challenge page — cookies expired")
raise WafChallengeError(
"Got AWS WAF challenge page — cookies expired"
)
return html
if resp.status_code in (202, 403):
raise WafChallengeError(f"HTTP {resp.status_code} — cookies likely expired")
raise WafChallengeError(
f"HTTP {resp.status_code} — cookies likely expired"
)
if resp.status_code in (429, 500, 502, 503, 504):
delay = RETRY_BASE_DELAY * (2 ** attempt)
delay = RETRY_BASE_DELAY * (2**attempt)
log.warning(
"HTTP %d from %s, retry %d/%d in %.1fs",
resp.status_code, url, attempt + 1, max_retries, delay,
resp.status_code,
url,
attempt + 1,
max_retries,
delay,
)
time.sleep(delay)
continue
@ -187,10 +201,14 @@ def fetch_page(
raise
except RequestsError as e:
openrent_errors_total.labels(type=type(e).__name__).inc()
delay = RETRY_BASE_DELAY * (2 ** attempt)
delay = RETRY_BASE_DELAY * (2**attempt)
log.warning(
"%s from %s, retry %d/%d in %.1fs",
type(e).__name__, url, attempt + 1, max_retries, delay,
type(e).__name__,
url,
attempt + 1,
max_retries,
delay,
)
time.sleep(delay)
@ -247,7 +265,9 @@ def _extract_bedrooms_from_title(title: str) -> int | None:
return None
def _extract_beds_baths_from_features(feature_items: list) -> tuple[int | None, int | None]:
def _extract_beds_baths_from_features(
feature_items: list,
) -> tuple[int | None, int | None]:
"""Extract bedrooms and bathrooms from feature list items.
OpenRent search cards have <ul> with items like:
@ -442,11 +462,7 @@ def parse_search_results(html: str) -> list[dict]:
# --- Coordinates from data attributes (may not be present on cards) ---
for el in [card] + card.select("[data-lat], [data-latitude]"):
lat = el.get("data-lat") or el.get("data-latitude")
lng = (
el.get("data-lng")
or el.get("data-longitude")
or el.get("data-lon")
)
lng = el.get("data-lng") or el.get("data-longitude") or el.get("data-lon")
if lat and lng:
try:
prop["lat"] = float(lat)
@ -543,9 +559,7 @@ def parse_property_detail(html: str) -> dict:
break
# --- Description for floor area ---
desc_el = soup.select_one(
".description, [class*='description'], #description"
)
desc_el = soup.select_one(".description, [class*='description'], #description")
if desc_el:
details["description"] = desc_el.get_text(strip=True)
@ -567,7 +581,12 @@ def map_property_type(raw_type: str | None) -> str:
lower = raw_type.lower()
if "room" in lower or "shared" in lower:
return "Other"
if "flat" in lower or "apartment" in lower or "maisonette" in lower or "studio" in lower:
if (
"flat" in lower
or "apartment" in lower
or "maisonette" in lower
or "studio" in lower
):
return "Flats/Maisonettes"
if "detached" in lower and "semi" not in lower:
return "Detached"
@ -647,7 +666,8 @@ def transform_property(
elif search_data.get("outcode"):
# No spatial index — try outcode lookup as fallback
outcode_pcs = _resolve_outcode_postcodes(
search_data["outcode"], pc_coords,
search_data["outcode"],
pc_coords,
)
if outcode_pcs:
postcode = outcode_pcs[0]
@ -708,7 +728,8 @@ def transform_property(
prop_id = search_data.get("id", "")
listing_url = search_data.get(
"url", f"{OPENRENT_BASE}/{prop_id}" if prop_id else "",
"url",
f"{OPENRENT_BASE}/{prop_id}" if prop_id else "",
)
description = detail.get("description") or search_data.get("description", "")
@ -775,7 +796,10 @@ def search_outcode(
time.sleep(DELAY_BETWEEN_PAGES * 0.5)
transformed = transform_property(
search_data, detail_data, pc_index, pc_coords,
search_data,
detail_data,
pc_index,
pc_coords,
)
if transformed:
properties.append(transformed)

View file

@ -24,7 +24,9 @@ def resolve_outcode_id(client: httpx.Client, outcode: str) -> str | None:
if outcode in outcode_cache:
return outcode_cache[outcode]
data = fetch_with_retry(client, TYPEAHEAD_URL, {"query": outcode, "limit": "10", "exclude": "STREET"})
data = fetch_with_retry(
client, TYPEAHEAD_URL, {"query": outcode, "limit": "10", "exclude": "STREET"}
)
if not data:
return None
@ -61,7 +63,12 @@ def search_outcode(
data = fetch_with_retry(client, SEARCH_URL, params)
if not data:
log.warning("Failed to fetch index %d for %s/%s", index, outcode, channel_cfg["channel"])
log.warning(
"Failed to fetch index %d for %s/%s",
index,
outcode,
channel_cfg["channel"],
)
break
raw_props = data.get("properties", [])

View file

@ -6,7 +6,16 @@ from dataclasses import dataclass, field
import polars as pl
from constants import ARCGIS_PATH, CHANNELS, DATA_DIR, DELAY_BETWEEN_OUTCODES, SCRAPE_HOMECOUK, SCRAPE_OPENRENT, SCRAPE_RIGHTMOVE, SEED
from constants import (
ARCGIS_PATH,
CHANNELS,
DATA_DIR,
DELAY_BETWEEN_OUTCODES,
SCRAPE_HOMECOUK,
SCRAPE_OPENRENT,
SCRAPE_RIGHTMOVE,
SEED,
)
from homecouk import CookiesExpiredError
from homecouk import load_cookies as load_homecouk_cookies
from homecouk import make_client as make_homecouk_client
@ -64,13 +73,23 @@ def _sync_gauges() -> None:
scrape_outcodes_done.set(status.outcodes_done)
scrape_outcodes_total.set(status.outcodes_total)
# Total properties (both sources combined)
scrape_properties_total.labels(channel="buy", source="total").set(status.properties_buy)
scrape_properties_total.labels(channel="rent", source="total").set(status.properties_rent)
scrape_properties_total.labels(channel="buy", source="total").set(
status.properties_buy
)
scrape_properties_total.labels(channel="rent", source="total").set(
status.properties_rent
)
# Per-source breakdown for current channel
ch = "buy" if status.channel == "BUY" else "rent"
scrape_properties_total.labels(channel=ch, source="rightmove").set(status.rm_properties)
scrape_properties_total.labels(channel=ch, source="homecouk").set(status.hk_properties)
scrape_properties_total.labels(channel=ch, source="openrent").set(status.or_properties)
scrape_properties_total.labels(channel=ch, source="rightmove").set(
status.rm_properties
)
scrape_properties_total.labels(channel=ch, source="homecouk").set(
status.hk_properties
)
scrape_properties_total.labels(channel=ch, source="openrent").set(
status.or_properties
)
if status.started_at:
end = status.finished_at if status.finished_at else time.time()
scrape_elapsed_seconds.set(end - status.started_at)
@ -86,7 +105,9 @@ def load_outcodes() -> list[str]:
log.info("England postcodes: %d", len(england))
outcodes = (
england.select(pl.col("pcd").str.extract(r"^([A-Z]{1,2}\d[A-Z0-9]?)", 1).alias("outcode"))
england.select(
pl.col("pcd").str.extract(r"^([A-Z]{1,2}\d[A-Z0-9]?)", 1).alias("outcode")
)
.drop_nulls()
.get_column("outcode")
.unique()
@ -101,7 +122,9 @@ def build_postcode_index() -> PostcodeSpatialIndex:
"""Build spatial index from arcgis England postcodes."""
log.info("Building postcode spatial index from %s", ARCGIS_PATH)
df = pl.read_parquet(ARCGIS_PATH, columns=["pcd", "ctry", "lat", "long"])
england = df.filter(pl.col("ctry") == "E92000001").drop_nulls(subset=["lat", "long"])
england = df.filter(pl.col("ctry") == "E92000001").drop_nulls(
subset=["lat", "long"]
)
return PostcodeSpatialIndex(
england.get_column("lat").to_list(),
england.get_column("long").to_list(),
@ -114,7 +137,9 @@ def build_postcode_coords() -> dict[str, tuple[float, float]]:
Used by OpenRent scraper to resolve coordinates from postcodes."""
log.info("Building postcode coords lookup from %s", ARCGIS_PATH)
df = pl.read_parquet(ARCGIS_PATH, columns=["pcd", "ctry", "lat", "long"])
england = df.filter(pl.col("ctry") == "E92000001").drop_nulls(subset=["lat", "long"])
england = df.filter(pl.col("ctry") == "E92000001").drop_nulls(
subset=["lat", "long"]
)
coords: dict[str, tuple[float, float]] = {}
for pcd, lat, lng in zip(
england.get_column("pcd").to_list(),
@ -179,7 +204,9 @@ def run_scrape(
log.info("home.co.uk scraping ENABLED")
homecouk_enabled.set(1)
else:
log.info("home.co.uk scraping DISABLED (need FlareSolverr or HOMECOUK_CF_CLEARANCE + HOMECOUK_SESSION)")
log.info(
"home.co.uk scraping DISABLED (need FlareSolverr or HOMECOUK_CF_CLEARANCE + HOMECOUK_SESSION)"
)
homecouk_enabled.set(0)
# OpenRent: must be enabled via SCRAPE_OPENRENT + cookies available
@ -195,7 +222,9 @@ def run_scrape(
log.info("OpenRent scraping ENABLED")
openrent_enabled.set(1)
else:
log.info("OpenRent scraping DISABLED (need FlareSolverr or OPENRENT_WAF_TOKEN)")
log.info(
"OpenRent scraping DISABLED (need FlareSolverr or OPENRENT_WAF_TOKEN)"
)
openrent_enabled.set(0)
# Build postcode coords if OpenRent is active and caller didn't provide them
@ -207,7 +236,9 @@ def run_scrape(
channel_name = channel_cfg["channel"]
file_suffix = "buy" if channel_name == "BUY" else "rent"
all_properties: dict[str, dict] = {} # dedup by id
seen_dedup_keys: set[tuple] = set() # cross-source dedup by (postcode, beds, price)
seen_dedup_keys: set[tuple] = (
set()
) # cross-source dedup by (postcode, beds, price)
rm_count = 0 # Rightmove properties this channel
hk_count = 0 # home.co.uk properties this channel
hk_dedup_count = 0 # home.co.uk skipped as cross-source duplicates
@ -222,15 +253,22 @@ def run_scrape(
status.hk_properties = 0
status.or_properties = 0
log.info("=== Starting %s channel (%d outcodes) ===", channel_name, len(shuffled))
log.info(
"=== Starting %s channel (%d outcodes) ===", channel_name, len(shuffled)
)
for i, outcode in enumerate(shuffled):
with status_lock:
status.outcode = outcode
status.outcodes_done = i
log.debug("Outcode %s (%d/%d) — %d properties so far",
outcode, i + 1, len(shuffled), len(all_properties))
log.debug(
"Outcode %s (%d/%d) — %d properties so far",
outcode,
i + 1,
len(shuffled),
len(all_properties),
)
made_requests = False
@ -240,9 +278,13 @@ def run_scrape(
try:
outcode_id = resolve_outcode_id(client, outcode)
if not outcode_id:
log.debug("No Rightmove ID for outcode %s, skipping", outcode)
log.debug(
"No Rightmove ID for outcode %s, skipping", outcode
)
else:
props = search_outcode(client, outcode_id, outcode, channel_cfg, pc_index)
props = search_outcode(
client, outcode_id, outcode, channel_cfg, pc_index
)
for p in props:
pid = p["id"]
if pid not in all_properties:
@ -261,7 +303,10 @@ def run_scrape(
made_requests = True
try:
hk_props = homecouk_search_outcode(
hk_client, outcode, channel_name, pc_index,
hk_client,
outcode,
channel_name,
pc_index,
)
for p in hk_props:
pid = p["id"]
@ -276,9 +321,13 @@ def run_scrape(
seen_dedup_keys.add(key)
hk_count += 1
if hk_props:
log.info("home.co.uk %s: +%d properties", outcode, len(hk_props))
log.info(
"home.co.uk %s: +%d properties", outcode, len(hk_props)
)
except CookiesExpiredError:
log.warning("home.co.uk cookies expired — attempting refresh via FlareSolverr")
log.warning(
"home.co.uk cookies expired — attempting refresh via FlareSolverr"
)
hk_client.close()
hk_result = load_homecouk_cookies()
if hk_result:
@ -286,13 +335,17 @@ def run_scrape(
log.info("home.co.uk cookies refreshed, continuing")
cookie_refreshes_total.labels(result="success").inc()
else:
log.warning("Cookie refresh failed, disabling home.co.uk for rest of scrape")
log.warning(
"Cookie refresh failed, disabling home.co.uk for rest of scrape"
)
hk_client = None
hk_failed = True
homecouk_enabled.set(0)
cookie_refreshes_total.labels(result="failure").inc()
with status_lock:
status.errors.append("home.co.uk cookies expired and refresh failed")
status.errors.append(
"home.co.uk cookies expired and refresh failed"
)
except Exception as e:
msg = f"Error scraping home.co.uk {outcode}/{channel_name}: {e}"
log.error(msg)
@ -305,7 +358,10 @@ def run_scrape(
made_requests = True
try:
or_props = openrent_search_outcode(
or_client, outcode, pc_index, pc_coords,
or_client,
outcode,
pc_index,
pc_coords,
)
for p in or_props:
pid = p["id"]
@ -318,9 +374,13 @@ def run_scrape(
seen_dedup_keys.add(key)
or_count += 1
if or_props:
log.info("OpenRent %s: +%d properties", outcode, len(or_props))
log.info(
"OpenRent %s: +%d properties", outcode, len(or_props)
)
except WafChallengeError:
log.warning("OpenRent WAF cookies expired — attempting refresh via FlareSolverr")
log.warning(
"OpenRent WAF cookies expired — attempting refresh via FlareSolverr"
)
or_client.close()
or_result = load_openrent_cookies()
if or_result:
@ -328,13 +388,17 @@ def run_scrape(
log.info("OpenRent cookies refreshed, continuing")
cookie_refreshes_total.labels(result="success").inc()
else:
log.warning("Cookie refresh failed, disabling OpenRent for rest of scrape")
log.warning(
"Cookie refresh failed, disabling OpenRent for rest of scrape"
)
or_client = None
or_failed = True
openrent_enabled.set(0)
cookie_refreshes_total.labels(result="failure").inc()
with status_lock:
status.errors.append("OpenRent WAF cookies expired and refresh failed")
status.errors.append(
"OpenRent WAF cookies expired and refresh failed"
)
except Exception as e:
msg = f"Error scraping OpenRent {outcode}/{channel_name}: {e}"
log.error(msg)
@ -352,8 +416,14 @@ def run_scrape(
status.or_properties = or_count
_sync_gauges()
log.info("Outcode %s: total %d (rm: %d, hk: %d, or: %d)",
outcode, len(all_properties), rm_count, hk_count, or_count)
log.info(
"Outcode %s: total %d (rm: %d, hk: %d, or: %d)",
outcode,
len(all_properties),
rm_count,
hk_count,
or_count,
)
if made_requests and i < len(shuffled) - 1:
time.sleep(DELAY_BETWEEN_OUTCODES)
@ -373,7 +443,11 @@ def run_scrape(
log.info(
"=== %s channel complete: %d unique (rm: %d, hk: %d, or: %d, cross-dedup: %d) ===",
channel_name, len(deduped), rm_count, hk_count, or_count,
channel_name,
len(deduped),
rm_count,
hk_count,
or_count,
hk_dedup_count + or_dedup_count,
)
@ -382,8 +456,12 @@ def run_scrape(
status.finished_at = time.time()
_sync_gauges()
elapsed = status.finished_at - status.started_at
log.info("Scrape complete in %.0fs — buy: %d, rent: %d",
elapsed, status.properties_buy, status.properties_rent)
log.info(
"Scrape complete in %.0fs — buy: %d, rent: %d",
elapsed,
status.properties_buy,
status.properties_rent,
)
except Exception as e:
log.exception("Fatal scrape error")

View file

@ -11,12 +11,16 @@ class PostcodeSpatialIndex:
"""Grid-based spatial index over arcgis postcodes for nearest-lookup."""
def __init__(self, lats: list[float], lngs: list[float], postcodes: list[str]):
self.grid: dict[tuple[int, int], list[tuple[float, float, str]]] = defaultdict(list)
self.grid: dict[tuple[int, int], list[tuple[float, float, str]]] = defaultdict(
list
)
for lat, lng, pcd in zip(lats, lngs, postcodes):
gx = int(math.floor(lng / GRID_CELL_SIZE))
gy = int(math.floor(lat / GRID_CELL_SIZE))
self.grid[(gx, gy)].append((lat, lng, pcd))
log.info("Postcode spatial index: %d cells, %d postcodes", len(self.grid), len(lats))
log.info(
"Postcode spatial index: %d cells, %d postcodes", len(self.grid), len(lats)
)
def nearest(self, lat: float, lng: float) -> str | None:
gx = int(math.floor(lng / GRID_CELL_SIZE))

View file

@ -60,9 +60,7 @@ def write_parquet(properties: list[dict], path: Path, channel: str) -> None:
"Property type": [p["Property type"] for p in properties],
"Property sub-type": [p["Property sub-type"] for p in properties],
"Price qualifier": [p["Price qualifier"] for p in properties],
"Total floor area (sqm)": [
p["Total floor area (sqm)"] for p in properties
],
"Total floor area (sqm)": [p["Total floor area (sqm)"] for p in properties],
"Listing URL": [p["Listing URL"] for p in properties],
"Listing features": [p["Listing features"] for p in properties],
"Listing date": listing_dates,

View file

@ -51,9 +51,19 @@ def fix_coords(lat: float, lng: float) -> tuple[float, float]:
if 49 <= lat <= 56 and -7 <= lng <= 2:
return lat, lng
if 49 <= lng <= 56 and -7 <= lat <= 2:
log.debug("Swapping reversed coords: lat=%.4f lng=%.4f → lat=%.4f lng=%.4f", lat, lng, lng, lat)
log.debug(
"Swapping reversed coords: lat=%.4f lng=%.4f → lat=%.4f lng=%.4f",
lat,
lng,
lng,
lat,
)
return lng, lat
log.warning("Coords outside England bounds even after swap attempt: lat=%.4f lng=%.4f", lat, lng)
log.warning(
"Coords outside England bounds even after swap attempt: lat=%.4f lng=%.4f",
lat,
lng,
)
return lat, lng
@ -66,7 +76,9 @@ def normalize_price(amount: int, frequency: str) -> int:
return amount
def transform_property(prop: dict, outcode: str, pc_index: PostcodeSpatialIndex) -> dict | None:
def transform_property(
prop: dict, outcode: str, pc_index: PostcodeSpatialIndex
) -> dict | None:
"""Transform a raw Rightmove property dict into our output schema."""
loc = prop.get("location")
if not loc:
@ -86,13 +98,19 @@ def transform_property(prop: dict, outcode: str, pc_index: PostcodeSpatialIndex)
price = normalize_price(int(amount), frequency)
display_prices = price_obj.get("displayPrices", [])
price_qualifier = display_prices[0].get("displayPriceQualifier", "") if display_prices else ""
price_qualifier = (
display_prices[0].get("displayPriceQualifier", "") if display_prices else ""
)
sub_type = prop.get("propertySubType", "")
bedrooms = prop.get("bedrooms", 0) or 0
bathrooms = prop.get("bathrooms", 0) or 0
key_features = [kf.get("description", "") for kf in prop.get("keyFeatures", []) if kf.get("description")]
key_features = [
kf.get("description", "")
for kf in prop.get("keyFeatures", [])
if kf.get("description")
]
postcode = pc_index.nearest(lat, lng)
if not postcode: