Faster scraping

This commit is contained in:
Andras Schmelczer 2026-03-20 07:52:22 +00:00
parent 05b8ee06c1
commit 852bb3f3a7
4 changed files with 437 additions and 324 deletions

View file

@ -146,6 +146,12 @@ services:
# networks: # networks:
# - dev-network # - dev-network
# restart: unless-stopped # restart: unless-stopped
# healthcheck:
# test: ["CMD", "curl", "-f", "http://localhost:8191/health"]
# interval: 30s
# timeout: 5s
# retries: 3
# start_period: 30s
# finder: # finder:
# build: # build:
@ -161,8 +167,14 @@ services:
# gluetun: # gluetun:
# condition: service_healthy # condition: service_healthy
# flaresolverr: # flaresolverr:
# condition: service_started # condition: service_healthy
# restart: unless-stopped # restart: unless-stopped
# healthcheck:
# test: ["CMD", "python3", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:1234/health')"]
# interval: 30s
# timeout: 5s
# retries: 3
# start_period: 60s
volumes: volumes:

View file

@ -4,8 +4,8 @@ from pathlib import Path
ARCGIS_PATH = os.environ.get("ARCGIS_PATH", "/data/arcgis_data.parquet") ARCGIS_PATH = os.environ.get("ARCGIS_PATH", "/data/arcgis_data.parquet")
DATA_DIR = Path("/app/data") DATA_DIR = Path("/app/data")
PAGE_SIZE = 24 PAGE_SIZE = 24
DELAY_BETWEEN_PAGES = 1.0 DELAY_BETWEEN_PAGES = 0.5
DELAY_BETWEEN_OUTCODES = 2.0 DELAY_BETWEEN_OUTCODES = 1.0
MAX_RETRIES = 3 MAX_RETRIES = 3
RETRY_BASE_DELAY = 2.0 RETRY_BASE_DELAY = 2.0
GRID_CELL_SIZE = 0.01 # degrees for postcode spatial index GRID_CELL_SIZE = 0.01 # degrees for postcode spatial index
@ -67,6 +67,7 @@ PROPERTY_TYPE_MAP = {
"Apartment": "Flats/Maisonettes", "Apartment": "Flats/Maisonettes",
"Penthouse": "Flats/Maisonettes", "Penthouse": "Flats/Maisonettes",
"Ground Flat": "Flats/Maisonettes", "Ground Flat": "Flats/Maisonettes",
"Duplex": "Flats/Maisonettes",
"Detached Bungalow": "Detached", "Detached Bungalow": "Detached",
"Semi-Detached Bungalow": "Semi-Detached", "Semi-Detached Bungalow": "Semi-Detached",
"Town House": "Terraced", "Town House": "Terraced",
@ -75,9 +76,15 @@ PROPERTY_TYPE_MAP = {
"Bungalow": "Other", "Bungalow": "Other",
"Cottage": "Other", "Cottage": "Other",
"Park Home": "Other", "Park Home": "Other",
"Mobile Home": "Other",
"Caravan": "Other",
"Lodge": "Other",
"Land": "Other", "Land": "Other",
"Farm / Barn": "Other", "Farm / Barn": "Other",
"Farm House": "Other",
"House": "Detached", "House": "Detached",
"House of Multiple Occupation": "Flats/Maisonettes",
"House Share": "Other",
"Not Specified": "Other", "Not Specified": "Other",
"Chalet": "Other", "Chalet": "Other",
"Barn Conversion": "Other", "Barn Conversion": "Other",
@ -85,9 +92,20 @@ PROPERTY_TYPE_MAP = {
"Character Property": "Other", "Character Property": "Other",
"Cluster House": "Other", "Cluster House": "Other",
"Retirement Property": "Flats/Maisonettes", "Retirement Property": "Flats/Maisonettes",
"Parking": "Other",
"Plot": "Other", "Plot": "Other",
"Garages": "Other", "Garages": "Other",
"Mews": "Terraced", "Mews": "Terraced",
"Property": "Other",
# Lowercase variants (from home.co.uk / Rightmove APIs)
"house": "Detached",
"bungalow": "Other",
"townhouse": "Terraced",
"land": "Other",
"other": "Other",
"not-specified": "Other",
"retirement-property": "Flats/Maisonettes",
"equestrian-facility": "Other",
} }
CHANNELS = [ CHANNELS = [

View file

@ -788,7 +788,24 @@ def search_outcode(
for search_data in search_results: for search_data in search_results:
detail_data = None detail_data = None
if fetch_details and search_data.get("url"): # Skip detail page if we already have coordinates or a resolvable postcode
has_coords = (
search_data.get("lat") is not None
and search_data.get("lng") is not None
)
has_resolvable_pc = (
search_data.get("postcode")
and pc_coords
and search_data["postcode"] in pc_coords
)
needs_detail = (
fetch_details
and search_data.get("url")
and not has_coords
and not has_resolvable_pc
)
if needs_detail:
detail_html = fetch_page(client, search_data["url"]) detail_html = fetch_page(client, search_data["url"])
if detail_html: if detail_html:
detail_data = parse_property_detail(detail_html) detail_data = parse_property_detail(detail_html)

View file

@ -61,7 +61,7 @@ class ScrapeStatus:
outcodes_total: int = 0 outcodes_total: int = 0
properties_buy: int = 0 properties_buy: int = 0
properties_rent: int = 0 properties_rent: int = 0
# Per-source counts for current channel # Per-source counts (combined across channels)
rm_properties: int = 0 rm_properties: int = 0
hk_properties: int = 0 hk_properties: int = 0
or_properties: int = 0 or_properties: int = 0
@ -81,27 +81,26 @@ def _sync_gauges() -> None:
scrape_state.labels(state=state).set(1 if status.state == state else 0) scrape_state.labels(state=state).set(1 if status.state == state else 0)
scrape_outcodes_done.set(status.outcodes_done) scrape_outcodes_done.set(status.outcodes_done)
scrape_outcodes_total.set(status.outcodes_total) scrape_outcodes_total.set(status.outcodes_total)
# Total properties (both sources combined)
scrape_properties_total.labels(channel="buy", source="total").set( scrape_properties_total.labels(channel="buy", source="total").set(
status.properties_buy status.properties_buy
) )
scrape_properties_total.labels(channel="rent", source="total").set( scrape_properties_total.labels(channel="rent", source="total").set(
status.properties_rent status.properties_rent
) )
# Per-source breakdown for current channel # Per-source totals (across both channels)
ch = "buy" if status.channel == "BUY" else "rent" for ch in ("buy", "rent"):
scrape_properties_total.labels(channel=ch, source="rightmove").set( scrape_properties_total.labels(channel=ch, source="rightmove").set(
status.rm_properties status.rm_properties
) )
scrape_properties_total.labels(channel=ch, source="homecouk").set( scrape_properties_total.labels(channel=ch, source="homecouk").set(
status.hk_properties status.hk_properties
) )
scrape_properties_total.labels(channel=ch, source="openrent").set( scrape_properties_total.labels(channel=ch, source="openrent").set(
status.or_properties status.or_properties
) )
scrape_properties_total.labels(channel=ch, source="zoopla").set( scrape_properties_total.labels(channel=ch, source="zoopla").set(
status.zp_properties status.zp_properties
) )
if status.started_at: if status.started_at:
end = status.finished_at if status.finished_at else time.time() end = status.finished_at if status.finished_at else time.time()
scrape_elapsed_seconds.set(end - status.started_at) scrape_elapsed_seconds.set(end - status.started_at)
@ -179,28 +178,89 @@ def _dedup_key(p: dict) -> tuple:
return (p.get("Postcode", ""), p.get("Bedrooms", 0), p.get("price", 0)) return (p.get("Postcode", ""), p.get("Bedrooms", 0), p.get("price", 0))
class _Progress:
"""Thread-safe progress tracker for parallel source workers."""
def __init__(self):
self._counts: dict[str, int] = {}
self._lock = threading.Lock()
def update(self, source: str, done: int) -> None:
with self._lock:
self._counts[source] = done
def snapshot(self) -> dict[str, int]:
with self._lock:
return dict(self._counts)
def _merge_channel(
rm_props: list[dict],
hk_props: list[dict],
or_props: list[dict],
zp_props: list[dict],
) -> tuple[dict[str, dict], dict[str, int], int]:
"""Merge properties from all sources for one channel with cross-source dedup.
Rightmove has priority; other sources are checked for duplicates.
Returns (all_properties_by_id, per_source_counts, total_dedup_count).
"""
all_properties: dict[str, dict] = {}
seen_keys: set[tuple] = set()
counts = {"rm": 0, "hk": 0, "or": 0, "zp": 0}
total_dedup = 0
# Rightmove first (priority source)
for p in rm_props:
pid = p["id"]
if pid not in all_properties:
all_properties[pid] = p
seen_keys.add(_dedup_key(p))
counts["rm"] += 1
# Other sources (check for cross-source duplicates)
for source, props in [("hk", hk_props), ("or", or_props), ("zp", zp_props)]:
for p in props:
pid = p["id"]
key = _dedup_key(p)
if pid in all_properties or key in seen_keys:
total_dedup += 1
continue
all_properties[pid] = p
seen_keys.add(key)
counts[source] += 1
return all_properties, counts, total_dedup
def run_scrape( def run_scrape(
outcodes: list[str], outcodes: list[str],
pc_index: PostcodeSpatialIndex, pc_index: PostcodeSpatialIndex,
pc_coords: dict[str, tuple[float, float]] | None = None, pc_coords: dict[str, tuple[float, float]] | None = None,
) -> None: ) -> None:
"""Main scrape loop — runs in background thread. """Main scrape orchestrator — runs all sources in parallel threads.
Scrapes Rightmove, home.co.uk, and OpenRent, merging into one dataset."""
Each source (Rightmove, home.co.uk, OpenRent, Zoopla) gets its own thread
that iterates all outcodes for both BUY and RENT channels. Results are
merged with cross-source deduplication after all workers complete.
"""
global status global status
with status_lock: with status_lock:
status.state = "running" status.state = "running"
status.started_at = time.time() status.started_at = time.time()
status.finished_at = 0.0
status.errors = [] status.errors = []
status.properties_buy = 0 status.properties_buy = 0
status.properties_rent = 0 status.properties_rent = 0
status.channel = ""
status.outcode = ""
_sync_gauges() _sync_gauges()
# Shuffle for geographic diversity
shuffled = list(outcodes) shuffled = list(outcodes)
random.seed(SEED) random.seed(SEED)
random.shuffle(shuffled) random.shuffle(shuffled)
if not SCRAPE_RIGHTMOVE and not SCRAPE_HOMECOUK and not SCRAPE_OPENRENT and not SCRAPE_ZOOPLA: if not any([SCRAPE_RIGHTMOVE, SCRAPE_HOMECOUK, SCRAPE_OPENRENT, SCRAPE_ZOOPLA]):
log.warning("All scrapers disabled — nothing to do") log.warning("All scrapers disabled — nothing to do")
with status_lock: with status_lock:
status.state = "done" status.state = "done"
@ -208,373 +268,387 @@ def run_scrape(
_sync_gauges() _sync_gauges()
return return
client = make_client() if SCRAPE_RIGHTMOVE else None
if not SCRAPE_RIGHTMOVE: if not SCRAPE_RIGHTMOVE:
log.info("Rightmove scraping DISABLED (SCRAPE_RIGHTMOVE=false)") log.info("Rightmove scraping DISABLED (SCRAPE_RIGHTMOVE=false)")
# home.co.uk: must be enabled via SCRAPE_HOMECOUK + cookies available
hk_client = None
hk_failed = False
if not SCRAPE_HOMECOUK: if not SCRAPE_HOMECOUK:
log.info("home.co.uk scraping DISABLED (SCRAPE_HOMECOUK=false)") log.info("home.co.uk scraping DISABLED (SCRAPE_HOMECOUK=false)")
homecouk_enabled.set(0) homecouk_enabled.set(0)
else:
hk_result = load_homecouk_cookies()
hk_client = make_homecouk_client(*hk_result) if hk_result else None
if hk_client:
log.info("home.co.uk scraping ENABLED")
homecouk_enabled.set(1)
else:
log.info(
"home.co.uk scraping DISABLED (need FlareSolverr or HOMECOUK_CF_CLEARANCE + HOMECOUK_SESSION)"
)
homecouk_enabled.set(0)
# OpenRent: must be enabled via SCRAPE_OPENRENT + cookies available
or_client = None
or_failed = False
if not SCRAPE_OPENRENT: if not SCRAPE_OPENRENT:
log.info("OpenRent scraping DISABLED (SCRAPE_OPENRENT=false)") log.info("OpenRent scraping DISABLED (SCRAPE_OPENRENT=false)")
openrent_enabled.set(0) openrent_enabled.set(0)
else:
or_result = load_openrent_cookies()
or_client = make_openrent_client(*or_result) if or_result else None
if or_client:
log.info("OpenRent scraping ENABLED")
openrent_enabled.set(1)
else:
log.info(
"OpenRent scraping DISABLED (need FlareSolverr or OPENRENT_WAF_TOKEN)"
)
openrent_enabled.set(0)
# Zoopla: uses Camoufox browser (no cookies/client pattern)
zp_browser = None
zp_page = None
zp_failed = False
if not SCRAPE_ZOOPLA: if not SCRAPE_ZOOPLA:
log.info("Zoopla scraping DISABLED (SCRAPE_ZOOPLA=false)") log.info("Zoopla scraping DISABLED (SCRAPE_ZOOPLA=false)")
zoopla_enabled.set(0) zoopla_enabled.set(0)
else:
try:
zp_browser, zp_page = launch_zoopla_browser()
log.info("Zoopla scraping ENABLED (Camoufox browser launched)")
zoopla_enabled.set(1)
except TurnstileError:
log.warning("Zoopla Cloudflare Turnstile failed — disabling Zoopla")
zoopla_enabled.set(0)
except Exception as e:
log.warning("Zoopla browser launch failed: %s — disabling Zoopla", e)
zoopla_enabled.set(0)
# Build postcode coords if OpenRent/Zoopla is active and caller didn't provide them # Build postcode coords if needed for OpenRent/Zoopla
if (or_client or zp_page) and pc_coords is None: if (SCRAPE_OPENRENT or SCRAPE_ZOOPLA) and pc_coords is None:
pc_coords = build_postcode_coords() pc_coords = build_postcode_coords()
try: # Per-source result containers: {channel_name: [properties]}
for channel_cfg in CHANNELS: # Each list is only written by its owning source thread.
channel_name = channel_cfg["channel"] rm_results: dict[str, list] = {"BUY": [], "RENT": []}
file_suffix = "buy" if channel_name == "BUY" else "rent" hk_results: dict[str, list] = {"BUY": [], "RENT": []}
all_properties: dict[str, dict] = {} # dedup by id or_results: dict[str, list] = {"BUY": [], "RENT": []}
seen_dedup_keys: set[tuple] = ( zp_results: dict[str, list] = {"BUY": [], "RENT": []}
set()
) # cross-source dedup by (postcode, beds, price)
rm_count = 0 # Rightmove properties this channel
hk_count = 0 # home.co.uk properties this channel
hk_dedup_count = 0 # home.co.uk skipped as cross-source duplicates
or_count = 0 # OpenRent properties this channel
or_dedup_count = 0 # OpenRent skipped as cross-source duplicates
zp_count = 0 # Zoopla properties this channel
zp_dedup_count = 0 # Zoopla skipped as cross-source duplicates
with status_lock: progress = _Progress()
status.channel = channel_name
status.outcodes_done = 0
status.outcodes_total = len(shuffled)
status.rm_properties = 0
status.hk_properties = 0
status.or_properties = 0
status.zp_properties = 0
channel_start = time.time() # --- Source worker closures ---
prev_prop_milestone = 0 # last 10k milestone we logged # Each worker owns its client lifecycle and iterates all outcodes for both
log.info( # channels. On auth failure, it refreshes cookies and continues. On fatal
"=== Starting %s channel (%d outcodes) ===", channel_name, len(shuffled) # failure, it marks itself as done and returns partial results.
)
def rm_worker():
client = make_client()
try:
for i, outcode in enumerate(shuffled): for i, outcode in enumerate(shuffled):
with status_lock: try:
status.outcode = outcode outcode_id = resolve_outcode_id(client, outcode)
status.outcodes_done = i except Exception as e:
log.error("Rightmove %s ID lookup: %s", outcode, e)
scrape_errors_total.labels(source="rightmove").inc()
progress.update("rm", i + 1)
time.sleep(DELAY_BETWEEN_OUTCODES)
continue
made_requests = False if not outcode_id:
log.debug("No Rightmove ID for %s, skipping", outcode)
progress.update("rm", i + 1)
time.sleep(DELAY_BETWEEN_OUTCODES)
continue
# --- Rightmove --- for ch_cfg in CHANNELS:
if SCRAPE_RIGHTMOVE: ch = ch_cfg["channel"]
made_requests = True
try: try:
outcode_id = resolve_outcode_id(client, outcode) props = search_outcode(
if not outcode_id: client, outcode_id, outcode, ch_cfg, pc_index
log.debug(
"No Rightmove ID for outcode %s, skipping", outcode
)
else:
props = search_outcode(
client, outcode_id, outcode, channel_cfg, pc_index
)
for p in props:
pid = p["id"]
if pid not in all_properties:
all_properties[pid] = p
seen_dedup_keys.add(_dedup_key(p))
rm_count += 1
except Exception as e:
msg = f"Error scraping Rightmove {outcode}/{channel_name}: {e}"
log.error(msg)
scrape_errors_total.labels(source="rightmove").inc()
with status_lock:
status.errors.append(msg)
# --- home.co.uk ---
if hk_client and not hk_failed:
made_requests = True
try:
hk_props = homecouk_search_outcode(
hk_client,
outcode,
channel_name,
pc_index,
) )
for p in hk_props: rm_results[ch].extend(props)
pid = p["id"] except Exception as e:
key = _dedup_key(p) log.error("Rightmove %s/%s: %s", outcode, ch, e)
if pid in all_properties or key in seen_dedup_keys: scrape_errors_total.labels(source="rightmove").inc()
hk_dedup_count += 1
cross_source_dedup_total.labels( progress.update("rm", i + 1)
channel="buy" if channel_name == "BUY" else "rent", time.sleep(DELAY_BETWEEN_OUTCODES)
).inc() except Exception as e:
continue log.exception("Fatal Rightmove error: %s", e)
all_properties[pid] = p with status_lock:
seen_dedup_keys.add(key) status.errors.append(f"Fatal Rightmove: {e}")
hk_count += 1 finally:
if hk_props: client.close()
log.info(
"home.co.uk %s: +%d properties", outcode, len(hk_props) def hk_worker():
) hk_result = load_homecouk_cookies()
if not hk_result:
log.info("home.co.uk DISABLED (no cookies available)")
homecouk_enabled.set(0)
progress.update("hk", len(shuffled))
return
client = make_homecouk_client(*hk_result)
log.info("home.co.uk scraping ENABLED")
homecouk_enabled.set(1)
try:
for i, outcode in enumerate(shuffled):
for ch_cfg in CHANNELS:
ch = ch_cfg["channel"]
try:
props = homecouk_search_outcode(
client, outcode, ch, pc_index
)
hk_results[ch].extend(props)
if props:
log.info("home.co.uk %s: +%d properties", outcode, len(props))
except CookiesExpiredError: except CookiesExpiredError:
log.warning( log.warning(
"home.co.uk cookies expired — attempting refresh via FlareSolverr" "home.co.uk cookies expired — attempting refresh"
) )
hk_client.close() client.close()
hk_result = load_homecouk_cookies() hk_new = load_homecouk_cookies()
if hk_result: if hk_new:
hk_client = make_homecouk_client(*hk_result) client = make_homecouk_client(*hk_new)
log.info("home.co.uk cookies refreshed, continuing") log.info("home.co.uk cookies refreshed, continuing")
cookie_refreshes_total.labels(result="success").inc() cookie_refreshes_total.labels(result="success").inc()
else: else:
log.warning( log.warning(
"Cookie refresh failed, disabling home.co.uk for rest of scrape" "Cookie refresh failed, disabling home.co.uk"
) )
hk_client = None
hk_failed = True
homecouk_enabled.set(0) homecouk_enabled.set(0)
cookie_refreshes_total.labels(result="failure").inc() cookie_refreshes_total.labels(result="failure").inc()
with status_lock: with status_lock:
status.errors.append( status.errors.append(
"home.co.uk cookies expired and refresh failed" "home.co.uk cookies expired and refresh failed"
) )
progress.update("hk", len(shuffled))
return
except Exception as e: except Exception as e:
msg = f"Error scraping home.co.uk {outcode}/{channel_name}: {e}" log.error("home.co.uk %s/%s: %s", outcode, ch, e)
log.error(msg)
scrape_errors_total.labels(source="homecouk").inc() scrape_errors_total.labels(source="homecouk").inc()
with status_lock:
status.errors.append(msg)
# --- OpenRent (RENT channel only) --- progress.update("hk", i + 1)
if or_client and not or_failed and channel_name == "RENT": time.sleep(DELAY_BETWEEN_OUTCODES)
made_requests = True except Exception as e:
try: log.exception("Fatal home.co.uk error: %s", e)
or_props = openrent_search_outcode( with status_lock:
or_client, status.errors.append(f"Fatal home.co.uk: {e}")
outcode, finally:
pc_index, try:
pc_coords, client.close()
) except Exception:
for p in or_props: pass
pid = p["id"]
key = _dedup_key(p) def or_worker():
if pid in all_properties or key in seen_dedup_keys: or_result = load_openrent_cookies()
or_dedup_count += 1 if not or_result:
cross_source_dedup_total.labels(channel="rent").inc() log.info("OpenRent DISABLED (no cookies available)")
continue openrent_enabled.set(0)
all_properties[pid] = p progress.update("or", len(shuffled))
seen_dedup_keys.add(key) return
or_count += 1 client = make_openrent_client(*or_result)
if or_props: log.info("OpenRent scraping ENABLED")
log.info( openrent_enabled.set(1)
"OpenRent %s: +%d properties", outcode, len(or_props) try:
) for i, outcode in enumerate(shuffled):
except WafChallengeError: # OpenRent is RENT-only
try:
props = openrent_search_outcode(
client, outcode, pc_index, pc_coords
)
or_results["RENT"].extend(props)
if props:
log.info("OpenRent %s: +%d properties", outcode, len(props))
except WafChallengeError:
log.warning(
"OpenRent WAF cookies expired — attempting refresh"
)
client.close()
or_new = load_openrent_cookies()
if or_new:
client = make_openrent_client(*or_new)
log.info("OpenRent cookies refreshed, continuing")
cookie_refreshes_total.labels(result="success").inc()
else:
log.warning( log.warning(
"OpenRent WAF cookies expired — attempting refresh via FlareSolverr" "Cookie refresh failed, disabling OpenRent"
) )
or_client.close() openrent_enabled.set(0)
or_result = load_openrent_cookies() cookie_refreshes_total.labels(result="failure").inc()
if or_result:
or_client = make_openrent_client(*or_result)
log.info("OpenRent cookies refreshed, continuing")
cookie_refreshes_total.labels(result="success").inc()
else:
log.warning(
"Cookie refresh failed, disabling OpenRent for rest of scrape"
)
or_client = None
or_failed = True
openrent_enabled.set(0)
cookie_refreshes_total.labels(result="failure").inc()
with status_lock:
status.errors.append(
"OpenRent WAF cookies expired and refresh failed"
)
except Exception as e:
msg = f"Error scraping OpenRent {outcode}/{channel_name}: {e}"
log.error(msg)
scrape_errors_total.labels(source="openrent").inc()
with status_lock: with status_lock:
status.errors.append(msg) status.errors.append(
"OpenRent WAF cookies expired and refresh failed"
# --- Zoopla ---
if zp_page and not zp_failed:
made_requests = True
try:
zp_props = zoopla_search_outcode(
zp_page,
outcode,
channel_name,
pc_index,
pc_coords,
)
for p in zp_props:
pid = p["id"]
key = _dedup_key(p)
if pid in all_properties or key in seen_dedup_keys:
zp_dedup_count += 1
cross_source_dedup_total.labels(
channel="buy" if channel_name == "BUY" else "rent",
).inc()
continue
all_properties[pid] = p
seen_dedup_keys.add(key)
zp_count += 1
if zp_props:
log.info(
"Zoopla %s: +%d properties", outcode, len(zp_props)
) )
progress.update("or", len(shuffled))
return
except Exception as e:
log.error("OpenRent %s: %s", outcode, e)
scrape_errors_total.labels(source="openrent").inc()
progress.update("or", i + 1)
time.sleep(DELAY_BETWEEN_OUTCODES)
except Exception as e:
log.exception("Fatal OpenRent error: %s", e)
with status_lock:
status.errors.append(f"Fatal OpenRent: {e}")
finally:
try:
client.close()
except Exception:
pass
def zp_worker():
try:
browser, page = launch_zoopla_browser()
log.info("Zoopla scraping ENABLED (Camoufox browser launched)")
zoopla_enabled.set(1)
except TurnstileError:
log.warning("Zoopla Cloudflare Turnstile failed — disabling Zoopla")
zoopla_enabled.set(0)
progress.update("zp", len(shuffled))
return
except Exception as e:
log.warning("Zoopla browser launch failed: %s — disabling Zoopla", e)
zoopla_enabled.set(0)
progress.update("zp", len(shuffled))
return
try:
for i, outcode in enumerate(shuffled):
for ch_cfg in CHANNELS:
ch = ch_cfg["channel"]
try:
props = zoopla_search_outcode(
page, outcode, ch, pc_index, pc_coords
)
zp_results[ch].extend(props)
if props:
log.info("Zoopla %s: +%d properties", outcode, len(props))
except TurnstileError: except TurnstileError:
log.warning( log.warning(
"Zoopla Cloudflare challenge failed — attempting browser relaunch" "Zoopla Turnstile challenge — relaunching browser"
) )
try: try:
zp_browser.close() browser.close()
except Exception: except Exception:
pass pass
try: try:
zp_browser, zp_page = launch_zoopla_browser() browser, page = launch_zoopla_browser()
log.info("Zoopla browser relaunched, continuing") log.info("Zoopla browser relaunched, continuing")
except Exception: except Exception:
log.warning( log.warning(
"Browser relaunch failed, disabling Zoopla for rest of scrape" "Browser relaunch failed, disabling Zoopla"
) )
zp_page = None
zp_browser = None
zp_failed = True
zoopla_enabled.set(0) zoopla_enabled.set(0)
with status_lock: with status_lock:
status.errors.append( status.errors.append(
"Zoopla Cloudflare challenge failed and browser relaunch failed" "Zoopla Cloudflare challenge failed and relaunch failed"
) )
progress.update("zp", len(shuffled))
return
except Exception as e: except Exception as e:
msg = f"Error scraping Zoopla {outcode}/{channel_name}: {e}" log.error("Zoopla %s/%s: %s", outcode, ch, e)
log.error(msg)
scrape_errors_total.labels(source="zoopla").inc() scrape_errors_total.labels(source="zoopla").inc()
with status_lock:
status.errors.append(msg)
with status_lock: progress.update("zp", i + 1)
if channel_name == "BUY": time.sleep(DELAY_BETWEEN_OUTCODES)
status.properties_buy = len(all_properties) except Exception as e:
else: log.exception("Fatal Zoopla error: %s", e)
status.properties_rent = len(all_properties) with status_lock:
status.rm_properties = rm_count status.errors.append(f"Fatal Zoopla: {e}")
status.hk_properties = hk_count finally:
status.or_properties = or_count try:
status.zp_properties = zp_count browser.close()
_sync_gauges() except Exception:
pass
# Log progress every 100 outcodes # --- Launch worker threads ---
done = i + 1
elapsed = time.time() - channel_start
if done % 100 == 0 or done == len(shuffled):
pct = done * 100 // len(shuffled)
rate = done / elapsed if elapsed > 0 else 0
log.info(
"%s %d/%d (%d%%) — %d props, %s elapsed, %.1f outcodes/min",
channel_name,
done,
len(shuffled),
pct,
len(all_properties),
_fmt_elapsed(elapsed),
rate * 60,
)
# Log when crossing a 10k property milestone active_sources: list[str] = []
current_milestone = len(all_properties) // 10_000 threads: list[threading.Thread] = []
if current_milestone > prev_prop_milestone:
prev_prop_milestone = current_milestone
log.info(
"%s %dk properties (rm: %d, hk: %d, or: %d, zp: %d) at outcode %d/%d [%s]",
channel_name,
current_milestone * 10,
rm_count,
hk_count,
or_count,
zp_count,
done,
len(shuffled),
_fmt_elapsed(elapsed),
)
if made_requests and i < len(shuffled) - 1: if SCRAPE_RIGHTMOVE:
time.sleep(DELAY_BETWEEN_OUTCODES) threads.append(threading.Thread(target=rm_worker, name="scrape-rm", daemon=True))
active_sources.append("rm")
if SCRAPE_HOMECOUK:
threads.append(threading.Thread(target=hk_worker, name="scrape-hk", daemon=True))
active_sources.append("hk")
if SCRAPE_OPENRENT:
threads.append(threading.Thread(target=or_worker, name="scrape-or", daemon=True))
active_sources.append("or")
if SCRAPE_ZOOPLA:
threads.append(threading.Thread(target=zp_worker, name="scrape-zp", daemon=True))
active_sources.append("zp")
# Write parquet log.info(
deduped = list(all_properties.values()) "=== Starting scrape: %d outcodes, sources: %s ===",
len(shuffled),
", ".join(active_sources),
)
for t in threads:
t.start()
# --- Monitor progress while workers run ---
scrape_start = time.time()
last_log = 0.0
try:
while any(t.is_alive() for t in threads):
snap = progress.snapshot()
min_done = min(
(snap.get(s, 0) for s in active_sources), default=0
)
# Count properties across sources (safe: only one thread writes each list)
total_buy = sum(
len(r["BUY"]) for r in [rm_results, hk_results, or_results, zp_results]
)
total_rent = sum(
len(r["RENT"]) for r in [rm_results, hk_results, or_results, zp_results]
)
with status_lock:
status.outcodes_done = min_done
status.outcodes_total = len(shuffled)
status.properties_buy = total_buy
status.properties_rent = total_rent
status.rm_properties = len(rm_results["BUY"]) + len(rm_results["RENT"])
status.hk_properties = len(hk_results["BUY"]) + len(hk_results["RENT"])
status.or_properties = len(or_results["RENT"])
status.zp_properties = len(zp_results["BUY"]) + len(zp_results["RENT"])
_sync_gauges()
# Log progress every 30 seconds
now = time.time()
if now - last_log >= 30:
elapsed = now - scrape_start
per_source = ", ".join(
f"{s}:{snap.get(s, 0)}" for s in active_sources
)
log.info(
"Progress: %d/%d outcodes (%s), %d buy + %d rent props, %s elapsed",
min_done,
len(shuffled),
per_source,
total_buy,
total_rent,
_fmt_elapsed(elapsed),
)
last_log = now
time.sleep(5)
except Exception as e:
log.exception("Monitor loop error: %s", e)
for t in threads:
t.join()
log.info("All source workers completed")
# --- Merge results per channel and write parquet ---
try:
for ch_cfg in CHANNELS:
ch = ch_cfg["channel"]
file_suffix = "buy" if ch == "BUY" else "rent"
merged, counts, total_dedup = _merge_channel(
rm_results[ch],
hk_results[ch],
or_results[ch],
zp_results[ch],
)
# Update cross-source dedup counter
ch_label = "buy" if ch == "BUY" else "rent"
if total_dedup:
cross_source_dedup_total.labels(channel=ch_label).inc(total_dedup)
deduped = list(merged.values())
output_path = DATA_DIR / f"online_listings_{file_suffix}.parquet" output_path = DATA_DIR / f"online_listings_{file_suffix}.parquet"
write_parquet(deduped, output_path, channel=file_suffix) write_parquet(deduped, output_path, channel=file_suffix)
with status_lock: with status_lock:
if channel_name == "BUY": if ch == "BUY":
status.properties_buy = len(deduped) status.properties_buy = len(deduped)
else: else:
status.properties_rent = len(deduped) status.properties_rent = len(deduped)
status.outcodes_done = len(shuffled)
_sync_gauges() _sync_gauges()
log.info( log.info(
"=== %s channel complete: %d unique (rm: %d, hk: %d, or: %d, zp: %d, cross-dedup: %d) ===", "=== %s complete: %d unique (rm:%d hk:%d or:%d zp:%d, cross-dedup:%d) ===",
channel_name, ch,
len(deduped), len(deduped),
rm_count, counts["rm"],
hk_count, counts["hk"],
or_count, counts["or"],
zp_count, counts["zp"],
hk_dedup_count + or_dedup_count + zp_dedup_count, total_dedup,
) )
with status_lock: with status_lock:
status.state = "done" status.state = "done"
status.finished_at = time.time() status.finished_at = time.time()
status.outcodes_done = len(shuffled)
_sync_gauges() _sync_gauges()
elapsed = status.finished_at - status.started_at elapsed = status.finished_at - status.started_at
log.info( log.info(
@ -598,26 +672,18 @@ def run_scrape(
body.get("elapsed_ms", 0), body.get("elapsed_ms", 0),
) )
else: else:
log.warning("Server reload failed (%d): %s", resp.status_code, resp.text[:200]) log.warning(
"Server reload failed (%d): %s",
resp.status_code,
resp.text[:200],
)
except Exception as e: except Exception as e:
log.warning("Server reload request failed: %s", e) log.warning("Server reload request failed: %s", e)
except Exception as e: except Exception as e:
log.exception("Fatal scrape error") log.exception("Fatal scrape error during merge/write")
with status_lock: with status_lock:
status.state = "error" status.state = "error"
status.errors.append(f"Fatal: {e}") status.errors.append(f"Fatal: {e}")
status.finished_at = time.time() status.finished_at = time.time()
_sync_gauges() _sync_gauges()
finally:
if client:
client.close()
if hk_client:
hk_client.close()
if or_client:
or_client.close()
if zp_browser:
try:
zp_browser.close()
except Exception:
pass