home.co.uk scraping
This commit is contained in:
parent
74d6dd7bf8
commit
f3e3c1ee49
6 changed files with 538 additions and 28 deletions
|
|
@ -7,8 +7,15 @@ from dataclasses import dataclass, field
|
|||
import polars as pl
|
||||
|
||||
from constants import ARCGIS_PATH, CHANNELS, DATA_DIR, DELAY_BETWEEN_OUTCODES, SEED
|
||||
from homecouk import CookiesExpiredError
|
||||
from homecouk import load_cookies as load_homecouk_cookies
|
||||
from homecouk import make_client as make_homecouk_client
|
||||
from homecouk import search_outcode as homecouk_search_outcode
|
||||
from http_client import make_client
|
||||
from metrics import (
|
||||
cookie_refreshes_total,
|
||||
cross_source_dedup_total,
|
||||
homecouk_enabled,
|
||||
scrape_elapsed_seconds,
|
||||
scrape_errors_total,
|
||||
scrape_outcodes_done,
|
||||
|
|
@ -32,6 +39,9 @@ class ScrapeStatus:
|
|||
outcodes_total: int = 0
|
||||
properties_buy: int = 0
|
||||
properties_rent: int = 0
|
||||
# Per-source counts for current channel
|
||||
rm_properties: int = 0
|
||||
hk_properties: int = 0
|
||||
errors: list[str] = field(default_factory=list)
|
||||
started_at: float = 0.0
|
||||
finished_at: float = 0.0
|
||||
|
|
@ -47,8 +57,13 @@ def _sync_gauges() -> None:
|
|||
scrape_state.labels(state=state).set(1 if status.state == state else 0)
|
||||
scrape_outcodes_done.set(status.outcodes_done)
|
||||
scrape_outcodes_total.set(status.outcodes_total)
|
||||
scrape_properties_total.labels(channel="buy").set(status.properties_buy)
|
||||
scrape_properties_total.labels(channel="rent").set(status.properties_rent)
|
||||
# Total properties (both sources combined)
|
||||
scrape_properties_total.labels(channel="buy", source="total").set(status.properties_buy)
|
||||
scrape_properties_total.labels(channel="rent", source="total").set(status.properties_rent)
|
||||
# Per-source breakdown for current channel
|
||||
ch = "buy" if status.channel == "BUY" else "rent"
|
||||
scrape_properties_total.labels(channel=ch, source="rightmove").set(status.rm_properties)
|
||||
scrape_properties_total.labels(channel=ch, source="homecouk").set(status.hk_properties)
|
||||
if status.started_at:
|
||||
end = status.finished_at if status.finished_at else time.time()
|
||||
scrape_elapsed_seconds.set(end - status.started_at)
|
||||
|
|
@ -87,8 +102,16 @@ def build_postcode_index() -> PostcodeSpatialIndex:
|
|||
)
|
||||
|
||||
|
||||
def _dedup_key(p: dict) -> tuple:
|
||||
"""Composite key for cross-source deduplication: (postcode, bedrooms, price).
|
||||
Two listings on different portals for the same physical property will share
|
||||
these attributes even though their IDs differ."""
|
||||
return (p.get("Postcode", ""), p.get("Bedrooms", 0), p.get("price", 0))
|
||||
|
||||
|
||||
def run_scrape(outcodes: list[str], pc_index: PostcodeSpatialIndex) -> None:
|
||||
"""Main scrape loop — runs in background thread."""
|
||||
"""Main scrape loop — runs in background thread.
|
||||
Scrapes Rightmove and (if configured) home.co.uk, merging into one dataset."""
|
||||
global status
|
||||
with status_lock:
|
||||
status.state = "running"
|
||||
|
|
@ -105,16 +128,33 @@ def run_scrape(outcodes: list[str], pc_index: PostcodeSpatialIndex) -> None:
|
|||
|
||||
client = make_client()
|
||||
|
||||
# home.co.uk: optional, enabled when cookies are available (via FlareSolverr or env vars)
|
||||
hk_result = load_homecouk_cookies()
|
||||
hk_client = make_homecouk_client(*hk_result) if hk_result else None
|
||||
if hk_client:
|
||||
log.info("home.co.uk scraping ENABLED")
|
||||
homecouk_enabled.set(1)
|
||||
else:
|
||||
log.info("home.co.uk scraping DISABLED (need FlareSolverr or HOMECOUK_CF_CLEARANCE + HOMECOUK_SESSION)")
|
||||
homecouk_enabled.set(0)
|
||||
hk_failed = False # set to True on 403 to skip remaining outcodes
|
||||
|
||||
try:
|
||||
for channel_cfg in CHANNELS:
|
||||
channel_name = channel_cfg["channel"]
|
||||
file_suffix = "buy" if channel_name == "BUY" else "rent"
|
||||
all_properties: dict[int, dict] = {} # dedup by id
|
||||
all_properties: dict[str, dict] = {} # dedup by id
|
||||
seen_dedup_keys: set[tuple] = set() # cross-source dedup by (postcode, beds, price)
|
||||
rm_count = 0 # Rightmove properties this channel
|
||||
hk_count = 0 # home.co.uk properties this channel
|
||||
hk_dedup_count = 0 # home.co.uk skipped as cross-source duplicates
|
||||
|
||||
with status_lock:
|
||||
status.channel = channel_name
|
||||
status.outcodes_done = 0
|
||||
status.outcodes_total = len(shuffled)
|
||||
status.rm_properties = 0
|
||||
status.hk_properties = 0
|
||||
|
||||
log.info("=== Starting %s channel (%d outcodes) ===", channel_name, len(shuffled))
|
||||
|
||||
|
|
@ -126,34 +166,81 @@ def run_scrape(outcodes: list[str], pc_index: PostcodeSpatialIndex) -> None:
|
|||
log.debug("Outcode %s (%d/%d) — %d properties so far",
|
||||
outcode, i + 1, len(shuffled), len(all_properties))
|
||||
|
||||
# --- Rightmove ---
|
||||
try:
|
||||
outcode_id = resolve_outcode_id(client, outcode)
|
||||
if not outcode_id:
|
||||
log.debug("No Rightmove ID for outcode %s, skipping", outcode)
|
||||
continue
|
||||
|
||||
props = search_outcode(client, outcode_id, outcode, channel_cfg, pc_index)
|
||||
for p in props:
|
||||
pid = p["id"]
|
||||
if pid not in all_properties:
|
||||
all_properties[pid] = p
|
||||
|
||||
with status_lock:
|
||||
if channel_name == "BUY":
|
||||
status.properties_buy = len(all_properties)
|
||||
else:
|
||||
status.properties_rent = len(all_properties)
|
||||
_sync_gauges()
|
||||
|
||||
log.info("Outcode %s: got %d properties (total: %d)", outcode, len(props), len(all_properties))
|
||||
|
||||
else:
|
||||
props = search_outcode(client, outcode_id, outcode, channel_cfg, pc_index)
|
||||
for p in props:
|
||||
pid = p["id"]
|
||||
if pid not in all_properties:
|
||||
all_properties[pid] = p
|
||||
seen_dedup_keys.add(_dedup_key(p))
|
||||
rm_count += 1
|
||||
except Exception as e:
|
||||
msg = f"Error scraping {outcode}/{channel_name}: {e}"
|
||||
msg = f"Error scraping Rightmove {outcode}/{channel_name}: {e}"
|
||||
log.error(msg)
|
||||
scrape_errors_total.inc()
|
||||
scrape_errors_total.labels(source="rightmove").inc()
|
||||
with status_lock:
|
||||
status.errors.append(msg)
|
||||
|
||||
# --- home.co.uk ---
|
||||
if hk_client and not hk_failed:
|
||||
try:
|
||||
hk_props = homecouk_search_outcode(
|
||||
hk_client, outcode, channel_name, pc_index,
|
||||
)
|
||||
for p in hk_props:
|
||||
pid = p["id"]
|
||||
key = _dedup_key(p)
|
||||
if pid in all_properties or key in seen_dedup_keys:
|
||||
hk_dedup_count += 1
|
||||
cross_source_dedup_total.labels(
|
||||
channel="buy" if channel_name == "BUY" else "rent",
|
||||
).inc()
|
||||
continue
|
||||
all_properties[pid] = p
|
||||
seen_dedup_keys.add(key)
|
||||
hk_count += 1
|
||||
if hk_props:
|
||||
log.info("home.co.uk %s: +%d properties", outcode, len(hk_props))
|
||||
except CookiesExpiredError:
|
||||
log.warning("home.co.uk cookies expired — attempting refresh via FlareSolverr")
|
||||
hk_client.close()
|
||||
hk_result = load_homecouk_cookies()
|
||||
if hk_result:
|
||||
hk_client = make_homecouk_client(*hk_result)
|
||||
log.info("home.co.uk cookies refreshed, continuing")
|
||||
cookie_refreshes_total.labels(result="success").inc()
|
||||
else:
|
||||
log.warning("Cookie refresh failed, disabling home.co.uk for rest of scrape")
|
||||
hk_client = None
|
||||
hk_failed = True
|
||||
homecouk_enabled.set(0)
|
||||
cookie_refreshes_total.labels(result="failure").inc()
|
||||
with status_lock:
|
||||
status.errors.append("home.co.uk cookies expired and refresh failed")
|
||||
except Exception as e:
|
||||
msg = f"Error scraping home.co.uk {outcode}/{channel_name}: {e}"
|
||||
log.error(msg)
|
||||
scrape_errors_total.labels(source="homecouk").inc()
|
||||
with status_lock:
|
||||
status.errors.append(msg)
|
||||
|
||||
with status_lock:
|
||||
if channel_name == "BUY":
|
||||
status.properties_buy = len(all_properties)
|
||||
else:
|
||||
status.properties_rent = len(all_properties)
|
||||
status.rm_properties = rm_count
|
||||
status.hk_properties = hk_count
|
||||
_sync_gauges()
|
||||
|
||||
log.info("Outcode %s: total %d (rm: %d, hk: %d)",
|
||||
outcode, len(all_properties), rm_count, hk_count)
|
||||
|
||||
if i < len(shuffled) - 1:
|
||||
time.sleep(DELAY_BETWEEN_OUTCODES)
|
||||
|
||||
|
|
@ -170,7 +257,8 @@ def run_scrape(outcodes: list[str], pc_index: PostcodeSpatialIndex) -> None:
|
|||
status.outcodes_done = len(shuffled)
|
||||
_sync_gauges()
|
||||
|
||||
log.info("=== %s channel complete: %d unique properties ===", channel_name, len(deduped))
|
||||
log.info("=== %s channel complete: %d unique (rm: %d, hk: %d, cross-dedup: %d) ===",
|
||||
channel_name, len(deduped), rm_count, hk_count, hk_dedup_count)
|
||||
|
||||
with status_lock:
|
||||
status.state = "done"
|
||||
|
|
@ -189,3 +277,5 @@ def run_scrape(outcodes: list[str], pc_index: PostcodeSpatialIndex) -> None:
|
|||
_sync_gauges()
|
||||
finally:
|
||||
client.close()
|
||||
if hk_client:
|
||||
hk_client.close()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue