This commit is contained in:
Andras Schmelczer 2026-03-12 22:11:00 +00:00
parent 14a3555cf1
commit 7e92bf112e
34 changed files with 1214437 additions and 224 deletions

View file

@ -6,7 +6,7 @@ from dataclasses import dataclass, field
import polars as pl
from constants import ARCGIS_PATH, CHANNELS, DATA_DIR, DELAY_BETWEEN_OUTCODES, SEED
from constants import ARCGIS_PATH, CHANNELS, DATA_DIR, DELAY_BETWEEN_OUTCODES, SCRAPE_HOMECOUK, SCRAPE_RIGHTMOVE, SEED
from homecouk import CookiesExpiredError
from homecouk import load_cookies as load_homecouk_cookies
from homecouk import make_client as make_homecouk_client
@ -126,18 +126,33 @@ def run_scrape(outcodes: list[str], pc_index: PostcodeSpatialIndex) -> None:
random.seed(SEED)
random.shuffle(shuffled)
client = make_client()
if not SCRAPE_RIGHTMOVE and not SCRAPE_HOMECOUK:
log.warning("Both SCRAPE_RIGHTMOVE and SCRAPE_HOMECOUK are disabled — nothing to do")
with status_lock:
status.state = "done"
status.finished_at = time.time()
_sync_gauges()
return
# home.co.uk: optional, enabled when cookies are available (via FlareSolverr or env vars)
hk_result = load_homecouk_cookies()
hk_client = make_homecouk_client(*hk_result) if hk_result else None
if hk_client:
log.info("home.co.uk scraping ENABLED")
homecouk_enabled.set(1)
else:
log.info("home.co.uk scraping DISABLED (need FlareSolverr or HOMECOUK_CF_CLEARANCE + HOMECOUK_SESSION)")
client = make_client() if SCRAPE_RIGHTMOVE else None
if not SCRAPE_RIGHTMOVE:
log.info("Rightmove scraping DISABLED (SCRAPE_RIGHTMOVE=false)")
# home.co.uk: must be enabled via SCRAPE_HOMECOUK + cookies available
hk_client = None
hk_failed = False
if not SCRAPE_HOMECOUK:
log.info("home.co.uk scraping DISABLED (SCRAPE_HOMECOUK=false)")
homecouk_enabled.set(0)
hk_failed = False # set to True on 403 to skip remaining outcodes
else:
hk_result = load_homecouk_cookies()
hk_client = make_homecouk_client(*hk_result) if hk_result else None
if hk_client:
log.info("home.co.uk scraping ENABLED")
homecouk_enabled.set(1)
else:
log.info("home.co.uk scraping DISABLED (need FlareSolverr or HOMECOUK_CF_CLEARANCE + HOMECOUK_SESSION)")
homecouk_enabled.set(0)
try:
for channel_cfg in CHANNELS:
@ -167,24 +182,25 @@ def run_scrape(outcodes: list[str], pc_index: PostcodeSpatialIndex) -> None:
outcode, i + 1, len(shuffled), len(all_properties))
# --- Rightmove ---
try:
outcode_id = resolve_outcode_id(client, outcode)
if not outcode_id:
log.debug("No Rightmove ID for outcode %s, skipping", outcode)
else:
props = search_outcode(client, outcode_id, outcode, channel_cfg, pc_index)
for p in props:
pid = p["id"]
if pid not in all_properties:
all_properties[pid] = p
seen_dedup_keys.add(_dedup_key(p))
rm_count += 1
except Exception as e:
msg = f"Error scraping Rightmove {outcode}/{channel_name}: {e}"
log.error(msg)
scrape_errors_total.labels(source="rightmove").inc()
with status_lock:
status.errors.append(msg)
if SCRAPE_RIGHTMOVE:
try:
outcode_id = resolve_outcode_id(client, outcode)
if not outcode_id:
log.debug("No Rightmove ID for outcode %s, skipping", outcode)
else:
props = search_outcode(client, outcode_id, outcode, channel_cfg, pc_index)
for p in props:
pid = p["id"]
if pid not in all_properties:
all_properties[pid] = p
seen_dedup_keys.add(_dedup_key(p))
rm_count += 1
except Exception as e:
msg = f"Error scraping Rightmove {outcode}/{channel_name}: {e}"
log.error(msg)
scrape_errors_total.labels(source="rightmove").inc()
with status_lock:
status.errors.append(msg)
# --- home.co.uk ---
if hk_client and not hk_failed:
@ -276,6 +292,7 @@ def run_scrape(outcodes: list[str], pc_index: PostcodeSpatialIndex) -> None:
status.finished_at = time.time()
_sync_gauges()
finally:
client.close()
if client:
client.close()
if hk_client:
hk_client.close()