Working
This commit is contained in:
parent
14a3555cf1
commit
7e92bf112e
34 changed files with 1214437 additions and 224 deletions
|
|
@ -6,7 +6,7 @@ from dataclasses import dataclass, field
|
|||
|
||||
import polars as pl
|
||||
|
||||
from constants import ARCGIS_PATH, CHANNELS, DATA_DIR, DELAY_BETWEEN_OUTCODES, SEED
|
||||
from constants import ARCGIS_PATH, CHANNELS, DATA_DIR, DELAY_BETWEEN_OUTCODES, SCRAPE_HOMECOUK, SCRAPE_RIGHTMOVE, SEED
|
||||
from homecouk import CookiesExpiredError
|
||||
from homecouk import load_cookies as load_homecouk_cookies
|
||||
from homecouk import make_client as make_homecouk_client
|
||||
|
|
@ -126,18 +126,33 @@ def run_scrape(outcodes: list[str], pc_index: PostcodeSpatialIndex) -> None:
|
|||
random.seed(SEED)
|
||||
random.shuffle(shuffled)
|
||||
|
||||
client = make_client()
|
||||
if not SCRAPE_RIGHTMOVE and not SCRAPE_HOMECOUK:
|
||||
log.warning("Both SCRAPE_RIGHTMOVE and SCRAPE_HOMECOUK are disabled — nothing to do")
|
||||
with status_lock:
|
||||
status.state = "done"
|
||||
status.finished_at = time.time()
|
||||
_sync_gauges()
|
||||
return
|
||||
|
||||
# home.co.uk: optional, enabled when cookies are available (via FlareSolverr or env vars)
|
||||
hk_result = load_homecouk_cookies()
|
||||
hk_client = make_homecouk_client(*hk_result) if hk_result else None
|
||||
if hk_client:
|
||||
log.info("home.co.uk scraping ENABLED")
|
||||
homecouk_enabled.set(1)
|
||||
else:
|
||||
log.info("home.co.uk scraping DISABLED (need FlareSolverr or HOMECOUK_CF_CLEARANCE + HOMECOUK_SESSION)")
|
||||
client = make_client() if SCRAPE_RIGHTMOVE else None
|
||||
if not SCRAPE_RIGHTMOVE:
|
||||
log.info("Rightmove scraping DISABLED (SCRAPE_RIGHTMOVE=false)")
|
||||
|
||||
# home.co.uk: must be enabled via SCRAPE_HOMECOUK + cookies available
|
||||
hk_client = None
|
||||
hk_failed = False
|
||||
if not SCRAPE_HOMECOUK:
|
||||
log.info("home.co.uk scraping DISABLED (SCRAPE_HOMECOUK=false)")
|
||||
homecouk_enabled.set(0)
|
||||
hk_failed = False # set to True on 403 to skip remaining outcodes
|
||||
else:
|
||||
hk_result = load_homecouk_cookies()
|
||||
hk_client = make_homecouk_client(*hk_result) if hk_result else None
|
||||
if hk_client:
|
||||
log.info("home.co.uk scraping ENABLED")
|
||||
homecouk_enabled.set(1)
|
||||
else:
|
||||
log.info("home.co.uk scraping DISABLED (need FlareSolverr or HOMECOUK_CF_CLEARANCE + HOMECOUK_SESSION)")
|
||||
homecouk_enabled.set(0)
|
||||
|
||||
try:
|
||||
for channel_cfg in CHANNELS:
|
||||
|
|
@ -167,24 +182,25 @@ def run_scrape(outcodes: list[str], pc_index: PostcodeSpatialIndex) -> None:
|
|||
outcode, i + 1, len(shuffled), len(all_properties))
|
||||
|
||||
# --- Rightmove ---
|
||||
try:
|
||||
outcode_id = resolve_outcode_id(client, outcode)
|
||||
if not outcode_id:
|
||||
log.debug("No Rightmove ID for outcode %s, skipping", outcode)
|
||||
else:
|
||||
props = search_outcode(client, outcode_id, outcode, channel_cfg, pc_index)
|
||||
for p in props:
|
||||
pid = p["id"]
|
||||
if pid not in all_properties:
|
||||
all_properties[pid] = p
|
||||
seen_dedup_keys.add(_dedup_key(p))
|
||||
rm_count += 1
|
||||
except Exception as e:
|
||||
msg = f"Error scraping Rightmove {outcode}/{channel_name}: {e}"
|
||||
log.error(msg)
|
||||
scrape_errors_total.labels(source="rightmove").inc()
|
||||
with status_lock:
|
||||
status.errors.append(msg)
|
||||
if SCRAPE_RIGHTMOVE:
|
||||
try:
|
||||
outcode_id = resolve_outcode_id(client, outcode)
|
||||
if not outcode_id:
|
||||
log.debug("No Rightmove ID for outcode %s, skipping", outcode)
|
||||
else:
|
||||
props = search_outcode(client, outcode_id, outcode, channel_cfg, pc_index)
|
||||
for p in props:
|
||||
pid = p["id"]
|
||||
if pid not in all_properties:
|
||||
all_properties[pid] = p
|
||||
seen_dedup_keys.add(_dedup_key(p))
|
||||
rm_count += 1
|
||||
except Exception as e:
|
||||
msg = f"Error scraping Rightmove {outcode}/{channel_name}: {e}"
|
||||
log.error(msg)
|
||||
scrape_errors_total.labels(source="rightmove").inc()
|
||||
with status_lock:
|
||||
status.errors.append(msg)
|
||||
|
||||
# --- home.co.uk ---
|
||||
if hk_client and not hk_failed:
|
||||
|
|
@ -276,6 +292,7 @@ def run_scrape(outcodes: list[str], pc_index: PostcodeSpatialIndex) -> None:
|
|||
status.finished_at = time.time()
|
||||
_sync_gauges()
|
||||
finally:
|
||||
client.close()
|
||||
if client:
|
||||
client.close()
|
||||
if hk_client:
|
||||
hk_client.close()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue