Fmt
This commit is contained in:
parent
479ef92236
commit
c38d654ac7
44 changed files with 2526 additions and 701 deletions
|
|
@ -6,7 +6,16 @@ from dataclasses import dataclass, field
|
|||
|
||||
import polars as pl
|
||||
|
||||
from constants import ARCGIS_PATH, CHANNELS, DATA_DIR, DELAY_BETWEEN_OUTCODES, SCRAPE_HOMECOUK, SCRAPE_OPENRENT, SCRAPE_RIGHTMOVE, SEED
|
||||
from constants import (
|
||||
ARCGIS_PATH,
|
||||
CHANNELS,
|
||||
DATA_DIR,
|
||||
DELAY_BETWEEN_OUTCODES,
|
||||
SCRAPE_HOMECOUK,
|
||||
SCRAPE_OPENRENT,
|
||||
SCRAPE_RIGHTMOVE,
|
||||
SEED,
|
||||
)
|
||||
from homecouk import CookiesExpiredError
|
||||
from homecouk import load_cookies as load_homecouk_cookies
|
||||
from homecouk import make_client as make_homecouk_client
|
||||
|
|
@ -64,13 +73,23 @@ def _sync_gauges() -> None:
|
|||
scrape_outcodes_done.set(status.outcodes_done)
|
||||
scrape_outcodes_total.set(status.outcodes_total)
|
||||
# Total properties (both sources combined)
|
||||
scrape_properties_total.labels(channel="buy", source="total").set(status.properties_buy)
|
||||
scrape_properties_total.labels(channel="rent", source="total").set(status.properties_rent)
|
||||
scrape_properties_total.labels(channel="buy", source="total").set(
|
||||
status.properties_buy
|
||||
)
|
||||
scrape_properties_total.labels(channel="rent", source="total").set(
|
||||
status.properties_rent
|
||||
)
|
||||
# Per-source breakdown for current channel
|
||||
ch = "buy" if status.channel == "BUY" else "rent"
|
||||
scrape_properties_total.labels(channel=ch, source="rightmove").set(status.rm_properties)
|
||||
scrape_properties_total.labels(channel=ch, source="homecouk").set(status.hk_properties)
|
||||
scrape_properties_total.labels(channel=ch, source="openrent").set(status.or_properties)
|
||||
scrape_properties_total.labels(channel=ch, source="rightmove").set(
|
||||
status.rm_properties
|
||||
)
|
||||
scrape_properties_total.labels(channel=ch, source="homecouk").set(
|
||||
status.hk_properties
|
||||
)
|
||||
scrape_properties_total.labels(channel=ch, source="openrent").set(
|
||||
status.or_properties
|
||||
)
|
||||
if status.started_at:
|
||||
end = status.finished_at if status.finished_at else time.time()
|
||||
scrape_elapsed_seconds.set(end - status.started_at)
|
||||
|
|
@ -86,7 +105,9 @@ def load_outcodes() -> list[str]:
|
|||
log.info("England postcodes: %d", len(england))
|
||||
|
||||
outcodes = (
|
||||
england.select(pl.col("pcd").str.extract(r"^([A-Z]{1,2}\d[A-Z0-9]?)", 1).alias("outcode"))
|
||||
england.select(
|
||||
pl.col("pcd").str.extract(r"^([A-Z]{1,2}\d[A-Z0-9]?)", 1).alias("outcode")
|
||||
)
|
||||
.drop_nulls()
|
||||
.get_column("outcode")
|
||||
.unique()
|
||||
|
|
@ -101,7 +122,9 @@ def build_postcode_index() -> PostcodeSpatialIndex:
|
|||
"""Build spatial index from arcgis England postcodes."""
|
||||
log.info("Building postcode spatial index from %s", ARCGIS_PATH)
|
||||
df = pl.read_parquet(ARCGIS_PATH, columns=["pcd", "ctry", "lat", "long"])
|
||||
england = df.filter(pl.col("ctry") == "E92000001").drop_nulls(subset=["lat", "long"])
|
||||
england = df.filter(pl.col("ctry") == "E92000001").drop_nulls(
|
||||
subset=["lat", "long"]
|
||||
)
|
||||
return PostcodeSpatialIndex(
|
||||
england.get_column("lat").to_list(),
|
||||
england.get_column("long").to_list(),
|
||||
|
|
@ -114,7 +137,9 @@ def build_postcode_coords() -> dict[str, tuple[float, float]]:
|
|||
Used by OpenRent scraper to resolve coordinates from postcodes."""
|
||||
log.info("Building postcode coords lookup from %s", ARCGIS_PATH)
|
||||
df = pl.read_parquet(ARCGIS_PATH, columns=["pcd", "ctry", "lat", "long"])
|
||||
england = df.filter(pl.col("ctry") == "E92000001").drop_nulls(subset=["lat", "long"])
|
||||
england = df.filter(pl.col("ctry") == "E92000001").drop_nulls(
|
||||
subset=["lat", "long"]
|
||||
)
|
||||
coords: dict[str, tuple[float, float]] = {}
|
||||
for pcd, lat, lng in zip(
|
||||
england.get_column("pcd").to_list(),
|
||||
|
|
@ -179,7 +204,9 @@ def run_scrape(
|
|||
log.info("home.co.uk scraping ENABLED")
|
||||
homecouk_enabled.set(1)
|
||||
else:
|
||||
log.info("home.co.uk scraping DISABLED (need FlareSolverr or HOMECOUK_CF_CLEARANCE + HOMECOUK_SESSION)")
|
||||
log.info(
|
||||
"home.co.uk scraping DISABLED (need FlareSolverr or HOMECOUK_CF_CLEARANCE + HOMECOUK_SESSION)"
|
||||
)
|
||||
homecouk_enabled.set(0)
|
||||
|
||||
# OpenRent: must be enabled via SCRAPE_OPENRENT + cookies available
|
||||
|
|
@ -195,7 +222,9 @@ def run_scrape(
|
|||
log.info("OpenRent scraping ENABLED")
|
||||
openrent_enabled.set(1)
|
||||
else:
|
||||
log.info("OpenRent scraping DISABLED (need FlareSolverr or OPENRENT_WAF_TOKEN)")
|
||||
log.info(
|
||||
"OpenRent scraping DISABLED (need FlareSolverr or OPENRENT_WAF_TOKEN)"
|
||||
)
|
||||
openrent_enabled.set(0)
|
||||
|
||||
# Build postcode coords if OpenRent is active and caller didn't provide them
|
||||
|
|
@ -207,7 +236,9 @@ def run_scrape(
|
|||
channel_name = channel_cfg["channel"]
|
||||
file_suffix = "buy" if channel_name == "BUY" else "rent"
|
||||
all_properties: dict[str, dict] = {} # dedup by id
|
||||
seen_dedup_keys: set[tuple] = set() # cross-source dedup by (postcode, beds, price)
|
||||
seen_dedup_keys: set[tuple] = (
|
||||
set()
|
||||
) # cross-source dedup by (postcode, beds, price)
|
||||
rm_count = 0 # Rightmove properties this channel
|
||||
hk_count = 0 # home.co.uk properties this channel
|
||||
hk_dedup_count = 0 # home.co.uk skipped as cross-source duplicates
|
||||
|
|
@ -222,15 +253,22 @@ def run_scrape(
|
|||
status.hk_properties = 0
|
||||
status.or_properties = 0
|
||||
|
||||
log.info("=== Starting %s channel (%d outcodes) ===", channel_name, len(shuffled))
|
||||
log.info(
|
||||
"=== Starting %s channel (%d outcodes) ===", channel_name, len(shuffled)
|
||||
)
|
||||
|
||||
for i, outcode in enumerate(shuffled):
|
||||
with status_lock:
|
||||
status.outcode = outcode
|
||||
status.outcodes_done = i
|
||||
|
||||
log.debug("Outcode %s (%d/%d) — %d properties so far",
|
||||
outcode, i + 1, len(shuffled), len(all_properties))
|
||||
log.debug(
|
||||
"Outcode %s (%d/%d) — %d properties so far",
|
||||
outcode,
|
||||
i + 1,
|
||||
len(shuffled),
|
||||
len(all_properties),
|
||||
)
|
||||
|
||||
made_requests = False
|
||||
|
||||
|
|
@ -240,9 +278,13 @@ def run_scrape(
|
|||
try:
|
||||
outcode_id = resolve_outcode_id(client, outcode)
|
||||
if not outcode_id:
|
||||
log.debug("No Rightmove ID for outcode %s, skipping", outcode)
|
||||
log.debug(
|
||||
"No Rightmove ID for outcode %s, skipping", outcode
|
||||
)
|
||||
else:
|
||||
props = search_outcode(client, outcode_id, outcode, channel_cfg, pc_index)
|
||||
props = search_outcode(
|
||||
client, outcode_id, outcode, channel_cfg, pc_index
|
||||
)
|
||||
for p in props:
|
||||
pid = p["id"]
|
||||
if pid not in all_properties:
|
||||
|
|
@ -261,7 +303,10 @@ def run_scrape(
|
|||
made_requests = True
|
||||
try:
|
||||
hk_props = homecouk_search_outcode(
|
||||
hk_client, outcode, channel_name, pc_index,
|
||||
hk_client,
|
||||
outcode,
|
||||
channel_name,
|
||||
pc_index,
|
||||
)
|
||||
for p in hk_props:
|
||||
pid = p["id"]
|
||||
|
|
@ -276,9 +321,13 @@ def run_scrape(
|
|||
seen_dedup_keys.add(key)
|
||||
hk_count += 1
|
||||
if hk_props:
|
||||
log.info("home.co.uk %s: +%d properties", outcode, len(hk_props))
|
||||
log.info(
|
||||
"home.co.uk %s: +%d properties", outcode, len(hk_props)
|
||||
)
|
||||
except CookiesExpiredError:
|
||||
log.warning("home.co.uk cookies expired — attempting refresh via FlareSolverr")
|
||||
log.warning(
|
||||
"home.co.uk cookies expired — attempting refresh via FlareSolverr"
|
||||
)
|
||||
hk_client.close()
|
||||
hk_result = load_homecouk_cookies()
|
||||
if hk_result:
|
||||
|
|
@ -286,13 +335,17 @@ def run_scrape(
|
|||
log.info("home.co.uk cookies refreshed, continuing")
|
||||
cookie_refreshes_total.labels(result="success").inc()
|
||||
else:
|
||||
log.warning("Cookie refresh failed, disabling home.co.uk for rest of scrape")
|
||||
log.warning(
|
||||
"Cookie refresh failed, disabling home.co.uk for rest of scrape"
|
||||
)
|
||||
hk_client = None
|
||||
hk_failed = True
|
||||
homecouk_enabled.set(0)
|
||||
cookie_refreshes_total.labels(result="failure").inc()
|
||||
with status_lock:
|
||||
status.errors.append("home.co.uk cookies expired and refresh failed")
|
||||
status.errors.append(
|
||||
"home.co.uk cookies expired and refresh failed"
|
||||
)
|
||||
except Exception as e:
|
||||
msg = f"Error scraping home.co.uk {outcode}/{channel_name}: {e}"
|
||||
log.error(msg)
|
||||
|
|
@ -305,7 +358,10 @@ def run_scrape(
|
|||
made_requests = True
|
||||
try:
|
||||
or_props = openrent_search_outcode(
|
||||
or_client, outcode, pc_index, pc_coords,
|
||||
or_client,
|
||||
outcode,
|
||||
pc_index,
|
||||
pc_coords,
|
||||
)
|
||||
for p in or_props:
|
||||
pid = p["id"]
|
||||
|
|
@ -318,9 +374,13 @@ def run_scrape(
|
|||
seen_dedup_keys.add(key)
|
||||
or_count += 1
|
||||
if or_props:
|
||||
log.info("OpenRent %s: +%d properties", outcode, len(or_props))
|
||||
log.info(
|
||||
"OpenRent %s: +%d properties", outcode, len(or_props)
|
||||
)
|
||||
except WafChallengeError:
|
||||
log.warning("OpenRent WAF cookies expired — attempting refresh via FlareSolverr")
|
||||
log.warning(
|
||||
"OpenRent WAF cookies expired — attempting refresh via FlareSolverr"
|
||||
)
|
||||
or_client.close()
|
||||
or_result = load_openrent_cookies()
|
||||
if or_result:
|
||||
|
|
@ -328,13 +388,17 @@ def run_scrape(
|
|||
log.info("OpenRent cookies refreshed, continuing")
|
||||
cookie_refreshes_total.labels(result="success").inc()
|
||||
else:
|
||||
log.warning("Cookie refresh failed, disabling OpenRent for rest of scrape")
|
||||
log.warning(
|
||||
"Cookie refresh failed, disabling OpenRent for rest of scrape"
|
||||
)
|
||||
or_client = None
|
||||
or_failed = True
|
||||
openrent_enabled.set(0)
|
||||
cookie_refreshes_total.labels(result="failure").inc()
|
||||
with status_lock:
|
||||
status.errors.append("OpenRent WAF cookies expired and refresh failed")
|
||||
status.errors.append(
|
||||
"OpenRent WAF cookies expired and refresh failed"
|
||||
)
|
||||
except Exception as e:
|
||||
msg = f"Error scraping OpenRent {outcode}/{channel_name}: {e}"
|
||||
log.error(msg)
|
||||
|
|
@ -352,8 +416,14 @@ def run_scrape(
|
|||
status.or_properties = or_count
|
||||
_sync_gauges()
|
||||
|
||||
log.info("Outcode %s: total %d (rm: %d, hk: %d, or: %d)",
|
||||
outcode, len(all_properties), rm_count, hk_count, or_count)
|
||||
log.info(
|
||||
"Outcode %s: total %d (rm: %d, hk: %d, or: %d)",
|
||||
outcode,
|
||||
len(all_properties),
|
||||
rm_count,
|
||||
hk_count,
|
||||
or_count,
|
||||
)
|
||||
|
||||
if made_requests and i < len(shuffled) - 1:
|
||||
time.sleep(DELAY_BETWEEN_OUTCODES)
|
||||
|
|
@ -373,7 +443,11 @@ def run_scrape(
|
|||
|
||||
log.info(
|
||||
"=== %s channel complete: %d unique (rm: %d, hk: %d, or: %d, cross-dedup: %d) ===",
|
||||
channel_name, len(deduped), rm_count, hk_count, or_count,
|
||||
channel_name,
|
||||
len(deduped),
|
||||
rm_count,
|
||||
hk_count,
|
||||
or_count,
|
||||
hk_dedup_count + or_dedup_count,
|
||||
)
|
||||
|
||||
|
|
@ -382,8 +456,12 @@ def run_scrape(
|
|||
status.finished_at = time.time()
|
||||
_sync_gauges()
|
||||
elapsed = status.finished_at - status.started_at
|
||||
log.info("Scrape complete in %.0fs — buy: %d, rent: %d",
|
||||
elapsed, status.properties_buy, status.properties_rent)
|
||||
log.info(
|
||||
"Scrape complete in %.0fs — buy: %d, rent: %d",
|
||||
elapsed,
|
||||
status.properties_buy,
|
||||
status.properties_rent,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
log.exception("Fatal scrape error")
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue