191 lines
6.6 KiB
Python
191 lines
6.6 KiB
Python
import logging
|
|
import random
|
|
import threading
|
|
import time
|
|
from dataclasses import dataclass, field
|
|
|
|
import polars as pl
|
|
|
|
from constants import ARCGIS_PATH, CHANNELS, DATA_DIR, DELAY_BETWEEN_OUTCODES, SEED
|
|
from http_client import make_client
|
|
from metrics import (
|
|
scrape_elapsed_seconds,
|
|
scrape_errors_total,
|
|
scrape_outcodes_done,
|
|
scrape_outcodes_total,
|
|
scrape_properties_total,
|
|
scrape_state,
|
|
)
|
|
from rightmove import resolve_outcode_id, search_outcode
|
|
from spatial import PostcodeSpatialIndex
|
|
from storage import write_parquet
|
|
|
|
log = logging.getLogger("rightmove")
|
|
|
|
|
|
@dataclass
|
|
class ScrapeStatus:
|
|
state: str = "idle" # idle | running | done | error
|
|
channel: str = ""
|
|
outcode: str = ""
|
|
outcodes_done: int = 0
|
|
outcodes_total: int = 0
|
|
properties_buy: int = 0
|
|
properties_rent: int = 0
|
|
errors: list[str] = field(default_factory=list)
|
|
started_at: float = 0.0
|
|
finished_at: float = 0.0
|
|
|
|
|
|
status = ScrapeStatus()
|
|
status_lock = threading.Lock()
|
|
|
|
|
|
def _sync_gauges() -> None:
|
|
"""Push current ScrapeStatus values into Prometheus gauges. Must hold status_lock."""
|
|
for state in ("idle", "running", "done", "error"):
|
|
scrape_state.labels(state=state).set(1 if status.state == state else 0)
|
|
scrape_outcodes_done.set(status.outcodes_done)
|
|
scrape_outcodes_total.set(status.outcodes_total)
|
|
scrape_properties_total.labels(channel="buy").set(status.properties_buy)
|
|
scrape_properties_total.labels(channel="rent").set(status.properties_rent)
|
|
if status.started_at:
|
|
end = status.finished_at if status.finished_at else time.time()
|
|
scrape_elapsed_seconds.set(end - status.started_at)
|
|
else:
|
|
scrape_elapsed_seconds.set(0)
|
|
|
|
|
|
def load_outcodes() -> list[str]:
|
|
"""Load England-only outcodes from arcgis parquet."""
|
|
log.info("Loading outcodes from %s", ARCGIS_PATH)
|
|
df = pl.read_parquet(ARCGIS_PATH, columns=["pcd", "ctry", "lat", "long"])
|
|
england = df.filter(pl.col("ctry") == "E92000001")
|
|
log.info("England postcodes: %d", len(england))
|
|
|
|
outcodes = (
|
|
england.select(pl.col("pcd").str.extract(r"^([A-Z]{1,2}\d[A-Z0-9]?)", 1).alias("outcode"))
|
|
.drop_nulls()
|
|
.get_column("outcode")
|
|
.unique()
|
|
.sort()
|
|
.to_list()
|
|
)
|
|
log.info("Unique England outcodes: %d", len(outcodes))
|
|
return outcodes
|
|
|
|
|
|
def build_postcode_index() -> PostcodeSpatialIndex:
|
|
"""Build spatial index from arcgis England postcodes."""
|
|
log.info("Building postcode spatial index from %s", ARCGIS_PATH)
|
|
df = pl.read_parquet(ARCGIS_PATH, columns=["pcd", "ctry", "lat", "long"])
|
|
england = df.filter(pl.col("ctry") == "E92000001").drop_nulls(subset=["lat", "long"])
|
|
return PostcodeSpatialIndex(
|
|
england.get_column("lat").to_list(),
|
|
england.get_column("long").to_list(),
|
|
england.get_column("pcd").to_list(),
|
|
)
|
|
|
|
|
|
def run_scrape(outcodes: list[str], pc_index: PostcodeSpatialIndex) -> None:
|
|
"""Main scrape loop — runs in background thread."""
|
|
global status
|
|
with status_lock:
|
|
status.state = "running"
|
|
status.started_at = time.time()
|
|
status.errors = []
|
|
status.properties_buy = 0
|
|
status.properties_rent = 0
|
|
_sync_gauges()
|
|
|
|
# Shuffle for geographic diversity
|
|
shuffled = list(outcodes)
|
|
random.seed(SEED)
|
|
random.shuffle(shuffled)
|
|
|
|
client = make_client()
|
|
|
|
try:
|
|
for channel_cfg in CHANNELS:
|
|
channel_name = channel_cfg["channel"]
|
|
file_suffix = "buy" if channel_name == "BUY" else "rent"
|
|
all_properties: dict[int, dict] = {} # dedup by id
|
|
|
|
with status_lock:
|
|
status.channel = channel_name
|
|
status.outcodes_done = 0
|
|
status.outcodes_total = len(shuffled)
|
|
|
|
log.info("=== Starting %s channel (%d outcodes) ===", channel_name, len(shuffled))
|
|
|
|
for i, outcode in enumerate(shuffled):
|
|
with status_lock:
|
|
status.outcode = outcode
|
|
status.outcodes_done = i
|
|
|
|
log.debug("Outcode %s (%d/%d) — %d properties so far",
|
|
outcode, i + 1, len(shuffled), len(all_properties))
|
|
|
|
try:
|
|
outcode_id = resolve_outcode_id(client, outcode)
|
|
if not outcode_id:
|
|
log.debug("No Rightmove ID for outcode %s, skipping", outcode)
|
|
continue
|
|
|
|
props = search_outcode(client, outcode_id, outcode, channel_cfg, pc_index)
|
|
for p in props:
|
|
pid = p["id"]
|
|
if pid not in all_properties:
|
|
all_properties[pid] = p
|
|
|
|
with status_lock:
|
|
if channel_name == "BUY":
|
|
status.properties_buy = len(all_properties)
|
|
else:
|
|
status.properties_rent = len(all_properties)
|
|
_sync_gauges()
|
|
|
|
log.info("Outcode %s: got %d properties (total: %d)", outcode, len(props), len(all_properties))
|
|
|
|
except Exception as e:
|
|
msg = f"Error scraping {outcode}/{channel_name}: {e}"
|
|
log.error(msg)
|
|
scrape_errors_total.inc()
|
|
with status_lock:
|
|
status.errors.append(msg)
|
|
|
|
if i < len(shuffled) - 1:
|
|
time.sleep(DELAY_BETWEEN_OUTCODES)
|
|
|
|
# Write parquet
|
|
deduped = list(all_properties.values())
|
|
output_path = DATA_DIR / f"rightmove_{file_suffix}.parquet"
|
|
write_parquet(deduped, output_path)
|
|
|
|
with status_lock:
|
|
if channel_name == "BUY":
|
|
status.properties_buy = len(deduped)
|
|
else:
|
|
status.properties_rent = len(deduped)
|
|
status.outcodes_done = len(shuffled)
|
|
_sync_gauges()
|
|
|
|
log.info("=== %s channel complete: %d unique properties ===", channel_name, len(deduped))
|
|
|
|
with status_lock:
|
|
status.state = "done"
|
|
status.finished_at = time.time()
|
|
_sync_gauges()
|
|
elapsed = status.finished_at - status.started_at
|
|
log.info("Scrape complete in %.0fs — buy: %d, rent: %d",
|
|
elapsed, status.properties_buy, status.properties_rent)
|
|
|
|
except Exception as e:
|
|
log.exception("Fatal scrape error")
|
|
with status_lock:
|
|
status.state = "error"
|
|
status.errors.append(f"Fatal: {e}")
|
|
status.finished_at = time.time()
|
|
_sync_gauges()
|
|
finally:
|
|
client.close()
|