perfect-postcode/finder/scraper.py
2026-02-18 21:22:15 +00:00

191 lines
6.7 KiB
Python

import logging
import random
import threading
import time
from dataclasses import dataclass, field
import polars as pl
from constants import ARCGIS_PATH, CHANNELS, DATA_DIR, DELAY_BETWEEN_OUTCODES, SEED
from http_client import make_client
from metrics import (
scrape_elapsed_seconds,
scrape_errors_total,
scrape_outcodes_done,
scrape_outcodes_total,
scrape_properties_total,
scrape_state,
)
from rightmove import resolve_outcode_id, search_outcode
from spatial import PostcodeSpatialIndex
from storage import write_parquet
log = logging.getLogger("rightmove")
@dataclass
class ScrapeStatus:
state: str = "idle" # idle | running | done | error
channel: str = ""
outcode: str = ""
outcodes_done: int = 0
outcodes_total: int = 0
properties_buy: int = 0
properties_rent: int = 0
errors: list[str] = field(default_factory=list)
started_at: float = 0.0
finished_at: float = 0.0
status = ScrapeStatus()
status_lock = threading.Lock()
def _sync_gauges() -> None:
"""Push current ScrapeStatus values into Prometheus gauges. Must hold status_lock."""
for state in ("idle", "running", "done", "error"):
scrape_state.labels(state=state).set(1 if status.state == state else 0)
scrape_outcodes_done.set(status.outcodes_done)
scrape_outcodes_total.set(status.outcodes_total)
scrape_properties_total.labels(channel="buy").set(status.properties_buy)
scrape_properties_total.labels(channel="rent").set(status.properties_rent)
if status.started_at:
end = status.finished_at if status.finished_at else time.time()
scrape_elapsed_seconds.set(end - status.started_at)
else:
scrape_elapsed_seconds.set(0)
def load_outcodes() -> list[str]:
"""Load England-only outcodes from arcgis parquet."""
log.info("Loading outcodes from %s", ARCGIS_PATH)
df = pl.read_parquet(ARCGIS_PATH, columns=["pcd", "ctry", "lat", "long"])
england = df.filter(pl.col("ctry") == "E92000001")
log.info("England postcodes: %d", len(england))
outcodes = (
england.select(pl.col("pcd").str.extract(r"^([A-Z]{1,2}\d[A-Z0-9]?)", 1).alias("outcode"))
.drop_nulls()
.get_column("outcode")
.unique()
.sort()
.to_list()
)
log.info("Unique England outcodes: %d", len(outcodes))
return outcodes
def build_postcode_index() -> PostcodeSpatialIndex:
"""Build spatial index from arcgis England postcodes."""
log.info("Building postcode spatial index from %s", ARCGIS_PATH)
df = pl.read_parquet(ARCGIS_PATH, columns=["pcd", "ctry", "lat", "long"])
england = df.filter(pl.col("ctry") == "E92000001").drop_nulls(subset=["lat", "long"])
return PostcodeSpatialIndex(
england.get_column("lat").to_list(),
england.get_column("long").to_list(),
england.get_column("pcd").to_list(),
)
def run_scrape(outcodes: list[str], pc_index: PostcodeSpatialIndex) -> None:
"""Main scrape loop — runs in background thread."""
global status
with status_lock:
status.state = "running"
status.started_at = time.time()
status.errors = []
status.properties_buy = 0
status.properties_rent = 0
_sync_gauges()
# Shuffle for geographic diversity
shuffled = list(outcodes)
random.seed(SEED)
random.shuffle(shuffled)
client = make_client()
try:
for channel_cfg in CHANNELS:
channel_name = channel_cfg["channel"]
file_suffix = "buy" if channel_name == "BUY" else "rent"
all_properties: dict[int, dict] = {} # dedup by id
with status_lock:
status.channel = channel_name
status.outcodes_done = 0
status.outcodes_total = len(shuffled)
log.info("=== Starting %s channel (%d outcodes) ===", channel_name, len(shuffled))
for i, outcode in enumerate(shuffled):
with status_lock:
status.outcode = outcode
status.outcodes_done = i
log.debug("Outcode %s (%d/%d) — %d properties so far",
outcode, i + 1, len(shuffled), len(all_properties))
try:
outcode_id = resolve_outcode_id(client, outcode)
if not outcode_id:
log.debug("No Rightmove ID for outcode %s, skipping", outcode)
continue
props = search_outcode(client, outcode_id, outcode, channel_cfg, pc_index)
for p in props:
pid = p["id"]
if pid not in all_properties:
all_properties[pid] = p
with status_lock:
if channel_name == "BUY":
status.properties_buy = len(all_properties)
else:
status.properties_rent = len(all_properties)
_sync_gauges()
log.info("Outcode %s: got %d properties (total: %d)", outcode, len(props), len(all_properties))
except Exception as e:
msg = f"Error scraping {outcode}/{channel_name}: {e}"
log.error(msg)
scrape_errors_total.inc()
with status_lock:
status.errors.append(msg)
if i < len(shuffled) - 1:
time.sleep(DELAY_BETWEEN_OUTCODES)
# Write parquet
deduped = list(all_properties.values())
output_path = DATA_DIR / f"online_listings_{file_suffix}.parquet"
write_parquet(deduped, output_path, channel=file_suffix)
with status_lock:
if channel_name == "BUY":
status.properties_buy = len(deduped)
else:
status.properties_rent = len(deduped)
status.outcodes_done = len(shuffled)
_sync_gauges()
log.info("=== %s channel complete: %d unique properties ===", channel_name, len(deduped))
with status_lock:
status.state = "done"
status.finished_at = time.time()
_sync_gauges()
elapsed = status.finished_at - status.started_at
log.info("Scrape complete in %.0fs — buy: %d, rent: %d",
elapsed, status.properties_buy, status.properties_rent)
except Exception as e:
log.exception("Fatal scrape error")
with status_lock:
status.state = "error"
status.errors.append(f"Fatal: {e}")
status.finished_at = time.time()
_sync_gauges()
finally:
client.close()