More
This commit is contained in:
parent
128b3191e7
commit
03445188ea
54 changed files with 596953 additions and 3577 deletions
191
finder/scraper.py
Normal file
191
finder/scraper.py
Normal file
|
|
@ -0,0 +1,191 @@
|
|||
import logging
|
||||
import random
|
||||
import threading
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
import polars as pl
|
||||
|
||||
from constants import ARCGIS_PATH, CHANNELS, DATA_DIR, DELAY_BETWEEN_OUTCODES, SEED
|
||||
from http_client import make_client
|
||||
from metrics import (
|
||||
scrape_elapsed_seconds,
|
||||
scrape_errors_total,
|
||||
scrape_outcodes_done,
|
||||
scrape_outcodes_total,
|
||||
scrape_properties_total,
|
||||
scrape_state,
|
||||
)
|
||||
from rightmove import resolve_outcode_id, search_outcode
|
||||
from spatial import PostcodeSpatialIndex
|
||||
from storage import write_parquet
|
||||
|
||||
log = logging.getLogger("rightmove")
|
||||
|
||||
|
||||
@dataclass
|
||||
class ScrapeStatus:
|
||||
state: str = "idle" # idle | running | done | error
|
||||
channel: str = ""
|
||||
outcode: str = ""
|
||||
outcodes_done: int = 0
|
||||
outcodes_total: int = 0
|
||||
properties_buy: int = 0
|
||||
properties_rent: int = 0
|
||||
errors: list[str] = field(default_factory=list)
|
||||
started_at: float = 0.0
|
||||
finished_at: float = 0.0
|
||||
|
||||
|
||||
status = ScrapeStatus()
|
||||
status_lock = threading.Lock()
|
||||
|
||||
|
||||
def _sync_gauges() -> None:
|
||||
"""Push current ScrapeStatus values into Prometheus gauges. Must hold status_lock."""
|
||||
for state in ("idle", "running", "done", "error"):
|
||||
scrape_state.labels(state=state).set(1 if status.state == state else 0)
|
||||
scrape_outcodes_done.set(status.outcodes_done)
|
||||
scrape_outcodes_total.set(status.outcodes_total)
|
||||
scrape_properties_total.labels(channel="buy").set(status.properties_buy)
|
||||
scrape_properties_total.labels(channel="rent").set(status.properties_rent)
|
||||
if status.started_at:
|
||||
end = status.finished_at if status.finished_at else time.time()
|
||||
scrape_elapsed_seconds.set(end - status.started_at)
|
||||
else:
|
||||
scrape_elapsed_seconds.set(0)
|
||||
|
||||
|
||||
def load_outcodes() -> list[str]:
|
||||
"""Load England-only outcodes from arcgis parquet."""
|
||||
log.info("Loading outcodes from %s", ARCGIS_PATH)
|
||||
df = pl.read_parquet(ARCGIS_PATH, columns=["pcd", "ctry", "lat", "long"])
|
||||
england = df.filter(pl.col("ctry") == "E92000001")
|
||||
log.info("England postcodes: %d", len(england))
|
||||
|
||||
outcodes = (
|
||||
england.select(pl.col("pcd").str.extract(r"^([A-Z]{1,2}\d[A-Z0-9]?)", 1).alias("outcode"))
|
||||
.drop_nulls()
|
||||
.get_column("outcode")
|
||||
.unique()
|
||||
.sort()
|
||||
.to_list()
|
||||
)
|
||||
log.info("Unique England outcodes: %d", len(outcodes))
|
||||
return outcodes
|
||||
|
||||
|
||||
def build_postcode_index() -> PostcodeSpatialIndex:
|
||||
"""Build spatial index from arcgis England postcodes."""
|
||||
log.info("Building postcode spatial index from %s", ARCGIS_PATH)
|
||||
df = pl.read_parquet(ARCGIS_PATH, columns=["pcd", "ctry", "lat", "long"])
|
||||
england = df.filter(pl.col("ctry") == "E92000001").drop_nulls(subset=["lat", "long"])
|
||||
return PostcodeSpatialIndex(
|
||||
england.get_column("lat").to_list(),
|
||||
england.get_column("long").to_list(),
|
||||
england.get_column("pcd").to_list(),
|
||||
)
|
||||
|
||||
|
||||
def run_scrape(outcodes: list[str], pc_index: PostcodeSpatialIndex) -> None:
|
||||
"""Main scrape loop — runs in background thread."""
|
||||
global status
|
||||
with status_lock:
|
||||
status.state = "running"
|
||||
status.started_at = time.time()
|
||||
status.errors = []
|
||||
status.properties_buy = 0
|
||||
status.properties_rent = 0
|
||||
_sync_gauges()
|
||||
|
||||
# Shuffle for geographic diversity
|
||||
shuffled = list(outcodes)
|
||||
random.seed(SEED)
|
||||
random.shuffle(shuffled)
|
||||
|
||||
client = make_client()
|
||||
|
||||
try:
|
||||
for channel_cfg in CHANNELS:
|
||||
channel_name = channel_cfg["channel"]
|
||||
file_suffix = "buy" if channel_name == "BUY" else "rent"
|
||||
all_properties: dict[int, dict] = {} # dedup by id
|
||||
|
||||
with status_lock:
|
||||
status.channel = channel_name
|
||||
status.outcodes_done = 0
|
||||
status.outcodes_total = len(shuffled)
|
||||
|
||||
log.info("=== Starting %s channel (%d outcodes) ===", channel_name, len(shuffled))
|
||||
|
||||
for i, outcode in enumerate(shuffled):
|
||||
with status_lock:
|
||||
status.outcode = outcode
|
||||
status.outcodes_done = i
|
||||
|
||||
log.debug("Outcode %s (%d/%d) — %d properties so far",
|
||||
outcode, i + 1, len(shuffled), len(all_properties))
|
||||
|
||||
try:
|
||||
outcode_id = resolve_outcode_id(client, outcode)
|
||||
if not outcode_id:
|
||||
log.debug("No Rightmove ID for outcode %s, skipping", outcode)
|
||||
continue
|
||||
|
||||
props = search_outcode(client, outcode_id, outcode, channel_cfg, pc_index)
|
||||
for p in props:
|
||||
pid = p["id"]
|
||||
if pid not in all_properties:
|
||||
all_properties[pid] = p
|
||||
|
||||
with status_lock:
|
||||
if channel_name == "BUY":
|
||||
status.properties_buy = len(all_properties)
|
||||
else:
|
||||
status.properties_rent = len(all_properties)
|
||||
_sync_gauges()
|
||||
|
||||
log.info("Outcode %s: got %d properties (total: %d)", outcode, len(props), len(all_properties))
|
||||
|
||||
except Exception as e:
|
||||
msg = f"Error scraping {outcode}/{channel_name}: {e}"
|
||||
log.error(msg)
|
||||
scrape_errors_total.inc()
|
||||
with status_lock:
|
||||
status.errors.append(msg)
|
||||
|
||||
if i < len(shuffled) - 1:
|
||||
time.sleep(DELAY_BETWEEN_OUTCODES)
|
||||
|
||||
# Write parquet
|
||||
deduped = list(all_properties.values())
|
||||
output_path = DATA_DIR / f"rightmove_{file_suffix}.parquet"
|
||||
write_parquet(deduped, output_path)
|
||||
|
||||
with status_lock:
|
||||
if channel_name == "BUY":
|
||||
status.properties_buy = len(deduped)
|
||||
else:
|
||||
status.properties_rent = len(deduped)
|
||||
status.outcodes_done = len(shuffled)
|
||||
_sync_gauges()
|
||||
|
||||
log.info("=== %s channel complete: %d unique properties ===", channel_name, len(deduped))
|
||||
|
||||
with status_lock:
|
||||
status.state = "done"
|
||||
status.finished_at = time.time()
|
||||
_sync_gauges()
|
||||
elapsed = status.finished_at - status.started_at
|
||||
log.info("Scrape complete in %.0fs — buy: %d, rent: %d",
|
||||
elapsed, status.properties_buy, status.properties_rent)
|
||||
|
||||
except Exception as e:
|
||||
log.exception("Fatal scrape error")
|
||||
with status_lock:
|
||||
status.state = "error"
|
||||
status.errors.append(f"Fatal: {e}")
|
||||
status.finished_at = time.time()
|
||||
_sync_gauges()
|
||||
finally:
|
||||
client.close()
|
||||
Loading…
Add table
Add a link
Reference in a new issue