Morning improvements

This commit is contained in:
Andras Schmelczer 2026-03-17 13:29:03 +00:00
parent 3e9fba5303
commit 53fff3efaa
41 changed files with 2438 additions and 637 deletions

View file

@ -6,11 +6,14 @@ from dataclasses import dataclass, field
import polars as pl
import httpx
from constants import (
ARCGIS_PATH,
CHANNELS,
DATA_DIR,
DELAY_BETWEEN_OUTCODES,
RELOAD_URL,
SCRAPE_HOMECOUK,
SCRAPE_OPENRENT,
SCRAPE_RIGHTMOVE,
@ -151,6 +154,15 @@ def build_postcode_coords() -> dict[str, tuple[float, float]]:
return coords
def _fmt_elapsed(seconds: float) -> str:
"""Format seconds as e.g. '2h13m' or '5m32s'."""
h, rem = divmod(int(seconds), 3600)
m, s = divmod(rem, 60)
if h:
return f"{h}h{m:02d}m"
return f"{m}m{s:02d}s"
def _dedup_key(p: dict) -> tuple:
"""Composite key for cross-source deduplication: (postcode, bedrooms, price).
Two listings on different portals for the same physical property will share
@ -253,6 +265,8 @@ def run_scrape(
status.hk_properties = 0
status.or_properties = 0
channel_start = time.time()
prev_prop_milestone = 0 # last 10k milestone we logged
log.info(
"=== Starting %s channel (%d outcodes) ===", channel_name, len(shuffled)
)
@ -262,14 +276,6 @@ def run_scrape(
status.outcode = outcode
status.outcodes_done = i
log.debug(
"Outcode %s (%d/%d) — %d properties so far",
outcode,
i + 1,
len(shuffled),
len(all_properties),
)
made_requests = False
# --- Rightmove ---
@ -416,14 +422,38 @@ def run_scrape(
status.or_properties = or_count
_sync_gauges()
log.info(
"Outcode %s: total %d (rm: %d, hk: %d, or: %d)",
outcode,
len(all_properties),
rm_count,
hk_count,
or_count,
)
# Log progress every 100 outcodes
done = i + 1
elapsed = time.time() - channel_start
if done % 100 == 0 or done == len(shuffled):
pct = done * 100 // len(shuffled)
rate = done / elapsed if elapsed > 0 else 0
log.info(
"%s %d/%d (%d%%) — %d props, %s elapsed, %.1f outcodes/min",
channel_name,
done,
len(shuffled),
pct,
len(all_properties),
_fmt_elapsed(elapsed),
rate * 60,
)
# Log when crossing a 10k property milestone
current_milestone = len(all_properties) // 10_000
if current_milestone > prev_prop_milestone:
prev_prop_milestone = current_milestone
log.info(
"%s %dk properties (rm: %d, hk: %d, or: %d) at outcode %d/%d [%s]",
channel_name,
current_milestone * 10,
rm_count,
hk_count,
or_count,
done,
len(shuffled),
_fmt_elapsed(elapsed),
)
if made_requests and i < len(shuffled) - 1:
time.sleep(DELAY_BETWEEN_OUTCODES)
@ -457,12 +487,30 @@ def run_scrape(
_sync_gauges()
elapsed = status.finished_at - status.started_at
log.info(
"Scrape complete in %.0fs — buy: %d, rent: %d",
elapsed,
"Scrape complete in %s — buy: %d, rent: %d",
_fmt_elapsed(elapsed),
status.properties_buy,
status.properties_rent,
)
# Trigger server data reload
if RELOAD_URL:
try:
log.info("Triggering server reload at %s", RELOAD_URL)
resp = httpx.post(RELOAD_URL, timeout=300)
if resp.is_success:
body = resp.json()
log.info(
"Server reload complete: %d rows, %d features, %dms",
body.get("rows", 0),
body.get("features", 0),
body.get("elapsed_ms", 0),
)
else:
log.warning("Server reload failed (%d): %s", resp.status_code, resp.text[:200])
except Exception as e:
log.warning("Server reload request failed: %s", e)
except Exception as e:
log.exception("Fatal scrape error")
with status_lock: