Morning improvements
This commit is contained in:
parent
3e9fba5303
commit
53fff3efaa
41 changed files with 2438 additions and 637 deletions
|
|
@ -6,11 +6,14 @@ from dataclasses import dataclass, field
|
|||
|
||||
import polars as pl
|
||||
|
||||
import httpx
|
||||
|
||||
from constants import (
|
||||
ARCGIS_PATH,
|
||||
CHANNELS,
|
||||
DATA_DIR,
|
||||
DELAY_BETWEEN_OUTCODES,
|
||||
RELOAD_URL,
|
||||
SCRAPE_HOMECOUK,
|
||||
SCRAPE_OPENRENT,
|
||||
SCRAPE_RIGHTMOVE,
|
||||
|
|
@ -151,6 +154,15 @@ def build_postcode_coords() -> dict[str, tuple[float, float]]:
|
|||
return coords
|
||||
|
||||
|
||||
def _fmt_elapsed(seconds: float) -> str:
|
||||
"""Format seconds as e.g. '2h13m' or '5m32s'."""
|
||||
h, rem = divmod(int(seconds), 3600)
|
||||
m, s = divmod(rem, 60)
|
||||
if h:
|
||||
return f"{h}h{m:02d}m"
|
||||
return f"{m}m{s:02d}s"
|
||||
|
||||
|
||||
def _dedup_key(p: dict) -> tuple:
|
||||
"""Composite key for cross-source deduplication: (postcode, bedrooms, price).
|
||||
Two listings on different portals for the same physical property will share
|
||||
|
|
@ -253,6 +265,8 @@ def run_scrape(
|
|||
status.hk_properties = 0
|
||||
status.or_properties = 0
|
||||
|
||||
channel_start = time.time()
|
||||
prev_prop_milestone = 0 # last 10k milestone we logged
|
||||
log.info(
|
||||
"=== Starting %s channel (%d outcodes) ===", channel_name, len(shuffled)
|
||||
)
|
||||
|
|
@ -262,14 +276,6 @@ def run_scrape(
|
|||
status.outcode = outcode
|
||||
status.outcodes_done = i
|
||||
|
||||
log.debug(
|
||||
"Outcode %s (%d/%d) — %d properties so far",
|
||||
outcode,
|
||||
i + 1,
|
||||
len(shuffled),
|
||||
len(all_properties),
|
||||
)
|
||||
|
||||
made_requests = False
|
||||
|
||||
# --- Rightmove ---
|
||||
|
|
@ -416,14 +422,38 @@ def run_scrape(
|
|||
status.or_properties = or_count
|
||||
_sync_gauges()
|
||||
|
||||
log.info(
|
||||
"Outcode %s: total %d (rm: %d, hk: %d, or: %d)",
|
||||
outcode,
|
||||
len(all_properties),
|
||||
rm_count,
|
||||
hk_count,
|
||||
or_count,
|
||||
)
|
||||
# Log progress every 100 outcodes
|
||||
done = i + 1
|
||||
elapsed = time.time() - channel_start
|
||||
if done % 100 == 0 or done == len(shuffled):
|
||||
pct = done * 100 // len(shuffled)
|
||||
rate = done / elapsed if elapsed > 0 else 0
|
||||
log.info(
|
||||
"%s %d/%d (%d%%) — %d props, %s elapsed, %.1f outcodes/min",
|
||||
channel_name,
|
||||
done,
|
||||
len(shuffled),
|
||||
pct,
|
||||
len(all_properties),
|
||||
_fmt_elapsed(elapsed),
|
||||
rate * 60,
|
||||
)
|
||||
|
||||
# Log when crossing a 10k property milestone
|
||||
current_milestone = len(all_properties) // 10_000
|
||||
if current_milestone > prev_prop_milestone:
|
||||
prev_prop_milestone = current_milestone
|
||||
log.info(
|
||||
"%s %dk properties (rm: %d, hk: %d, or: %d) at outcode %d/%d [%s]",
|
||||
channel_name,
|
||||
current_milestone * 10,
|
||||
rm_count,
|
||||
hk_count,
|
||||
or_count,
|
||||
done,
|
||||
len(shuffled),
|
||||
_fmt_elapsed(elapsed),
|
||||
)
|
||||
|
||||
if made_requests and i < len(shuffled) - 1:
|
||||
time.sleep(DELAY_BETWEEN_OUTCODES)
|
||||
|
|
@ -457,12 +487,30 @@ def run_scrape(
|
|||
_sync_gauges()
|
||||
elapsed = status.finished_at - status.started_at
|
||||
log.info(
|
||||
"Scrape complete in %.0fs — buy: %d, rent: %d",
|
||||
elapsed,
|
||||
"Scrape complete in %s — buy: %d, rent: %d",
|
||||
_fmt_elapsed(elapsed),
|
||||
status.properties_buy,
|
||||
status.properties_rent,
|
||||
)
|
||||
|
||||
# Trigger server data reload
|
||||
if RELOAD_URL:
|
||||
try:
|
||||
log.info("Triggering server reload at %s", RELOAD_URL)
|
||||
resp = httpx.post(RELOAD_URL, timeout=300)
|
||||
if resp.is_success:
|
||||
body = resp.json()
|
||||
log.info(
|
||||
"Server reload complete: %d rows, %d features, %dms",
|
||||
body.get("rows", 0),
|
||||
body.get("features", 0),
|
||||
body.get("elapsed_ms", 0),
|
||||
)
|
||||
else:
|
||||
log.warning("Server reload failed (%d): %s", resp.status_code, resp.text[:200])
|
||||
except Exception as e:
|
||||
log.warning("Server reload request failed: %s", e)
|
||||
|
||||
except Exception as e:
|
||||
log.exception("Fatal scrape error")
|
||||
with status_lock:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue