perfect-postcode/finder/main.py
2026-03-12 22:11:29 +00:00

173 lines
5.8 KiB
Python

import logging
import threading
import time
from datetime import datetime, timedelta, timezone
from pathlib import Path
from flask import Flask, Response, jsonify, send_from_directory
from prometheus_client import generate_latest, CONTENT_TYPE_LATEST
from constants import DATA_DIR, RUN_ON_STARTUP, SCHEDULE_HOUR, SCRAPE_HOMECOUK, SCRAPE_OPENRENT, SCRAPE_RIGHTMOVE
from homecouk import load_cookies as load_homecouk_cookies
from openrent import load_cookies as load_openrent_cookies
from rightmove import outcode_cache
from scraper import (
_sync_gauges,
build_postcode_coords,
build_postcode_index,
load_outcodes,
run_scrape,
status,
status_lock,
)
# ---------------------------------------------------------------------------
# Logging
# ---------------------------------------------------------------------------
LOG_DIR = Path("/app/data")
LOG_DIR.mkdir(parents=True, exist_ok=True)
logging.basicConfig(
level=logging.DEBUG,
format="%(asctime)s [%(levelname)s] %(message)s",
handlers=[
logging.StreamHandler(),
logging.FileHandler(LOG_DIR / "rightmove.log"),
],
)
log = logging.getLogger("rightmove")
log.setLevel(logging.DEBUG)
logging.getLogger("httpx").setLevel(logging.WARNING)
logging.getLogger("httpcore").setLevel(logging.WARNING)
# ---------------------------------------------------------------------------
# Startup: load data
# ---------------------------------------------------------------------------
log.info("Loading arcgis data...")
OUTCODES = load_outcodes()
PC_INDEX = build_postcode_index()
PC_COORDS = build_postcode_coords() if SCRAPE_OPENRENT else None
log.info("Ready — %d outcodes, postcode index built (rightmove=%s, homecouk=%s, openrent=%s)",
len(OUTCODES), SCRAPE_RIGHTMOVE, SCRAPE_HOMECOUK, SCRAPE_OPENRENT)
# ---------------------------------------------------------------------------
# Scheduler
# ---------------------------------------------------------------------------
def _start_scrape() -> bool:
"""Try to start a scrape. Returns True if started, False if already running."""
with status_lock:
if status.state == "running":
return False
status.state = "running"
thread = threading.Thread(target=run_scrape, args=(OUTCODES, PC_INDEX, PC_COORDS), daemon=True)
thread.start()
return True
def _seconds_until(hour: int) -> float:
"""Seconds from now until the next occurrence of `hour`:00 UTC."""
now = datetime.now(timezone.utc)
target = now.replace(hour=hour, minute=0, second=0, microsecond=0)
if target <= now:
target += timedelta(days=1)
return (target - now).total_seconds()
def _scheduler_loop() -> None:
"""Background thread that triggers a daily scrape at SCHEDULE_HOUR UTC."""
log.info("Scheduler active — will run daily at %02d:00 UTC", SCHEDULE_HOUR)
while True:
wait = _seconds_until(SCHEDULE_HOUR)
log.info("Next scheduled scrape in %.0f seconds (%.1f hours)", wait, wait / 3600)
time.sleep(wait)
log.info("Scheduled scrape triggered")
if not _start_scrape():
log.warning("Scheduled scrape skipped — already running")
if RUN_ON_STARTUP:
log.info("RUN_ON_STARTUP=true — starting initial scrape")
_start_scrape()
if SCHEDULE_HOUR >= 0:
scheduler = threading.Thread(target=_scheduler_loop, daemon=True)
scheduler.start()
# ---------------------------------------------------------------------------
# Flask app
# ---------------------------------------------------------------------------
app = Flask(__name__)
@app.route("/run", methods=["POST"])
def trigger_run():
if _start_scrape():
return jsonify({"message": "Scrape started"}), 200
return jsonify({"error": "Scrape already running"}), 409
@app.route("/status")
def get_status():
with status_lock:
elapsed = 0.0
if status.started_at:
end = status.finished_at if status.finished_at else time.time()
elapsed = end - status.started_at
resp = {
"state": status.state,
"channel": status.channel,
"outcode": status.outcode,
"outcodes_done": status.outcodes_done,
"outcodes_total": status.outcodes_total,
"properties_buy": status.properties_buy,
"properties_rent": status.properties_rent,
"properties_by_source": {
"rightmove": status.rm_properties,
"homecouk": status.hk_properties,
"openrent": status.or_properties,
},
"errors": status.errors[-20:], # last 20 errors
"elapsed_seconds": round(elapsed, 1),
}
if SCHEDULE_HOUR >= 0:
resp["next_scrape_in_seconds"] = round(_seconds_until(SCHEDULE_HOUR))
return jsonify(resp)
@app.route("/debug")
def get_debug():
hk_cookies = load_homecouk_cookies() if SCRAPE_HOMECOUK else None
or_cookies = load_openrent_cookies() if SCRAPE_OPENRENT else None
return jsonify({
"outcode_cache_size": len(outcode_cache),
"outcode_cache_sample": dict(list(outcode_cache.items())[:20]),
"scrape_rightmove": SCRAPE_RIGHTMOVE,
"scrape_homecouk": SCRAPE_HOMECOUK,
"scrape_openrent": SCRAPE_OPENRENT,
"homecouk_cookies_available": hk_cookies is not None,
"openrent_cookies_available": or_cookies is not None,
})
@app.route("/metrics")
def metrics():
with status_lock:
_sync_gauges()
return Response(generate_latest(), mimetype=CONTENT_TYPE_LATEST)
@app.route("/data/<filename>")
def serve_data(filename):
if not filename.endswith(".parquet"):
return jsonify({"error": "Only parquet files served"}), 400
return send_from_directory(DATA_DIR, filename)
if __name__ == "__main__":
app.run(host="0.0.0.0", port=1234, debug=False)