import logging import threading import time from datetime import datetime, timedelta, timezone from pathlib import Path from flask import Flask, Response, jsonify, send_from_directory from prometheus_client import generate_latest, CONTENT_TYPE_LATEST from constants import DATA_DIR, RUN_ON_STARTUP, SCHEDULE_HOUR, SCRAPE_HOMECOUK, SCRAPE_OPENRENT, SCRAPE_RIGHTMOVE from homecouk import load_cookies as load_homecouk_cookies from openrent import load_cookies as load_openrent_cookies from rightmove import outcode_cache from scraper import ( _sync_gauges, build_postcode_coords, build_postcode_index, load_outcodes, run_scrape, status, status_lock, ) # --------------------------------------------------------------------------- # Logging # --------------------------------------------------------------------------- LOG_DIR = Path("/app/data") LOG_DIR.mkdir(parents=True, exist_ok=True) logging.basicConfig( level=logging.DEBUG, format="%(asctime)s [%(levelname)s] %(message)s", handlers=[ logging.StreamHandler(), logging.FileHandler(LOG_DIR / "rightmove.log"), ], ) log = logging.getLogger("rightmove") log.setLevel(logging.DEBUG) logging.getLogger("httpx").setLevel(logging.WARNING) logging.getLogger("httpcore").setLevel(logging.WARNING) # --------------------------------------------------------------------------- # Startup: load data # --------------------------------------------------------------------------- log.info("Loading arcgis data...") OUTCODES = load_outcodes() PC_INDEX = build_postcode_index() PC_COORDS = build_postcode_coords() if SCRAPE_OPENRENT else None log.info("Ready — %d outcodes, postcode index built (rightmove=%s, homecouk=%s, openrent=%s)", len(OUTCODES), SCRAPE_RIGHTMOVE, SCRAPE_HOMECOUK, SCRAPE_OPENRENT) # --------------------------------------------------------------------------- # Scheduler # --------------------------------------------------------------------------- def _start_scrape() -> bool: """Try to start a scrape. Returns True if started, False if already running.""" with status_lock: if status.state == "running": return False status.state = "running" thread = threading.Thread(target=run_scrape, args=(OUTCODES, PC_INDEX, PC_COORDS), daemon=True) thread.start() return True def _seconds_until(hour: int) -> float: """Seconds from now until the next occurrence of `hour`:00 UTC.""" now = datetime.now(timezone.utc) target = now.replace(hour=hour, minute=0, second=0, microsecond=0) if target <= now: target += timedelta(days=1) return (target - now).total_seconds() def _scheduler_loop() -> None: """Background thread that triggers a daily scrape at SCHEDULE_HOUR UTC.""" log.info("Scheduler active — will run daily at %02d:00 UTC", SCHEDULE_HOUR) while True: wait = _seconds_until(SCHEDULE_HOUR) log.info("Next scheduled scrape in %.0f seconds (%.1f hours)", wait, wait / 3600) time.sleep(wait) log.info("Scheduled scrape triggered") if not _start_scrape(): log.warning("Scheduled scrape skipped — already running") if RUN_ON_STARTUP: log.info("RUN_ON_STARTUP=true — starting initial scrape") _start_scrape() if SCHEDULE_HOUR >= 0: scheduler = threading.Thread(target=_scheduler_loop, daemon=True) scheduler.start() # --------------------------------------------------------------------------- # Flask app # --------------------------------------------------------------------------- app = Flask(__name__) @app.route("/run", methods=["POST"]) def trigger_run(): if _start_scrape(): return jsonify({"message": "Scrape started"}), 200 return jsonify({"error": "Scrape already running"}), 409 @app.route("/status") def get_status(): with status_lock: elapsed = 0.0 if status.started_at: end = status.finished_at if status.finished_at else time.time() elapsed = end - status.started_at resp = { "state": status.state, "channel": status.channel, "outcode": status.outcode, "outcodes_done": status.outcodes_done, "outcodes_total": status.outcodes_total, "properties_buy": status.properties_buy, "properties_rent": status.properties_rent, "properties_by_source": { "rightmove": status.rm_properties, "homecouk": status.hk_properties, "openrent": status.or_properties, }, "errors": status.errors[-20:], # last 20 errors "elapsed_seconds": round(elapsed, 1), } if SCHEDULE_HOUR >= 0: resp["next_scrape_in_seconds"] = round(_seconds_until(SCHEDULE_HOUR)) return jsonify(resp) @app.route("/debug") def get_debug(): hk_cookies = load_homecouk_cookies() if SCRAPE_HOMECOUK else None or_cookies = load_openrent_cookies() if SCRAPE_OPENRENT else None return jsonify({ "outcode_cache_size": len(outcode_cache), "outcode_cache_sample": dict(list(outcode_cache.items())[:20]), "scrape_rightmove": SCRAPE_RIGHTMOVE, "scrape_homecouk": SCRAPE_HOMECOUK, "scrape_openrent": SCRAPE_OPENRENT, "homecouk_cookies_available": hk_cookies is not None, "openrent_cookies_available": or_cookies is not None, }) @app.route("/metrics") def metrics(): with status_lock: _sync_gauges() return Response(generate_latest(), mimetype=CONTENT_TYPE_LATEST) @app.route("/data/") def serve_data(filename): if not filename.endswith(".parquet"): return jsonify({"error": "Only parquet files served"}), 400 return send_from_directory(DATA_DIR, filename) if __name__ == "__main__": app.run(host="0.0.0.0", port=1234, debug=False)