changes

2026-02-18 21:22:15 +00:00 · 2026-02-18 21:22:15 +00:00 · ffe080adef
commit ffe080adef
parent 524580eb25
82 changed files with 2652 additions and 2956 deletions
--- a/finder/main.py
+++ b/finder/main.py
@ -1,12 +1,13 @@
 import logging
 import threading
 import time
+from datetime import datetime, timedelta, timezone
 from pathlib import Path

 from flask import Flask, Response, jsonify, send_from_directory
 from prometheus_client import generate_latest, CONTENT_TYPE_LATEST

-from constants import DATA_DIR
+from constants import DATA_DIR, RUN_ON_STARTUP, SCHEDULE_HOUR
 from rightmove import outcode_cache
 from scraper import (
    _sync_gauges,
@ -46,6 +47,52 @@ OUTCODES = load_outcodes()
 PC_INDEX = build_postcode_index()
 log.info("Ready — %d outcodes, postcode index built", len(OUTCODES))

+# ---------------------------------------------------------------------------
+# Scheduler
+# ---------------------------------------------------------------------------
+
+
+def _start_scrape() -> bool:
+    """Try to start a scrape. Returns True if started, False if already running."""
+    with status_lock:
+        if status.state == "running":
+            return False
+        status.state = "running"
+    thread = threading.Thread(target=run_scrape, args=(OUTCODES, PC_INDEX), daemon=True)
+    thread.start()
+    return True
+
+
+def _seconds_until(hour: int) -> float:
+    """Seconds from now until the next occurrence of `hour`:00 UTC."""
+    now = datetime.now(timezone.utc)
+    target = now.replace(hour=hour, minute=0, second=0, microsecond=0)
+    if target <= now:
+        target += timedelta(days=1)
+    return (target - now).total_seconds()
+
+
+def _scheduler_loop() -> None:
+    """Background thread that triggers a daily scrape at SCHEDULE_HOUR UTC."""
+    log.info("Scheduler active — will run daily at %02d:00 UTC", SCHEDULE_HOUR)
+    while True:
+        wait = _seconds_until(SCHEDULE_HOUR)
+        log.info("Next scheduled scrape in %.0f seconds (%.1f hours)", wait, wait / 3600)
+        time.sleep(wait)
+        log.info("Scheduled scrape triggered")
+        if not _start_scrape():
+            log.warning("Scheduled scrape skipped — already running")
+
+
+if RUN_ON_STARTUP:
+    log.info("RUN_ON_STARTUP=true — starting initial scrape")
+    _start_scrape()
+
+if SCHEDULE_HOUR >= 0:
+    scheduler = threading.Thread(target=_scheduler_loop, daemon=True)
+    scheduler.start()
+
+
 # ---------------------------------------------------------------------------
 # Flask app
 # ---------------------------------------------------------------------------
@ -55,14 +102,9 @@ app = Flask(__name__)

@app.route("/run", methods=["POST"])
 def trigger_run():
-    with status_lock:
-        if status.state == "running":
-            return jsonify({"error": "Scrape already running"}), 409
-        status.state = "running"
-
-    thread = threading.Thread(target=run_scrape, args=(OUTCODES, PC_INDEX), daemon=True)
-    thread.start()
-    return jsonify({"message": "Scrape started"}), 200
+    if _start_scrape():
+        return jsonify({"message": "Scrape started"}), 200
+    return jsonify({"error": "Scrape already running"}), 409


@app.route("/status")
@ -72,7 +114,7 @@ def get_status():
        if status.started_at:
            end = status.finished_at if status.finished_at else time.time()
            elapsed = end - status.started_at
-        return jsonify({
+        resp = {
            "state": status.state,
            "channel": status.channel,
            "outcode": status.outcode,
@ -82,7 +124,10 @@ def get_status():
            "properties_rent": status.properties_rent,
            "errors": status.errors[-20:],  # last 20 errors
            "elapsed_seconds": round(elapsed, 1),
-        })
+        }
+        if SCHEDULE_HOUR >= 0:
+            resp["next_scrape_in_seconds"] = round(_seconds_until(SCHEDULE_HOUR))
+        return jsonify(resp)


@app.route("/debug")