This commit is contained in:
Andras Schmelczer 2026-02-18 21:22:15 +00:00
parent 524580eb25
commit ffe080adef
82 changed files with 2652 additions and 2956 deletions

View file

@ -1,12 +1,13 @@
import logging
import threading
import time
from datetime import datetime, timedelta, timezone
from pathlib import Path
from flask import Flask, Response, jsonify, send_from_directory
from prometheus_client import generate_latest, CONTENT_TYPE_LATEST
from constants import DATA_DIR
from constants import DATA_DIR, RUN_ON_STARTUP, SCHEDULE_HOUR
from rightmove import outcode_cache
from scraper import (
_sync_gauges,
@ -46,6 +47,52 @@ OUTCODES = load_outcodes()
PC_INDEX = build_postcode_index()
log.info("Ready — %d outcodes, postcode index built", len(OUTCODES))
# ---------------------------------------------------------------------------
# Scheduler
# ---------------------------------------------------------------------------
def _start_scrape() -> bool:
"""Try to start a scrape. Returns True if started, False if already running."""
with status_lock:
if status.state == "running":
return False
status.state = "running"
thread = threading.Thread(target=run_scrape, args=(OUTCODES, PC_INDEX), daemon=True)
thread.start()
return True
def _seconds_until(hour: int) -> float:
"""Seconds from now until the next occurrence of `hour`:00 UTC."""
now = datetime.now(timezone.utc)
target = now.replace(hour=hour, minute=0, second=0, microsecond=0)
if target <= now:
target += timedelta(days=1)
return (target - now).total_seconds()
def _scheduler_loop() -> None:
"""Background thread that triggers a daily scrape at SCHEDULE_HOUR UTC."""
log.info("Scheduler active — will run daily at %02d:00 UTC", SCHEDULE_HOUR)
while True:
wait = _seconds_until(SCHEDULE_HOUR)
log.info("Next scheduled scrape in %.0f seconds (%.1f hours)", wait, wait / 3600)
time.sleep(wait)
log.info("Scheduled scrape triggered")
if not _start_scrape():
log.warning("Scheduled scrape skipped — already running")
if RUN_ON_STARTUP:
log.info("RUN_ON_STARTUP=true — starting initial scrape")
_start_scrape()
if SCHEDULE_HOUR >= 0:
scheduler = threading.Thread(target=_scheduler_loop, daemon=True)
scheduler.start()
# ---------------------------------------------------------------------------
# Flask app
# ---------------------------------------------------------------------------
@ -55,14 +102,9 @@ app = Flask(__name__)
@app.route("/run", methods=["POST"])
def trigger_run():
with status_lock:
if status.state == "running":
return jsonify({"error": "Scrape already running"}), 409
status.state = "running"
thread = threading.Thread(target=run_scrape, args=(OUTCODES, PC_INDEX), daemon=True)
thread.start()
return jsonify({"message": "Scrape started"}), 200
if _start_scrape():
return jsonify({"message": "Scrape started"}), 200
return jsonify({"error": "Scrape already running"}), 409
@app.route("/status")
@ -72,7 +114,7 @@ def get_status():
if status.started_at:
end = status.finished_at if status.finished_at else time.time()
elapsed = end - status.started_at
return jsonify({
resp = {
"state": status.state,
"channel": status.channel,
"outcode": status.outcode,
@ -82,7 +124,10 @@ def get_status():
"properties_rent": status.properties_rent,
"errors": status.errors[-20:], # last 20 errors
"elapsed_seconds": round(elapsed, 1),
})
}
if SCHEDULE_HOUR >= 0:
resp["next_scrape_in_seconds"] = round(_seconds_until(SCHEDULE_HOUR))
return jsonify(resp)
@app.route("/debug")