Remove finder
This commit is contained in:
parent
55238f59aa
commit
cd778dd088
26 changed files with 0 additions and 57826 deletions
211
finder/main.py
211
finder/main.py
|
|
@ -1,211 +0,0 @@
|
|||
import logging
|
||||
import threading
|
||||
import time
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from pathlib import Path
|
||||
|
||||
from flask import Flask, Response, jsonify, send_from_directory
|
||||
from prometheus_client import generate_latest, CONTENT_TYPE_LATEST
|
||||
|
||||
from constants import (
|
||||
DATA_DIR,
|
||||
RUN_ON_STARTUP,
|
||||
SCHEDULE_HOUR,
|
||||
SCRAPE_HOMECOUK,
|
||||
SCRAPE_OPENRENT,
|
||||
SCRAPE_RIGHTMOVE,
|
||||
SCRAPE_ZOOPLA,
|
||||
)
|
||||
from homecouk import load_cookies as load_homecouk_cookies
|
||||
from openrent import load_cookies as load_openrent_cookies
|
||||
from rightmove import outcode_cache
|
||||
from scraper import (
|
||||
_sync_gauges,
|
||||
build_postcode_coords,
|
||||
build_postcode_index,
|
||||
load_outcodes,
|
||||
run_scrape,
|
||||
status,
|
||||
status_lock,
|
||||
)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Logging
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
LOG_DIR = Path("/app/data")
|
||||
LOG_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.DEBUG,
|
||||
format="%(asctime)s [%(levelname)s] %(message)s",
|
||||
handlers=[
|
||||
logging.StreamHandler(),
|
||||
logging.FileHandler(LOG_DIR / "rightmove.log"),
|
||||
],
|
||||
)
|
||||
log = logging.getLogger("rightmove")
|
||||
log.setLevel(logging.DEBUG)
|
||||
logging.getLogger("httpx").setLevel(logging.WARNING)
|
||||
logging.getLogger("httpcore").setLevel(logging.WARNING)
|
||||
|
||||
|
||||
# Suppress noisy /metrics and /health request logs from werkzeug
|
||||
class _NoiseFilter(logging.Filter):
|
||||
def filter(self, record):
|
||||
msg = record.getMessage()
|
||||
return "GET /metrics" not in msg and "GET /health" not in msg
|
||||
|
||||
|
||||
logging.getLogger("werkzeug").addFilter(_NoiseFilter())
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Startup: load data
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
log.info("Loading arcgis data...")
|
||||
OUTCODES = load_outcodes()
|
||||
PC_INDEX = build_postcode_index()
|
||||
PC_COORDS = build_postcode_coords() if (SCRAPE_OPENRENT or SCRAPE_ZOOPLA) else None
|
||||
log.info(
|
||||
"Ready — %d outcodes, postcode index built (rightmove=%s, homecouk=%s, openrent=%s, zoopla=%s)",
|
||||
len(OUTCODES),
|
||||
SCRAPE_RIGHTMOVE,
|
||||
SCRAPE_HOMECOUK,
|
||||
SCRAPE_OPENRENT,
|
||||
SCRAPE_ZOOPLA,
|
||||
)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Scheduler
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _start_scrape() -> bool:
|
||||
"""Try to start a scrape. Returns True if started, False if already running."""
|
||||
with status_lock:
|
||||
if status.state == "running":
|
||||
return False
|
||||
status.state = "running"
|
||||
thread = threading.Thread(
|
||||
target=run_scrape, args=(OUTCODES, PC_INDEX, PC_COORDS), daemon=True
|
||||
)
|
||||
thread.start()
|
||||
return True
|
||||
|
||||
|
||||
def _seconds_until(hour: int) -> float:
|
||||
"""Seconds from now until the next occurrence of `hour`:00 UTC."""
|
||||
now = datetime.now(timezone.utc)
|
||||
target = now.replace(hour=hour, minute=0, second=0, microsecond=0)
|
||||
if target <= now:
|
||||
target += timedelta(days=1)
|
||||
return (target - now).total_seconds()
|
||||
|
||||
|
||||
def _scheduler_loop() -> None:
|
||||
"""Background thread that triggers a daily scrape at SCHEDULE_HOUR UTC."""
|
||||
log.info("Scheduler active — will run daily at %02d:00 UTC", SCHEDULE_HOUR)
|
||||
while True:
|
||||
wait = _seconds_until(SCHEDULE_HOUR)
|
||||
log.info(
|
||||
"Next scheduled scrape in %.0f seconds (%.1f hours)", wait, wait / 3600
|
||||
)
|
||||
time.sleep(wait)
|
||||
log.info("Scheduled scrape triggered")
|
||||
if not _start_scrape():
|
||||
log.warning("Scheduled scrape skipped — already running")
|
||||
|
||||
|
||||
if RUN_ON_STARTUP:
|
||||
log.info("RUN_ON_STARTUP=true — starting initial scrape")
|
||||
_start_scrape()
|
||||
|
||||
if SCHEDULE_HOUR >= 0:
|
||||
scheduler = threading.Thread(target=_scheduler_loop, daemon=True)
|
||||
scheduler.start()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Flask app
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
|
||||
@app.route("/health")
|
||||
def health():
|
||||
return "ok", 200
|
||||
|
||||
|
||||
@app.route("/run", methods=["POST"])
|
||||
def trigger_run():
|
||||
if _start_scrape():
|
||||
return jsonify({"message": "Scrape started"}), 200
|
||||
return jsonify({"error": "Scrape already running"}), 409
|
||||
|
||||
|
||||
@app.route("/status")
|
||||
def get_status():
|
||||
with status_lock:
|
||||
elapsed = 0.0
|
||||
if status.started_at:
|
||||
end = status.finished_at if status.finished_at else time.time()
|
||||
elapsed = end - status.started_at
|
||||
resp = {
|
||||
"state": status.state,
|
||||
"channel": status.channel,
|
||||
"outcode": status.outcode,
|
||||
"outcodes_done": status.outcodes_done,
|
||||
"outcodes_total": status.outcodes_total,
|
||||
"properties_buy": status.properties_buy,
|
||||
"properties_rent": status.properties_rent,
|
||||
"properties_by_source": {
|
||||
"rightmove": status.rm_properties,
|
||||
"homecouk": status.hk_properties,
|
||||
"openrent": status.or_properties,
|
||||
"zoopla": status.zp_properties,
|
||||
},
|
||||
"errors": status.errors[-20:], # last 20 errors
|
||||
"elapsed_seconds": round(elapsed, 1),
|
||||
}
|
||||
if SCHEDULE_HOUR >= 0:
|
||||
resp["next_scrape_in_seconds"] = round(_seconds_until(SCHEDULE_HOUR))
|
||||
return jsonify(resp)
|
||||
|
||||
|
||||
@app.route("/debug")
|
||||
def get_debug():
|
||||
hk_cookies = load_homecouk_cookies() if SCRAPE_HOMECOUK else None
|
||||
or_cookies = load_openrent_cookies() if SCRAPE_OPENRENT else None
|
||||
return jsonify(
|
||||
{
|
||||
"outcode_cache_size": len(outcode_cache),
|
||||
"outcode_cache_sample": dict(list(outcode_cache.items())[:20]),
|
||||
"scrape_rightmove": SCRAPE_RIGHTMOVE,
|
||||
"scrape_homecouk": SCRAPE_HOMECOUK,
|
||||
"scrape_openrent": SCRAPE_OPENRENT,
|
||||
"scrape_zoopla": SCRAPE_ZOOPLA,
|
||||
"homecouk_cookies_available": hk_cookies is not None,
|
||||
"openrent_cookies_available": or_cookies is not None,
|
||||
"zoopla_note": "browser-based (Camoufox), no cookies needed",
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@app.route("/metrics")
|
||||
def metrics():
|
||||
with status_lock:
|
||||
_sync_gauges()
|
||||
return Response(generate_latest(), mimetype=CONTENT_TYPE_LATEST)
|
||||
|
||||
|
||||
@app.route("/data/<filename>")
|
||||
def serve_data(filename):
|
||||
if not filename.endswith(".parquet"):
|
||||
return jsonify({"error": "Only parquet files served"}), 400
|
||||
return send_from_directory(DATA_DIR, filename)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.run(host="0.0.0.0", port=1234, debug=False)
|
||||
Loading…
Add table
Add a link
Reference in a new issue