all good

2026-05-17 10:16:30 +01:00 · 2026-05-17 10:16:30 +01:00 · 017902b8e6
commit 017902b8e6
parent 47d89f6fad
82 changed files with 331466 additions and 54841 deletions
--- a/finder/main.py
+++ b/finder/main.py
@ -1,211 +1,166 @@
+import argparse
 import logging
-import threading
+import os
+import tempfile
 import time
-from datetime import datetime, timedelta, timezone
 from pathlib import Path

-from flask import Flask, Response, jsonify, send_from_directory
-from prometheus_client import generate_latest, CONTENT_TYPE_LATEST
+from constants import DATA_DIR

-from constants import (
-    DATA_DIR,
-    RUN_ON_STARTUP,
-    SCHEDULE_HOUR,
-    SCRAPE_HOMECOUK,
-    SCRAPE_OPENRENT,
-    SCRAPE_RIGHTMOVE,
-    SCRAPE_ZOOPLA,
-)
-from homecouk import load_cookies as load_homecouk_cookies
-from openrent import load_cookies as load_openrent_cookies
-from rightmove import outcode_cache
-from scraper import (
-    _sync_gauges,
-    build_postcode_coords,
-    build_postcode_index,
-    load_outcodes,
-    run_scrape,
-    status,
-    status_lock,
+
+SOURCE_CHOICES = ("rightmove", "homecouk", "zoopla", "all")
+TEST_MAX_PROPERTIES_PER_SOURCE = 100
+TEST_OUTCODES = (
+    "E1",
+    "N1",
+    "NW1",
+    "SE1",
+    "SW1",
+    "W1",
+    "WC1",
+    "BR1",
+    "CR0",
+    "TW1",
 )

-# ---------------------------------------------------------------------------
-# Logging
-# ---------------------------------------------------------------------------
-
-LOG_DIR = Path("/app/data")
-LOG_DIR.mkdir(parents=True, exist_ok=True)
-
-logging.basicConfig(
-    level=logging.DEBUG,
-    format="%(asctime)s [%(levelname)s] %(message)s",
-    handlers=[
-        logging.StreamHandler(),
-        logging.FileHandler(LOG_DIR / "rightmove.log"),
-    ],
-)
-log = logging.getLogger("rightmove")
-log.setLevel(logging.DEBUG)
-logging.getLogger("httpx").setLevel(logging.WARNING)
-logging.getLogger("httpcore").setLevel(logging.WARNING)
+log = logging.getLogger("finder")


-# Suppress noisy /metrics and /health request logs from werkzeug
-class _NoiseFilter(logging.Filter):
-    def filter(self, record):
-        msg = record.getMessage()
-        return "GET /metrics" not in msg and "GET /health" not in msg
+def configure_standalone_runtime() -> None:
+    """Keep browser/cache/temp files on the project volume for local runs."""
+    runtime_dir = DATA_DIR / ".runtime"
+    cache_dir = runtime_dir / "cache"
+    temp_dir = runtime_dir / "tmp"
+    cache_dir.mkdir(parents=True, exist_ok=True)
+    temp_dir.mkdir(parents=True, exist_ok=True)
+
+    os.environ.setdefault("XDG_CACHE_HOME", str(cache_dir))
+    os.environ.setdefault("TMPDIR", str(temp_dir))
+    tempfile.tempdir = str(temp_dir)


-logging.getLogger("werkzeug").addFilter(_NoiseFilter())
-
-# ---------------------------------------------------------------------------
-# Startup: load data
-# ---------------------------------------------------------------------------
-
-log.info("Loading arcgis data...")
-OUTCODES = load_outcodes()
-PC_INDEX = build_postcode_index()
-PC_COORDS = build_postcode_coords() if (SCRAPE_OPENRENT or SCRAPE_ZOOPLA) else None
-log.info(
-    "Ready — %d outcodes, postcode index built (rightmove=%s, homecouk=%s, openrent=%s, zoopla=%s)",
-    len(OUTCODES),
-    SCRAPE_RIGHTMOVE,
-    SCRAPE_HOMECOUK,
-    SCRAPE_OPENRENT,
-    SCRAPE_ZOOPLA,
-)
-
-# ---------------------------------------------------------------------------
-# Scheduler
-# ---------------------------------------------------------------------------
-
-
-def _start_scrape() -> bool:
-    """Try to start a scrape. Returns True if started, False if already running."""
-    with status_lock:
-        if status.state == "running":
-            return False
-        status.state = "running"
-    thread = threading.Thread(
-        target=run_scrape, args=(OUTCODES, PC_INDEX, PC_COORDS), daemon=True
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Run a manual Greater London-ish property scrape."
    )
-    thread.start()
-    return True
+    parser.add_argument(
+        "--source",
+        choices=SOURCE_CHOICES,
+        default="all",
+        help="Portal to scrape. 'all' runs Rightmove, home.co.uk, and Zoopla.",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=Path,
+        default=DATA_DIR,
+        help=f"Directory for parquet output. Defaults to {DATA_DIR}.",
+    )
+    parser.add_argument(
+        "--limit-outcodes",
+        type=int,
+        default=None,
+        help="Limit outcodes for a quick manual smoke test.",
+    )
+    parser.add_argument(
+        "--max-properties-per-source",
+        type=int,
+        default=None,
+        help="Stop each source after this many transformed listings.",
+    )
+    parser.add_argument(
+        "--test",
+        action="store_true",
+        help=(
+            "Run a small standalone smoke test: use likely London outcodes and "
+            f"fetch at most {TEST_MAX_PROPERTIES_PER_SOURCE} listings per source."
+        ),
+    )
+    return parser.parse_args()


-def _seconds_until(hour: int) -> float:
-    """Seconds from now until the next occurrence of `hour`:00 UTC."""
-    now = datetime.now(timezone.utc)
-    target = now.replace(hour=hour, minute=0, second=0, microsecond=0)
-    if target <= now:
-        target += timedelta(days=1)
-    return (target - now).total_seconds()
+def configure_logging() -> None:
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s [%(levelname)s] %(message)s",
+    )
+    logging.getLogger("httpx").setLevel(logging.WARNING)
+    logging.getLogger("httpcore").setLevel(logging.WARNING)


-def _scheduler_loop() -> None:
-    """Background thread that triggers a daily scrape at SCHEDULE_HOUR UTC."""
-    log.info("Scheduler active — will run daily at %02d:00 UTC", SCHEDULE_HOUR)
-    while True:
-        wait = _seconds_until(SCHEDULE_HOUR)
-        log.info(
-            "Next scheduled scrape in %.0f seconds (%.1f hours)", wait, wait / 3600
-        )
-        time.sleep(wait)
-        log.info("Scheduled scrape triggered")
-        if not _start_scrape():
-            log.warning("Scheduled scrape skipped — already running")
+def selected_sources(source: str) -> list[str]:
+    if source == "all":
+        return ["rightmove", "homecouk", "zoopla"]
+    return [source]


-if RUN_ON_STARTUP:
-    log.info("RUN_ON_STARTUP=true — starting initial scrape")
-    _start_scrape()
+def main() -> int:
+    args = parse_args()
+    configure_standalone_runtime()
+    configure_logging()

-if SCHEDULE_HOUR >= 0:
-    scheduler = threading.Thread(target=_scheduler_loop, daemon=True)
-    scheduler.start()
+    if args.limit_outcodes is not None and args.limit_outcodes < 1:
+        raise SystemExit("--limit-outcodes must be greater than zero")
+    if (
+        args.max_properties_per_source is not None
+        and args.max_properties_per_source < 1
+    ):
+        raise SystemExit("--max-properties-per-source must be greater than zero")

+    output_dir = args.output_dir.expanduser().resolve()
+    if args.test and args.output_dir == DATA_DIR:
+        output_dir = (DATA_DIR / "test").expanduser().resolve()
+    output_dir.mkdir(parents=True, exist_ok=True)

-# ---------------------------------------------------------------------------
-# Flask app
-# ---------------------------------------------------------------------------
-
-app = Flask(__name__)
-
-
-@app.route("/health")
-def health():
-    return "ok", 200
-
-
-@app.route("/run", methods=["POST"])
-def trigger_run():
-    if _start_scrape():
-        return jsonify({"message": "Scrape started"}), 200
-    return jsonify({"error": "Scrape already running"}), 409
-
-
-@app.route("/status")
-def get_status():
-    with status_lock:
-        elapsed = 0.0
-        if status.started_at:
-            end = status.finished_at if status.finished_at else time.time()
-            elapsed = end - status.started_at
-        resp = {
-            "state": status.state,
-            "channel": status.channel,
-            "outcode": status.outcode,
-            "outcodes_done": status.outcodes_done,
-            "outcodes_total": status.outcodes_total,
-            "properties_buy": status.properties_buy,
-            "properties_rent": status.properties_rent,
-            "properties_by_source": {
-                "rightmove": status.rm_properties,
-                "homecouk": status.hk_properties,
-                "openrent": status.or_properties,
-                "zoopla": status.zp_properties,
-            },
-            "errors": status.errors[-20:],  # last 20 errors
-            "elapsed_seconds": round(elapsed, 1),
-        }
-        if SCHEDULE_HOUR >= 0:
-            resp["next_scrape_in_seconds"] = round(_seconds_until(SCHEDULE_HOUR))
-        return jsonify(resp)
-
-
-@app.route("/debug")
-def get_debug():
-    hk_cookies = load_homecouk_cookies() if SCRAPE_HOMECOUK else None
-    or_cookies = load_openrent_cookies() if SCRAPE_OPENRENT else None
-    return jsonify(
-        {
-            "outcode_cache_size": len(outcode_cache),
-            "outcode_cache_sample": dict(list(outcode_cache.items())[:20]),
-            "scrape_rightmove": SCRAPE_RIGHTMOVE,
-            "scrape_homecouk": SCRAPE_HOMECOUK,
-            "scrape_openrent": SCRAPE_OPENRENT,
-            "scrape_zoopla": SCRAPE_ZOOPLA,
-            "homecouk_cookies_available": hk_cookies is not None,
-            "openrent_cookies_available": or_cookies is not None,
-            "zoopla_note": "browser-based (Camoufox), no cookies needed",
-        }
+    from scraper import (
+        build_postcode_coords,
+        build_postcode_index,
+        load_outcodes,
+        run_scrape,
    )

+    outcodes = load_outcodes()
+    if args.test and args.limit_outcodes is None:
+        preferred = [outcode for outcode in TEST_OUTCODES if outcode in set(outcodes)]
+        if preferred:
+            outcodes = preferred
+    if args.limit_outcodes is not None:
+        outcodes = outcodes[: args.limit_outcodes]

-@app.route("/metrics")
-def metrics():
-    with status_lock:
-        _sync_gauges()
-    return Response(generate_latest(), mimetype=CONTENT_TYPE_LATEST)
+    if not outcodes:
+        raise SystemExit("No Greater London-ish outcodes loaded; nothing to scrape.")

+    sources = selected_sources(args.source)
+    max_properties_per_source = args.max_properties_per_source
+    if args.test and max_properties_per_source is None:
+        max_properties_per_source = TEST_MAX_PROPERTIES_PER_SOURCE

-@app.route("/data/<filename>")
-def serve_data(filename):
-    if not filename.endswith(".parquet"):
-        return jsonify({"error": "Only parquet files served"}), 400
-    return send_from_directory(DATA_DIR, filename)
+    log.info(
+        "Starting sale scrape: source=%s outcodes=%d output_dir=%s test=%s",
+        args.source,
+        len(outcodes),
+        output_dir,
+        args.test,
+    )
+    started = time.monotonic()
+
+    pc_index = build_postcode_index()
+    pc_coords = build_postcode_coords() if "zoopla" in sources else None
+    result = run_scrape(
+        outcodes,
+        pc_index,
+        pc_coords=pc_coords,
+        sources=sources,
+        output_dir=output_dir,
+        max_properties_per_source=max_properties_per_source,
+    )
+
+    elapsed = time.monotonic() - started
+    log.info("Scrape finished in %.1fs", elapsed)
+    log.info("Result: %s", result)
+    if args.test and result.get("errors"):
+        raise SystemExit("Test scrape failed; see errors in the result above.")
+    return 0


 if __name__ == "__main__":
-    app.run(host="0.0.0.0", port=1234, debug=False)
+    raise SystemExit(main())