This commit is contained in:
Andras Schmelczer 2026-05-17 10:16:30 +01:00
parent 47d89f6fad
commit 017902b8e6
82 changed files with 331466 additions and 54841 deletions

View file

@ -1,211 +1,166 @@
import argparse
import logging
import threading
import os
import tempfile
import time
from datetime import datetime, timedelta, timezone
from pathlib import Path
from flask import Flask, Response, jsonify, send_from_directory
from prometheus_client import generate_latest, CONTENT_TYPE_LATEST
from constants import DATA_DIR
from constants import (
DATA_DIR,
RUN_ON_STARTUP,
SCHEDULE_HOUR,
SCRAPE_HOMECOUK,
SCRAPE_OPENRENT,
SCRAPE_RIGHTMOVE,
SCRAPE_ZOOPLA,
)
from homecouk import load_cookies as load_homecouk_cookies
from openrent import load_cookies as load_openrent_cookies
from rightmove import outcode_cache
from scraper import (
_sync_gauges,
build_postcode_coords,
build_postcode_index,
load_outcodes,
run_scrape,
status,
status_lock,
SOURCE_CHOICES = ("rightmove", "homecouk", "zoopla", "all")
TEST_MAX_PROPERTIES_PER_SOURCE = 100
TEST_OUTCODES = (
"E1",
"N1",
"NW1",
"SE1",
"SW1",
"W1",
"WC1",
"BR1",
"CR0",
"TW1",
)
# ---------------------------------------------------------------------------
# Logging
# ---------------------------------------------------------------------------
LOG_DIR = Path("/app/data")
LOG_DIR.mkdir(parents=True, exist_ok=True)
logging.basicConfig(
level=logging.DEBUG,
format="%(asctime)s [%(levelname)s] %(message)s",
handlers=[
logging.StreamHandler(),
logging.FileHandler(LOG_DIR / "rightmove.log"),
],
)
log = logging.getLogger("rightmove")
log.setLevel(logging.DEBUG)
logging.getLogger("httpx").setLevel(logging.WARNING)
logging.getLogger("httpcore").setLevel(logging.WARNING)
log = logging.getLogger("finder")
# Suppress noisy /metrics and /health request logs from werkzeug
class _NoiseFilter(logging.Filter):
def filter(self, record):
msg = record.getMessage()
return "GET /metrics" not in msg and "GET /health" not in msg
def configure_standalone_runtime() -> None:
"""Keep browser/cache/temp files on the project volume for local runs."""
runtime_dir = DATA_DIR / ".runtime"
cache_dir = runtime_dir / "cache"
temp_dir = runtime_dir / "tmp"
cache_dir.mkdir(parents=True, exist_ok=True)
temp_dir.mkdir(parents=True, exist_ok=True)
os.environ.setdefault("XDG_CACHE_HOME", str(cache_dir))
os.environ.setdefault("TMPDIR", str(temp_dir))
tempfile.tempdir = str(temp_dir)
logging.getLogger("werkzeug").addFilter(_NoiseFilter())
# ---------------------------------------------------------------------------
# Startup: load data
# ---------------------------------------------------------------------------
log.info("Loading arcgis data...")
OUTCODES = load_outcodes()
PC_INDEX = build_postcode_index()
PC_COORDS = build_postcode_coords() if (SCRAPE_OPENRENT or SCRAPE_ZOOPLA) else None
log.info(
"Ready — %d outcodes, postcode index built (rightmove=%s, homecouk=%s, openrent=%s, zoopla=%s)",
len(OUTCODES),
SCRAPE_RIGHTMOVE,
SCRAPE_HOMECOUK,
SCRAPE_OPENRENT,
SCRAPE_ZOOPLA,
)
# ---------------------------------------------------------------------------
# Scheduler
# ---------------------------------------------------------------------------
def _start_scrape() -> bool:
"""Try to start a scrape. Returns True if started, False if already running."""
with status_lock:
if status.state == "running":
return False
status.state = "running"
thread = threading.Thread(
target=run_scrape, args=(OUTCODES, PC_INDEX, PC_COORDS), daemon=True
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Run a manual Greater London-ish property scrape."
)
thread.start()
return True
parser.add_argument(
"--source",
choices=SOURCE_CHOICES,
default="all",
help="Portal to scrape. 'all' runs Rightmove, home.co.uk, and Zoopla.",
)
parser.add_argument(
"--output-dir",
type=Path,
default=DATA_DIR,
help=f"Directory for parquet output. Defaults to {DATA_DIR}.",
)
parser.add_argument(
"--limit-outcodes",
type=int,
default=None,
help="Limit outcodes for a quick manual smoke test.",
)
parser.add_argument(
"--max-properties-per-source",
type=int,
default=None,
help="Stop each source after this many transformed listings.",
)
parser.add_argument(
"--test",
action="store_true",
help=(
"Run a small standalone smoke test: use likely London outcodes and "
f"fetch at most {TEST_MAX_PROPERTIES_PER_SOURCE} listings per source."
),
)
return parser.parse_args()
def _seconds_until(hour: int) -> float:
"""Seconds from now until the next occurrence of `hour`:00 UTC."""
now = datetime.now(timezone.utc)
target = now.replace(hour=hour, minute=0, second=0, microsecond=0)
if target <= now:
target += timedelta(days=1)
return (target - now).total_seconds()
def configure_logging() -> None:
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
)
logging.getLogger("httpx").setLevel(logging.WARNING)
logging.getLogger("httpcore").setLevel(logging.WARNING)
def _scheduler_loop() -> None:
"""Background thread that triggers a daily scrape at SCHEDULE_HOUR UTC."""
log.info("Scheduler active — will run daily at %02d:00 UTC", SCHEDULE_HOUR)
while True:
wait = _seconds_until(SCHEDULE_HOUR)
log.info(
"Next scheduled scrape in %.0f seconds (%.1f hours)", wait, wait / 3600
)
time.sleep(wait)
log.info("Scheduled scrape triggered")
if not _start_scrape():
log.warning("Scheduled scrape skipped — already running")
def selected_sources(source: str) -> list[str]:
if source == "all":
return ["rightmove", "homecouk", "zoopla"]
return [source]
if RUN_ON_STARTUP:
log.info("RUN_ON_STARTUP=true — starting initial scrape")
_start_scrape()
def main() -> int:
args = parse_args()
configure_standalone_runtime()
configure_logging()
if SCHEDULE_HOUR >= 0:
scheduler = threading.Thread(target=_scheduler_loop, daemon=True)
scheduler.start()
if args.limit_outcodes is not None and args.limit_outcodes < 1:
raise SystemExit("--limit-outcodes must be greater than zero")
if (
args.max_properties_per_source is not None
and args.max_properties_per_source < 1
):
raise SystemExit("--max-properties-per-source must be greater than zero")
output_dir = args.output_dir.expanduser().resolve()
if args.test and args.output_dir == DATA_DIR:
output_dir = (DATA_DIR / "test").expanduser().resolve()
output_dir.mkdir(parents=True, exist_ok=True)
# ---------------------------------------------------------------------------
# Flask app
# ---------------------------------------------------------------------------
app = Flask(__name__)
@app.route("/health")
def health():
return "ok", 200
@app.route("/run", methods=["POST"])
def trigger_run():
if _start_scrape():
return jsonify({"message": "Scrape started"}), 200
return jsonify({"error": "Scrape already running"}), 409
@app.route("/status")
def get_status():
with status_lock:
elapsed = 0.0
if status.started_at:
end = status.finished_at if status.finished_at else time.time()
elapsed = end - status.started_at
resp = {
"state": status.state,
"channel": status.channel,
"outcode": status.outcode,
"outcodes_done": status.outcodes_done,
"outcodes_total": status.outcodes_total,
"properties_buy": status.properties_buy,
"properties_rent": status.properties_rent,
"properties_by_source": {
"rightmove": status.rm_properties,
"homecouk": status.hk_properties,
"openrent": status.or_properties,
"zoopla": status.zp_properties,
},
"errors": status.errors[-20:], # last 20 errors
"elapsed_seconds": round(elapsed, 1),
}
if SCHEDULE_HOUR >= 0:
resp["next_scrape_in_seconds"] = round(_seconds_until(SCHEDULE_HOUR))
return jsonify(resp)
@app.route("/debug")
def get_debug():
hk_cookies = load_homecouk_cookies() if SCRAPE_HOMECOUK else None
or_cookies = load_openrent_cookies() if SCRAPE_OPENRENT else None
return jsonify(
{
"outcode_cache_size": len(outcode_cache),
"outcode_cache_sample": dict(list(outcode_cache.items())[:20]),
"scrape_rightmove": SCRAPE_RIGHTMOVE,
"scrape_homecouk": SCRAPE_HOMECOUK,
"scrape_openrent": SCRAPE_OPENRENT,
"scrape_zoopla": SCRAPE_ZOOPLA,
"homecouk_cookies_available": hk_cookies is not None,
"openrent_cookies_available": or_cookies is not None,
"zoopla_note": "browser-based (Camoufox), no cookies needed",
}
from scraper import (
build_postcode_coords,
build_postcode_index,
load_outcodes,
run_scrape,
)
outcodes = load_outcodes()
if args.test and args.limit_outcodes is None:
preferred = [outcode for outcode in TEST_OUTCODES if outcode in set(outcodes)]
if preferred:
outcodes = preferred
if args.limit_outcodes is not None:
outcodes = outcodes[: args.limit_outcodes]
@app.route("/metrics")
def metrics():
with status_lock:
_sync_gauges()
return Response(generate_latest(), mimetype=CONTENT_TYPE_LATEST)
if not outcodes:
raise SystemExit("No Greater London-ish outcodes loaded; nothing to scrape.")
sources = selected_sources(args.source)
max_properties_per_source = args.max_properties_per_source
if args.test and max_properties_per_source is None:
max_properties_per_source = TEST_MAX_PROPERTIES_PER_SOURCE
@app.route("/data/<filename>")
def serve_data(filename):
if not filename.endswith(".parquet"):
return jsonify({"error": "Only parquet files served"}), 400
return send_from_directory(DATA_DIR, filename)
log.info(
"Starting sale scrape: source=%s outcodes=%d output_dir=%s test=%s",
args.source,
len(outcodes),
output_dir,
args.test,
)
started = time.monotonic()
pc_index = build_postcode_index()
pc_coords = build_postcode_coords() if "zoopla" in sources else None
result = run_scrape(
outcodes,
pc_index,
pc_coords=pc_coords,
sources=sources,
output_dir=output_dir,
max_properties_per_source=max_properties_per_source,
)
elapsed = time.monotonic() - started
log.info("Scrape finished in %.1fs", elapsed)
log.info("Result: %s", result)
if args.test and result.get("errors"):
raise SystemExit("Test scrape failed; see errors in the result above.")
return 0
if __name__ == "__main__":
app.run(host="0.0.0.0", port=1234, debug=False)
raise SystemExit(main())