fix zoopla bug

2026-03-24 22:30:49 +00:00 · 2026-03-24 22:30:49 +00:00 · 1dfa0e0009
commit 1dfa0e0009
parent 96dfdd7491
4 changed files with 215 additions and 4 deletions
--- a/finder/constants.py
+++ b/finder/constants.py
@ -10,6 +10,7 @@ MAX_RETRIES = 3
 RETRY_BASE_DELAY = 2.0
 GRID_CELL_SIZE = 0.01  # degrees for postcode spatial index
 SEED = 42
 CHECKPOINT_INTERVAL = int(os.environ.get("CHECKPOINT_INTERVAL", "900"))  # seconds
 # Schedule: hour of day (UTC) to auto-run scrape. Set to -1 to disable.
 SCHEDULE_HOUR = int(os.environ.get("SCHEDULE_HOUR", "3"))
--- a/finder/openrent.py
+++ b/finder/openrent.py
@ -624,9 +624,13 @@ def _resolve_outcode_postcodes(
    pc_coords: dict[str, tuple[float, float]],
 ) -> list[str]:
    """Get all postcodes for an outcode from the postcode coordinates lookup."""
    # ONSPD 7-char format: 4-char outcodes have no space before incode
    # (e.g., "BH191AB"), while shorter outcodes do (e.g., "E14 5AB").
    prefix = outcode + " "
-    # Also try without space for non-standard format (e.g. "SW1Y" matches "SW1Y 4AA")
+    results = [pcd for pcd in pc_coords if pcd.startswith(prefix)]
-    return [pcd for pcd in pc_coords if pcd.startswith(prefix)]
+    if not results and len(outcode) >= 4:
        results = [pcd for pcd in pc_coords if pcd.startswith(outcode) and len(pcd) > len(outcode)]
    return results
 def transform_property(
--- a/finder/scraper.py
+++ b/finder/scraper.py
@ -1,3 +1,4 @@
 import json
 import logging
 import random
 import threading
@ -11,6 +12,7 @@ import httpx
 from constants import (
    ARCGIS_PATH,
    CHANNELS,
    CHECKPOINT_INTERVAL,
    DATA_DIR,
    DELAY_BETWEEN_OUTCODES,
    RELOAD_URL,
@ -233,6 +235,135 @@ def _merge_channel(
    return all_properties, counts, total_dedup
 # ---------------------------------------------------------------------------
 # Checkpointing — save/resume partial results across crashes
 # ---------------------------------------------------------------------------
 def _checkpoint_meta_path():
    return DATA_DIR / "checkpoint.json"
 def _checkpoint_results_path(source: str, channel: str):
    return DATA_DIR / f"checkpoint_{source}_{channel}.json"
 def _save_checkpoint(
    shuffled: list[str],
    progress: _Progress,
    source_results: dict[str, dict[str, list]],
    active_sources: list[str],
 ) -> None:
    """Save per-source progress indices and partial results to disk.
    Writes atomically (temp + rename) so a crash mid-write leaves the previous
    checkpoint intact.
    """
    snap = progress.snapshot()
    meta = {
        "seed": SEED,
        "num_outcodes": len(shuffled),
        "sources": {s: snap.get(s, 0) for s in active_sources},
        "timestamp": time.time(),
    }
    # Write result files per source per channel
    for source in active_sources:
        results = source_results.get(source, {})
        for ch_key in ("BUY", "RENT"):
            props = results.get(ch_key, [])
            path = _checkpoint_results_path(source, ch_key.lower())
            tmp = path.with_suffix(".tmp")
            try:
                with open(tmp, "w") as f:
                    json.dump(props, f, default=str)
                tmp.rename(path)
            except Exception as e:
                log.warning("Failed to write checkpoint %s: %s", path.name, e)
    # Write metadata atomically
    tmp = _checkpoint_meta_path().with_suffix(".tmp")
    try:
        with open(tmp, "w") as f:
            json.dump(meta, f)
        tmp.rename(_checkpoint_meta_path())
    except Exception as e:
        log.warning("Failed to write checkpoint metadata: %s", e)
        return
    total = sum(len(source_results.get(s, {}).get(ch, []))
                for s in active_sources for ch in ("BUY", "RENT"))
    log.info(
        "Checkpoint saved: %s (%d properties)",
        {s: snap.get(s, 0) for s in active_sources},
        total,
    )
 def _load_checkpoint(
    shuffled: list[str],
 ) -> tuple[dict[str, int], dict[str, dict[str, list]]] | None:
    """Load checkpoint if it exists and matches the current outcode list.
    Returns (start_indices, loaded_results) or None if no valid checkpoint.
    """
    path = _checkpoint_meta_path()
    if not path.exists():
        return None
    try:
        with open(path) as f:
            meta = json.load(f)
    except Exception:
        log.warning("Checkpoint file corrupt, starting fresh")
        _clear_checkpoint()
        return None
    if meta.get("seed") != SEED or meta.get("num_outcodes") != len(shuffled):
        log.info("Checkpoint from different run configuration, discarding")
        _clear_checkpoint()
        return None
    start_indices: dict[str, int] = {}
    loaded_results: dict[str, dict[str, list]] = {}
    for source, completed in meta.get("sources", {}).items():
        start_indices[source] = completed
        loaded_results[source] = {"BUY": [], "RENT": []}
        for channel in ("buy", "rent"):
            rpath = _checkpoint_results_path(source, channel)
            if rpath.exists():
                try:
                    with open(rpath) as f:
                        loaded_results[source][channel.upper()] = json.load(f)
                except Exception:
                    log.warning(
                        "Checkpoint results for %s/%s corrupt, restarting %s",
                        source, channel, source,
                    )
                    start_indices[source] = 0
                    loaded_results[source] = {"BUY": [], "RENT": []}
                    break
    elapsed_since = time.time() - meta.get("timestamp", 0)
    log.info(
        "Resuming from checkpoint (saved %.0fm ago): %s",
        elapsed_since / 60,
        start_indices,
    )
    return start_indices, loaded_results
 def _clear_checkpoint() -> None:
    """Remove all checkpoint files after successful completion."""
    for path in DATA_DIR.glob("checkpoint*"):
        try:
            path.unlink()
        except Exception:
            pass
 def run_scrape(
    outcodes: list[str],
    pc_index: PostcodeSpatialIndex,
@ -293,15 +424,40 @@ def run_scrape(
    progress = _Progress()
    # --- Resume from checkpoint if available ---
    start_indices: dict[str, int] = {}
    checkpoint = _load_checkpoint(shuffled)
    if checkpoint:
        start_indices, loaded = checkpoint
        source_to_results = {"rm": rm_results, "hk": hk_results, "or": or_results, "zp": zp_results}
        for src, data in loaded.items():
            if src in source_to_results:
                for ch in ("BUY", "RENT"):
                    source_to_results[src][ch] = data.get(ch, [])
        # Reassign in case references changed
        rm_results = source_to_results["rm"]
        hk_results = source_to_results["hk"]
        or_results = source_to_results["or"]
        zp_results = source_to_results["zp"]
        # Pre-set progress for resumed sources
        for src, idx in start_indices.items():
            if idx > 0:
                progress.update(src, idx)
    # --- Source worker closures ---
    # Each worker owns its client lifecycle and iterates all outcodes for both
    # channels. On auth failure, it refreshes cookies and continues. On fatal
    # failure, it marks itself as done and returns partial results.
    def rm_worker():
        rm_start = start_indices.get("rm", 0)
        if rm_start > 0:
            log.info("Rightmove resuming from outcode %d/%d", rm_start, len(shuffled))
        client = make_client()
        try:
            for i, outcode in enumerate(shuffled):
                if i < rm_start:
                    continue
                try:
                    outcode_id = resolve_outcode_id(client, outcode)
                except Exception as e:
@ -344,11 +500,16 @@ def run_scrape(
            homecouk_enabled.set(0)
            progress.update("hk", len(shuffled))
            return
        hk_start = start_indices.get("hk", 0)
        if hk_start > 0:
            log.info("home.co.uk resuming from outcode %d/%d", hk_start, len(shuffled))
        client = make_homecouk_client(*hk_result)
        log.info("home.co.uk scraping ENABLED")
        homecouk_enabled.set(1)
        try:
            for i, outcode in enumerate(shuffled):
                if i < hk_start:
                    continue
                for ch_cfg in CHANNELS:
                    ch = ch_cfg["channel"]
                    try:
@ -403,11 +564,16 @@ def run_scrape(
            openrent_enabled.set(0)
            progress.update("or", len(shuffled))
            return
        or_start = start_indices.get("or", 0)
        if or_start > 0:
            log.info("OpenRent resuming from outcode %d/%d", or_start, len(shuffled))
        client = make_openrent_client(*or_result)
        log.info("OpenRent scraping ENABLED")
        openrent_enabled.set(1)
        try:
            for i, outcode in enumerate(shuffled):
                if i < or_start:
                    continue
                # OpenRent is RENT-only
                try:
                    props = openrent_search_outcode(
@ -470,8 +636,14 @@ def run_scrape(
            progress.update("zp", len(shuffled))
            return
        zp_start = start_indices.get("zp", 0)
        if zp_start > 0:
            log.info("Zoopla resuming from outcode %d/%d", zp_start, len(shuffled))
        try:
            for i, outcode in enumerate(shuffled):
                if i < zp_start:
                    continue
                search_url = None
                for ch_cfg in CHANNELS:
                    ch = ch_cfg["channel"]
@ -559,8 +731,15 @@ def run_scrape(
    # --- Monitor progress while workers run ---
    # Map source names to result dicts for checkpointing
    source_results_map = {
        "rm": rm_results, "hk": hk_results,
        "or": or_results, "zp": zp_results,
    }
    scrape_start = time.time()
    last_log = 0.0
    last_checkpoint = time.time()
    try:
        while any(t.is_alive() for t in threads):
@ -588,8 +767,9 @@ def run_scrape(
                status.zp_properties = len(zp_results["BUY"]) + len(zp_results["RENT"])
                _sync_gauges()
            # Log progress every 30 seconds
            now = time.time()
            # Log progress every 30 seconds
            if now - last_log >= 30:
                elapsed = now - scrape_start
                per_source = ", ".join(
@ -606,10 +786,26 @@ def run_scrape(
                )
                last_log = now
            # Save checkpoint periodically
            if now - last_checkpoint >= CHECKPOINT_INTERVAL:
                try:
                    _save_checkpoint(
                        shuffled, progress, source_results_map, active_sources,
                    )
                except Exception as e:
                    log.warning("Checkpoint save failed: %s", e)
                last_checkpoint = now
            time.sleep(5)
    except Exception as e:
        log.exception("Monitor loop error: %s", e)
    # Save final checkpoint before joining (in case merge/write fails)
    try:
        _save_checkpoint(shuffled, progress, source_results_map, active_sources)
    except Exception:
        pass
    for t in threads:
        t.join()
@ -656,6 +852,9 @@ def run_scrape(
                total_dedup,
            )
        # Scrape completed successfully — clear checkpoint
        _clear_checkpoint()
        with status_lock:
            status.state = "done"
            status.finished_at = time.time()
--- a/finder/zoopla.py
+++ b/finder/zoopla.py
@ -626,9 +626,16 @@ def transform_property(
        # Try outcode-level fallback
        outcode = _extract_outcode(address)
        if outcode:
            # ONSPD 7-char format: 4-char outcodes have no space before incode
            # (e.g., "BH191AB"), while shorter outcodes do (e.g., "E14 5AB").
            # Check both formats to handle all outcode lengths.
            prefix = outcode + " "
            for pcd, coords in pc_coords.items():
-                if pcd.startswith(prefix):
+                if pcd.startswith(prefix) or (
                    len(outcode) >= 4
                    and pcd.startswith(outcode)
                    and len(pcd) > len(outcode)
                ):
                    postcode = pcd
                    lat, lng = coords
                    break