import logging import math import os import random import re import threading import time from collections import defaultdict from dataclasses import dataclass, field from pathlib import Path import httpx import polars as pl from flask import Flask, jsonify, send_from_directory # --------------------------------------------------------------------------- # Logging # --------------------------------------------------------------------------- LOG_DIR = Path("/app/data") LOG_DIR.mkdir(parents=True, exist_ok=True) logging.basicConfig( level=logging.DEBUG, format="%(asctime)s [%(levelname)s] %(message)s", handlers=[ logging.StreamHandler(), logging.FileHandler(LOG_DIR / "rightmove.log"), ], ) log = logging.getLogger("rightmove") log.setLevel(logging.DEBUG) logging.getLogger("httpx").setLevel(logging.WARNING) logging.getLogger("httpcore").setLevel(logging.WARNING) # --------------------------------------------------------------------------- # Constants # --------------------------------------------------------------------------- ARCGIS_PATH = os.environ.get("ARCGIS_PATH", "/data/arcgis_data.parquet") DATA_DIR = Path("/app/data") PAGE_SIZE = 24 MAX_PAGES_PER_OUTCODE = 42 # 24*42 = 1008, safety cap per outcode DELAY_BETWEEN_PAGES = 1.0 DELAY_BETWEEN_OUTCODES = 2.0 MAX_RETRIES = 3 RETRY_BASE_DELAY = 2.0 GRID_CELL_SIZE = 0.01 # degrees for postcode spatial index SEED = 42 TYPEAHEAD_URL = "https://los.rightmove.co.uk/typeahead" SEARCH_URL = "https://www.rightmove.co.uk/api/property-search/listing/search" RIGHTMOVE_BASE = "https://www.rightmove.co.uk" PROPERTY_TYPE_MAP = { "Detached": "Detached", "Semi-Detached": "Semi-Detached", "Terraced": "Terraced", "End of Terrace": "Terraced", "Mid Terrace": "Terraced", "Flat": "Flat", "Maisonette": "Flat", "Studio": "Flat", "Apartment": "Flat", "Penthouse": "Flat", "Ground Flat": "Flat", "Detached Bungalow": "Detached", "Semi-Detached Bungalow": "Semi-Detached", "Town House": "Terraced", "Link Detached": "Detached", "Link Detached House": "Detached", "Bungalow": "Other", "Cottage": "Other", "Park Home": "Other", "Land": "Other", "Farm / Barn": "Other", "House": "Detached", "Not Specified": "Other", "Chalet": "Other", "Barn Conversion": "Other", "Coach House": "Other", "Character Property": "Other", "Cluster House": "Other", "Retirement Property": "Flat", "Plot": "Other", "Garages": "Other", "Mews": "Terraced", } CHANNELS = [ {"channel": "BUY", "transactionType": "BUY", "sortType": "2"}, {"channel": "RENT", "transactionType": "LETTING", "sortType": "6"}, ] # --------------------------------------------------------------------------- # Postcode spatial index # --------------------------------------------------------------------------- class PostcodeSpatialIndex: """Grid-based spatial index over arcgis postcodes for nearest-lookup.""" def __init__(self, lats: list[float], lngs: list[float], postcodes: list[str]): self.grid: dict[tuple[int, int], list[tuple[float, float, str]]] = defaultdict(list) for lat, lng, pcd in zip(lats, lngs, postcodes): gx = int(math.floor(lng / GRID_CELL_SIZE)) gy = int(math.floor(lat / GRID_CELL_SIZE)) self.grid[(gx, gy)].append((lat, lng, pcd)) log.info("Postcode spatial index: %d cells, %d postcodes", len(self.grid), len(lats)) def nearest(self, lat: float, lng: float) -> str | None: gx = int(math.floor(lng / GRID_CELL_SIZE)) gy = int(math.floor(lat / GRID_CELL_SIZE)) best_dist = float("inf") best_pcd = None for dx in range(-1, 2): for dy in range(-1, 2): for plat, plng, pcd in self.grid.get((gx + dx, gy + dy), []): d = (plat - lat) ** 2 + (plng - lng) ** 2 if d < best_dist: best_dist = d best_pcd = pcd return best_pcd # --------------------------------------------------------------------------- # Scrape status # --------------------------------------------------------------------------- @dataclass class ScrapeStatus: state: str = "idle" # idle | running | done | error channel: str = "" outcode: str = "" outcodes_done: int = 0 outcodes_total: int = 0 properties_buy: int = 0 properties_rent: int = 0 errors: list[str] = field(default_factory=list) started_at: float = 0.0 finished_at: float = 0.0 status = ScrapeStatus() status_lock = threading.Lock() debug_data: dict = {"last_response": None, "outcode_cache": {}} # --------------------------------------------------------------------------- # HTTP helpers # --------------------------------------------------------------------------- USER_AGENT = ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36" ) # Gluetun control API — runs on port 8000 inside the gluetun container. # Since finder uses network_mode: service:gluetun, localhost IS gluetun. GLUETUN_API = "http://127.0.0.1:8000" _ip_rotate_lock = threading.Lock() def rotate_ip() -> bool: """Ask gluetun to reconnect to a different VPN server, getting a new IP. Returns True if the IP changed successfully.""" with _ip_rotate_lock: log.info("Rotating VPN IP via gluetun...") try: # Get current IP with httpx.Client(timeout=10) as ctl: old_ip_resp = ctl.get(f"{GLUETUN_API}/v1/publicip/ip") old_ip = old_ip_resp.json().get("public_ip", "unknown") if old_ip_resp.status_code == 200 else "unknown" log.info("Current IP: %s", old_ip) # Trigger server change — PUT with empty JSON body picks a random server resp = ctl.put(f"{GLUETUN_API}/v1/vpn/status", json={"status": "stopped"}) if resp.status_code != 200: log.error("Failed to stop VPN: %d %s", resp.status_code, resp.text) return False time.sleep(2) resp = ctl.put(f"{GLUETUN_API}/v1/vpn/status", json={"status": "running"}) if resp.status_code != 200: log.error("Failed to start VPN: %d %s", resp.status_code, resp.text) return False # Wait for reconnection for _ in range(30): time.sleep(2) try: with httpx.Client(timeout=10) as ctl: new_ip_resp = ctl.get(f"{GLUETUN_API}/v1/publicip/ip") if new_ip_resp.status_code == 200: new_ip = new_ip_resp.json().get("public_ip", "") if new_ip and new_ip != old_ip: log.info("IP rotated: %s → %s", old_ip, new_ip) return True except Exception: pass # VPN still reconnecting log.warning("IP rotation timed out (may still be same IP)") return False except Exception as e: log.error("IP rotation failed: %s", e) return False def make_client() -> httpx.Client: return httpx.Client( timeout=30, headers={"User-Agent": USER_AGENT, "Accept": "application/json"}, follow_redirects=True, ) def fetch_with_retry( client: httpx.Client, url: str, params: dict | None = None, on_403: bool = True ) -> dict | None: """GET JSON with retries on 429/5xx/connection errors. Returns None on permanent failure. On 403, triggers IP rotation and retries once.""" for attempt in range(MAX_RETRIES): try: resp = client.get(url, params=params) if resp.status_code == 200: return resp.json() if resp.status_code == 403 and on_403: log.warning("HTTP 403 — IP likely blocked, rotating...") if rotate_ip(): # Retry once with new IP (but don't recurse on 403 again) return fetch_with_retry(client, url, params, on_403=False) log.error("IP rotation failed, giving up on %s", url) return None if resp.status_code in (429, 500, 502, 503, 504): delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1) log.warning("HTTP %d from %s, retry %d/%d in %.1fs", resp.status_code, url, attempt + 1, MAX_RETRIES, delay) time.sleep(delay) continue log.error("HTTP %d from %s (non-retryable)", resp.status_code, url) return None except (httpx.ConnectError, httpx.ReadTimeout, httpx.WriteTimeout, httpx.PoolTimeout) as e: delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1) log.warning("%s from %s, retry %d/%d in %.1fs", type(e).__name__, url, attempt + 1, MAX_RETRIES, delay) time.sleep(delay) log.error("All %d retries exhausted for %s", MAX_RETRIES, url) return None # --------------------------------------------------------------------------- # Rightmove API # --------------------------------------------------------------------------- def resolve_outcode_id(client: httpx.Client, outcode: str) -> str | None: """Look up Rightmove's internal ID for an outcode via typeahead API.""" if outcode in debug_data["outcode_cache"]: return debug_data["outcode_cache"][outcode] data = fetch_with_retry(client, TYPEAHEAD_URL, {"query": outcode, "limit": "10", "exclude": "STREET"}) if not data: return None for match in data.get("matches", []): if match.get("type") == "OUTCODE" and match.get("displayName") == outcode: rid = str(match["id"]) debug_data["outcode_cache"][outcode] = rid return rid log.debug("Outcode %s not found in typeahead results", outcode) return None def search_outcode( client: httpx.Client, outcode_id: str, outcode: str, channel_cfg: dict, pc_index: PostcodeSpatialIndex, ) -> list[dict]: """Paginate through search results for one outcode+channel. Returns transformed properties.""" properties = [] index = 0 for page in range(MAX_PAGES_PER_OUTCODE): params = { "useLocationIdentifier": "true", "locationIdentifier": f"OUTCODE^{outcode_id}", "index": str(index), "sortType": channel_cfg["sortType"], "channel": channel_cfg["channel"], "transactionType": channel_cfg["transactionType"], } data = fetch_with_retry(client, SEARCH_URL, params) if not data: log.warning("Failed to fetch page %d for %s/%s", page, outcode, channel_cfg["channel"]) break debug_data["last_response"] = data raw_props = data.get("properties", []) if not raw_props: break for prop in raw_props: transformed = transform_property(prop, outcode, pc_index) if transformed: properties.append(transformed) # Check if there are more pages result_count_str = data.get("resultCount", "0") result_count = int(result_count_str.replace(",", "")) index += PAGE_SIZE if index >= result_count: break if page < MAX_PAGES_PER_OUTCODE - 1: time.sleep(DELAY_BETWEEN_PAGES) return properties # --------------------------------------------------------------------------- # Property transformation # --------------------------------------------------------------------------- def parse_display_size(display_size: str | None) -> float | None: """Parse displaySize like '499 sq. ft.' or '4,124 sq. ft.' to sqm.""" if not display_size: return None # Try sq. ft. first m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*ft", display_size, re.IGNORECASE) if m: sqft = float(m.group(1).replace(",", "")) return round(sqft * 0.092903, 1) # Try sq. m. m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*m", display_size, re.IGNORECASE) if m: return round(float(m.group(1).replace(",", "")), 1) return None def map_property_type(sub_type: str | None) -> str: """Map propertySubType to canonical type.""" if not sub_type: return "Other" canonical = PROPERTY_TYPE_MAP.get(sub_type) if canonical: return canonical log.warning("Unknown propertySubType: %r — mapping to Other", sub_type) return "Other" def extract_tenure(tenure_obj: dict | None) -> str | None: """Extract tenure string from tenure object.""" if not tenure_obj: return None tt = tenure_obj.get("tenureType", "") if tt == "FREEHOLD": return "Freehold" if tt == "LEASEHOLD": return "Leasehold" return None def fix_coords(lat: float, lng: float) -> tuple[float, float]: """Swap lat/lng if they look reversed. England: lat ~49–56, lng ~-7–2.""" if 49 <= lat <= 56 and -7 <= lng <= 2: return lat, lng if 49 <= lng <= 56 and -7 <= lat <= 2: log.debug("Swapping reversed coords: lat=%.4f lng=%.4f → lat=%.4f lng=%.4f", lat, lng, lng, lat) return lng, lat log.warning("Coords outside England bounds even after swap attempt: lat=%.4f lng=%.4f", lat, lng) return lat, lng def normalize_price(amount: int, frequency: str) -> int: """Normalize price to monthly for rentals (weekly × 52/12, yearly ÷ 12).""" if frequency == "weekly": return round(amount * 52 / 12) if frequency == "yearly": return round(amount / 12) return amount def transform_property(prop: dict, outcode: str, pc_index: PostcodeSpatialIndex) -> dict | None: """Transform a raw Rightmove property dict into our output schema.""" loc = prop.get("location") if not loc: return None raw_lat = loc.get("latitude") raw_lng = loc.get("longitude") if raw_lat is None or raw_lng is None: return None lat, lng = fix_coords(raw_lat, raw_lng) price_obj = prop.get("price", {}) amount = price_obj.get("amount") if amount is None: return None frequency = price_obj.get("frequency", "") price = normalize_price(int(amount), frequency) display_prices = price_obj.get("displayPrices", []) price_qualifier = display_prices[0].get("displayPriceQualifier", "") if display_prices else "" sub_type = prop.get("propertySubType", "") bedrooms = prop.get("bedrooms", 0) or 0 bathrooms = prop.get("bathrooms", 0) or 0 key_features = [kf.get("description", "") for kf in prop.get("keyFeatures", []) if kf.get("description")] listing_update = prop.get("listingUpdate", {}) update_date = listing_update.get("listingUpdateDate", "") postcode = pc_index.nearest(lat, lng) return { "id": prop.get("id"), "bedrooms": bedrooms, "bathrooms": bathrooms, "total_rooms": bedrooms + bathrooms, "longitude": lng, "latitude": lat, "postcode": postcode, "address": prop.get("displayAddress", ""), "tenure": extract_tenure(prop.get("tenure")), "property_type": map_property_type(sub_type), "property_sub_type": sub_type or "Unknown", "price": price, "price_frequency": frequency, "price_qualifier": price_qualifier, "floorspace_sqm": parse_display_size(prop.get("displaySize")), "url": RIGHTMOVE_BASE + prop.get("propertyUrl", ""), "features": key_features, "first_visible_date": prop.get("firstVisibleDate", ""), "update_date": update_date, "outcode": outcode, "house_share": sub_type == "House Share", } # --------------------------------------------------------------------------- # Parquet writing # --------------------------------------------------------------------------- def write_parquet(properties: list[dict], path: Path) -> None: """Write properties list to parquet using Polars.""" if not properties: log.warning("No properties to write to %s", path) return df = pl.DataFrame( { "id": [p["id"] for p in properties], "bedrooms": [p["bedrooms"] for p in properties], "bathrooms": [p["bathrooms"] for p in properties], "total_rooms": [p["total_rooms"] for p in properties], "longitude": [p["longitude"] for p in properties], "latitude": [p["latitude"] for p in properties], "postcode": [p["postcode"] for p in properties], "address": [p["address"] for p in properties], "tenure": [p["tenure"] for p in properties], "property_type": [p["property_type"] for p in properties], "property_sub_type": [p["property_sub_type"] for p in properties], "price": [p["price"] for p in properties], "price_frequency": [p["price_frequency"] for p in properties], "price_qualifier": [p["price_qualifier"] for p in properties], "floorspace_sqm": [p["floorspace_sqm"] for p in properties], "url": [p["url"] for p in properties], "features": [p["features"] for p in properties], "first_visible_date": [p["first_visible_date"] for p in properties], "update_date": [p["update_date"] for p in properties], "outcode": [p["outcode"] for p in properties], "house_share": [p["house_share"] for p in properties], }, schema={ "id": pl.Int64, "bedrooms": pl.Int32, "bathrooms": pl.Int32, "total_rooms": pl.Int32, "longitude": pl.Float64, "latitude": pl.Float64, "postcode": pl.Utf8, "address": pl.Utf8, "tenure": pl.Utf8, "property_type": pl.Utf8, "property_sub_type": pl.Utf8, "price": pl.Int64, "price_frequency": pl.Utf8, "price_qualifier": pl.Utf8, "floorspace_sqm": pl.Float64, "url": pl.Utf8, "features": pl.List(pl.Utf8), "first_visible_date": pl.Utf8, "update_date": pl.Utf8, "outcode": pl.Utf8, "house_share": pl.Boolean, }, ) df.write_parquet(path) log.info("Wrote %d properties to %s", len(df), path) # --------------------------------------------------------------------------- # Scrape orchestration # --------------------------------------------------------------------------- def load_outcodes() -> list[str]: """Load England-only outcodes from arcgis parquet.""" log.info("Loading outcodes from %s", ARCGIS_PATH) df = pl.read_parquet(ARCGIS_PATH, columns=["pcd", "ctry", "lat", "long"]) england = df.filter(pl.col("ctry") == "E92000001") log.info("England postcodes: %d", len(england)) outcodes = ( england.select(pl.col("pcd").str.extract(r"^([A-Z]{1,2}\d[A-Z0-9]?)", 1).alias("outcode")) .drop_nulls() .get_column("outcode") .unique() .sort() .to_list() ) log.info("Unique England outcodes: %d", len(outcodes)) return outcodes def build_postcode_index() -> PostcodeSpatialIndex: """Build spatial index from arcgis England postcodes.""" log.info("Building postcode spatial index from %s", ARCGIS_PATH) df = pl.read_parquet(ARCGIS_PATH, columns=["pcd", "ctry", "lat", "long"]) england = df.filter(pl.col("ctry") == "E92000001").drop_nulls(subset=["lat", "long"]) return PostcodeSpatialIndex( england.get_column("lat").to_list(), england.get_column("long").to_list(), england.get_column("pcd").to_list(), ) def run_scrape(outcodes: list[str], pc_index: PostcodeSpatialIndex) -> None: """Main scrape loop — runs in background thread.""" global status with status_lock: status.state = "running" status.started_at = time.time() status.errors = [] status.properties_buy = 0 status.properties_rent = 0 # Shuffle for geographic diversity shuffled = list(outcodes) random.seed(SEED) random.shuffle(shuffled) client = make_client() try: for channel_cfg in CHANNELS: channel_name = channel_cfg["channel"] file_suffix = "buy" if channel_name == "BUY" else "rent" all_properties: dict[int, dict] = {} # dedup by id with status_lock: status.channel = channel_name status.outcodes_done = 0 status.outcodes_total = len(shuffled) log.info("=== Starting %s channel (%d outcodes) ===", channel_name, len(shuffled)) for i, outcode in enumerate(shuffled): with status_lock: status.outcode = outcode status.outcodes_done = i log.debug("Outcode %s (%d/%d) — %d properties so far", outcode, i + 1, len(shuffled), len(all_properties)) try: outcode_id = resolve_outcode_id(client, outcode) if not outcode_id: log.debug("No Rightmove ID for outcode %s, skipping", outcode) continue props = search_outcode(client, outcode_id, outcode, channel_cfg, pc_index) for p in props: pid = p["id"] if pid not in all_properties: all_properties[pid] = p with status_lock: if channel_name == "BUY": status.properties_buy = len(all_properties) else: status.properties_rent = len(all_properties) log.info("Outcode %s: got %d properties (total: %d)", outcode, len(props), len(all_properties)) except Exception as e: msg = f"Error scraping {outcode}/{channel_name}: {e}" log.error(msg) with status_lock: status.errors.append(msg) if i < len(shuffled) - 1: time.sleep(DELAY_BETWEEN_OUTCODES) # Write parquet deduped = list(all_properties.values()) output_path = DATA_DIR / f"rightmove_{file_suffix}.parquet" write_parquet(deduped, output_path) with status_lock: if channel_name == "BUY": status.properties_buy = len(deduped) else: status.properties_rent = len(deduped) status.outcodes_done = len(shuffled) log.info("=== %s channel complete: %d unique properties ===", channel_name, len(deduped)) with status_lock: status.state = "done" status.finished_at = time.time() elapsed = status.finished_at - status.started_at log.info("Scrape complete in %.0fs — buy: %d, rent: %d", elapsed, status.properties_buy, status.properties_rent) except Exception as e: log.exception("Fatal scrape error") with status_lock: status.state = "error" status.errors.append(f"Fatal: {e}") status.finished_at = time.time() finally: client.close() # --------------------------------------------------------------------------- # Startup: load data # --------------------------------------------------------------------------- log.info("Loading arcgis data...") OUTCODES = load_outcodes() PC_INDEX = build_postcode_index() log.info("Ready — %d outcodes, postcode index built", len(OUTCODES)) # --------------------------------------------------------------------------- # Flask app # --------------------------------------------------------------------------- app = Flask(__name__) @app.route("/run", methods=["POST"]) def trigger_run(): with status_lock: if status.state == "running": return jsonify({"error": "Scrape already running"}), 409 status.state = "running" thread = threading.Thread(target=run_scrape, args=(OUTCODES, PC_INDEX), daemon=True) thread.start() return jsonify({"message": "Scrape started"}), 200 @app.route("/status") def get_status(): with status_lock: elapsed = 0.0 if status.started_at: end = status.finished_at if status.finished_at else time.time() elapsed = end - status.started_at return jsonify({ "state": status.state, "channel": status.channel, "outcode": status.outcode, "outcodes_done": status.outcodes_done, "outcodes_total": status.outcodes_total, "properties_buy": status.properties_buy, "properties_rent": status.properties_rent, "errors": status.errors[-20:], # last 20 errors "elapsed_seconds": round(elapsed, 1), }) @app.route("/debug") def get_debug(): return jsonify({ "last_response": debug_data["last_response"], "outcode_cache_size": len(debug_data["outcode_cache"]), "outcode_cache_sample": dict(list(debug_data["outcode_cache"].items())[:20]), }) @app.route("/data/") def serve_data(filename): if not filename.endswith(".parquet"): return jsonify({"error": "Only parquet files served"}), 400 return send_from_directory(DATA_DIR, filename) if __name__ == "__main__": app.run(host="0.0.0.0", port=1234, debug=False)