perfect-postcode/finder/main.py
2026-02-15 22:39:53 +00:00

710 lines
25 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import logging
import math
import os
import random
import re
import threading
import time
from collections import defaultdict
from dataclasses import dataclass, field
from pathlib import Path
import httpx
import polars as pl
from flask import Flask, jsonify, send_from_directory
# ---------------------------------------------------------------------------
# Logging
# ---------------------------------------------------------------------------
LOG_DIR = Path("/app/data")
LOG_DIR.mkdir(parents=True, exist_ok=True)
logging.basicConfig(
level=logging.DEBUG,
format="%(asctime)s [%(levelname)s] %(message)s",
handlers=[
logging.StreamHandler(),
logging.FileHandler(LOG_DIR / "rightmove.log"),
],
)
log = logging.getLogger("rightmove")
log.setLevel(logging.DEBUG)
logging.getLogger("httpx").setLevel(logging.WARNING)
logging.getLogger("httpcore").setLevel(logging.WARNING)
# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------
ARCGIS_PATH = os.environ.get("ARCGIS_PATH", "/data/arcgis_data.parquet")
DATA_DIR = Path("/app/data")
PAGE_SIZE = 24
MAX_PAGES_PER_OUTCODE = 42 # 24*42 = 1008, safety cap per outcode
DELAY_BETWEEN_PAGES = 1.0
DELAY_BETWEEN_OUTCODES = 2.0
MAX_RETRIES = 3
RETRY_BASE_DELAY = 2.0
GRID_CELL_SIZE = 0.01 # degrees for postcode spatial index
SEED = 42
TYPEAHEAD_URL = "https://los.rightmove.co.uk/typeahead"
SEARCH_URL = "https://www.rightmove.co.uk/api/property-search/listing/search"
RIGHTMOVE_BASE = "https://www.rightmove.co.uk"
PROPERTY_TYPE_MAP = {
"Detached": "Detached",
"Semi-Detached": "Semi-Detached",
"Terraced": "Terraced",
"End of Terrace": "Terraced",
"Mid Terrace": "Terraced",
"Flat": "Flat",
"Maisonette": "Flat",
"Studio": "Flat",
"Apartment": "Flat",
"Penthouse": "Flat",
"Ground Flat": "Flat",
"Detached Bungalow": "Detached",
"Semi-Detached Bungalow": "Semi-Detached",
"Town House": "Terraced",
"Link Detached": "Detached",
"Link Detached House": "Detached",
"Bungalow": "Other",
"Cottage": "Other",
"Park Home": "Other",
"Land": "Other",
"Farm / Barn": "Other",
"House": "Detached",
"Not Specified": "Other",
"Chalet": "Other",
"Barn Conversion": "Other",
"Coach House": "Other",
"Character Property": "Other",
"Cluster House": "Other",
"Retirement Property": "Flat",
"Plot": "Other",
"Garages": "Other",
"Mews": "Terraced",
}
CHANNELS = [
{"channel": "BUY", "transactionType": "BUY", "sortType": "2"},
{"channel": "RENT", "transactionType": "LETTING", "sortType": "6"},
]
# ---------------------------------------------------------------------------
# Postcode spatial index
# ---------------------------------------------------------------------------
class PostcodeSpatialIndex:
"""Grid-based spatial index over arcgis postcodes for nearest-lookup."""
def __init__(self, lats: list[float], lngs: list[float], postcodes: list[str]):
self.grid: dict[tuple[int, int], list[tuple[float, float, str]]] = defaultdict(list)
for lat, lng, pcd in zip(lats, lngs, postcodes):
gx = int(math.floor(lng / GRID_CELL_SIZE))
gy = int(math.floor(lat / GRID_CELL_SIZE))
self.grid[(gx, gy)].append((lat, lng, pcd))
log.info("Postcode spatial index: %d cells, %d postcodes", len(self.grid), len(lats))
def nearest(self, lat: float, lng: float) -> str | None:
gx = int(math.floor(lng / GRID_CELL_SIZE))
gy = int(math.floor(lat / GRID_CELL_SIZE))
best_dist = float("inf")
best_pcd = None
for dx in range(-1, 2):
for dy in range(-1, 2):
for plat, plng, pcd in self.grid.get((gx + dx, gy + dy), []):
d = (plat - lat) ** 2 + (plng - lng) ** 2
if d < best_dist:
best_dist = d
best_pcd = pcd
return best_pcd
# ---------------------------------------------------------------------------
# Scrape status
# ---------------------------------------------------------------------------
@dataclass
class ScrapeStatus:
state: str = "idle" # idle | running | done | error
channel: str = ""
outcode: str = ""
outcodes_done: int = 0
outcodes_total: int = 0
properties_buy: int = 0
properties_rent: int = 0
errors: list[str] = field(default_factory=list)
started_at: float = 0.0
finished_at: float = 0.0
status = ScrapeStatus()
status_lock = threading.Lock()
debug_data: dict = {"last_response": None, "outcode_cache": {}}
# ---------------------------------------------------------------------------
# HTTP helpers
# ---------------------------------------------------------------------------
USER_AGENT = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
)
# Gluetun control API — runs on port 8000 inside the gluetun container.
# Since finder uses network_mode: service:gluetun, localhost IS gluetun.
GLUETUN_API = "http://127.0.0.1:8000"
_ip_rotate_lock = threading.Lock()
def rotate_ip() -> bool:
"""Ask gluetun to reconnect to a different VPN server, getting a new IP.
Returns True if the IP changed successfully."""
with _ip_rotate_lock:
log.info("Rotating VPN IP via gluetun...")
try:
# Get current IP
with httpx.Client(timeout=10) as ctl:
old_ip_resp = ctl.get(f"{GLUETUN_API}/v1/publicip/ip")
old_ip = old_ip_resp.json().get("public_ip", "unknown") if old_ip_resp.status_code == 200 else "unknown"
log.info("Current IP: %s", old_ip)
# Trigger server change — PUT with empty JSON body picks a random server
resp = ctl.put(f"{GLUETUN_API}/v1/vpn/status", json={"status": "stopped"})
if resp.status_code != 200:
log.error("Failed to stop VPN: %d %s", resp.status_code, resp.text)
return False
time.sleep(2)
resp = ctl.put(f"{GLUETUN_API}/v1/vpn/status", json={"status": "running"})
if resp.status_code != 200:
log.error("Failed to start VPN: %d %s", resp.status_code, resp.text)
return False
# Wait for reconnection
for _ in range(30):
time.sleep(2)
try:
with httpx.Client(timeout=10) as ctl:
new_ip_resp = ctl.get(f"{GLUETUN_API}/v1/publicip/ip")
if new_ip_resp.status_code == 200:
new_ip = new_ip_resp.json().get("public_ip", "")
if new_ip and new_ip != old_ip:
log.info("IP rotated: %s%s", old_ip, new_ip)
return True
except Exception:
pass # VPN still reconnecting
log.warning("IP rotation timed out (may still be same IP)")
return False
except Exception as e:
log.error("IP rotation failed: %s", e)
return False
def make_client() -> httpx.Client:
return httpx.Client(
timeout=30,
headers={"User-Agent": USER_AGENT, "Accept": "application/json"},
follow_redirects=True,
)
def fetch_with_retry(
client: httpx.Client, url: str, params: dict | None = None, on_403: bool = True
) -> dict | None:
"""GET JSON with retries on 429/5xx/connection errors. Returns None on permanent failure.
On 403, triggers IP rotation and retries once."""
for attempt in range(MAX_RETRIES):
try:
resp = client.get(url, params=params)
if resp.status_code == 200:
return resp.json()
if resp.status_code == 403 and on_403:
log.warning("HTTP 403 — IP likely blocked, rotating...")
if rotate_ip():
# Retry once with new IP (but don't recurse on 403 again)
return fetch_with_retry(client, url, params, on_403=False)
log.error("IP rotation failed, giving up on %s", url)
return None
if resp.status_code in (429, 500, 502, 503, 504):
delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1)
log.warning("HTTP %d from %s, retry %d/%d in %.1fs", resp.status_code, url, attempt + 1, MAX_RETRIES, delay)
time.sleep(delay)
continue
log.error("HTTP %d from %s (non-retryable)", resp.status_code, url)
return None
except (httpx.ConnectError, httpx.ReadTimeout, httpx.WriteTimeout, httpx.PoolTimeout) as e:
delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1)
log.warning("%s from %s, retry %d/%d in %.1fs", type(e).__name__, url, attempt + 1, MAX_RETRIES, delay)
time.sleep(delay)
log.error("All %d retries exhausted for %s", MAX_RETRIES, url)
return None
# ---------------------------------------------------------------------------
# Rightmove API
# ---------------------------------------------------------------------------
def resolve_outcode_id(client: httpx.Client, outcode: str) -> str | None:
"""Look up Rightmove's internal ID for an outcode via typeahead API."""
if outcode in debug_data["outcode_cache"]:
return debug_data["outcode_cache"][outcode]
data = fetch_with_retry(client, TYPEAHEAD_URL, {"query": outcode, "limit": "10", "exclude": "STREET"})
if not data:
return None
for match in data.get("matches", []):
if match.get("type") == "OUTCODE" and match.get("displayName") == outcode:
rid = str(match["id"])
debug_data["outcode_cache"][outcode] = rid
return rid
log.debug("Outcode %s not found in typeahead results", outcode)
return None
def search_outcode(
client: httpx.Client,
outcode_id: str,
outcode: str,
channel_cfg: dict,
pc_index: PostcodeSpatialIndex,
) -> list[dict]:
"""Paginate through search results for one outcode+channel. Returns transformed properties."""
properties = []
index = 0
for page in range(MAX_PAGES_PER_OUTCODE):
params = {
"useLocationIdentifier": "true",
"locationIdentifier": f"OUTCODE^{outcode_id}",
"index": str(index),
"sortType": channel_cfg["sortType"],
"channel": channel_cfg["channel"],
"transactionType": channel_cfg["transactionType"],
}
data = fetch_with_retry(client, SEARCH_URL, params)
if not data:
log.warning("Failed to fetch page %d for %s/%s", page, outcode, channel_cfg["channel"])
break
debug_data["last_response"] = data
raw_props = data.get("properties", [])
if not raw_props:
break
for prop in raw_props:
transformed = transform_property(prop, outcode, pc_index)
if transformed:
properties.append(transformed)
# Check if there are more pages
result_count_str = data.get("resultCount", "0")
result_count = int(result_count_str.replace(",", ""))
index += PAGE_SIZE
if index >= result_count:
break
if page < MAX_PAGES_PER_OUTCODE - 1:
time.sleep(DELAY_BETWEEN_PAGES)
return properties
# ---------------------------------------------------------------------------
# Property transformation
# ---------------------------------------------------------------------------
def parse_display_size(display_size: str | None) -> float | None:
"""Parse displaySize like '499 sq. ft.' or '4,124 sq. ft.' to sqm."""
if not display_size:
return None
# Try sq. ft. first
m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*ft", display_size, re.IGNORECASE)
if m:
sqft = float(m.group(1).replace(",", ""))
return round(sqft * 0.092903, 1)
# Try sq. m.
m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*m", display_size, re.IGNORECASE)
if m:
return round(float(m.group(1).replace(",", "")), 1)
return None
def map_property_type(sub_type: str | None) -> str:
"""Map propertySubType to canonical type."""
if not sub_type:
return "Other"
canonical = PROPERTY_TYPE_MAP.get(sub_type)
if canonical:
return canonical
log.warning("Unknown propertySubType: %r — mapping to Other", sub_type)
return "Other"
def extract_tenure(tenure_obj: dict | None) -> str | None:
"""Extract tenure string from tenure object."""
if not tenure_obj:
return None
tt = tenure_obj.get("tenureType", "")
if tt == "FREEHOLD":
return "Freehold"
if tt == "LEASEHOLD":
return "Leasehold"
return None
def fix_coords(lat: float, lng: float) -> tuple[float, float]:
"""Swap lat/lng if they look reversed. England: lat ~4956, lng ~-72."""
if 49 <= lat <= 56 and -7 <= lng <= 2:
return lat, lng
if 49 <= lng <= 56 and -7 <= lat <= 2:
log.debug("Swapping reversed coords: lat=%.4f lng=%.4f → lat=%.4f lng=%.4f", lat, lng, lng, lat)
return lng, lat
log.warning("Coords outside England bounds even after swap attempt: lat=%.4f lng=%.4f", lat, lng)
return lat, lng
def normalize_price(amount: int, frequency: str) -> int:
"""Normalize price to monthly for rentals (weekly × 52/12, yearly ÷ 12)."""
if frequency == "weekly":
return round(amount * 52 / 12)
if frequency == "yearly":
return round(amount / 12)
return amount
def transform_property(prop: dict, outcode: str, pc_index: PostcodeSpatialIndex) -> dict | None:
"""Transform a raw Rightmove property dict into our output schema."""
loc = prop.get("location")
if not loc:
return None
raw_lat = loc.get("latitude")
raw_lng = loc.get("longitude")
if raw_lat is None or raw_lng is None:
return None
lat, lng = fix_coords(raw_lat, raw_lng)
price_obj = prop.get("price", {})
amount = price_obj.get("amount")
if amount is None:
return None
frequency = price_obj.get("frequency", "")
price = normalize_price(int(amount), frequency)
display_prices = price_obj.get("displayPrices", [])
price_qualifier = display_prices[0].get("displayPriceQualifier", "") if display_prices else ""
sub_type = prop.get("propertySubType", "")
bedrooms = prop.get("bedrooms", 0) or 0
bathrooms = prop.get("bathrooms", 0) or 0
key_features = [kf.get("description", "") for kf in prop.get("keyFeatures", []) if kf.get("description")]
listing_update = prop.get("listingUpdate", {})
update_date = listing_update.get("listingUpdateDate", "")
postcode = pc_index.nearest(lat, lng)
return {
"id": prop.get("id"),
"bedrooms": bedrooms,
"bathrooms": bathrooms,
"total_rooms": bedrooms + bathrooms,
"longitude": lng,
"latitude": lat,
"postcode": postcode,
"address": prop.get("displayAddress", ""),
"tenure": extract_tenure(prop.get("tenure")),
"property_type": map_property_type(sub_type),
"property_sub_type": sub_type or "Unknown",
"price": price,
"price_frequency": frequency,
"price_qualifier": price_qualifier,
"floorspace_sqm": parse_display_size(prop.get("displaySize")),
"url": RIGHTMOVE_BASE + prop.get("propertyUrl", ""),
"features": key_features,
"first_visible_date": prop.get("firstVisibleDate", ""),
"update_date": update_date,
"outcode": outcode,
"house_share": sub_type == "House Share",
}
# ---------------------------------------------------------------------------
# Parquet writing
# ---------------------------------------------------------------------------
def write_parquet(properties: list[dict], path: Path) -> None:
"""Write properties list to parquet using Polars."""
if not properties:
log.warning("No properties to write to %s", path)
return
df = pl.DataFrame(
{
"id": [p["id"] for p in properties],
"bedrooms": [p["bedrooms"] for p in properties],
"bathrooms": [p["bathrooms"] for p in properties],
"total_rooms": [p["total_rooms"] for p in properties],
"longitude": [p["longitude"] for p in properties],
"latitude": [p["latitude"] for p in properties],
"postcode": [p["postcode"] for p in properties],
"address": [p["address"] for p in properties],
"tenure": [p["tenure"] for p in properties],
"property_type": [p["property_type"] for p in properties],
"property_sub_type": [p["property_sub_type"] for p in properties],
"price": [p["price"] for p in properties],
"price_frequency": [p["price_frequency"] for p in properties],
"price_qualifier": [p["price_qualifier"] for p in properties],
"floorspace_sqm": [p["floorspace_sqm"] for p in properties],
"url": [p["url"] for p in properties],
"features": [p["features"] for p in properties],
"first_visible_date": [p["first_visible_date"] for p in properties],
"update_date": [p["update_date"] for p in properties],
"outcode": [p["outcode"] for p in properties],
"house_share": [p["house_share"] for p in properties],
},
schema={
"id": pl.Int64,
"bedrooms": pl.Int32,
"bathrooms": pl.Int32,
"total_rooms": pl.Int32,
"longitude": pl.Float64,
"latitude": pl.Float64,
"postcode": pl.Utf8,
"address": pl.Utf8,
"tenure": pl.Utf8,
"property_type": pl.Utf8,
"property_sub_type": pl.Utf8,
"price": pl.Int64,
"price_frequency": pl.Utf8,
"price_qualifier": pl.Utf8,
"floorspace_sqm": pl.Float64,
"url": pl.Utf8,
"features": pl.List(pl.Utf8),
"first_visible_date": pl.Utf8,
"update_date": pl.Utf8,
"outcode": pl.Utf8,
"house_share": pl.Boolean,
},
)
df.write_parquet(path)
log.info("Wrote %d properties to %s", len(df), path)
# ---------------------------------------------------------------------------
# Scrape orchestration
# ---------------------------------------------------------------------------
def load_outcodes() -> list[str]:
"""Load England-only outcodes from arcgis parquet."""
log.info("Loading outcodes from %s", ARCGIS_PATH)
df = pl.read_parquet(ARCGIS_PATH, columns=["pcd", "ctry", "lat", "long"])
england = df.filter(pl.col("ctry") == "E92000001")
log.info("England postcodes: %d", len(england))
outcodes = (
england.select(pl.col("pcd").str.extract(r"^([A-Z]{1,2}\d[A-Z0-9]?)", 1).alias("outcode"))
.drop_nulls()
.get_column("outcode")
.unique()
.sort()
.to_list()
)
log.info("Unique England outcodes: %d", len(outcodes))
return outcodes
def build_postcode_index() -> PostcodeSpatialIndex:
"""Build spatial index from arcgis England postcodes."""
log.info("Building postcode spatial index from %s", ARCGIS_PATH)
df = pl.read_parquet(ARCGIS_PATH, columns=["pcd", "ctry", "lat", "long"])
england = df.filter(pl.col("ctry") == "E92000001").drop_nulls(subset=["lat", "long"])
return PostcodeSpatialIndex(
england.get_column("lat").to_list(),
england.get_column("long").to_list(),
england.get_column("pcd").to_list(),
)
def run_scrape(outcodes: list[str], pc_index: PostcodeSpatialIndex) -> None:
"""Main scrape loop — runs in background thread."""
global status
with status_lock:
status.state = "running"
status.started_at = time.time()
status.errors = []
status.properties_buy = 0
status.properties_rent = 0
# Shuffle for geographic diversity
shuffled = list(outcodes)
random.seed(SEED)
random.shuffle(shuffled)
client = make_client()
try:
for channel_cfg in CHANNELS:
channel_name = channel_cfg["channel"]
file_suffix = "buy" if channel_name == "BUY" else "rent"
all_properties: dict[int, dict] = {} # dedup by id
with status_lock:
status.channel = channel_name
status.outcodes_done = 0
status.outcodes_total = len(shuffled)
log.info("=== Starting %s channel (%d outcodes) ===", channel_name, len(shuffled))
for i, outcode in enumerate(shuffled):
with status_lock:
status.outcode = outcode
status.outcodes_done = i
log.debug("Outcode %s (%d/%d) — %d properties so far",
outcode, i + 1, len(shuffled), len(all_properties))
try:
outcode_id = resolve_outcode_id(client, outcode)
if not outcode_id:
log.debug("No Rightmove ID for outcode %s, skipping", outcode)
continue
props = search_outcode(client, outcode_id, outcode, channel_cfg, pc_index)
for p in props:
pid = p["id"]
if pid not in all_properties:
all_properties[pid] = p
with status_lock:
if channel_name == "BUY":
status.properties_buy = len(all_properties)
else:
status.properties_rent = len(all_properties)
log.info("Outcode %s: got %d properties (total: %d)", outcode, len(props), len(all_properties))
except Exception as e:
msg = f"Error scraping {outcode}/{channel_name}: {e}"
log.error(msg)
with status_lock:
status.errors.append(msg)
if i < len(shuffled) - 1:
time.sleep(DELAY_BETWEEN_OUTCODES)
# Write parquet
deduped = list(all_properties.values())
output_path = DATA_DIR / f"rightmove_{file_suffix}.parquet"
write_parquet(deduped, output_path)
with status_lock:
if channel_name == "BUY":
status.properties_buy = len(deduped)
else:
status.properties_rent = len(deduped)
status.outcodes_done = len(shuffled)
log.info("=== %s channel complete: %d unique properties ===", channel_name, len(deduped))
with status_lock:
status.state = "done"
status.finished_at = time.time()
elapsed = status.finished_at - status.started_at
log.info("Scrape complete in %.0fs — buy: %d, rent: %d",
elapsed, status.properties_buy, status.properties_rent)
except Exception as e:
log.exception("Fatal scrape error")
with status_lock:
status.state = "error"
status.errors.append(f"Fatal: {e}")
status.finished_at = time.time()
finally:
client.close()
# ---------------------------------------------------------------------------
# Startup: load data
# ---------------------------------------------------------------------------
log.info("Loading arcgis data...")
OUTCODES = load_outcodes()
PC_INDEX = build_postcode_index()
log.info("Ready — %d outcodes, postcode index built", len(OUTCODES))
# ---------------------------------------------------------------------------
# Flask app
# ---------------------------------------------------------------------------
app = Flask(__name__)
@app.route("/run", methods=["POST"])
def trigger_run():
with status_lock:
if status.state == "running":
return jsonify({"error": "Scrape already running"}), 409
status.state = "running"
thread = threading.Thread(target=run_scrape, args=(OUTCODES, PC_INDEX), daemon=True)
thread.start()
return jsonify({"message": "Scrape started"}), 200
@app.route("/status")
def get_status():
with status_lock:
elapsed = 0.0
if status.started_at:
end = status.finished_at if status.finished_at else time.time()
elapsed = end - status.started_at
return jsonify({
"state": status.state,
"channel": status.channel,
"outcode": status.outcode,
"outcodes_done": status.outcodes_done,
"outcodes_total": status.outcodes_total,
"properties_buy": status.properties_buy,
"properties_rent": status.properties_rent,
"errors": status.errors[-20:], # last 20 errors
"elapsed_seconds": round(elapsed, 1),
})
@app.route("/debug")
def get_debug():
return jsonify({
"last_response": debug_data["last_response"],
"outcode_cache_size": len(debug_data["outcode_cache"]),
"outcode_cache_sample": dict(list(debug_data["outcode_cache"].items())[:20]),
})
@app.route("/data/<filename>")
def serve_data(filename):
if not filename.endswith(".parquet"):
return jsonify({"error": "Only parquet files served"}), 400
return send_from_directory(DATA_DIR, filename)
if __name__ == "__main__":
app.run(host="0.0.0.0", port=1234, debug=False)