710 lines
25 KiB
Python
710 lines
25 KiB
Python
import logging
|
||
import math
|
||
import os
|
||
import random
|
||
import re
|
||
import threading
|
||
import time
|
||
from collections import defaultdict
|
||
from dataclasses import dataclass, field
|
||
from pathlib import Path
|
||
|
||
import httpx
|
||
import polars as pl
|
||
from flask import Flask, jsonify, send_from_directory
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Logging
|
||
# ---------------------------------------------------------------------------
|
||
|
||
LOG_DIR = Path("/app/data")
|
||
LOG_DIR.mkdir(parents=True, exist_ok=True)
|
||
|
||
logging.basicConfig(
|
||
level=logging.DEBUG,
|
||
format="%(asctime)s [%(levelname)s] %(message)s",
|
||
handlers=[
|
||
logging.StreamHandler(),
|
||
logging.FileHandler(LOG_DIR / "rightmove.log"),
|
||
],
|
||
)
|
||
log = logging.getLogger("rightmove")
|
||
log.setLevel(logging.DEBUG)
|
||
logging.getLogger("httpx").setLevel(logging.WARNING)
|
||
logging.getLogger("httpcore").setLevel(logging.WARNING)
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Constants
|
||
# ---------------------------------------------------------------------------
|
||
|
||
ARCGIS_PATH = os.environ.get("ARCGIS_PATH", "/data/arcgis_data.parquet")
|
||
DATA_DIR = Path("/app/data")
|
||
PAGE_SIZE = 24
|
||
MAX_PAGES_PER_OUTCODE = 42 # 24*42 = 1008, safety cap per outcode
|
||
DELAY_BETWEEN_PAGES = 1.0
|
||
DELAY_BETWEEN_OUTCODES = 2.0
|
||
MAX_RETRIES = 3
|
||
RETRY_BASE_DELAY = 2.0
|
||
GRID_CELL_SIZE = 0.01 # degrees for postcode spatial index
|
||
SEED = 42
|
||
|
||
TYPEAHEAD_URL = "https://los.rightmove.co.uk/typeahead"
|
||
SEARCH_URL = "https://www.rightmove.co.uk/api/property-search/listing/search"
|
||
RIGHTMOVE_BASE = "https://www.rightmove.co.uk"
|
||
|
||
PROPERTY_TYPE_MAP = {
|
||
"Detached": "Detached",
|
||
"Semi-Detached": "Semi-Detached",
|
||
"Terraced": "Terraced",
|
||
"End of Terrace": "Terraced",
|
||
"Mid Terrace": "Terraced",
|
||
"Flat": "Flat",
|
||
"Maisonette": "Flat",
|
||
"Studio": "Flat",
|
||
"Apartment": "Flat",
|
||
"Penthouse": "Flat",
|
||
"Ground Flat": "Flat",
|
||
"Detached Bungalow": "Detached",
|
||
"Semi-Detached Bungalow": "Semi-Detached",
|
||
"Town House": "Terraced",
|
||
"Link Detached": "Detached",
|
||
"Link Detached House": "Detached",
|
||
"Bungalow": "Other",
|
||
"Cottage": "Other",
|
||
"Park Home": "Other",
|
||
"Land": "Other",
|
||
"Farm / Barn": "Other",
|
||
"House": "Detached",
|
||
"Not Specified": "Other",
|
||
"Chalet": "Other",
|
||
"Barn Conversion": "Other",
|
||
"Coach House": "Other",
|
||
"Character Property": "Other",
|
||
"Cluster House": "Other",
|
||
"Retirement Property": "Flat",
|
||
"Plot": "Other",
|
||
"Garages": "Other",
|
||
"Mews": "Terraced",
|
||
}
|
||
|
||
CHANNELS = [
|
||
{"channel": "BUY", "transactionType": "BUY", "sortType": "2"},
|
||
{"channel": "RENT", "transactionType": "LETTING", "sortType": "6"},
|
||
]
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Postcode spatial index
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
class PostcodeSpatialIndex:
|
||
"""Grid-based spatial index over arcgis postcodes for nearest-lookup."""
|
||
|
||
def __init__(self, lats: list[float], lngs: list[float], postcodes: list[str]):
|
||
self.grid: dict[tuple[int, int], list[tuple[float, float, str]]] = defaultdict(list)
|
||
for lat, lng, pcd in zip(lats, lngs, postcodes):
|
||
gx = int(math.floor(lng / GRID_CELL_SIZE))
|
||
gy = int(math.floor(lat / GRID_CELL_SIZE))
|
||
self.grid[(gx, gy)].append((lat, lng, pcd))
|
||
log.info("Postcode spatial index: %d cells, %d postcodes", len(self.grid), len(lats))
|
||
|
||
def nearest(self, lat: float, lng: float) -> str | None:
|
||
gx = int(math.floor(lng / GRID_CELL_SIZE))
|
||
gy = int(math.floor(lat / GRID_CELL_SIZE))
|
||
best_dist = float("inf")
|
||
best_pcd = None
|
||
for dx in range(-1, 2):
|
||
for dy in range(-1, 2):
|
||
for plat, plng, pcd in self.grid.get((gx + dx, gy + dy), []):
|
||
d = (plat - lat) ** 2 + (plng - lng) ** 2
|
||
if d < best_dist:
|
||
best_dist = d
|
||
best_pcd = pcd
|
||
return best_pcd
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Scrape status
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
@dataclass
|
||
class ScrapeStatus:
|
||
state: str = "idle" # idle | running | done | error
|
||
channel: str = ""
|
||
outcode: str = ""
|
||
outcodes_done: int = 0
|
||
outcodes_total: int = 0
|
||
properties_buy: int = 0
|
||
properties_rent: int = 0
|
||
errors: list[str] = field(default_factory=list)
|
||
started_at: float = 0.0
|
||
finished_at: float = 0.0
|
||
|
||
|
||
status = ScrapeStatus()
|
||
status_lock = threading.Lock()
|
||
debug_data: dict = {"last_response": None, "outcode_cache": {}}
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# HTTP helpers
|
||
# ---------------------------------------------------------------------------
|
||
|
||
USER_AGENT = (
|
||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
|
||
)
|
||
|
||
# Gluetun control API — runs on port 8000 inside the gluetun container.
|
||
# Since finder uses network_mode: service:gluetun, localhost IS gluetun.
|
||
GLUETUN_API = "http://127.0.0.1:8000"
|
||
_ip_rotate_lock = threading.Lock()
|
||
|
||
|
||
def rotate_ip() -> bool:
|
||
"""Ask gluetun to reconnect to a different VPN server, getting a new IP.
|
||
Returns True if the IP changed successfully."""
|
||
with _ip_rotate_lock:
|
||
log.info("Rotating VPN IP via gluetun...")
|
||
try:
|
||
# Get current IP
|
||
with httpx.Client(timeout=10) as ctl:
|
||
old_ip_resp = ctl.get(f"{GLUETUN_API}/v1/publicip/ip")
|
||
old_ip = old_ip_resp.json().get("public_ip", "unknown") if old_ip_resp.status_code == 200 else "unknown"
|
||
log.info("Current IP: %s", old_ip)
|
||
|
||
# Trigger server change — PUT with empty JSON body picks a random server
|
||
resp = ctl.put(f"{GLUETUN_API}/v1/vpn/status", json={"status": "stopped"})
|
||
if resp.status_code != 200:
|
||
log.error("Failed to stop VPN: %d %s", resp.status_code, resp.text)
|
||
return False
|
||
time.sleep(2)
|
||
|
||
resp = ctl.put(f"{GLUETUN_API}/v1/vpn/status", json={"status": "running"})
|
||
if resp.status_code != 200:
|
||
log.error("Failed to start VPN: %d %s", resp.status_code, resp.text)
|
||
return False
|
||
|
||
# Wait for reconnection
|
||
for _ in range(30):
|
||
time.sleep(2)
|
||
try:
|
||
with httpx.Client(timeout=10) as ctl:
|
||
new_ip_resp = ctl.get(f"{GLUETUN_API}/v1/publicip/ip")
|
||
if new_ip_resp.status_code == 200:
|
||
new_ip = new_ip_resp.json().get("public_ip", "")
|
||
if new_ip and new_ip != old_ip:
|
||
log.info("IP rotated: %s → %s", old_ip, new_ip)
|
||
return True
|
||
except Exception:
|
||
pass # VPN still reconnecting
|
||
|
||
log.warning("IP rotation timed out (may still be same IP)")
|
||
return False
|
||
|
||
except Exception as e:
|
||
log.error("IP rotation failed: %s", e)
|
||
return False
|
||
|
||
|
||
def make_client() -> httpx.Client:
|
||
return httpx.Client(
|
||
timeout=30,
|
||
headers={"User-Agent": USER_AGENT, "Accept": "application/json"},
|
||
follow_redirects=True,
|
||
)
|
||
|
||
|
||
def fetch_with_retry(
|
||
client: httpx.Client, url: str, params: dict | None = None, on_403: bool = True
|
||
) -> dict | None:
|
||
"""GET JSON with retries on 429/5xx/connection errors. Returns None on permanent failure.
|
||
On 403, triggers IP rotation and retries once."""
|
||
for attempt in range(MAX_RETRIES):
|
||
try:
|
||
resp = client.get(url, params=params)
|
||
if resp.status_code == 200:
|
||
return resp.json()
|
||
if resp.status_code == 403 and on_403:
|
||
log.warning("HTTP 403 — IP likely blocked, rotating...")
|
||
if rotate_ip():
|
||
# Retry once with new IP (but don't recurse on 403 again)
|
||
return fetch_with_retry(client, url, params, on_403=False)
|
||
log.error("IP rotation failed, giving up on %s", url)
|
||
return None
|
||
if resp.status_code in (429, 500, 502, 503, 504):
|
||
delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1)
|
||
log.warning("HTTP %d from %s, retry %d/%d in %.1fs", resp.status_code, url, attempt + 1, MAX_RETRIES, delay)
|
||
time.sleep(delay)
|
||
continue
|
||
log.error("HTTP %d from %s (non-retryable)", resp.status_code, url)
|
||
return None
|
||
except (httpx.ConnectError, httpx.ReadTimeout, httpx.WriteTimeout, httpx.PoolTimeout) as e:
|
||
delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1)
|
||
log.warning("%s from %s, retry %d/%d in %.1fs", type(e).__name__, url, attempt + 1, MAX_RETRIES, delay)
|
||
time.sleep(delay)
|
||
log.error("All %d retries exhausted for %s", MAX_RETRIES, url)
|
||
return None
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Rightmove API
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
def resolve_outcode_id(client: httpx.Client, outcode: str) -> str | None:
|
||
"""Look up Rightmove's internal ID for an outcode via typeahead API."""
|
||
if outcode in debug_data["outcode_cache"]:
|
||
return debug_data["outcode_cache"][outcode]
|
||
|
||
data = fetch_with_retry(client, TYPEAHEAD_URL, {"query": outcode, "limit": "10", "exclude": "STREET"})
|
||
if not data:
|
||
return None
|
||
|
||
for match in data.get("matches", []):
|
||
if match.get("type") == "OUTCODE" and match.get("displayName") == outcode:
|
||
rid = str(match["id"])
|
||
debug_data["outcode_cache"][outcode] = rid
|
||
return rid
|
||
|
||
log.debug("Outcode %s not found in typeahead results", outcode)
|
||
return None
|
||
|
||
|
||
def search_outcode(
|
||
client: httpx.Client,
|
||
outcode_id: str,
|
||
outcode: str,
|
||
channel_cfg: dict,
|
||
pc_index: PostcodeSpatialIndex,
|
||
) -> list[dict]:
|
||
"""Paginate through search results for one outcode+channel. Returns transformed properties."""
|
||
properties = []
|
||
index = 0
|
||
|
||
for page in range(MAX_PAGES_PER_OUTCODE):
|
||
params = {
|
||
"useLocationIdentifier": "true",
|
||
"locationIdentifier": f"OUTCODE^{outcode_id}",
|
||
"index": str(index),
|
||
"sortType": channel_cfg["sortType"],
|
||
"channel": channel_cfg["channel"],
|
||
"transactionType": channel_cfg["transactionType"],
|
||
}
|
||
|
||
data = fetch_with_retry(client, SEARCH_URL, params)
|
||
if not data:
|
||
log.warning("Failed to fetch page %d for %s/%s", page, outcode, channel_cfg["channel"])
|
||
break
|
||
|
||
debug_data["last_response"] = data
|
||
|
||
raw_props = data.get("properties", [])
|
||
if not raw_props:
|
||
break
|
||
|
||
for prop in raw_props:
|
||
transformed = transform_property(prop, outcode, pc_index)
|
||
if transformed:
|
||
properties.append(transformed)
|
||
|
||
# Check if there are more pages
|
||
result_count_str = data.get("resultCount", "0")
|
||
result_count = int(result_count_str.replace(",", ""))
|
||
index += PAGE_SIZE
|
||
|
||
if index >= result_count:
|
||
break
|
||
|
||
if page < MAX_PAGES_PER_OUTCODE - 1:
|
||
time.sleep(DELAY_BETWEEN_PAGES)
|
||
|
||
return properties
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Property transformation
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
def parse_display_size(display_size: str | None) -> float | None:
|
||
"""Parse displaySize like '499 sq. ft.' or '4,124 sq. ft.' to sqm."""
|
||
if not display_size:
|
||
return None
|
||
# Try sq. ft. first
|
||
m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*ft", display_size, re.IGNORECASE)
|
||
if m:
|
||
sqft = float(m.group(1).replace(",", ""))
|
||
return round(sqft * 0.092903, 1)
|
||
# Try sq. m.
|
||
m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*m", display_size, re.IGNORECASE)
|
||
if m:
|
||
return round(float(m.group(1).replace(",", "")), 1)
|
||
return None
|
||
|
||
|
||
def map_property_type(sub_type: str | None) -> str:
|
||
"""Map propertySubType to canonical type."""
|
||
if not sub_type:
|
||
return "Other"
|
||
canonical = PROPERTY_TYPE_MAP.get(sub_type)
|
||
if canonical:
|
||
return canonical
|
||
log.warning("Unknown propertySubType: %r — mapping to Other", sub_type)
|
||
return "Other"
|
||
|
||
|
||
def extract_tenure(tenure_obj: dict | None) -> str | None:
|
||
"""Extract tenure string from tenure object."""
|
||
if not tenure_obj:
|
||
return None
|
||
tt = tenure_obj.get("tenureType", "")
|
||
if tt == "FREEHOLD":
|
||
return "Freehold"
|
||
if tt == "LEASEHOLD":
|
||
return "Leasehold"
|
||
return None
|
||
|
||
|
||
def fix_coords(lat: float, lng: float) -> tuple[float, float]:
|
||
"""Swap lat/lng if they look reversed. England: lat ~49–56, lng ~-7–2."""
|
||
if 49 <= lat <= 56 and -7 <= lng <= 2:
|
||
return lat, lng
|
||
if 49 <= lng <= 56 and -7 <= lat <= 2:
|
||
log.debug("Swapping reversed coords: lat=%.4f lng=%.4f → lat=%.4f lng=%.4f", lat, lng, lng, lat)
|
||
return lng, lat
|
||
log.warning("Coords outside England bounds even after swap attempt: lat=%.4f lng=%.4f", lat, lng)
|
||
return lat, lng
|
||
|
||
|
||
def normalize_price(amount: int, frequency: str) -> int:
|
||
"""Normalize price to monthly for rentals (weekly × 52/12, yearly ÷ 12)."""
|
||
if frequency == "weekly":
|
||
return round(amount * 52 / 12)
|
||
if frequency == "yearly":
|
||
return round(amount / 12)
|
||
return amount
|
||
|
||
|
||
def transform_property(prop: dict, outcode: str, pc_index: PostcodeSpatialIndex) -> dict | None:
|
||
"""Transform a raw Rightmove property dict into our output schema."""
|
||
loc = prop.get("location")
|
||
if not loc:
|
||
return None
|
||
raw_lat = loc.get("latitude")
|
||
raw_lng = loc.get("longitude")
|
||
if raw_lat is None or raw_lng is None:
|
||
return None
|
||
|
||
lat, lng = fix_coords(raw_lat, raw_lng)
|
||
|
||
price_obj = prop.get("price", {})
|
||
amount = price_obj.get("amount")
|
||
if amount is None:
|
||
return None
|
||
frequency = price_obj.get("frequency", "")
|
||
price = normalize_price(int(amount), frequency)
|
||
|
||
display_prices = price_obj.get("displayPrices", [])
|
||
price_qualifier = display_prices[0].get("displayPriceQualifier", "") if display_prices else ""
|
||
|
||
sub_type = prop.get("propertySubType", "")
|
||
bedrooms = prop.get("bedrooms", 0) or 0
|
||
bathrooms = prop.get("bathrooms", 0) or 0
|
||
|
||
key_features = [kf.get("description", "") for kf in prop.get("keyFeatures", []) if kf.get("description")]
|
||
|
||
listing_update = prop.get("listingUpdate", {})
|
||
update_date = listing_update.get("listingUpdateDate", "")
|
||
|
||
postcode = pc_index.nearest(lat, lng)
|
||
|
||
return {
|
||
"id": prop.get("id"),
|
||
"bedrooms": bedrooms,
|
||
"bathrooms": bathrooms,
|
||
"total_rooms": bedrooms + bathrooms,
|
||
"longitude": lng,
|
||
"latitude": lat,
|
||
"postcode": postcode,
|
||
"address": prop.get("displayAddress", ""),
|
||
"tenure": extract_tenure(prop.get("tenure")),
|
||
"property_type": map_property_type(sub_type),
|
||
"property_sub_type": sub_type or "Unknown",
|
||
"price": price,
|
||
"price_frequency": frequency,
|
||
"price_qualifier": price_qualifier,
|
||
"floorspace_sqm": parse_display_size(prop.get("displaySize")),
|
||
"url": RIGHTMOVE_BASE + prop.get("propertyUrl", ""),
|
||
"features": key_features,
|
||
"first_visible_date": prop.get("firstVisibleDate", ""),
|
||
"update_date": update_date,
|
||
"outcode": outcode,
|
||
"house_share": sub_type == "House Share",
|
||
}
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Parquet writing
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
def write_parquet(properties: list[dict], path: Path) -> None:
|
||
"""Write properties list to parquet using Polars."""
|
||
if not properties:
|
||
log.warning("No properties to write to %s", path)
|
||
return
|
||
|
||
df = pl.DataFrame(
|
||
{
|
||
"id": [p["id"] for p in properties],
|
||
"bedrooms": [p["bedrooms"] for p in properties],
|
||
"bathrooms": [p["bathrooms"] for p in properties],
|
||
"total_rooms": [p["total_rooms"] for p in properties],
|
||
"longitude": [p["longitude"] for p in properties],
|
||
"latitude": [p["latitude"] for p in properties],
|
||
"postcode": [p["postcode"] for p in properties],
|
||
"address": [p["address"] for p in properties],
|
||
"tenure": [p["tenure"] for p in properties],
|
||
"property_type": [p["property_type"] for p in properties],
|
||
"property_sub_type": [p["property_sub_type"] for p in properties],
|
||
"price": [p["price"] for p in properties],
|
||
"price_frequency": [p["price_frequency"] for p in properties],
|
||
"price_qualifier": [p["price_qualifier"] for p in properties],
|
||
"floorspace_sqm": [p["floorspace_sqm"] for p in properties],
|
||
"url": [p["url"] for p in properties],
|
||
"features": [p["features"] for p in properties],
|
||
"first_visible_date": [p["first_visible_date"] for p in properties],
|
||
"update_date": [p["update_date"] for p in properties],
|
||
"outcode": [p["outcode"] for p in properties],
|
||
"house_share": [p["house_share"] for p in properties],
|
||
},
|
||
schema={
|
||
"id": pl.Int64,
|
||
"bedrooms": pl.Int32,
|
||
"bathrooms": pl.Int32,
|
||
"total_rooms": pl.Int32,
|
||
"longitude": pl.Float64,
|
||
"latitude": pl.Float64,
|
||
"postcode": pl.Utf8,
|
||
"address": pl.Utf8,
|
||
"tenure": pl.Utf8,
|
||
"property_type": pl.Utf8,
|
||
"property_sub_type": pl.Utf8,
|
||
"price": pl.Int64,
|
||
"price_frequency": pl.Utf8,
|
||
"price_qualifier": pl.Utf8,
|
||
"floorspace_sqm": pl.Float64,
|
||
"url": pl.Utf8,
|
||
"features": pl.List(pl.Utf8),
|
||
"first_visible_date": pl.Utf8,
|
||
"update_date": pl.Utf8,
|
||
"outcode": pl.Utf8,
|
||
"house_share": pl.Boolean,
|
||
},
|
||
)
|
||
|
||
df.write_parquet(path)
|
||
log.info("Wrote %d properties to %s", len(df), path)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Scrape orchestration
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
def load_outcodes() -> list[str]:
|
||
"""Load England-only outcodes from arcgis parquet."""
|
||
log.info("Loading outcodes from %s", ARCGIS_PATH)
|
||
df = pl.read_parquet(ARCGIS_PATH, columns=["pcd", "ctry", "lat", "long"])
|
||
england = df.filter(pl.col("ctry") == "E92000001")
|
||
log.info("England postcodes: %d", len(england))
|
||
|
||
outcodes = (
|
||
england.select(pl.col("pcd").str.extract(r"^([A-Z]{1,2}\d[A-Z0-9]?)", 1).alias("outcode"))
|
||
.drop_nulls()
|
||
.get_column("outcode")
|
||
.unique()
|
||
.sort()
|
||
.to_list()
|
||
)
|
||
log.info("Unique England outcodes: %d", len(outcodes))
|
||
return outcodes
|
||
|
||
|
||
def build_postcode_index() -> PostcodeSpatialIndex:
|
||
"""Build spatial index from arcgis England postcodes."""
|
||
log.info("Building postcode spatial index from %s", ARCGIS_PATH)
|
||
df = pl.read_parquet(ARCGIS_PATH, columns=["pcd", "ctry", "lat", "long"])
|
||
england = df.filter(pl.col("ctry") == "E92000001").drop_nulls(subset=["lat", "long"])
|
||
return PostcodeSpatialIndex(
|
||
england.get_column("lat").to_list(),
|
||
england.get_column("long").to_list(),
|
||
england.get_column("pcd").to_list(),
|
||
)
|
||
|
||
|
||
def run_scrape(outcodes: list[str], pc_index: PostcodeSpatialIndex) -> None:
|
||
"""Main scrape loop — runs in background thread."""
|
||
global status
|
||
with status_lock:
|
||
status.state = "running"
|
||
status.started_at = time.time()
|
||
status.errors = []
|
||
status.properties_buy = 0
|
||
status.properties_rent = 0
|
||
|
||
# Shuffle for geographic diversity
|
||
shuffled = list(outcodes)
|
||
random.seed(SEED)
|
||
random.shuffle(shuffled)
|
||
|
||
client = make_client()
|
||
|
||
try:
|
||
for channel_cfg in CHANNELS:
|
||
channel_name = channel_cfg["channel"]
|
||
file_suffix = "buy" if channel_name == "BUY" else "rent"
|
||
all_properties: dict[int, dict] = {} # dedup by id
|
||
|
||
with status_lock:
|
||
status.channel = channel_name
|
||
status.outcodes_done = 0
|
||
status.outcodes_total = len(shuffled)
|
||
|
||
log.info("=== Starting %s channel (%d outcodes) ===", channel_name, len(shuffled))
|
||
|
||
for i, outcode in enumerate(shuffled):
|
||
with status_lock:
|
||
status.outcode = outcode
|
||
status.outcodes_done = i
|
||
|
||
log.debug("Outcode %s (%d/%d) — %d properties so far",
|
||
outcode, i + 1, len(shuffled), len(all_properties))
|
||
|
||
try:
|
||
outcode_id = resolve_outcode_id(client, outcode)
|
||
if not outcode_id:
|
||
log.debug("No Rightmove ID for outcode %s, skipping", outcode)
|
||
continue
|
||
|
||
props = search_outcode(client, outcode_id, outcode, channel_cfg, pc_index)
|
||
for p in props:
|
||
pid = p["id"]
|
||
if pid not in all_properties:
|
||
all_properties[pid] = p
|
||
|
||
with status_lock:
|
||
if channel_name == "BUY":
|
||
status.properties_buy = len(all_properties)
|
||
else:
|
||
status.properties_rent = len(all_properties)
|
||
|
||
log.info("Outcode %s: got %d properties (total: %d)", outcode, len(props), len(all_properties))
|
||
|
||
except Exception as e:
|
||
msg = f"Error scraping {outcode}/{channel_name}: {e}"
|
||
log.error(msg)
|
||
with status_lock:
|
||
status.errors.append(msg)
|
||
|
||
if i < len(shuffled) - 1:
|
||
time.sleep(DELAY_BETWEEN_OUTCODES)
|
||
|
||
# Write parquet
|
||
deduped = list(all_properties.values())
|
||
output_path = DATA_DIR / f"rightmove_{file_suffix}.parquet"
|
||
write_parquet(deduped, output_path)
|
||
|
||
with status_lock:
|
||
if channel_name == "BUY":
|
||
status.properties_buy = len(deduped)
|
||
else:
|
||
status.properties_rent = len(deduped)
|
||
status.outcodes_done = len(shuffled)
|
||
|
||
log.info("=== %s channel complete: %d unique properties ===", channel_name, len(deduped))
|
||
|
||
with status_lock:
|
||
status.state = "done"
|
||
status.finished_at = time.time()
|
||
elapsed = status.finished_at - status.started_at
|
||
log.info("Scrape complete in %.0fs — buy: %d, rent: %d",
|
||
elapsed, status.properties_buy, status.properties_rent)
|
||
|
||
except Exception as e:
|
||
log.exception("Fatal scrape error")
|
||
with status_lock:
|
||
status.state = "error"
|
||
status.errors.append(f"Fatal: {e}")
|
||
status.finished_at = time.time()
|
||
finally:
|
||
client.close()
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Startup: load data
|
||
# ---------------------------------------------------------------------------
|
||
|
||
log.info("Loading arcgis data...")
|
||
OUTCODES = load_outcodes()
|
||
PC_INDEX = build_postcode_index()
|
||
log.info("Ready — %d outcodes, postcode index built", len(OUTCODES))
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Flask app
|
||
# ---------------------------------------------------------------------------
|
||
|
||
app = Flask(__name__)
|
||
|
||
|
||
@app.route("/run", methods=["POST"])
|
||
def trigger_run():
|
||
with status_lock:
|
||
if status.state == "running":
|
||
return jsonify({"error": "Scrape already running"}), 409
|
||
status.state = "running"
|
||
|
||
thread = threading.Thread(target=run_scrape, args=(OUTCODES, PC_INDEX), daemon=True)
|
||
thread.start()
|
||
return jsonify({"message": "Scrape started"}), 200
|
||
|
||
|
||
@app.route("/status")
|
||
def get_status():
|
||
with status_lock:
|
||
elapsed = 0.0
|
||
if status.started_at:
|
||
end = status.finished_at if status.finished_at else time.time()
|
||
elapsed = end - status.started_at
|
||
return jsonify({
|
||
"state": status.state,
|
||
"channel": status.channel,
|
||
"outcode": status.outcode,
|
||
"outcodes_done": status.outcodes_done,
|
||
"outcodes_total": status.outcodes_total,
|
||
"properties_buy": status.properties_buy,
|
||
"properties_rent": status.properties_rent,
|
||
"errors": status.errors[-20:], # last 20 errors
|
||
"elapsed_seconds": round(elapsed, 1),
|
||
})
|
||
|
||
|
||
@app.route("/debug")
|
||
def get_debug():
|
||
return jsonify({
|
||
"last_response": debug_data["last_response"],
|
||
"outcode_cache_size": len(debug_data["outcode_cache"]),
|
||
"outcode_cache_sample": dict(list(debug_data["outcode_cache"].items())[:20]),
|
||
})
|
||
|
||
|
||
@app.route("/data/<filename>")
|
||
def serve_data(filename):
|
||
if not filename.endswith(".parquet"):
|
||
return jsonify({"error": "Only parquet files served"}), 400
|
||
return send_from_directory(DATA_DIR, filename)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
app.run(host="0.0.0.0", port=1234, debug=False)
|