More
This commit is contained in:
parent
128b3191e7
commit
03445188ea
54 changed files with 596953 additions and 3577 deletions
|
|
@ -6,6 +6,6 @@ WORKDIR /app
|
|||
COPY pyproject.toml ./
|
||||
RUN uv pip install --system -r pyproject.toml
|
||||
|
||||
COPY main.py ./
|
||||
COPY *.py ./
|
||||
|
||||
CMD ["python3", "main.py"]
|
||||
|
|
|
|||
56
finder/constants.py
Normal file
56
finder/constants.py
Normal file
|
|
@ -0,0 +1,56 @@
|
|||
import os
|
||||
from pathlib import Path
|
||||
|
||||
ARCGIS_PATH = os.environ.get("ARCGIS_PATH", "/data/arcgis_data.parquet")
|
||||
DATA_DIR = Path("/app/data")
|
||||
PAGE_SIZE = 24
|
||||
DELAY_BETWEEN_PAGES = 1.0
|
||||
DELAY_BETWEEN_OUTCODES = 2.0
|
||||
MAX_RETRIES = 3
|
||||
RETRY_BASE_DELAY = 2.0
|
||||
GRID_CELL_SIZE = 0.01 # degrees for postcode spatial index
|
||||
SEED = 42
|
||||
|
||||
TYPEAHEAD_URL = "https://los.rightmove.co.uk/typeahead"
|
||||
SEARCH_URL = "https://www.rightmove.co.uk/api/property-search/listing/search"
|
||||
RIGHTMOVE_BASE = "https://www.rightmove.co.uk"
|
||||
|
||||
PROPERTY_TYPE_MAP = {
|
||||
"Detached": "Detached",
|
||||
"Semi-Detached": "Semi-Detached",
|
||||
"Terraced": "Terraced",
|
||||
"End of Terrace": "Terraced",
|
||||
"Mid Terrace": "Terraced",
|
||||
"Flat": "Flat",
|
||||
"Maisonette": "Flat",
|
||||
"Studio": "Flat",
|
||||
"Apartment": "Flat",
|
||||
"Penthouse": "Flat",
|
||||
"Ground Flat": "Flat",
|
||||
"Detached Bungalow": "Detached",
|
||||
"Semi-Detached Bungalow": "Semi-Detached",
|
||||
"Town House": "Terraced",
|
||||
"Link Detached": "Detached",
|
||||
"Link Detached House": "Detached",
|
||||
"Bungalow": "Other",
|
||||
"Cottage": "Other",
|
||||
"Park Home": "Other",
|
||||
"Land": "Other",
|
||||
"Farm / Barn": "Other",
|
||||
"House": "Detached",
|
||||
"Not Specified": "Other",
|
||||
"Chalet": "Other",
|
||||
"Barn Conversion": "Other",
|
||||
"Coach House": "Other",
|
||||
"Character Property": "Other",
|
||||
"Cluster House": "Other",
|
||||
"Retirement Property": "Flat",
|
||||
"Plot": "Other",
|
||||
"Garages": "Other",
|
||||
"Mews": "Terraced",
|
||||
}
|
||||
|
||||
CHANNELS = [
|
||||
{"channel": "BUY", "transactionType": "BUY", "sortType": "2"},
|
||||
{"channel": "RENT", "transactionType": "LETTING", "sortType": "6"},
|
||||
]
|
||||
126
finder/http_client.py
Normal file
126
finder/http_client.py
Normal file
|
|
@ -0,0 +1,126 @@
|
|||
import logging
|
||||
import random
|
||||
import threading
|
||||
import time
|
||||
|
||||
import httpx
|
||||
from fake_useragent import UserAgent
|
||||
|
||||
from constants import MAX_RETRIES, RETRY_BASE_DELAY
|
||||
from metrics import http_errors_total, http_requests_total, ip_rotations_total
|
||||
|
||||
log = logging.getLogger("rightmove")
|
||||
|
||||
_ua = UserAgent(browsers=["Chrome", "Edge"], os=["Windows", "Mac OS X"], min_version=120.0)
|
||||
|
||||
|
||||
def _endpoint_label(url: str) -> str:
|
||||
if "typeahead" in url:
|
||||
return "typeahead"
|
||||
if "search" in url:
|
||||
return "search"
|
||||
return "other"
|
||||
|
||||
|
||||
def _status_label(code: int) -> str:
|
||||
if code >= 500:
|
||||
return "5xx"
|
||||
return str(code)
|
||||
|
||||
# Gluetun control API — runs on port 8000 inside the gluetun container.
|
||||
# Since finder uses network_mode: service:gluetun, localhost IS gluetun.
|
||||
GLUETUN_API = "http://127.0.0.1:8000"
|
||||
_ip_rotate_lock = threading.Lock()
|
||||
|
||||
|
||||
def rotate_ip() -> bool:
|
||||
"""Ask gluetun to reconnect to a different VPN server, getting a new IP.
|
||||
Returns True if the IP changed successfully."""
|
||||
with _ip_rotate_lock:
|
||||
log.info("Rotating VPN IP via gluetun...")
|
||||
try:
|
||||
# Get current IP
|
||||
with httpx.Client(timeout=10) as ctl:
|
||||
old_ip_resp = ctl.get(f"{GLUETUN_API}/v1/publicip/ip")
|
||||
old_ip = old_ip_resp.json().get("public_ip", "unknown") if old_ip_resp.status_code == 200 else "unknown"
|
||||
log.info("Current IP: %s", old_ip)
|
||||
|
||||
# Trigger server change — PUT with empty JSON body picks a random server
|
||||
resp = ctl.put(f"{GLUETUN_API}/v1/vpn/status", json={"status": "stopped"})
|
||||
if resp.status_code != 200:
|
||||
log.error("Failed to stop VPN: %d %s", resp.status_code, resp.text)
|
||||
return False
|
||||
time.sleep(2)
|
||||
|
||||
resp = ctl.put(f"{GLUETUN_API}/v1/vpn/status", json={"status": "running"})
|
||||
if resp.status_code != 200:
|
||||
log.error("Failed to start VPN: %d %s", resp.status_code, resp.text)
|
||||
return False
|
||||
|
||||
# Wait for reconnection
|
||||
for _ in range(30):
|
||||
time.sleep(2)
|
||||
try:
|
||||
with httpx.Client(timeout=10) as ctl:
|
||||
new_ip_resp = ctl.get(f"{GLUETUN_API}/v1/publicip/ip")
|
||||
if new_ip_resp.status_code == 200:
|
||||
new_ip = new_ip_resp.json().get("public_ip", "")
|
||||
if new_ip and new_ip != old_ip:
|
||||
log.info("IP rotated: %s → %s", old_ip, new_ip)
|
||||
ip_rotations_total.labels(result="success").inc()
|
||||
return True
|
||||
except Exception:
|
||||
pass # VPN still reconnecting
|
||||
|
||||
log.warning("IP rotation timed out (may still be same IP)")
|
||||
ip_rotations_total.labels(result="failure").inc()
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
log.error("IP rotation failed: %s", e)
|
||||
ip_rotations_total.labels(result="failure").inc()
|
||||
return False
|
||||
|
||||
|
||||
def make_client() -> httpx.Client:
|
||||
return httpx.Client(
|
||||
timeout=30,
|
||||
headers={"User-Agent": _ua.random, "Accept": "application/json"},
|
||||
follow_redirects=True,
|
||||
)
|
||||
|
||||
|
||||
def fetch_with_retry(
|
||||
client: httpx.Client, url: str, params: dict | None = None, on_403: bool = True
|
||||
) -> dict | None:
|
||||
"""GET JSON with retries on 429/5xx/connection errors. Returns None on permanent failure.
|
||||
On 403, triggers IP rotation and retries once."""
|
||||
endpoint = _endpoint_label(url)
|
||||
for attempt in range(MAX_RETRIES):
|
||||
try:
|
||||
resp = client.get(url, params=params)
|
||||
http_requests_total.labels(status=_status_label(resp.status_code), endpoint=endpoint).inc()
|
||||
if resp.status_code == 200:
|
||||
return resp.json()
|
||||
if resp.status_code == 403 and on_403:
|
||||
log.warning("HTTP 403 — IP likely blocked, rotating...")
|
||||
if rotate_ip():
|
||||
# Retry once with new IP (but don't recurse on 403 again)
|
||||
return fetch_with_retry(client, url, params, on_403=False)
|
||||
log.error("IP rotation failed, giving up on %s", url)
|
||||
return None
|
||||
if resp.status_code in (429, 500, 502, 503, 504):
|
||||
delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1)
|
||||
log.warning("HTTP %d from %s, retry %d/%d in %.1fs", resp.status_code, url, attempt + 1, MAX_RETRIES, delay)
|
||||
time.sleep(delay)
|
||||
continue
|
||||
log.error("HTTP %d from %s (non-retryable)", resp.status_code, url)
|
||||
return None
|
||||
except (httpx.ConnectError, httpx.ReadTimeout, httpx.WriteTimeout, httpx.PoolTimeout) as e:
|
||||
http_errors_total.labels(type=type(e).__name__).inc()
|
||||
delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1)
|
||||
log.warning("%s from %s, retry %d/%d in %.1fs", type(e).__name__, url, attempt + 1, MAX_RETRIES, delay)
|
||||
time.sleep(delay)
|
||||
http_errors_total.labels(type="retry_exhausted").inc()
|
||||
log.error("All %d retries exhausted for %s", MAX_RETRIES, url)
|
||||
return None
|
||||
643
finder/main.py
643
finder/main.py
|
|
@ -1,17 +1,21 @@
|
|||
import logging
|
||||
import math
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
import threading
|
||||
import time
|
||||
from collections import defaultdict
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
|
||||
import httpx
|
||||
import polars as pl
|
||||
from flask import Flask, jsonify, send_from_directory
|
||||
from flask import Flask, Response, jsonify, send_from_directory
|
||||
from prometheus_client import generate_latest, CONTENT_TYPE_LATEST
|
||||
|
||||
from constants import DATA_DIR
|
||||
from rightmove import outcode_cache
|
||||
from scraper import (
|
||||
_sync_gauges,
|
||||
build_postcode_index,
|
||||
load_outcodes,
|
||||
run_scrape,
|
||||
status,
|
||||
status_lock,
|
||||
)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Logging
|
||||
|
|
@ -33,615 +37,6 @@ log.setLevel(logging.DEBUG)
|
|||
logging.getLogger("httpx").setLevel(logging.WARNING)
|
||||
logging.getLogger("httpcore").setLevel(logging.WARNING)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Constants
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
ARCGIS_PATH = os.environ.get("ARCGIS_PATH", "/data/arcgis_data.parquet")
|
||||
DATA_DIR = Path("/app/data")
|
||||
PAGE_SIZE = 24
|
||||
MAX_PAGES_PER_OUTCODE = 42 # 24*42 = 1008, safety cap per outcode
|
||||
DELAY_BETWEEN_PAGES = 1.0
|
||||
DELAY_BETWEEN_OUTCODES = 2.0
|
||||
MAX_RETRIES = 3
|
||||
RETRY_BASE_DELAY = 2.0
|
||||
GRID_CELL_SIZE = 0.01 # degrees for postcode spatial index
|
||||
SEED = 42
|
||||
|
||||
TYPEAHEAD_URL = "https://los.rightmove.co.uk/typeahead"
|
||||
SEARCH_URL = "https://www.rightmove.co.uk/api/property-search/listing/search"
|
||||
RIGHTMOVE_BASE = "https://www.rightmove.co.uk"
|
||||
|
||||
PROPERTY_TYPE_MAP = {
|
||||
"Detached": "Detached",
|
||||
"Semi-Detached": "Semi-Detached",
|
||||
"Terraced": "Terraced",
|
||||
"End of Terrace": "Terraced",
|
||||
"Mid Terrace": "Terraced",
|
||||
"Flat": "Flat",
|
||||
"Maisonette": "Flat",
|
||||
"Studio": "Flat",
|
||||
"Apartment": "Flat",
|
||||
"Penthouse": "Flat",
|
||||
"Ground Flat": "Flat",
|
||||
"Detached Bungalow": "Detached",
|
||||
"Semi-Detached Bungalow": "Semi-Detached",
|
||||
"Town House": "Terraced",
|
||||
"Link Detached": "Detached",
|
||||
"Link Detached House": "Detached",
|
||||
"Bungalow": "Other",
|
||||
"Cottage": "Other",
|
||||
"Park Home": "Other",
|
||||
"Land": "Other",
|
||||
"Farm / Barn": "Other",
|
||||
"House": "Detached",
|
||||
"Not Specified": "Other",
|
||||
"Chalet": "Other",
|
||||
"Barn Conversion": "Other",
|
||||
"Coach House": "Other",
|
||||
"Character Property": "Other",
|
||||
"Cluster House": "Other",
|
||||
"Retirement Property": "Flat",
|
||||
"Plot": "Other",
|
||||
"Garages": "Other",
|
||||
"Mews": "Terraced",
|
||||
}
|
||||
|
||||
CHANNELS = [
|
||||
{"channel": "BUY", "transactionType": "BUY", "sortType": "2"},
|
||||
{"channel": "RENT", "transactionType": "LETTING", "sortType": "6"},
|
||||
]
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Postcode spatial index
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class PostcodeSpatialIndex:
|
||||
"""Grid-based spatial index over arcgis postcodes for nearest-lookup."""
|
||||
|
||||
def __init__(self, lats: list[float], lngs: list[float], postcodes: list[str]):
|
||||
self.grid: dict[tuple[int, int], list[tuple[float, float, str]]] = defaultdict(list)
|
||||
for lat, lng, pcd in zip(lats, lngs, postcodes):
|
||||
gx = int(math.floor(lng / GRID_CELL_SIZE))
|
||||
gy = int(math.floor(lat / GRID_CELL_SIZE))
|
||||
self.grid[(gx, gy)].append((lat, lng, pcd))
|
||||
log.info("Postcode spatial index: %d cells, %d postcodes", len(self.grid), len(lats))
|
||||
|
||||
def nearest(self, lat: float, lng: float) -> str | None:
|
||||
gx = int(math.floor(lng / GRID_CELL_SIZE))
|
||||
gy = int(math.floor(lat / GRID_CELL_SIZE))
|
||||
best_dist = float("inf")
|
||||
best_pcd = None
|
||||
for dx in range(-1, 2):
|
||||
for dy in range(-1, 2):
|
||||
for plat, plng, pcd in self.grid.get((gx + dx, gy + dy), []):
|
||||
d = (plat - lat) ** 2 + (plng - lng) ** 2
|
||||
if d < best_dist:
|
||||
best_dist = d
|
||||
best_pcd = pcd
|
||||
return best_pcd
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Scrape status
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@dataclass
|
||||
class ScrapeStatus:
|
||||
state: str = "idle" # idle | running | done | error
|
||||
channel: str = ""
|
||||
outcode: str = ""
|
||||
outcodes_done: int = 0
|
||||
outcodes_total: int = 0
|
||||
properties_buy: int = 0
|
||||
properties_rent: int = 0
|
||||
errors: list[str] = field(default_factory=list)
|
||||
started_at: float = 0.0
|
||||
finished_at: float = 0.0
|
||||
|
||||
|
||||
status = ScrapeStatus()
|
||||
status_lock = threading.Lock()
|
||||
debug_data: dict = {"last_response": None, "outcode_cache": {}}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# HTTP helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
USER_AGENT = (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
|
||||
)
|
||||
|
||||
# Gluetun control API — runs on port 8000 inside the gluetun container.
|
||||
# Since finder uses network_mode: service:gluetun, localhost IS gluetun.
|
||||
GLUETUN_API = "http://127.0.0.1:8000"
|
||||
_ip_rotate_lock = threading.Lock()
|
||||
|
||||
|
||||
def rotate_ip() -> bool:
|
||||
"""Ask gluetun to reconnect to a different VPN server, getting a new IP.
|
||||
Returns True if the IP changed successfully."""
|
||||
with _ip_rotate_lock:
|
||||
log.info("Rotating VPN IP via gluetun...")
|
||||
try:
|
||||
# Get current IP
|
||||
with httpx.Client(timeout=10) as ctl:
|
||||
old_ip_resp = ctl.get(f"{GLUETUN_API}/v1/publicip/ip")
|
||||
old_ip = old_ip_resp.json().get("public_ip", "unknown") if old_ip_resp.status_code == 200 else "unknown"
|
||||
log.info("Current IP: %s", old_ip)
|
||||
|
||||
# Trigger server change — PUT with empty JSON body picks a random server
|
||||
resp = ctl.put(f"{GLUETUN_API}/v1/vpn/status", json={"status": "stopped"})
|
||||
if resp.status_code != 200:
|
||||
log.error("Failed to stop VPN: %d %s", resp.status_code, resp.text)
|
||||
return False
|
||||
time.sleep(2)
|
||||
|
||||
resp = ctl.put(f"{GLUETUN_API}/v1/vpn/status", json={"status": "running"})
|
||||
if resp.status_code != 200:
|
||||
log.error("Failed to start VPN: %d %s", resp.status_code, resp.text)
|
||||
return False
|
||||
|
||||
# Wait for reconnection
|
||||
for _ in range(30):
|
||||
time.sleep(2)
|
||||
try:
|
||||
with httpx.Client(timeout=10) as ctl:
|
||||
new_ip_resp = ctl.get(f"{GLUETUN_API}/v1/publicip/ip")
|
||||
if new_ip_resp.status_code == 200:
|
||||
new_ip = new_ip_resp.json().get("public_ip", "")
|
||||
if new_ip and new_ip != old_ip:
|
||||
log.info("IP rotated: %s → %s", old_ip, new_ip)
|
||||
return True
|
||||
except Exception:
|
||||
pass # VPN still reconnecting
|
||||
|
||||
log.warning("IP rotation timed out (may still be same IP)")
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
log.error("IP rotation failed: %s", e)
|
||||
return False
|
||||
|
||||
|
||||
def make_client() -> httpx.Client:
|
||||
return httpx.Client(
|
||||
timeout=30,
|
||||
headers={"User-Agent": USER_AGENT, "Accept": "application/json"},
|
||||
follow_redirects=True,
|
||||
)
|
||||
|
||||
|
||||
def fetch_with_retry(
|
||||
client: httpx.Client, url: str, params: dict | None = None, on_403: bool = True
|
||||
) -> dict | None:
|
||||
"""GET JSON with retries on 429/5xx/connection errors. Returns None on permanent failure.
|
||||
On 403, triggers IP rotation and retries once."""
|
||||
for attempt in range(MAX_RETRIES):
|
||||
try:
|
||||
resp = client.get(url, params=params)
|
||||
if resp.status_code == 200:
|
||||
return resp.json()
|
||||
if resp.status_code == 403 and on_403:
|
||||
log.warning("HTTP 403 — IP likely blocked, rotating...")
|
||||
if rotate_ip():
|
||||
# Retry once with new IP (but don't recurse on 403 again)
|
||||
return fetch_with_retry(client, url, params, on_403=False)
|
||||
log.error("IP rotation failed, giving up on %s", url)
|
||||
return None
|
||||
if resp.status_code in (429, 500, 502, 503, 504):
|
||||
delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1)
|
||||
log.warning("HTTP %d from %s, retry %d/%d in %.1fs", resp.status_code, url, attempt + 1, MAX_RETRIES, delay)
|
||||
time.sleep(delay)
|
||||
continue
|
||||
log.error("HTTP %d from %s (non-retryable)", resp.status_code, url)
|
||||
return None
|
||||
except (httpx.ConnectError, httpx.ReadTimeout, httpx.WriteTimeout, httpx.PoolTimeout) as e:
|
||||
delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1)
|
||||
log.warning("%s from %s, retry %d/%d in %.1fs", type(e).__name__, url, attempt + 1, MAX_RETRIES, delay)
|
||||
time.sleep(delay)
|
||||
log.error("All %d retries exhausted for %s", MAX_RETRIES, url)
|
||||
return None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Rightmove API
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def resolve_outcode_id(client: httpx.Client, outcode: str) -> str | None:
|
||||
"""Look up Rightmove's internal ID for an outcode via typeahead API."""
|
||||
if outcode in debug_data["outcode_cache"]:
|
||||
return debug_data["outcode_cache"][outcode]
|
||||
|
||||
data = fetch_with_retry(client, TYPEAHEAD_URL, {"query": outcode, "limit": "10", "exclude": "STREET"})
|
||||
if not data:
|
||||
return None
|
||||
|
||||
for match in data.get("matches", []):
|
||||
if match.get("type") == "OUTCODE" and match.get("displayName") == outcode:
|
||||
rid = str(match["id"])
|
||||
debug_data["outcode_cache"][outcode] = rid
|
||||
return rid
|
||||
|
||||
log.debug("Outcode %s not found in typeahead results", outcode)
|
||||
return None
|
||||
|
||||
|
||||
def search_outcode(
|
||||
client: httpx.Client,
|
||||
outcode_id: str,
|
||||
outcode: str,
|
||||
channel_cfg: dict,
|
||||
pc_index: PostcodeSpatialIndex,
|
||||
) -> list[dict]:
|
||||
"""Paginate through search results for one outcode+channel. Returns transformed properties."""
|
||||
properties = []
|
||||
index = 0
|
||||
|
||||
for page in range(MAX_PAGES_PER_OUTCODE):
|
||||
params = {
|
||||
"useLocationIdentifier": "true",
|
||||
"locationIdentifier": f"OUTCODE^{outcode_id}",
|
||||
"index": str(index),
|
||||
"sortType": channel_cfg["sortType"],
|
||||
"channel": channel_cfg["channel"],
|
||||
"transactionType": channel_cfg["transactionType"],
|
||||
}
|
||||
|
||||
data = fetch_with_retry(client, SEARCH_URL, params)
|
||||
if not data:
|
||||
log.warning("Failed to fetch page %d for %s/%s", page, outcode, channel_cfg["channel"])
|
||||
break
|
||||
|
||||
debug_data["last_response"] = data
|
||||
|
||||
raw_props = data.get("properties", [])
|
||||
if not raw_props:
|
||||
break
|
||||
|
||||
for prop in raw_props:
|
||||
transformed = transform_property(prop, outcode, pc_index)
|
||||
if transformed:
|
||||
properties.append(transformed)
|
||||
|
||||
# Check if there are more pages
|
||||
result_count_str = data.get("resultCount", "0")
|
||||
result_count = int(result_count_str.replace(",", ""))
|
||||
index += PAGE_SIZE
|
||||
|
||||
if index >= result_count:
|
||||
break
|
||||
|
||||
if page < MAX_PAGES_PER_OUTCODE - 1:
|
||||
time.sleep(DELAY_BETWEEN_PAGES)
|
||||
|
||||
return properties
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Property transformation
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def parse_display_size(display_size: str | None) -> float | None:
|
||||
"""Parse displaySize like '499 sq. ft.' or '4,124 sq. ft.' to sqm."""
|
||||
if not display_size:
|
||||
return None
|
||||
# Try sq. ft. first
|
||||
m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*ft", display_size, re.IGNORECASE)
|
||||
if m:
|
||||
sqft = float(m.group(1).replace(",", ""))
|
||||
return round(sqft * 0.092903, 1)
|
||||
# Try sq. m.
|
||||
m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*m", display_size, re.IGNORECASE)
|
||||
if m:
|
||||
return round(float(m.group(1).replace(",", "")), 1)
|
||||
return None
|
||||
|
||||
|
||||
def map_property_type(sub_type: str | None) -> str:
|
||||
"""Map propertySubType to canonical type."""
|
||||
if not sub_type:
|
||||
return "Other"
|
||||
canonical = PROPERTY_TYPE_MAP.get(sub_type)
|
||||
if canonical:
|
||||
return canonical
|
||||
log.warning("Unknown propertySubType: %r — mapping to Other", sub_type)
|
||||
return "Other"
|
||||
|
||||
|
||||
def extract_tenure(tenure_obj: dict | None) -> str | None:
|
||||
"""Extract tenure string from tenure object."""
|
||||
if not tenure_obj:
|
||||
return None
|
||||
tt = tenure_obj.get("tenureType", "")
|
||||
if tt == "FREEHOLD":
|
||||
return "Freehold"
|
||||
if tt == "LEASEHOLD":
|
||||
return "Leasehold"
|
||||
return None
|
||||
|
||||
|
||||
def fix_coords(lat: float, lng: float) -> tuple[float, float]:
|
||||
"""Swap lat/lng if they look reversed. England: lat ~49–56, lng ~-7–2."""
|
||||
if 49 <= lat <= 56 and -7 <= lng <= 2:
|
||||
return lat, lng
|
||||
if 49 <= lng <= 56 and -7 <= lat <= 2:
|
||||
log.debug("Swapping reversed coords: lat=%.4f lng=%.4f → lat=%.4f lng=%.4f", lat, lng, lng, lat)
|
||||
return lng, lat
|
||||
log.warning("Coords outside England bounds even after swap attempt: lat=%.4f lng=%.4f", lat, lng)
|
||||
return lat, lng
|
||||
|
||||
|
||||
def normalize_price(amount: int, frequency: str) -> int:
|
||||
"""Normalize price to monthly for rentals (weekly × 52/12, yearly ÷ 12)."""
|
||||
if frequency == "weekly":
|
||||
return round(amount * 52 / 12)
|
||||
if frequency == "yearly":
|
||||
return round(amount / 12)
|
||||
return amount
|
||||
|
||||
|
||||
def transform_property(prop: dict, outcode: str, pc_index: PostcodeSpatialIndex) -> dict | None:
|
||||
"""Transform a raw Rightmove property dict into our output schema."""
|
||||
loc = prop.get("location")
|
||||
if not loc:
|
||||
return None
|
||||
raw_lat = loc.get("latitude")
|
||||
raw_lng = loc.get("longitude")
|
||||
if raw_lat is None or raw_lng is None:
|
||||
return None
|
||||
|
||||
lat, lng = fix_coords(raw_lat, raw_lng)
|
||||
|
||||
price_obj = prop.get("price", {})
|
||||
amount = price_obj.get("amount")
|
||||
if amount is None:
|
||||
return None
|
||||
frequency = price_obj.get("frequency", "")
|
||||
price = normalize_price(int(amount), frequency)
|
||||
|
||||
display_prices = price_obj.get("displayPrices", [])
|
||||
price_qualifier = display_prices[0].get("displayPriceQualifier", "") if display_prices else ""
|
||||
|
||||
sub_type = prop.get("propertySubType", "")
|
||||
bedrooms = prop.get("bedrooms", 0) or 0
|
||||
bathrooms = prop.get("bathrooms", 0) or 0
|
||||
|
||||
key_features = [kf.get("description", "") for kf in prop.get("keyFeatures", []) if kf.get("description")]
|
||||
|
||||
listing_update = prop.get("listingUpdate", {})
|
||||
update_date = listing_update.get("listingUpdateDate", "")
|
||||
|
||||
postcode = pc_index.nearest(lat, lng)
|
||||
|
||||
return {
|
||||
"id": prop.get("id"),
|
||||
"bedrooms": bedrooms,
|
||||
"bathrooms": bathrooms,
|
||||
"total_rooms": bedrooms + bathrooms,
|
||||
"longitude": lng,
|
||||
"latitude": lat,
|
||||
"postcode": postcode,
|
||||
"address": prop.get("displayAddress", ""),
|
||||
"tenure": extract_tenure(prop.get("tenure")),
|
||||
"property_type": map_property_type(sub_type),
|
||||
"property_sub_type": sub_type or "Unknown",
|
||||
"price": price,
|
||||
"price_frequency": frequency,
|
||||
"price_qualifier": price_qualifier,
|
||||
"floorspace_sqm": parse_display_size(prop.get("displaySize")),
|
||||
"url": RIGHTMOVE_BASE + prop.get("propertyUrl", ""),
|
||||
"features": key_features,
|
||||
"first_visible_date": prop.get("firstVisibleDate", ""),
|
||||
"update_date": update_date,
|
||||
"outcode": outcode,
|
||||
"house_share": sub_type == "House Share",
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Parquet writing
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def write_parquet(properties: list[dict], path: Path) -> None:
|
||||
"""Write properties list to parquet using Polars."""
|
||||
if not properties:
|
||||
log.warning("No properties to write to %s", path)
|
||||
return
|
||||
|
||||
df = pl.DataFrame(
|
||||
{
|
||||
"id": [p["id"] for p in properties],
|
||||
"bedrooms": [p["bedrooms"] for p in properties],
|
||||
"bathrooms": [p["bathrooms"] for p in properties],
|
||||
"total_rooms": [p["total_rooms"] for p in properties],
|
||||
"longitude": [p["longitude"] for p in properties],
|
||||
"latitude": [p["latitude"] for p in properties],
|
||||
"postcode": [p["postcode"] for p in properties],
|
||||
"address": [p["address"] for p in properties],
|
||||
"tenure": [p["tenure"] for p in properties],
|
||||
"property_type": [p["property_type"] for p in properties],
|
||||
"property_sub_type": [p["property_sub_type"] for p in properties],
|
||||
"price": [p["price"] for p in properties],
|
||||
"price_frequency": [p["price_frequency"] for p in properties],
|
||||
"price_qualifier": [p["price_qualifier"] for p in properties],
|
||||
"floorspace_sqm": [p["floorspace_sqm"] for p in properties],
|
||||
"url": [p["url"] for p in properties],
|
||||
"features": [p["features"] for p in properties],
|
||||
"first_visible_date": [p["first_visible_date"] for p in properties],
|
||||
"update_date": [p["update_date"] for p in properties],
|
||||
"outcode": [p["outcode"] for p in properties],
|
||||
"house_share": [p["house_share"] for p in properties],
|
||||
},
|
||||
schema={
|
||||
"id": pl.Int64,
|
||||
"bedrooms": pl.Int32,
|
||||
"bathrooms": pl.Int32,
|
||||
"total_rooms": pl.Int32,
|
||||
"longitude": pl.Float64,
|
||||
"latitude": pl.Float64,
|
||||
"postcode": pl.Utf8,
|
||||
"address": pl.Utf8,
|
||||
"tenure": pl.Utf8,
|
||||
"property_type": pl.Utf8,
|
||||
"property_sub_type": pl.Utf8,
|
||||
"price": pl.Int64,
|
||||
"price_frequency": pl.Utf8,
|
||||
"price_qualifier": pl.Utf8,
|
||||
"floorspace_sqm": pl.Float64,
|
||||
"url": pl.Utf8,
|
||||
"features": pl.List(pl.Utf8),
|
||||
"first_visible_date": pl.Utf8,
|
||||
"update_date": pl.Utf8,
|
||||
"outcode": pl.Utf8,
|
||||
"house_share": pl.Boolean,
|
||||
},
|
||||
)
|
||||
|
||||
df.write_parquet(path)
|
||||
log.info("Wrote %d properties to %s", len(df), path)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Scrape orchestration
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def load_outcodes() -> list[str]:
|
||||
"""Load England-only outcodes from arcgis parquet."""
|
||||
log.info("Loading outcodes from %s", ARCGIS_PATH)
|
||||
df = pl.read_parquet(ARCGIS_PATH, columns=["pcd", "ctry", "lat", "long"])
|
||||
england = df.filter(pl.col("ctry") == "E92000001")
|
||||
log.info("England postcodes: %d", len(england))
|
||||
|
||||
outcodes = (
|
||||
england.select(pl.col("pcd").str.extract(r"^([A-Z]{1,2}\d[A-Z0-9]?)", 1).alias("outcode"))
|
||||
.drop_nulls()
|
||||
.get_column("outcode")
|
||||
.unique()
|
||||
.sort()
|
||||
.to_list()
|
||||
)
|
||||
log.info("Unique England outcodes: %d", len(outcodes))
|
||||
return outcodes
|
||||
|
||||
|
||||
def build_postcode_index() -> PostcodeSpatialIndex:
|
||||
"""Build spatial index from arcgis England postcodes."""
|
||||
log.info("Building postcode spatial index from %s", ARCGIS_PATH)
|
||||
df = pl.read_parquet(ARCGIS_PATH, columns=["pcd", "ctry", "lat", "long"])
|
||||
england = df.filter(pl.col("ctry") == "E92000001").drop_nulls(subset=["lat", "long"])
|
||||
return PostcodeSpatialIndex(
|
||||
england.get_column("lat").to_list(),
|
||||
england.get_column("long").to_list(),
|
||||
england.get_column("pcd").to_list(),
|
||||
)
|
||||
|
||||
|
||||
def run_scrape(outcodes: list[str], pc_index: PostcodeSpatialIndex) -> None:
|
||||
"""Main scrape loop — runs in background thread."""
|
||||
global status
|
||||
with status_lock:
|
||||
status.state = "running"
|
||||
status.started_at = time.time()
|
||||
status.errors = []
|
||||
status.properties_buy = 0
|
||||
status.properties_rent = 0
|
||||
|
||||
# Shuffle for geographic diversity
|
||||
shuffled = list(outcodes)
|
||||
random.seed(SEED)
|
||||
random.shuffle(shuffled)
|
||||
|
||||
client = make_client()
|
||||
|
||||
try:
|
||||
for channel_cfg in CHANNELS:
|
||||
channel_name = channel_cfg["channel"]
|
||||
file_suffix = "buy" if channel_name == "BUY" else "rent"
|
||||
all_properties: dict[int, dict] = {} # dedup by id
|
||||
|
||||
with status_lock:
|
||||
status.channel = channel_name
|
||||
status.outcodes_done = 0
|
||||
status.outcodes_total = len(shuffled)
|
||||
|
||||
log.info("=== Starting %s channel (%d outcodes) ===", channel_name, len(shuffled))
|
||||
|
||||
for i, outcode in enumerate(shuffled):
|
||||
with status_lock:
|
||||
status.outcode = outcode
|
||||
status.outcodes_done = i
|
||||
|
||||
log.debug("Outcode %s (%d/%d) — %d properties so far",
|
||||
outcode, i + 1, len(shuffled), len(all_properties))
|
||||
|
||||
try:
|
||||
outcode_id = resolve_outcode_id(client, outcode)
|
||||
if not outcode_id:
|
||||
log.debug("No Rightmove ID for outcode %s, skipping", outcode)
|
||||
continue
|
||||
|
||||
props = search_outcode(client, outcode_id, outcode, channel_cfg, pc_index)
|
||||
for p in props:
|
||||
pid = p["id"]
|
||||
if pid not in all_properties:
|
||||
all_properties[pid] = p
|
||||
|
||||
with status_lock:
|
||||
if channel_name == "BUY":
|
||||
status.properties_buy = len(all_properties)
|
||||
else:
|
||||
status.properties_rent = len(all_properties)
|
||||
|
||||
log.info("Outcode %s: got %d properties (total: %d)", outcode, len(props), len(all_properties))
|
||||
|
||||
except Exception as e:
|
||||
msg = f"Error scraping {outcode}/{channel_name}: {e}"
|
||||
log.error(msg)
|
||||
with status_lock:
|
||||
status.errors.append(msg)
|
||||
|
||||
if i < len(shuffled) - 1:
|
||||
time.sleep(DELAY_BETWEEN_OUTCODES)
|
||||
|
||||
# Write parquet
|
||||
deduped = list(all_properties.values())
|
||||
output_path = DATA_DIR / f"rightmove_{file_suffix}.parquet"
|
||||
write_parquet(deduped, output_path)
|
||||
|
||||
with status_lock:
|
||||
if channel_name == "BUY":
|
||||
status.properties_buy = len(deduped)
|
||||
else:
|
||||
status.properties_rent = len(deduped)
|
||||
status.outcodes_done = len(shuffled)
|
||||
|
||||
log.info("=== %s channel complete: %d unique properties ===", channel_name, len(deduped))
|
||||
|
||||
with status_lock:
|
||||
status.state = "done"
|
||||
status.finished_at = time.time()
|
||||
elapsed = status.finished_at - status.started_at
|
||||
log.info("Scrape complete in %.0fs — buy: %d, rent: %d",
|
||||
elapsed, status.properties_buy, status.properties_rent)
|
||||
|
||||
except Exception as e:
|
||||
log.exception("Fatal scrape error")
|
||||
with status_lock:
|
||||
status.state = "error"
|
||||
status.errors.append(f"Fatal: {e}")
|
||||
status.finished_at = time.time()
|
||||
finally:
|
||||
client.close()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Startup: load data
|
||||
# ---------------------------------------------------------------------------
|
||||
|
|
@ -693,12 +88,18 @@ def get_status():
|
|||
@app.route("/debug")
|
||||
def get_debug():
|
||||
return jsonify({
|
||||
"last_response": debug_data["last_response"],
|
||||
"outcode_cache_size": len(debug_data["outcode_cache"]),
|
||||
"outcode_cache_sample": dict(list(debug_data["outcode_cache"].items())[:20]),
|
||||
"outcode_cache_size": len(outcode_cache),
|
||||
"outcode_cache_sample": dict(list(outcode_cache.items())[:20]),
|
||||
})
|
||||
|
||||
|
||||
@app.route("/metrics")
|
||||
def metrics():
|
||||
with status_lock:
|
||||
_sync_gauges()
|
||||
return Response(generate_latest(), mimetype=CONTENT_TYPE_LATEST)
|
||||
|
||||
|
||||
@app.route("/data/<filename>")
|
||||
def serve_data(filename):
|
||||
if not filename.endswith(".parquet"):
|
||||
|
|
|
|||
59
finder/metrics.py
Normal file
59
finder/metrics.py
Normal file
|
|
@ -0,0 +1,59 @@
|
|||
from prometheus_client import Counter, Gauge
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Gauges — current scrape state, updated after each outcode
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
scrape_state = Gauge(
|
||||
"scrape_state",
|
||||
"Current scrape state as a labeled gauge (1 = active)",
|
||||
["state"],
|
||||
)
|
||||
|
||||
scrape_outcodes_done = Gauge(
|
||||
"scrape_outcodes_done",
|
||||
"Outcodes processed in current channel",
|
||||
)
|
||||
|
||||
scrape_outcodes_total = Gauge(
|
||||
"scrape_outcodes_total",
|
||||
"Total outcodes in current channel",
|
||||
)
|
||||
|
||||
scrape_properties_total = Gauge(
|
||||
"scrape_properties_total",
|
||||
"Properties found so far",
|
||||
["channel"],
|
||||
)
|
||||
|
||||
scrape_elapsed_seconds = Gauge(
|
||||
"scrape_elapsed_seconds",
|
||||
"Seconds since scrape started",
|
||||
)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Counters — monotonically increasing
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
http_requests_total = Counter(
|
||||
"http_requests_total",
|
||||
"HTTP requests made by the scraper",
|
||||
["status", "endpoint"],
|
||||
)
|
||||
|
||||
http_errors_total = Counter(
|
||||
"http_errors_total",
|
||||
"HTTP connection/timeout errors",
|
||||
["type"],
|
||||
)
|
||||
|
||||
ip_rotations_total = Counter(
|
||||
"ip_rotations_total",
|
||||
"VPN IP rotation attempts",
|
||||
["result"],
|
||||
)
|
||||
|
||||
scrape_errors_total = Counter(
|
||||
"scrape_errors_total",
|
||||
"Per-outcode scrape errors",
|
||||
)
|
||||
|
|
@ -6,4 +6,6 @@ dependencies = [
|
|||
"flask",
|
||||
"httpx",
|
||||
"polars",
|
||||
"fake-useragent>=2.2.0",
|
||||
"prometheus-client",
|
||||
]
|
||||
|
|
|
|||
86
finder/rightmove.py
Normal file
86
finder/rightmove.py
Normal file
|
|
@ -0,0 +1,86 @@
|
|||
import logging
|
||||
import time
|
||||
|
||||
import httpx
|
||||
|
||||
from constants import (
|
||||
PAGE_SIZE,
|
||||
DELAY_BETWEEN_PAGES,
|
||||
SEARCH_URL,
|
||||
TYPEAHEAD_URL,
|
||||
)
|
||||
from http_client import fetch_with_retry
|
||||
from spatial import PostcodeSpatialIndex
|
||||
from transform import transform_property
|
||||
|
||||
log = logging.getLogger("rightmove")
|
||||
|
||||
# Outcode ID cache (Rightmove typeahead → internal ID)
|
||||
outcode_cache: dict[str, str] = {}
|
||||
|
||||
|
||||
def resolve_outcode_id(client: httpx.Client, outcode: str) -> str | None:
|
||||
"""Look up Rightmove's internal ID for an outcode via typeahead API."""
|
||||
if outcode in outcode_cache:
|
||||
return outcode_cache[outcode]
|
||||
|
||||
data = fetch_with_retry(client, TYPEAHEAD_URL, {"query": outcode, "limit": "10", "exclude": "STREET"})
|
||||
if not data:
|
||||
return None
|
||||
|
||||
for match in data.get("matches", []):
|
||||
if match.get("type") == "OUTCODE" and match.get("displayName") == outcode:
|
||||
rid = str(match["id"])
|
||||
outcode_cache[outcode] = rid
|
||||
return rid
|
||||
|
||||
log.debug("Outcode %s not found in typeahead results", outcode)
|
||||
return None
|
||||
|
||||
|
||||
def search_outcode(
|
||||
client: httpx.Client,
|
||||
outcode_id: str,
|
||||
outcode: str,
|
||||
channel_cfg: dict,
|
||||
pc_index: PostcodeSpatialIndex,
|
||||
) -> list[dict]:
|
||||
"""Paginate through search results for one outcode+channel. Returns transformed properties."""
|
||||
properties = []
|
||||
index = 0
|
||||
|
||||
while True:
|
||||
params = {
|
||||
"useLocationIdentifier": "true",
|
||||
"locationIdentifier": f"OUTCODE^{outcode_id}",
|
||||
"index": str(index),
|
||||
"sortType": channel_cfg["sortType"],
|
||||
"channel": channel_cfg["channel"],
|
||||
"transactionType": channel_cfg["transactionType"],
|
||||
}
|
||||
|
||||
data = fetch_with_retry(client, SEARCH_URL, params)
|
||||
if not data:
|
||||
log.warning("Failed to fetch index %d for %s/%s", index, outcode, channel_cfg["channel"])
|
||||
break
|
||||
|
||||
raw_props = data.get("properties", [])
|
||||
if not raw_props:
|
||||
break
|
||||
|
||||
for prop in raw_props:
|
||||
transformed = transform_property(prop, outcode, pc_index)
|
||||
if transformed:
|
||||
properties.append(transformed)
|
||||
|
||||
# Check if there are more pages
|
||||
result_count_str = data.get("resultCount", "0")
|
||||
result_count = int(result_count_str.replace(",", ""))
|
||||
index += PAGE_SIZE
|
||||
|
||||
if index >= result_count:
|
||||
break
|
||||
|
||||
time.sleep(DELAY_BETWEEN_PAGES)
|
||||
|
||||
return properties
|
||||
191
finder/scraper.py
Normal file
191
finder/scraper.py
Normal file
|
|
@ -0,0 +1,191 @@
|
|||
import logging
|
||||
import random
|
||||
import threading
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
import polars as pl
|
||||
|
||||
from constants import ARCGIS_PATH, CHANNELS, DATA_DIR, DELAY_BETWEEN_OUTCODES, SEED
|
||||
from http_client import make_client
|
||||
from metrics import (
|
||||
scrape_elapsed_seconds,
|
||||
scrape_errors_total,
|
||||
scrape_outcodes_done,
|
||||
scrape_outcodes_total,
|
||||
scrape_properties_total,
|
||||
scrape_state,
|
||||
)
|
||||
from rightmove import resolve_outcode_id, search_outcode
|
||||
from spatial import PostcodeSpatialIndex
|
||||
from storage import write_parquet
|
||||
|
||||
log = logging.getLogger("rightmove")
|
||||
|
||||
|
||||
@dataclass
|
||||
class ScrapeStatus:
|
||||
state: str = "idle" # idle | running | done | error
|
||||
channel: str = ""
|
||||
outcode: str = ""
|
||||
outcodes_done: int = 0
|
||||
outcodes_total: int = 0
|
||||
properties_buy: int = 0
|
||||
properties_rent: int = 0
|
||||
errors: list[str] = field(default_factory=list)
|
||||
started_at: float = 0.0
|
||||
finished_at: float = 0.0
|
||||
|
||||
|
||||
status = ScrapeStatus()
|
||||
status_lock = threading.Lock()
|
||||
|
||||
|
||||
def _sync_gauges() -> None:
|
||||
"""Push current ScrapeStatus values into Prometheus gauges. Must hold status_lock."""
|
||||
for state in ("idle", "running", "done", "error"):
|
||||
scrape_state.labels(state=state).set(1 if status.state == state else 0)
|
||||
scrape_outcodes_done.set(status.outcodes_done)
|
||||
scrape_outcodes_total.set(status.outcodes_total)
|
||||
scrape_properties_total.labels(channel="buy").set(status.properties_buy)
|
||||
scrape_properties_total.labels(channel="rent").set(status.properties_rent)
|
||||
if status.started_at:
|
||||
end = status.finished_at if status.finished_at else time.time()
|
||||
scrape_elapsed_seconds.set(end - status.started_at)
|
||||
else:
|
||||
scrape_elapsed_seconds.set(0)
|
||||
|
||||
|
||||
def load_outcodes() -> list[str]:
|
||||
"""Load England-only outcodes from arcgis parquet."""
|
||||
log.info("Loading outcodes from %s", ARCGIS_PATH)
|
||||
df = pl.read_parquet(ARCGIS_PATH, columns=["pcd", "ctry", "lat", "long"])
|
||||
england = df.filter(pl.col("ctry") == "E92000001")
|
||||
log.info("England postcodes: %d", len(england))
|
||||
|
||||
outcodes = (
|
||||
england.select(pl.col("pcd").str.extract(r"^([A-Z]{1,2}\d[A-Z0-9]?)", 1).alias("outcode"))
|
||||
.drop_nulls()
|
||||
.get_column("outcode")
|
||||
.unique()
|
||||
.sort()
|
||||
.to_list()
|
||||
)
|
||||
log.info("Unique England outcodes: %d", len(outcodes))
|
||||
return outcodes
|
||||
|
||||
|
||||
def build_postcode_index() -> PostcodeSpatialIndex:
|
||||
"""Build spatial index from arcgis England postcodes."""
|
||||
log.info("Building postcode spatial index from %s", ARCGIS_PATH)
|
||||
df = pl.read_parquet(ARCGIS_PATH, columns=["pcd", "ctry", "lat", "long"])
|
||||
england = df.filter(pl.col("ctry") == "E92000001").drop_nulls(subset=["lat", "long"])
|
||||
return PostcodeSpatialIndex(
|
||||
england.get_column("lat").to_list(),
|
||||
england.get_column("long").to_list(),
|
||||
england.get_column("pcd").to_list(),
|
||||
)
|
||||
|
||||
|
||||
def run_scrape(outcodes: list[str], pc_index: PostcodeSpatialIndex) -> None:
|
||||
"""Main scrape loop — runs in background thread."""
|
||||
global status
|
||||
with status_lock:
|
||||
status.state = "running"
|
||||
status.started_at = time.time()
|
||||
status.errors = []
|
||||
status.properties_buy = 0
|
||||
status.properties_rent = 0
|
||||
_sync_gauges()
|
||||
|
||||
# Shuffle for geographic diversity
|
||||
shuffled = list(outcodes)
|
||||
random.seed(SEED)
|
||||
random.shuffle(shuffled)
|
||||
|
||||
client = make_client()
|
||||
|
||||
try:
|
||||
for channel_cfg in CHANNELS:
|
||||
channel_name = channel_cfg["channel"]
|
||||
file_suffix = "buy" if channel_name == "BUY" else "rent"
|
||||
all_properties: dict[int, dict] = {} # dedup by id
|
||||
|
||||
with status_lock:
|
||||
status.channel = channel_name
|
||||
status.outcodes_done = 0
|
||||
status.outcodes_total = len(shuffled)
|
||||
|
||||
log.info("=== Starting %s channel (%d outcodes) ===", channel_name, len(shuffled))
|
||||
|
||||
for i, outcode in enumerate(shuffled):
|
||||
with status_lock:
|
||||
status.outcode = outcode
|
||||
status.outcodes_done = i
|
||||
|
||||
log.debug("Outcode %s (%d/%d) — %d properties so far",
|
||||
outcode, i + 1, len(shuffled), len(all_properties))
|
||||
|
||||
try:
|
||||
outcode_id = resolve_outcode_id(client, outcode)
|
||||
if not outcode_id:
|
||||
log.debug("No Rightmove ID for outcode %s, skipping", outcode)
|
||||
continue
|
||||
|
||||
props = search_outcode(client, outcode_id, outcode, channel_cfg, pc_index)
|
||||
for p in props:
|
||||
pid = p["id"]
|
||||
if pid not in all_properties:
|
||||
all_properties[pid] = p
|
||||
|
||||
with status_lock:
|
||||
if channel_name == "BUY":
|
||||
status.properties_buy = len(all_properties)
|
||||
else:
|
||||
status.properties_rent = len(all_properties)
|
||||
_sync_gauges()
|
||||
|
||||
log.info("Outcode %s: got %d properties (total: %d)", outcode, len(props), len(all_properties))
|
||||
|
||||
except Exception as e:
|
||||
msg = f"Error scraping {outcode}/{channel_name}: {e}"
|
||||
log.error(msg)
|
||||
scrape_errors_total.inc()
|
||||
with status_lock:
|
||||
status.errors.append(msg)
|
||||
|
||||
if i < len(shuffled) - 1:
|
||||
time.sleep(DELAY_BETWEEN_OUTCODES)
|
||||
|
||||
# Write parquet
|
||||
deduped = list(all_properties.values())
|
||||
output_path = DATA_DIR / f"rightmove_{file_suffix}.parquet"
|
||||
write_parquet(deduped, output_path)
|
||||
|
||||
with status_lock:
|
||||
if channel_name == "BUY":
|
||||
status.properties_buy = len(deduped)
|
||||
else:
|
||||
status.properties_rent = len(deduped)
|
||||
status.outcodes_done = len(shuffled)
|
||||
_sync_gauges()
|
||||
|
||||
log.info("=== %s channel complete: %d unique properties ===", channel_name, len(deduped))
|
||||
|
||||
with status_lock:
|
||||
status.state = "done"
|
||||
status.finished_at = time.time()
|
||||
_sync_gauges()
|
||||
elapsed = status.finished_at - status.started_at
|
||||
log.info("Scrape complete in %.0fs — buy: %d, rent: %d",
|
||||
elapsed, status.properties_buy, status.properties_rent)
|
||||
|
||||
except Exception as e:
|
||||
log.exception("Fatal scrape error")
|
||||
with status_lock:
|
||||
status.state = "error"
|
||||
status.errors.append(f"Fatal: {e}")
|
||||
status.finished_at = time.time()
|
||||
_sync_gauges()
|
||||
finally:
|
||||
client.close()
|
||||
33
finder/spatial.py
Normal file
33
finder/spatial.py
Normal file
|
|
@ -0,0 +1,33 @@
|
|||
import logging
|
||||
import math
|
||||
from collections import defaultdict
|
||||
|
||||
from constants import GRID_CELL_SIZE
|
||||
|
||||
log = logging.getLogger("rightmove")
|
||||
|
||||
|
||||
class PostcodeSpatialIndex:
|
||||
"""Grid-based spatial index over arcgis postcodes for nearest-lookup."""
|
||||
|
||||
def __init__(self, lats: list[float], lngs: list[float], postcodes: list[str]):
|
||||
self.grid: dict[tuple[int, int], list[tuple[float, float, str]]] = defaultdict(list)
|
||||
for lat, lng, pcd in zip(lats, lngs, postcodes):
|
||||
gx = int(math.floor(lng / GRID_CELL_SIZE))
|
||||
gy = int(math.floor(lat / GRID_CELL_SIZE))
|
||||
self.grid[(gx, gy)].append((lat, lng, pcd))
|
||||
log.info("Postcode spatial index: %d cells, %d postcodes", len(self.grid), len(lats))
|
||||
|
||||
def nearest(self, lat: float, lng: float) -> str | None:
|
||||
gx = int(math.floor(lng / GRID_CELL_SIZE))
|
||||
gy = int(math.floor(lat / GRID_CELL_SIZE))
|
||||
best_dist = float("inf")
|
||||
best_pcd = None
|
||||
for dx in range(-1, 2):
|
||||
for dy in range(-1, 2):
|
||||
for plat, plng, pcd in self.grid.get((gx + dx, gy + dy), []):
|
||||
d = (plat - lat) ** 2 + (plng - lng) ** 2
|
||||
if d < best_dist:
|
||||
best_dist = d
|
||||
best_pcd = pcd
|
||||
return best_pcd
|
||||
65
finder/storage.py
Normal file
65
finder/storage.py
Normal file
|
|
@ -0,0 +1,65 @@
|
|||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
import polars as pl
|
||||
|
||||
log = logging.getLogger("rightmove")
|
||||
|
||||
|
||||
def write_parquet(properties: list[dict], path: Path) -> None:
|
||||
"""Write properties list to parquet using Polars."""
|
||||
if not properties:
|
||||
log.warning("No properties to write to %s", path)
|
||||
return
|
||||
|
||||
df = pl.DataFrame(
|
||||
{
|
||||
"id": [p["id"] for p in properties],
|
||||
"bedrooms": [p["bedrooms"] for p in properties],
|
||||
"bathrooms": [p["bathrooms"] for p in properties],
|
||||
"total_rooms": [p["total_rooms"] for p in properties],
|
||||
"longitude": [p["longitude"] for p in properties],
|
||||
"latitude": [p["latitude"] for p in properties],
|
||||
"postcode": [p["postcode"] for p in properties],
|
||||
"address": [p["address"] for p in properties],
|
||||
"tenure": [p["tenure"] for p in properties],
|
||||
"property_type": [p["property_type"] for p in properties],
|
||||
"property_sub_type": [p["property_sub_type"] for p in properties],
|
||||
"price": [p["price"] for p in properties],
|
||||
"price_frequency": [p["price_frequency"] for p in properties],
|
||||
"price_qualifier": [p["price_qualifier"] for p in properties],
|
||||
"floorspace_sqm": [p["floorspace_sqm"] for p in properties],
|
||||
"url": [p["url"] for p in properties],
|
||||
"features": [p["features"] for p in properties],
|
||||
"first_visible_date": [p["first_visible_date"] for p in properties],
|
||||
"update_date": [p["update_date"] for p in properties],
|
||||
"outcode": [p["outcode"] for p in properties],
|
||||
"house_share": [p["house_share"] for p in properties],
|
||||
},
|
||||
schema={
|
||||
"id": pl.Int64,
|
||||
"bedrooms": pl.Int32,
|
||||
"bathrooms": pl.Int32,
|
||||
"total_rooms": pl.Int32,
|
||||
"longitude": pl.Float64,
|
||||
"latitude": pl.Float64,
|
||||
"postcode": pl.Utf8,
|
||||
"address": pl.Utf8,
|
||||
"tenure": pl.Utf8,
|
||||
"property_type": pl.Utf8,
|
||||
"property_sub_type": pl.Utf8,
|
||||
"price": pl.Int64,
|
||||
"price_frequency": pl.Utf8,
|
||||
"price_qualifier": pl.Utf8,
|
||||
"floorspace_sqm": pl.Float64,
|
||||
"url": pl.Utf8,
|
||||
"features": pl.List(pl.Utf8),
|
||||
"first_visible_date": pl.Utf8,
|
||||
"update_date": pl.Utf8,
|
||||
"outcode": pl.Utf8,
|
||||
"house_share": pl.Boolean,
|
||||
},
|
||||
)
|
||||
|
||||
df.write_parquet(path)
|
||||
log.info("Wrote %d properties to %s", len(df), path)
|
||||
124
finder/transform.py
Normal file
124
finder/transform.py
Normal file
|
|
@ -0,0 +1,124 @@
|
|||
import logging
|
||||
import re
|
||||
|
||||
from constants import PROPERTY_TYPE_MAP, RIGHTMOVE_BASE
|
||||
from spatial import PostcodeSpatialIndex
|
||||
|
||||
log = logging.getLogger("rightmove")
|
||||
|
||||
|
||||
def parse_display_size(display_size: str | None) -> float | None:
|
||||
"""Parse displaySize like '499 sq. ft.' or '4,124 sq. ft.' to sqm."""
|
||||
if not display_size:
|
||||
return None
|
||||
# Try sq. ft. first
|
||||
m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*ft", display_size, re.IGNORECASE)
|
||||
if m:
|
||||
sqft = float(m.group(1).replace(",", ""))
|
||||
return round(sqft * 0.092903, 1)
|
||||
# Try sq. m.
|
||||
m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*m", display_size, re.IGNORECASE)
|
||||
if m:
|
||||
return round(float(m.group(1).replace(",", "")), 1)
|
||||
return None
|
||||
|
||||
|
||||
def map_property_type(sub_type: str | None) -> str:
|
||||
"""Map propertySubType to canonical type."""
|
||||
if not sub_type:
|
||||
return "Other"
|
||||
canonical = PROPERTY_TYPE_MAP.get(sub_type)
|
||||
if canonical:
|
||||
return canonical
|
||||
log.warning("Unknown propertySubType: %r — mapping to Other", sub_type)
|
||||
return "Other"
|
||||
|
||||
|
||||
def extract_tenure(tenure_obj: dict | None) -> str | None:
|
||||
"""Extract tenure string from tenure object."""
|
||||
if not tenure_obj:
|
||||
return None
|
||||
tt = tenure_obj.get("tenureType", "")
|
||||
if tt == "FREEHOLD":
|
||||
return "Freehold"
|
||||
if tt == "LEASEHOLD":
|
||||
return "Leasehold"
|
||||
return None
|
||||
|
||||
|
||||
def fix_coords(lat: float, lng: float) -> tuple[float, float]:
|
||||
"""Swap lat/lng if they look reversed. England: lat ~49–56, lng ~-7–2."""
|
||||
if 49 <= lat <= 56 and -7 <= lng <= 2:
|
||||
return lat, lng
|
||||
if 49 <= lng <= 56 and -7 <= lat <= 2:
|
||||
log.debug("Swapping reversed coords: lat=%.4f lng=%.4f → lat=%.4f lng=%.4f", lat, lng, lng, lat)
|
||||
return lng, lat
|
||||
log.warning("Coords outside England bounds even after swap attempt: lat=%.4f lng=%.4f", lat, lng)
|
||||
return lat, lng
|
||||
|
||||
|
||||
def normalize_price(amount: int, frequency: str) -> int:
|
||||
"""Normalize price to monthly for rentals (weekly × 52/12, yearly ÷ 12)."""
|
||||
if frequency == "weekly":
|
||||
return round(amount * 52 / 12)
|
||||
if frequency == "yearly":
|
||||
return round(amount / 12)
|
||||
return amount
|
||||
|
||||
|
||||
def transform_property(prop: dict, outcode: str, pc_index: PostcodeSpatialIndex) -> dict | None:
|
||||
"""Transform a raw Rightmove property dict into our output schema."""
|
||||
loc = prop.get("location")
|
||||
if not loc:
|
||||
return None
|
||||
raw_lat = loc.get("latitude")
|
||||
raw_lng = loc.get("longitude")
|
||||
if raw_lat is None or raw_lng is None:
|
||||
return None
|
||||
|
||||
lat, lng = fix_coords(raw_lat, raw_lng)
|
||||
|
||||
price_obj = prop.get("price", {})
|
||||
amount = price_obj.get("amount")
|
||||
if amount is None:
|
||||
return None
|
||||
frequency = price_obj.get("frequency", "")
|
||||
price = normalize_price(int(amount), frequency)
|
||||
|
||||
display_prices = price_obj.get("displayPrices", [])
|
||||
price_qualifier = display_prices[0].get("displayPriceQualifier", "") if display_prices else ""
|
||||
|
||||
sub_type = prop.get("propertySubType", "")
|
||||
bedrooms = prop.get("bedrooms", 0) or 0
|
||||
bathrooms = prop.get("bathrooms", 0) or 0
|
||||
|
||||
key_features = [kf.get("description", "") for kf in prop.get("keyFeatures", []) if kf.get("description")]
|
||||
|
||||
listing_update = prop.get("listingUpdate", {})
|
||||
update_date = listing_update.get("listingUpdateDate", "")
|
||||
|
||||
postcode = pc_index.nearest(lat, lng)
|
||||
|
||||
return {
|
||||
"id": prop.get("id"),
|
||||
"bedrooms": bedrooms,
|
||||
"bathrooms": bathrooms,
|
||||
"total_rooms": bedrooms + bathrooms,
|
||||
"longitude": lng,
|
||||
"latitude": lat,
|
||||
"postcode": postcode,
|
||||
"address": prop.get("displayAddress", ""),
|
||||
"tenure": extract_tenure(prop.get("tenure")),
|
||||
"property_type": map_property_type(sub_type),
|
||||
"property_sub_type": sub_type or "Unknown",
|
||||
"price": price,
|
||||
"price_frequency": frequency,
|
||||
"price_qualifier": price_qualifier,
|
||||
"floorspace_sqm": parse_display_size(prop.get("displaySize")),
|
||||
"url": RIGHTMOVE_BASE + prop.get("propertyUrl", ""),
|
||||
"features": key_features,
|
||||
"first_visible_date": prop.get("firstVisibleDate", ""),
|
||||
"update_date": update_date,
|
||||
"outcode": outcode,
|
||||
"house_share": sub_type == "House Share",
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue