Changes
This commit is contained in:
parent
3a3f899ea2
commit
128b3191e7
68 changed files with 28060 additions and 1152 deletions
11
finder/Dockerfile
Normal file
11
finder/Dockerfile
Normal file
|
|
@ -0,0 +1,11 @@
|
|||
FROM python:3.12-slim
|
||||
|
||||
COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
|
||||
|
||||
WORKDIR /app
|
||||
COPY pyproject.toml ./
|
||||
RUN uv pip install --system -r pyproject.toml
|
||||
|
||||
COPY main.py ./
|
||||
|
||||
CMD ["python3", "main.py"]
|
||||
710
finder/main.py
Normal file
710
finder/main.py
Normal file
|
|
@ -0,0 +1,710 @@
|
|||
import logging
|
||||
import math
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
import threading
|
||||
import time
|
||||
from collections import defaultdict
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
|
||||
import httpx
|
||||
import polars as pl
|
||||
from flask import Flask, jsonify, send_from_directory
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Logging
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
LOG_DIR = Path("/app/data")
|
||||
LOG_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.DEBUG,
|
||||
format="%(asctime)s [%(levelname)s] %(message)s",
|
||||
handlers=[
|
||||
logging.StreamHandler(),
|
||||
logging.FileHandler(LOG_DIR / "rightmove.log"),
|
||||
],
|
||||
)
|
||||
log = logging.getLogger("rightmove")
|
||||
log.setLevel(logging.DEBUG)
|
||||
logging.getLogger("httpx").setLevel(logging.WARNING)
|
||||
logging.getLogger("httpcore").setLevel(logging.WARNING)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Constants
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
ARCGIS_PATH = os.environ.get("ARCGIS_PATH", "/data/arcgis_data.parquet")
|
||||
DATA_DIR = Path("/app/data")
|
||||
PAGE_SIZE = 24
|
||||
MAX_PAGES_PER_OUTCODE = 42 # 24*42 = 1008, safety cap per outcode
|
||||
DELAY_BETWEEN_PAGES = 1.0
|
||||
DELAY_BETWEEN_OUTCODES = 2.0
|
||||
MAX_RETRIES = 3
|
||||
RETRY_BASE_DELAY = 2.0
|
||||
GRID_CELL_SIZE = 0.01 # degrees for postcode spatial index
|
||||
SEED = 42
|
||||
|
||||
TYPEAHEAD_URL = "https://los.rightmove.co.uk/typeahead"
|
||||
SEARCH_URL = "https://www.rightmove.co.uk/api/property-search/listing/search"
|
||||
RIGHTMOVE_BASE = "https://www.rightmove.co.uk"
|
||||
|
||||
PROPERTY_TYPE_MAP = {
|
||||
"Detached": "Detached",
|
||||
"Semi-Detached": "Semi-Detached",
|
||||
"Terraced": "Terraced",
|
||||
"End of Terrace": "Terraced",
|
||||
"Mid Terrace": "Terraced",
|
||||
"Flat": "Flat",
|
||||
"Maisonette": "Flat",
|
||||
"Studio": "Flat",
|
||||
"Apartment": "Flat",
|
||||
"Penthouse": "Flat",
|
||||
"Ground Flat": "Flat",
|
||||
"Detached Bungalow": "Detached",
|
||||
"Semi-Detached Bungalow": "Semi-Detached",
|
||||
"Town House": "Terraced",
|
||||
"Link Detached": "Detached",
|
||||
"Link Detached House": "Detached",
|
||||
"Bungalow": "Other",
|
||||
"Cottage": "Other",
|
||||
"Park Home": "Other",
|
||||
"Land": "Other",
|
||||
"Farm / Barn": "Other",
|
||||
"House": "Detached",
|
||||
"Not Specified": "Other",
|
||||
"Chalet": "Other",
|
||||
"Barn Conversion": "Other",
|
||||
"Coach House": "Other",
|
||||
"Character Property": "Other",
|
||||
"Cluster House": "Other",
|
||||
"Retirement Property": "Flat",
|
||||
"Plot": "Other",
|
||||
"Garages": "Other",
|
||||
"Mews": "Terraced",
|
||||
}
|
||||
|
||||
CHANNELS = [
|
||||
{"channel": "BUY", "transactionType": "BUY", "sortType": "2"},
|
||||
{"channel": "RENT", "transactionType": "LETTING", "sortType": "6"},
|
||||
]
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Postcode spatial index
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class PostcodeSpatialIndex:
|
||||
"""Grid-based spatial index over arcgis postcodes for nearest-lookup."""
|
||||
|
||||
def __init__(self, lats: list[float], lngs: list[float], postcodes: list[str]):
|
||||
self.grid: dict[tuple[int, int], list[tuple[float, float, str]]] = defaultdict(list)
|
||||
for lat, lng, pcd in zip(lats, lngs, postcodes):
|
||||
gx = int(math.floor(lng / GRID_CELL_SIZE))
|
||||
gy = int(math.floor(lat / GRID_CELL_SIZE))
|
||||
self.grid[(gx, gy)].append((lat, lng, pcd))
|
||||
log.info("Postcode spatial index: %d cells, %d postcodes", len(self.grid), len(lats))
|
||||
|
||||
def nearest(self, lat: float, lng: float) -> str | None:
|
||||
gx = int(math.floor(lng / GRID_CELL_SIZE))
|
||||
gy = int(math.floor(lat / GRID_CELL_SIZE))
|
||||
best_dist = float("inf")
|
||||
best_pcd = None
|
||||
for dx in range(-1, 2):
|
||||
for dy in range(-1, 2):
|
||||
for plat, plng, pcd in self.grid.get((gx + dx, gy + dy), []):
|
||||
d = (plat - lat) ** 2 + (plng - lng) ** 2
|
||||
if d < best_dist:
|
||||
best_dist = d
|
||||
best_pcd = pcd
|
||||
return best_pcd
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Scrape status
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@dataclass
|
||||
class ScrapeStatus:
|
||||
state: str = "idle" # idle | running | done | error
|
||||
channel: str = ""
|
||||
outcode: str = ""
|
||||
outcodes_done: int = 0
|
||||
outcodes_total: int = 0
|
||||
properties_buy: int = 0
|
||||
properties_rent: int = 0
|
||||
errors: list[str] = field(default_factory=list)
|
||||
started_at: float = 0.0
|
||||
finished_at: float = 0.0
|
||||
|
||||
|
||||
status = ScrapeStatus()
|
||||
status_lock = threading.Lock()
|
||||
debug_data: dict = {"last_response": None, "outcode_cache": {}}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# HTTP helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
USER_AGENT = (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
|
||||
)
|
||||
|
||||
# Gluetun control API — runs on port 8000 inside the gluetun container.
|
||||
# Since finder uses network_mode: service:gluetun, localhost IS gluetun.
|
||||
GLUETUN_API = "http://127.0.0.1:8000"
|
||||
_ip_rotate_lock = threading.Lock()
|
||||
|
||||
|
||||
def rotate_ip() -> bool:
|
||||
"""Ask gluetun to reconnect to a different VPN server, getting a new IP.
|
||||
Returns True if the IP changed successfully."""
|
||||
with _ip_rotate_lock:
|
||||
log.info("Rotating VPN IP via gluetun...")
|
||||
try:
|
||||
# Get current IP
|
||||
with httpx.Client(timeout=10) as ctl:
|
||||
old_ip_resp = ctl.get(f"{GLUETUN_API}/v1/publicip/ip")
|
||||
old_ip = old_ip_resp.json().get("public_ip", "unknown") if old_ip_resp.status_code == 200 else "unknown"
|
||||
log.info("Current IP: %s", old_ip)
|
||||
|
||||
# Trigger server change — PUT with empty JSON body picks a random server
|
||||
resp = ctl.put(f"{GLUETUN_API}/v1/vpn/status", json={"status": "stopped"})
|
||||
if resp.status_code != 200:
|
||||
log.error("Failed to stop VPN: %d %s", resp.status_code, resp.text)
|
||||
return False
|
||||
time.sleep(2)
|
||||
|
||||
resp = ctl.put(f"{GLUETUN_API}/v1/vpn/status", json={"status": "running"})
|
||||
if resp.status_code != 200:
|
||||
log.error("Failed to start VPN: %d %s", resp.status_code, resp.text)
|
||||
return False
|
||||
|
||||
# Wait for reconnection
|
||||
for _ in range(30):
|
||||
time.sleep(2)
|
||||
try:
|
||||
with httpx.Client(timeout=10) as ctl:
|
||||
new_ip_resp = ctl.get(f"{GLUETUN_API}/v1/publicip/ip")
|
||||
if new_ip_resp.status_code == 200:
|
||||
new_ip = new_ip_resp.json().get("public_ip", "")
|
||||
if new_ip and new_ip != old_ip:
|
||||
log.info("IP rotated: %s → %s", old_ip, new_ip)
|
||||
return True
|
||||
except Exception:
|
||||
pass # VPN still reconnecting
|
||||
|
||||
log.warning("IP rotation timed out (may still be same IP)")
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
log.error("IP rotation failed: %s", e)
|
||||
return False
|
||||
|
||||
|
||||
def make_client() -> httpx.Client:
|
||||
return httpx.Client(
|
||||
timeout=30,
|
||||
headers={"User-Agent": USER_AGENT, "Accept": "application/json"},
|
||||
follow_redirects=True,
|
||||
)
|
||||
|
||||
|
||||
def fetch_with_retry(
|
||||
client: httpx.Client, url: str, params: dict | None = None, on_403: bool = True
|
||||
) -> dict | None:
|
||||
"""GET JSON with retries on 429/5xx/connection errors. Returns None on permanent failure.
|
||||
On 403, triggers IP rotation and retries once."""
|
||||
for attempt in range(MAX_RETRIES):
|
||||
try:
|
||||
resp = client.get(url, params=params)
|
||||
if resp.status_code == 200:
|
||||
return resp.json()
|
||||
if resp.status_code == 403 and on_403:
|
||||
log.warning("HTTP 403 — IP likely blocked, rotating...")
|
||||
if rotate_ip():
|
||||
# Retry once with new IP (but don't recurse on 403 again)
|
||||
return fetch_with_retry(client, url, params, on_403=False)
|
||||
log.error("IP rotation failed, giving up on %s", url)
|
||||
return None
|
||||
if resp.status_code in (429, 500, 502, 503, 504):
|
||||
delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1)
|
||||
log.warning("HTTP %d from %s, retry %d/%d in %.1fs", resp.status_code, url, attempt + 1, MAX_RETRIES, delay)
|
||||
time.sleep(delay)
|
||||
continue
|
||||
log.error("HTTP %d from %s (non-retryable)", resp.status_code, url)
|
||||
return None
|
||||
except (httpx.ConnectError, httpx.ReadTimeout, httpx.WriteTimeout, httpx.PoolTimeout) as e:
|
||||
delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1)
|
||||
log.warning("%s from %s, retry %d/%d in %.1fs", type(e).__name__, url, attempt + 1, MAX_RETRIES, delay)
|
||||
time.sleep(delay)
|
||||
log.error("All %d retries exhausted for %s", MAX_RETRIES, url)
|
||||
return None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Rightmove API
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def resolve_outcode_id(client: httpx.Client, outcode: str) -> str | None:
|
||||
"""Look up Rightmove's internal ID for an outcode via typeahead API."""
|
||||
if outcode in debug_data["outcode_cache"]:
|
||||
return debug_data["outcode_cache"][outcode]
|
||||
|
||||
data = fetch_with_retry(client, TYPEAHEAD_URL, {"query": outcode, "limit": "10", "exclude": "STREET"})
|
||||
if not data:
|
||||
return None
|
||||
|
||||
for match in data.get("matches", []):
|
||||
if match.get("type") == "OUTCODE" and match.get("displayName") == outcode:
|
||||
rid = str(match["id"])
|
||||
debug_data["outcode_cache"][outcode] = rid
|
||||
return rid
|
||||
|
||||
log.debug("Outcode %s not found in typeahead results", outcode)
|
||||
return None
|
||||
|
||||
|
||||
def search_outcode(
|
||||
client: httpx.Client,
|
||||
outcode_id: str,
|
||||
outcode: str,
|
||||
channel_cfg: dict,
|
||||
pc_index: PostcodeSpatialIndex,
|
||||
) -> list[dict]:
|
||||
"""Paginate through search results for one outcode+channel. Returns transformed properties."""
|
||||
properties = []
|
||||
index = 0
|
||||
|
||||
for page in range(MAX_PAGES_PER_OUTCODE):
|
||||
params = {
|
||||
"useLocationIdentifier": "true",
|
||||
"locationIdentifier": f"OUTCODE^{outcode_id}",
|
||||
"index": str(index),
|
||||
"sortType": channel_cfg["sortType"],
|
||||
"channel": channel_cfg["channel"],
|
||||
"transactionType": channel_cfg["transactionType"],
|
||||
}
|
||||
|
||||
data = fetch_with_retry(client, SEARCH_URL, params)
|
||||
if not data:
|
||||
log.warning("Failed to fetch page %d for %s/%s", page, outcode, channel_cfg["channel"])
|
||||
break
|
||||
|
||||
debug_data["last_response"] = data
|
||||
|
||||
raw_props = data.get("properties", [])
|
||||
if not raw_props:
|
||||
break
|
||||
|
||||
for prop in raw_props:
|
||||
transformed = transform_property(prop, outcode, pc_index)
|
||||
if transformed:
|
||||
properties.append(transformed)
|
||||
|
||||
# Check if there are more pages
|
||||
result_count_str = data.get("resultCount", "0")
|
||||
result_count = int(result_count_str.replace(",", ""))
|
||||
index += PAGE_SIZE
|
||||
|
||||
if index >= result_count:
|
||||
break
|
||||
|
||||
if page < MAX_PAGES_PER_OUTCODE - 1:
|
||||
time.sleep(DELAY_BETWEEN_PAGES)
|
||||
|
||||
return properties
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Property transformation
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def parse_display_size(display_size: str | None) -> float | None:
|
||||
"""Parse displaySize like '499 sq. ft.' or '4,124 sq. ft.' to sqm."""
|
||||
if not display_size:
|
||||
return None
|
||||
# Try sq. ft. first
|
||||
m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*ft", display_size, re.IGNORECASE)
|
||||
if m:
|
||||
sqft = float(m.group(1).replace(",", ""))
|
||||
return round(sqft * 0.092903, 1)
|
||||
# Try sq. m.
|
||||
m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*m", display_size, re.IGNORECASE)
|
||||
if m:
|
||||
return round(float(m.group(1).replace(",", "")), 1)
|
||||
return None
|
||||
|
||||
|
||||
def map_property_type(sub_type: str | None) -> str:
|
||||
"""Map propertySubType to canonical type."""
|
||||
if not sub_type:
|
||||
return "Other"
|
||||
canonical = PROPERTY_TYPE_MAP.get(sub_type)
|
||||
if canonical:
|
||||
return canonical
|
||||
log.warning("Unknown propertySubType: %r — mapping to Other", sub_type)
|
||||
return "Other"
|
||||
|
||||
|
||||
def extract_tenure(tenure_obj: dict | None) -> str | None:
|
||||
"""Extract tenure string from tenure object."""
|
||||
if not tenure_obj:
|
||||
return None
|
||||
tt = tenure_obj.get("tenureType", "")
|
||||
if tt == "FREEHOLD":
|
||||
return "Freehold"
|
||||
if tt == "LEASEHOLD":
|
||||
return "Leasehold"
|
||||
return None
|
||||
|
||||
|
||||
def fix_coords(lat: float, lng: float) -> tuple[float, float]:
|
||||
"""Swap lat/lng if they look reversed. England: lat ~49–56, lng ~-7–2."""
|
||||
if 49 <= lat <= 56 and -7 <= lng <= 2:
|
||||
return lat, lng
|
||||
if 49 <= lng <= 56 and -7 <= lat <= 2:
|
||||
log.debug("Swapping reversed coords: lat=%.4f lng=%.4f → lat=%.4f lng=%.4f", lat, lng, lng, lat)
|
||||
return lng, lat
|
||||
log.warning("Coords outside England bounds even after swap attempt: lat=%.4f lng=%.4f", lat, lng)
|
||||
return lat, lng
|
||||
|
||||
|
||||
def normalize_price(amount: int, frequency: str) -> int:
|
||||
"""Normalize price to monthly for rentals (weekly × 52/12, yearly ÷ 12)."""
|
||||
if frequency == "weekly":
|
||||
return round(amount * 52 / 12)
|
||||
if frequency == "yearly":
|
||||
return round(amount / 12)
|
||||
return amount
|
||||
|
||||
|
||||
def transform_property(prop: dict, outcode: str, pc_index: PostcodeSpatialIndex) -> dict | None:
|
||||
"""Transform a raw Rightmove property dict into our output schema."""
|
||||
loc = prop.get("location")
|
||||
if not loc:
|
||||
return None
|
||||
raw_lat = loc.get("latitude")
|
||||
raw_lng = loc.get("longitude")
|
||||
if raw_lat is None or raw_lng is None:
|
||||
return None
|
||||
|
||||
lat, lng = fix_coords(raw_lat, raw_lng)
|
||||
|
||||
price_obj = prop.get("price", {})
|
||||
amount = price_obj.get("amount")
|
||||
if amount is None:
|
||||
return None
|
||||
frequency = price_obj.get("frequency", "")
|
||||
price = normalize_price(int(amount), frequency)
|
||||
|
||||
display_prices = price_obj.get("displayPrices", [])
|
||||
price_qualifier = display_prices[0].get("displayPriceQualifier", "") if display_prices else ""
|
||||
|
||||
sub_type = prop.get("propertySubType", "")
|
||||
bedrooms = prop.get("bedrooms", 0) or 0
|
||||
bathrooms = prop.get("bathrooms", 0) or 0
|
||||
|
||||
key_features = [kf.get("description", "") for kf in prop.get("keyFeatures", []) if kf.get("description")]
|
||||
|
||||
listing_update = prop.get("listingUpdate", {})
|
||||
update_date = listing_update.get("listingUpdateDate", "")
|
||||
|
||||
postcode = pc_index.nearest(lat, lng)
|
||||
|
||||
return {
|
||||
"id": prop.get("id"),
|
||||
"bedrooms": bedrooms,
|
||||
"bathrooms": bathrooms,
|
||||
"total_rooms": bedrooms + bathrooms,
|
||||
"longitude": lng,
|
||||
"latitude": lat,
|
||||
"postcode": postcode,
|
||||
"address": prop.get("displayAddress", ""),
|
||||
"tenure": extract_tenure(prop.get("tenure")),
|
||||
"property_type": map_property_type(sub_type),
|
||||
"property_sub_type": sub_type or "Unknown",
|
||||
"price": price,
|
||||
"price_frequency": frequency,
|
||||
"price_qualifier": price_qualifier,
|
||||
"floorspace_sqm": parse_display_size(prop.get("displaySize")),
|
||||
"url": RIGHTMOVE_BASE + prop.get("propertyUrl", ""),
|
||||
"features": key_features,
|
||||
"first_visible_date": prop.get("firstVisibleDate", ""),
|
||||
"update_date": update_date,
|
||||
"outcode": outcode,
|
||||
"house_share": sub_type == "House Share",
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Parquet writing
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def write_parquet(properties: list[dict], path: Path) -> None:
|
||||
"""Write properties list to parquet using Polars."""
|
||||
if not properties:
|
||||
log.warning("No properties to write to %s", path)
|
||||
return
|
||||
|
||||
df = pl.DataFrame(
|
||||
{
|
||||
"id": [p["id"] for p in properties],
|
||||
"bedrooms": [p["bedrooms"] for p in properties],
|
||||
"bathrooms": [p["bathrooms"] for p in properties],
|
||||
"total_rooms": [p["total_rooms"] for p in properties],
|
||||
"longitude": [p["longitude"] for p in properties],
|
||||
"latitude": [p["latitude"] for p in properties],
|
||||
"postcode": [p["postcode"] for p in properties],
|
||||
"address": [p["address"] for p in properties],
|
||||
"tenure": [p["tenure"] for p in properties],
|
||||
"property_type": [p["property_type"] for p in properties],
|
||||
"property_sub_type": [p["property_sub_type"] for p in properties],
|
||||
"price": [p["price"] for p in properties],
|
||||
"price_frequency": [p["price_frequency"] for p in properties],
|
||||
"price_qualifier": [p["price_qualifier"] for p in properties],
|
||||
"floorspace_sqm": [p["floorspace_sqm"] for p in properties],
|
||||
"url": [p["url"] for p in properties],
|
||||
"features": [p["features"] for p in properties],
|
||||
"first_visible_date": [p["first_visible_date"] for p in properties],
|
||||
"update_date": [p["update_date"] for p in properties],
|
||||
"outcode": [p["outcode"] for p in properties],
|
||||
"house_share": [p["house_share"] for p in properties],
|
||||
},
|
||||
schema={
|
||||
"id": pl.Int64,
|
||||
"bedrooms": pl.Int32,
|
||||
"bathrooms": pl.Int32,
|
||||
"total_rooms": pl.Int32,
|
||||
"longitude": pl.Float64,
|
||||
"latitude": pl.Float64,
|
||||
"postcode": pl.Utf8,
|
||||
"address": pl.Utf8,
|
||||
"tenure": pl.Utf8,
|
||||
"property_type": pl.Utf8,
|
||||
"property_sub_type": pl.Utf8,
|
||||
"price": pl.Int64,
|
||||
"price_frequency": pl.Utf8,
|
||||
"price_qualifier": pl.Utf8,
|
||||
"floorspace_sqm": pl.Float64,
|
||||
"url": pl.Utf8,
|
||||
"features": pl.List(pl.Utf8),
|
||||
"first_visible_date": pl.Utf8,
|
||||
"update_date": pl.Utf8,
|
||||
"outcode": pl.Utf8,
|
||||
"house_share": pl.Boolean,
|
||||
},
|
||||
)
|
||||
|
||||
df.write_parquet(path)
|
||||
log.info("Wrote %d properties to %s", len(df), path)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Scrape orchestration
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def load_outcodes() -> list[str]:
|
||||
"""Load England-only outcodes from arcgis parquet."""
|
||||
log.info("Loading outcodes from %s", ARCGIS_PATH)
|
||||
df = pl.read_parquet(ARCGIS_PATH, columns=["pcd", "ctry", "lat", "long"])
|
||||
england = df.filter(pl.col("ctry") == "E92000001")
|
||||
log.info("England postcodes: %d", len(england))
|
||||
|
||||
outcodes = (
|
||||
england.select(pl.col("pcd").str.extract(r"^([A-Z]{1,2}\d[A-Z0-9]?)", 1).alias("outcode"))
|
||||
.drop_nulls()
|
||||
.get_column("outcode")
|
||||
.unique()
|
||||
.sort()
|
||||
.to_list()
|
||||
)
|
||||
log.info("Unique England outcodes: %d", len(outcodes))
|
||||
return outcodes
|
||||
|
||||
|
||||
def build_postcode_index() -> PostcodeSpatialIndex:
|
||||
"""Build spatial index from arcgis England postcodes."""
|
||||
log.info("Building postcode spatial index from %s", ARCGIS_PATH)
|
||||
df = pl.read_parquet(ARCGIS_PATH, columns=["pcd", "ctry", "lat", "long"])
|
||||
england = df.filter(pl.col("ctry") == "E92000001").drop_nulls(subset=["lat", "long"])
|
||||
return PostcodeSpatialIndex(
|
||||
england.get_column("lat").to_list(),
|
||||
england.get_column("long").to_list(),
|
||||
england.get_column("pcd").to_list(),
|
||||
)
|
||||
|
||||
|
||||
def run_scrape(outcodes: list[str], pc_index: PostcodeSpatialIndex) -> None:
|
||||
"""Main scrape loop — runs in background thread."""
|
||||
global status
|
||||
with status_lock:
|
||||
status.state = "running"
|
||||
status.started_at = time.time()
|
||||
status.errors = []
|
||||
status.properties_buy = 0
|
||||
status.properties_rent = 0
|
||||
|
||||
# Shuffle for geographic diversity
|
||||
shuffled = list(outcodes)
|
||||
random.seed(SEED)
|
||||
random.shuffle(shuffled)
|
||||
|
||||
client = make_client()
|
||||
|
||||
try:
|
||||
for channel_cfg in CHANNELS:
|
||||
channel_name = channel_cfg["channel"]
|
||||
file_suffix = "buy" if channel_name == "BUY" else "rent"
|
||||
all_properties: dict[int, dict] = {} # dedup by id
|
||||
|
||||
with status_lock:
|
||||
status.channel = channel_name
|
||||
status.outcodes_done = 0
|
||||
status.outcodes_total = len(shuffled)
|
||||
|
||||
log.info("=== Starting %s channel (%d outcodes) ===", channel_name, len(shuffled))
|
||||
|
||||
for i, outcode in enumerate(shuffled):
|
||||
with status_lock:
|
||||
status.outcode = outcode
|
||||
status.outcodes_done = i
|
||||
|
||||
log.debug("Outcode %s (%d/%d) — %d properties so far",
|
||||
outcode, i + 1, len(shuffled), len(all_properties))
|
||||
|
||||
try:
|
||||
outcode_id = resolve_outcode_id(client, outcode)
|
||||
if not outcode_id:
|
||||
log.debug("No Rightmove ID for outcode %s, skipping", outcode)
|
||||
continue
|
||||
|
||||
props = search_outcode(client, outcode_id, outcode, channel_cfg, pc_index)
|
||||
for p in props:
|
||||
pid = p["id"]
|
||||
if pid not in all_properties:
|
||||
all_properties[pid] = p
|
||||
|
||||
with status_lock:
|
||||
if channel_name == "BUY":
|
||||
status.properties_buy = len(all_properties)
|
||||
else:
|
||||
status.properties_rent = len(all_properties)
|
||||
|
||||
log.info("Outcode %s: got %d properties (total: %d)", outcode, len(props), len(all_properties))
|
||||
|
||||
except Exception as e:
|
||||
msg = f"Error scraping {outcode}/{channel_name}: {e}"
|
||||
log.error(msg)
|
||||
with status_lock:
|
||||
status.errors.append(msg)
|
||||
|
||||
if i < len(shuffled) - 1:
|
||||
time.sleep(DELAY_BETWEEN_OUTCODES)
|
||||
|
||||
# Write parquet
|
||||
deduped = list(all_properties.values())
|
||||
output_path = DATA_DIR / f"rightmove_{file_suffix}.parquet"
|
||||
write_parquet(deduped, output_path)
|
||||
|
||||
with status_lock:
|
||||
if channel_name == "BUY":
|
||||
status.properties_buy = len(deduped)
|
||||
else:
|
||||
status.properties_rent = len(deduped)
|
||||
status.outcodes_done = len(shuffled)
|
||||
|
||||
log.info("=== %s channel complete: %d unique properties ===", channel_name, len(deduped))
|
||||
|
||||
with status_lock:
|
||||
status.state = "done"
|
||||
status.finished_at = time.time()
|
||||
elapsed = status.finished_at - status.started_at
|
||||
log.info("Scrape complete in %.0fs — buy: %d, rent: %d",
|
||||
elapsed, status.properties_buy, status.properties_rent)
|
||||
|
||||
except Exception as e:
|
||||
log.exception("Fatal scrape error")
|
||||
with status_lock:
|
||||
status.state = "error"
|
||||
status.errors.append(f"Fatal: {e}")
|
||||
status.finished_at = time.time()
|
||||
finally:
|
||||
client.close()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Startup: load data
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
log.info("Loading arcgis data...")
|
||||
OUTCODES = load_outcodes()
|
||||
PC_INDEX = build_postcode_index()
|
||||
log.info("Ready — %d outcodes, postcode index built", len(OUTCODES))
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Flask app
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
|
||||
@app.route("/run", methods=["POST"])
|
||||
def trigger_run():
|
||||
with status_lock:
|
||||
if status.state == "running":
|
||||
return jsonify({"error": "Scrape already running"}), 409
|
||||
status.state = "running"
|
||||
|
||||
thread = threading.Thread(target=run_scrape, args=(OUTCODES, PC_INDEX), daemon=True)
|
||||
thread.start()
|
||||
return jsonify({"message": "Scrape started"}), 200
|
||||
|
||||
|
||||
@app.route("/status")
|
||||
def get_status():
|
||||
with status_lock:
|
||||
elapsed = 0.0
|
||||
if status.started_at:
|
||||
end = status.finished_at if status.finished_at else time.time()
|
||||
elapsed = end - status.started_at
|
||||
return jsonify({
|
||||
"state": status.state,
|
||||
"channel": status.channel,
|
||||
"outcode": status.outcode,
|
||||
"outcodes_done": status.outcodes_done,
|
||||
"outcodes_total": status.outcodes_total,
|
||||
"properties_buy": status.properties_buy,
|
||||
"properties_rent": status.properties_rent,
|
||||
"errors": status.errors[-20:], # last 20 errors
|
||||
"elapsed_seconds": round(elapsed, 1),
|
||||
})
|
||||
|
||||
|
||||
@app.route("/debug")
|
||||
def get_debug():
|
||||
return jsonify({
|
||||
"last_response": debug_data["last_response"],
|
||||
"outcode_cache_size": len(debug_data["outcode_cache"]),
|
||||
"outcode_cache_sample": dict(list(debug_data["outcode_cache"].items())[:20]),
|
||||
})
|
||||
|
||||
|
||||
@app.route("/data/<filename>")
|
||||
def serve_data(filename):
|
||||
if not filename.endswith(".parquet"):
|
||||
return jsonify({"error": "Only parquet files served"}), 400
|
||||
return send_from_directory(DATA_DIR, filename)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.run(host="0.0.0.0", port=1234, debug=False)
|
||||
6
finder/onthemarket/explain.md
Normal file
6
finder/onthemarket/explain.md
Normal file
|
|
@ -0,0 +1,6 @@
|
|||
|
||||
Hit the following url with the outcode as the location-id and the page. So for E13, page 2 it's:
|
||||
|
||||
https://www.onthemarket.com/async/search/properties-v2/?search-type=for-sale&location-id=e13&page=2&view=map-list
|
||||
|
||||
and the response is in [[response.json]]
|
||||
4256
finder/onthemarket/response.json
Normal file
4256
finder/onthemarket/response.json
Normal file
File diff suppressed because it is too large
Load diff
9
finder/pyproject.toml
Normal file
9
finder/pyproject.toml
Normal file
|
|
@ -0,0 +1,9 @@
|
|||
[project]
|
||||
name = "finder"
|
||||
version = "0.1.0"
|
||||
requires-python = ">=3.12"
|
||||
dependencies = [
|
||||
"flask",
|
||||
"httpx",
|
||||
"polars",
|
||||
]
|
||||
10918
finder/rightmove/buy.json
Normal file
10918
finder/rightmove/buy.json
Normal file
File diff suppressed because it is too large
Load diff
52
finder/rightmove/explain.md
Normal file
52
finder/rightmove/explain.md
Normal file
|
|
@ -0,0 +1,52 @@
|
|||
The API works as follows, you must search for outcodes, such as E11, then hit https://los.rightmove.co.uk/typeahead?query=E11&limit=10&exclude=STREET which will return something like:
|
||||
|
||||
{
|
||||
"matches": [
|
||||
{
|
||||
"id": "746",
|
||||
"type": "OUTCODE",
|
||||
"displayName": "E11",
|
||||
"highlighting": "<span class='highlightLetter'>E11</span>",
|
||||
"highlights": [
|
||||
{
|
||||
"text": "E11",
|
||||
"highlighted": true
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "749",
|
||||
"type": "OUTCODE",
|
||||
"displayName": "E14",
|
||||
"highlighting": "displayName",
|
||||
"highlights": []
|
||||
},
|
||||
{
|
||||
"id": "752",
|
||||
"type": "OUTCODE",
|
||||
"displayName": "E17",
|
||||
"highlighting": "displayName",
|
||||
"highlights": []
|
||||
},
|
||||
...
|
||||
]
|
||||
}
|
||||
|
||||
We need to find the id of the object which has "type": "OUTCODE", and displayName matching the outcode we searched for, in this case E11, which is 746. Then we can hit the search endpoint with that id, and it will return the properties for that outcode:
|
||||
|
||||
https://www.rightmove.co.uk/api/property-search/listing/search?useLocationIdentifier=true&locationIdentifier=OUTCODE%5E746&buy=For+sale&_includeSSTC=on&index=0&sortType=2&channel=BUY&transactionType=BUY&displayLocationIdentifier=E12.html
|
||||
|
||||
You can see the example response to this at [[buy.json]]
|
||||
|
||||
You must set locationIdentifier=OUTCODE%5E{id} where id is 746 in this case, so it's 746 locationIdentifier=OUTCODE%5E746. Paging works by increasing index by the number of results per page, which is 24. So the next page would be index=24, then index=48, etc.
|
||||
|
||||
|
||||
The rental endpoint works similarly:
|
||||
|
||||
https://www.rightmove.co.uk/api/property-search/listing/search?locationIdentifier=OUTCODE%5E745&index=0&sortType=6&channel=RENT&transactionType=LETTING&displayLocationIdentifier=E16.html
|
||||
|
||||
https://www.rightmove.co.uk/api/property-search/listing/search?locationIdentifier=OUTCODE%5E752&index=48&sortType=6&channel=RENT&transactionType=LETTING&displayLocationIdentifier=E17.html
|
||||
|
||||
|
||||
See a response example for the rental endpoint at [[rent.json]]
|
||||
|
||||
8247
finder/rightmove/rental.json
Normal file
8247
finder/rightmove/rental.json
Normal file
File diff suppressed because it is too large
Load diff
Loading…
Add table
Add a link
Reference in a new issue