changes
This commit is contained in:
parent
524580eb25
commit
ffe080adef
82 changed files with 2652 additions and 2956 deletions
|
|
@ -7,5 +7,6 @@ COPY pyproject.toml ./
|
|||
RUN uv pip install --system -r pyproject.toml
|
||||
|
||||
COPY *.py ./
|
||||
COPY property-data/arcgis_data.parquet /data/arcgis_data.parquet
|
||||
|
||||
CMD ["python3", "main.py"]
|
||||
|
|
|
|||
|
|
@ -11,6 +11,11 @@ RETRY_BASE_DELAY = 2.0
|
|||
GRID_CELL_SIZE = 0.01 # degrees for postcode spatial index
|
||||
SEED = 42
|
||||
|
||||
# Schedule: hour of day (UTC) to auto-run scrape. Set to -1 to disable.
|
||||
SCHEDULE_HOUR = int(os.environ.get("SCHEDULE_HOUR", "3"))
|
||||
# Whether to run a scrape immediately on startup
|
||||
RUN_ON_STARTUP = os.environ.get("RUN_ON_STARTUP", "true").lower() in ("1", "true", "yes")
|
||||
|
||||
TYPEAHEAD_URL = "https://los.rightmove.co.uk/typeahead"
|
||||
SEARCH_URL = "https://www.rightmove.co.uk/api/property-search/listing/search"
|
||||
RIGHTMOVE_BASE = "https://www.rightmove.co.uk"
|
||||
|
|
|
|||
|
|
@ -28329,3 +28329,27 @@
|
|||
* Running on http://127.0.0.1:1234
|
||||
* Running on http://10.66.109.86:1234
|
||||
2026-02-15 22:37:52,025 [INFO] [33mPress CTRL+C to quit[0m
|
||||
2026-02-15 23:00:08,987 [INFO] Loading arcgis data...
|
||||
2026-02-15 23:00:08,988 [INFO] Loading outcodes from /data/arcgis_data.parquet
|
||||
2026-02-15 23:00:09,078 [INFO] England postcodes: 2260065
|
||||
2026-02-15 23:00:09,118 [INFO] Unique England outcodes: 2323
|
||||
2026-02-15 23:00:09,118 [INFO] Building postcode spatial index from /data/arcgis_data.parquet
|
||||
2026-02-15 23:00:10,418 [INFO] Postcode spatial index: 113226 cells, 2260065 postcodes
|
||||
2026-02-15 23:00:10,434 [INFO] Ready — 2323 outcodes, postcode index built
|
||||
2026-02-15 23:00:10,446 [INFO] [31m[1mWARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.[0m
|
||||
* Running on all addresses (0.0.0.0)
|
||||
* Running on http://127.0.0.1:1234
|
||||
* Running on http://10.66.109.86:1234
|
||||
2026-02-15 23:00:10,446 [INFO] [33mPress CTRL+C to quit[0m
|
||||
2026-02-16 19:56:56,857 [INFO] Loading arcgis data...
|
||||
2026-02-16 19:56:56,857 [INFO] Loading outcodes from /data/arcgis_data.parquet
|
||||
2026-02-16 19:56:57,061 [INFO] England postcodes: 2260065
|
||||
2026-02-16 19:56:57,161 [INFO] Unique England outcodes: 2323
|
||||
2026-02-16 19:56:57,162 [INFO] Building postcode spatial index from /data/arcgis_data.parquet
|
||||
2026-02-16 19:57:00,146 [INFO] Postcode spatial index: 113226 cells, 2260065 postcodes
|
||||
2026-02-16 19:57:00,227 [INFO] Ready — 2323 outcodes, postcode index built
|
||||
2026-02-16 19:57:00,247 [INFO] [31m[1mWARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.[0m
|
||||
* Running on all addresses (0.0.0.0)
|
||||
* Running on http://127.0.0.1:1234
|
||||
* Running on http://10.66.109.86:1234
|
||||
2026-02-16 19:57:00,248 [INFO] [33mPress CTRL+C to quit[0m
|
||||
|
|
|
|||
|
|
@ -1,12 +1,13 @@
|
|||
import logging
|
||||
import threading
|
||||
import time
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from pathlib import Path
|
||||
|
||||
from flask import Flask, Response, jsonify, send_from_directory
|
||||
from prometheus_client import generate_latest, CONTENT_TYPE_LATEST
|
||||
|
||||
from constants import DATA_DIR
|
||||
from constants import DATA_DIR, RUN_ON_STARTUP, SCHEDULE_HOUR
|
||||
from rightmove import outcode_cache
|
||||
from scraper import (
|
||||
_sync_gauges,
|
||||
|
|
@ -46,6 +47,52 @@ OUTCODES = load_outcodes()
|
|||
PC_INDEX = build_postcode_index()
|
||||
log.info("Ready — %d outcodes, postcode index built", len(OUTCODES))
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Scheduler
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _start_scrape() -> bool:
|
||||
"""Try to start a scrape. Returns True if started, False if already running."""
|
||||
with status_lock:
|
||||
if status.state == "running":
|
||||
return False
|
||||
status.state = "running"
|
||||
thread = threading.Thread(target=run_scrape, args=(OUTCODES, PC_INDEX), daemon=True)
|
||||
thread.start()
|
||||
return True
|
||||
|
||||
|
||||
def _seconds_until(hour: int) -> float:
|
||||
"""Seconds from now until the next occurrence of `hour`:00 UTC."""
|
||||
now = datetime.now(timezone.utc)
|
||||
target = now.replace(hour=hour, minute=0, second=0, microsecond=0)
|
||||
if target <= now:
|
||||
target += timedelta(days=1)
|
||||
return (target - now).total_seconds()
|
||||
|
||||
|
||||
def _scheduler_loop() -> None:
|
||||
"""Background thread that triggers a daily scrape at SCHEDULE_HOUR UTC."""
|
||||
log.info("Scheduler active — will run daily at %02d:00 UTC", SCHEDULE_HOUR)
|
||||
while True:
|
||||
wait = _seconds_until(SCHEDULE_HOUR)
|
||||
log.info("Next scheduled scrape in %.0f seconds (%.1f hours)", wait, wait / 3600)
|
||||
time.sleep(wait)
|
||||
log.info("Scheduled scrape triggered")
|
||||
if not _start_scrape():
|
||||
log.warning("Scheduled scrape skipped — already running")
|
||||
|
||||
|
||||
if RUN_ON_STARTUP:
|
||||
log.info("RUN_ON_STARTUP=true — starting initial scrape")
|
||||
_start_scrape()
|
||||
|
||||
if SCHEDULE_HOUR >= 0:
|
||||
scheduler = threading.Thread(target=_scheduler_loop, daemon=True)
|
||||
scheduler.start()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Flask app
|
||||
# ---------------------------------------------------------------------------
|
||||
|
|
@ -55,14 +102,9 @@ app = Flask(__name__)
|
|||
|
||||
@app.route("/run", methods=["POST"])
|
||||
def trigger_run():
|
||||
with status_lock:
|
||||
if status.state == "running":
|
||||
return jsonify({"error": "Scrape already running"}), 409
|
||||
status.state = "running"
|
||||
|
||||
thread = threading.Thread(target=run_scrape, args=(OUTCODES, PC_INDEX), daemon=True)
|
||||
thread.start()
|
||||
return jsonify({"message": "Scrape started"}), 200
|
||||
if _start_scrape():
|
||||
return jsonify({"message": "Scrape started"}), 200
|
||||
return jsonify({"error": "Scrape already running"}), 409
|
||||
|
||||
|
||||
@app.route("/status")
|
||||
|
|
@ -72,7 +114,7 @@ def get_status():
|
|||
if status.started_at:
|
||||
end = status.finished_at if status.finished_at else time.time()
|
||||
elapsed = end - status.started_at
|
||||
return jsonify({
|
||||
resp = {
|
||||
"state": status.state,
|
||||
"channel": status.channel,
|
||||
"outcode": status.outcode,
|
||||
|
|
@ -82,7 +124,10 @@ def get_status():
|
|||
"properties_rent": status.properties_rent,
|
||||
"errors": status.errors[-20:], # last 20 errors
|
||||
"elapsed_seconds": round(elapsed, 1),
|
||||
})
|
||||
}
|
||||
if SCHEDULE_HOUR >= 0:
|
||||
resp["next_scrape_in_seconds"] = round(_seconds_until(SCHEDULE_HOUR))
|
||||
return jsonify(resp)
|
||||
|
||||
|
||||
@app.route("/debug")
|
||||
|
|
|
|||
|
|
@ -159,8 +159,8 @@ def run_scrape(outcodes: list[str], pc_index: PostcodeSpatialIndex) -> None:
|
|||
|
||||
# Write parquet
|
||||
deduped = list(all_properties.values())
|
||||
output_path = DATA_DIR / f"rightmove_{file_suffix}.parquet"
|
||||
write_parquet(deduped, output_path)
|
||||
output_path = DATA_DIR / f"online_listings_{file_suffix}.parquet"
|
||||
write_parquet(deduped, output_path, channel=file_suffix)
|
||||
|
||||
with status_lock:
|
||||
if channel_name == "BUY":
|
||||
|
|
|
|||
|
|
@ -1,63 +1,94 @@
|
|||
import logging
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
import polars as pl
|
||||
|
||||
from transform import normalize_price
|
||||
|
||||
log = logging.getLogger("rightmove")
|
||||
|
||||
|
||||
def write_parquet(properties: list[dict], path: Path) -> None:
|
||||
"""Write properties list to parquet using Polars."""
|
||||
def write_parquet(properties: list[dict], path: Path, channel: str) -> None:
|
||||
"""Write properties list to parquet with server-ready column names.
|
||||
|
||||
channel: "buy" or "rent"
|
||||
"""
|
||||
if not properties:
|
||||
log.warning("No properties to write to %s", path)
|
||||
return
|
||||
|
||||
# Parse first_visible_date to datetime
|
||||
listing_dates = []
|
||||
for p in properties:
|
||||
fvd = p.get("first_visible_date", "")
|
||||
if fvd:
|
||||
try:
|
||||
dt = datetime.fromisoformat(fvd.replace("Z", "+00:00"))
|
||||
listing_dates.append(dt.replace(tzinfo=None))
|
||||
except (ValueError, TypeError):
|
||||
listing_dates.append(None)
|
||||
else:
|
||||
listing_dates.append(None)
|
||||
|
||||
# Derive asking price / asking rent based on channel
|
||||
if channel == "buy":
|
||||
asking_prices = [p["price"] for p in properties]
|
||||
asking_rents = [None] * len(properties)
|
||||
listing_statuses = ["For sale"] * len(properties)
|
||||
else:
|
||||
asking_prices = [None] * len(properties)
|
||||
asking_rents = [
|
||||
normalize_price(p["price"], p["price_frequency"]) for p in properties
|
||||
]
|
||||
listing_statuses = ["For rent"] * len(properties)
|
||||
|
||||
df = pl.DataFrame(
|
||||
{
|
||||
"id": [p["id"] for p in properties],
|
||||
"bedrooms": [p["bedrooms"] for p in properties],
|
||||
"bathrooms": [p["bathrooms"] for p in properties],
|
||||
"total_rooms": [p["total_rooms"] for p in properties],
|
||||
"longitude": [p["longitude"] for p in properties],
|
||||
"latitude": [p["latitude"] for p in properties],
|
||||
"postcode": [p["postcode"] for p in properties],
|
||||
"address": [p["address"] for p in properties],
|
||||
"tenure": [p["tenure"] for p in properties],
|
||||
"property_type": [p["property_type"] for p in properties],
|
||||
"property_sub_type": [p["property_sub_type"] for p in properties],
|
||||
"price": [p["price"] for p in properties],
|
||||
"price_frequency": [p["price_frequency"] for p in properties],
|
||||
"price_qualifier": [p["price_qualifier"] for p in properties],
|
||||
"floorspace_sqm": [p["floorspace_sqm"] for p in properties],
|
||||
"url": [p["url"] for p in properties],
|
||||
"features": [p["features"] for p in properties],
|
||||
"first_visible_date": [p["first_visible_date"] for p in properties],
|
||||
"update_date": [p["update_date"] for p in properties],
|
||||
"outcode": [p["outcode"] for p in properties],
|
||||
"house_share": [p["house_share"] for p in properties],
|
||||
"Bedrooms": [p["Bedrooms"] for p in properties],
|
||||
"Bathrooms": [p["Bathrooms"] for p in properties],
|
||||
"Number of bedrooms & living rooms": [
|
||||
p["Number of bedrooms & living rooms"] for p in properties
|
||||
],
|
||||
"lon": [p["lon"] for p in properties],
|
||||
"lat": [p["lat"] for p in properties],
|
||||
"Postcode": [p["Postcode"] for p in properties],
|
||||
"Address per Property Register": [
|
||||
p["Address per Property Register"] for p in properties
|
||||
],
|
||||
"Leashold/Freehold": [p["Leashold/Freehold"] for p in properties],
|
||||
"Property type": [p["Property type"] for p in properties],
|
||||
"Property sub-type": [p["Property sub-type"] for p in properties],
|
||||
"Price qualifier": [p["Price qualifier"] for p in properties],
|
||||
"Total floor area (sqm)": [
|
||||
p["Total floor area (sqm)"] for p in properties
|
||||
],
|
||||
"Listing URL": [p["Listing URL"] for p in properties],
|
||||
"Listing features": [p["Listing features"] for p in properties],
|
||||
"Listing date": listing_dates,
|
||||
"Listing status": listing_statuses,
|
||||
"Asking price": asking_prices,
|
||||
"Asking rent (monthly)": asking_rents,
|
||||
},
|
||||
schema={
|
||||
"id": pl.Int64,
|
||||
"bedrooms": pl.Int32,
|
||||
"bathrooms": pl.Int32,
|
||||
"total_rooms": pl.Int32,
|
||||
"longitude": pl.Float64,
|
||||
"latitude": pl.Float64,
|
||||
"postcode": pl.Utf8,
|
||||
"address": pl.Utf8,
|
||||
"tenure": pl.Utf8,
|
||||
"property_type": pl.Utf8,
|
||||
"property_sub_type": pl.Utf8,
|
||||
"price": pl.Int64,
|
||||
"price_frequency": pl.Utf8,
|
||||
"price_qualifier": pl.Utf8,
|
||||
"floorspace_sqm": pl.Float64,
|
||||
"url": pl.Utf8,
|
||||
"features": pl.List(pl.Utf8),
|
||||
"first_visible_date": pl.Utf8,
|
||||
"update_date": pl.Utf8,
|
||||
"outcode": pl.Utf8,
|
||||
"house_share": pl.Boolean,
|
||||
"Bedrooms": pl.Int32,
|
||||
"Bathrooms": pl.Int32,
|
||||
"Number of bedrooms & living rooms": pl.Int32,
|
||||
"lon": pl.Float64,
|
||||
"lat": pl.Float64,
|
||||
"Postcode": pl.Utf8,
|
||||
"Address per Property Register": pl.Utf8,
|
||||
"Leashold/Freehold": pl.Utf8,
|
||||
"Property type": pl.Utf8,
|
||||
"Property sub-type": pl.Utf8,
|
||||
"Price qualifier": pl.Utf8,
|
||||
"Total floor area (sqm)": pl.Float64,
|
||||
"Listing URL": pl.Utf8,
|
||||
"Listing features": pl.List(pl.Utf8),
|
||||
"Listing date": pl.Datetime("us"),
|
||||
"Listing status": pl.Utf8,
|
||||
"Asking price": pl.Int64,
|
||||
"Asking rent (monthly)": pl.Int64,
|
||||
},
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -98,27 +98,27 @@ def transform_property(prop: dict, outcode: str, pc_index: PostcodeSpatialIndex)
|
|||
update_date = listing_update.get("listingUpdateDate", "")
|
||||
|
||||
postcode = pc_index.nearest(lat, lng)
|
||||
if not postcode:
|
||||
log.debug("No England postcode for property at %.4f, %.4f — skipping", lat, lng)
|
||||
return None
|
||||
|
||||
return {
|
||||
"id": prop.get("id"),
|
||||
"bedrooms": bedrooms,
|
||||
"bathrooms": bathrooms,
|
||||
"total_rooms": bedrooms + bathrooms,
|
||||
"longitude": lng,
|
||||
"latitude": lat,
|
||||
"postcode": postcode,
|
||||
"address": prop.get("displayAddress", ""),
|
||||
"tenure": extract_tenure(prop.get("tenure")),
|
||||
"property_type": map_property_type(sub_type),
|
||||
"property_sub_type": sub_type or "Unknown",
|
||||
"Bedrooms": bedrooms,
|
||||
"Bathrooms": bathrooms,
|
||||
"Number of bedrooms & living rooms": bedrooms + bathrooms,
|
||||
"lon": lng,
|
||||
"lat": lat,
|
||||
"Postcode": postcode,
|
||||
"Address per Property Register": prop.get("displayAddress", ""),
|
||||
"Leashold/Freehold": extract_tenure(prop.get("tenure")),
|
||||
"Property type": map_property_type(sub_type),
|
||||
"Property sub-type": sub_type or "Unknown",
|
||||
"price": price,
|
||||
"price_frequency": frequency,
|
||||
"price_qualifier": price_qualifier,
|
||||
"floorspace_sqm": parse_display_size(prop.get("displaySize")),
|
||||
"url": RIGHTMOVE_BASE + prop.get("propertyUrl", ""),
|
||||
"features": key_features,
|
||||
"Price qualifier": price_qualifier,
|
||||
"Total floor area (sqm)": parse_display_size(prop.get("displaySize")),
|
||||
"Listing URL": RIGHTMOVE_BASE + prop.get("propertyUrl", ""),
|
||||
"Listing features": key_features,
|
||||
"first_visible_date": prop.get("firstVisibleDate", ""),
|
||||
"update_date": update_date,
|
||||
"outcode": outcode,
|
||||
"house_share": sub_type == "House Share",
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue