all good

2026-05-17 10:16:30 +01:00 · 2026-05-17 10:16:30 +01:00 · 017902b8e6
commit 017902b8e6
parent 47d89f6fad
82 changed files with 331466 additions and 54841 deletions
--- a/finder/constants.py
+++ b/finder/constants.py
@ -1,8 +1,13 @@
 import os
 from pathlib import Path

-ARCGIS_PATH = os.environ.get("ARCGIS_PATH", "/data/arcgis_data.parquet")
-DATA_DIR = Path("/app/data")
+FINDER_DIR = Path(__file__).resolve().parent
+REPO_DIR = FINDER_DIR.parent
+
+DATA_DIR = Path(os.environ.get("DATA_DIR", str(FINDER_DIR / "data")))
+ARCGIS_PATH = Path(
+    os.environ.get("ARCGIS_PATH", str(REPO_DIR / "property-data" / "arcgis_data.parquet"))
+)
 PAGE_SIZE = 24
 DELAY_BETWEEN_PAGES = 0.3
 DELAY_BETWEEN_OUTCODES = 0.5
@ -10,42 +15,6 @@ MAX_RETRIES = 3
 RETRY_BASE_DELAY = 2.0
 GRID_CELL_SIZE = 0.01  # degrees for postcode spatial index
 MAX_BEDROOMS = 20  # sanity cap — values above this are almost certainly parsing errors
-# Rent sanity bounds (monthly). Rents outside this range are nulled out — they are
-# almost always total-stay pricing (e.g. "Golf Open 2026" short lets), annual rents
-# mislabelled as monthly, or data errors.
-MIN_RENT_MONTHLY = 50  # below £50/month is implausible for any UK property
-MAX_RENT_MONTHLY = 25_000  # above £25k/month covers ultra-prime London; higher is suspect
-SEED = 42
-CHECKPOINT_INTERVAL = int(os.environ.get("CHECKPOINT_INTERVAL", "900"))  # seconds
-
-# Schedule: hour of day (UTC) to auto-run scrape. Set to -1 to disable.
-SCHEDULE_HOUR = int(os.environ.get("SCHEDULE_HOUR", "3"))
-# Whether to run a scrape immediately on startup
-RUN_ON_STARTUP = os.environ.get("RUN_ON_STARTUP", "").lower() in ("1", "true", "yes")
-# Enable/disable individual sources
-SCRAPE_RIGHTMOVE = os.environ.get("SCRAPE_RIGHTMOVE", "true").lower() in (
-    "1",
-    "true",
-    "yes",
-)
-SCRAPE_HOMECOUK = os.environ.get("SCRAPE_HOMECOUK", "true").lower() in (
-    "1",
-    "true",
-    "yes",
-)
-SCRAPE_OPENRENT = os.environ.get("SCRAPE_OPENRENT", "true").lower() in (
-    "1",
-    "true",
-    "yes",
-)
-SCRAPE_ZOOPLA = os.environ.get("SCRAPE_ZOOPLA", "true").lower() in (
-    "1",
-    "true",
-    "yes",
-)
-
-# URL to trigger server data reload after scrape (e.g. http://server:8001/api/reload)
-RELOAD_URL = os.environ.get("RELOAD_URL", "")

 TYPEAHEAD_URL = "https://los.rightmove.co.uk/typeahead"
 SEARCH_URL = "https://www.rightmove.co.uk/api/property-search/listing/search"
@ -55,14 +24,36 @@ RIGHTMOVE_BASE = "https://www.rightmove.co.uk"
 HOMECOUK_BASE = "https://home.co.uk"
 HOMECOUK_API_BASE = f"{HOMECOUK_BASE}/api"
 HOMECOUK_PER_PAGE = 30  # max supported by the API
-HOMECOUK_CONCURRENCY = int(os.environ.get("HOMECOUK_CONCURRENCY", "4"))
-
-# OpenRent
-OPENRENT_BASE = "https://www.openrent.co.uk"

 # Zoopla
 ZOOPLA_BASE = "https://www.zoopla.co.uk"

+# Greater London-ish postcode areas. This intentionally uses broad area
+# prefixes so a manual scrape can include central/inner London plus common
+# outer-London and near-London outcodes without maintaining a long borough list.
+LONDON_OUTCODE_PREFIXES = {
+    "E",
+    "EC",
+    "N",
+    "NW",
+    "SE",
+    "SW",
+    "W",
+    "WC",
+    "BR",
+    "CR",
+    "DA",
+    "EN",
+    "HA",
+    "IG",
+    "KT",
+    "RM",
+    "SM",
+    "TW",
+    "UB",
+    "WD",
+}
+
 PROPERTY_TYPE_MAP = {
    "Detached": "Detached",
    "Semi-Detached": "Semi-Detached",
@ -150,5 +141,4 @@ PROPERTY_TYPE_MAP = {

 CHANNELS = [
    {"channel": "BUY", "transactionType": "BUY", "sortType": "2"},
-    {"channel": "RENT", "transactionType": "LETTING", "sortType": "6"},
 ]