This commit is contained in:
Andras Schmelczer 2026-05-17 10:16:30 +01:00
parent 47d89f6fad
commit 017902b8e6
82 changed files with 331466 additions and 54841 deletions

View file

@ -1,8 +1,13 @@
import os
from pathlib import Path
ARCGIS_PATH = os.environ.get("ARCGIS_PATH", "/data/arcgis_data.parquet")
DATA_DIR = Path("/app/data")
FINDER_DIR = Path(__file__).resolve().parent
REPO_DIR = FINDER_DIR.parent
DATA_DIR = Path(os.environ.get("DATA_DIR", str(FINDER_DIR / "data")))
ARCGIS_PATH = Path(
os.environ.get("ARCGIS_PATH", str(REPO_DIR / "property-data" / "arcgis_data.parquet"))
)
PAGE_SIZE = 24
DELAY_BETWEEN_PAGES = 0.3
DELAY_BETWEEN_OUTCODES = 0.5
@ -10,42 +15,6 @@ MAX_RETRIES = 3
RETRY_BASE_DELAY = 2.0
GRID_CELL_SIZE = 0.01 # degrees for postcode spatial index
MAX_BEDROOMS = 20 # sanity cap — values above this are almost certainly parsing errors
# Rent sanity bounds (monthly). Rents outside this range are nulled out — they are
# almost always total-stay pricing (e.g. "Golf Open 2026" short lets), annual rents
# mislabelled as monthly, or data errors.
MIN_RENT_MONTHLY = 50 # below £50/month is implausible for any UK property
MAX_RENT_MONTHLY = 25_000 # above £25k/month covers ultra-prime London; higher is suspect
SEED = 42
CHECKPOINT_INTERVAL = int(os.environ.get("CHECKPOINT_INTERVAL", "900")) # seconds
# Schedule: hour of day (UTC) to auto-run scrape. Set to -1 to disable.
SCHEDULE_HOUR = int(os.environ.get("SCHEDULE_HOUR", "3"))
# Whether to run a scrape immediately on startup
RUN_ON_STARTUP = os.environ.get("RUN_ON_STARTUP", "").lower() in ("1", "true", "yes")
# Enable/disable individual sources
SCRAPE_RIGHTMOVE = os.environ.get("SCRAPE_RIGHTMOVE", "true").lower() in (
"1",
"true",
"yes",
)
SCRAPE_HOMECOUK = os.environ.get("SCRAPE_HOMECOUK", "true").lower() in (
"1",
"true",
"yes",
)
SCRAPE_OPENRENT = os.environ.get("SCRAPE_OPENRENT", "true").lower() in (
"1",
"true",
"yes",
)
SCRAPE_ZOOPLA = os.environ.get("SCRAPE_ZOOPLA", "true").lower() in (
"1",
"true",
"yes",
)
# URL to trigger server data reload after scrape (e.g. http://server:8001/api/reload)
RELOAD_URL = os.environ.get("RELOAD_URL", "")
TYPEAHEAD_URL = "https://los.rightmove.co.uk/typeahead"
SEARCH_URL = "https://www.rightmove.co.uk/api/property-search/listing/search"
@ -55,14 +24,36 @@ RIGHTMOVE_BASE = "https://www.rightmove.co.uk"
HOMECOUK_BASE = "https://home.co.uk"
HOMECOUK_API_BASE = f"{HOMECOUK_BASE}/api"
HOMECOUK_PER_PAGE = 30 # max supported by the API
HOMECOUK_CONCURRENCY = int(os.environ.get("HOMECOUK_CONCURRENCY", "4"))
# OpenRent
OPENRENT_BASE = "https://www.openrent.co.uk"
# Zoopla
ZOOPLA_BASE = "https://www.zoopla.co.uk"
# Greater London-ish postcode areas. This intentionally uses broad area
# prefixes so a manual scrape can include central/inner London plus common
# outer-London and near-London outcodes without maintaining a long borough list.
LONDON_OUTCODE_PREFIXES = {
"E",
"EC",
"N",
"NW",
"SE",
"SW",
"W",
"WC",
"BR",
"CR",
"DA",
"EN",
"HA",
"IG",
"KT",
"RM",
"SM",
"TW",
"UB",
"WD",
}
PROPERTY_TYPE_MAP = {
"Detached": "Detached",
"Semi-Detached": "Semi-Detached",
@ -150,5 +141,4 @@ PROPERTY_TYPE_MAP = {
CHANNELS = [
{"channel": "BUY", "transactionType": "BUY", "sortType": "2"},
{"channel": "RENT", "transactionType": "LETTING", "sortType": "6"},
]