191 lines
7.1 KiB
Python
191 lines
7.1 KiB
Python
import os
|
|
from pathlib import Path
|
|
|
|
FINDER_DIR = Path(__file__).resolve().parent
|
|
REPO_DIR = FINDER_DIR.parent
|
|
|
|
DATA_DIR = Path(os.environ.get("DATA_DIR", str(FINDER_DIR / "data")))
|
|
ARCGIS_PATH = Path(
|
|
os.environ.get(
|
|
"ARCGIS_PATH", str(REPO_DIR / "property-data" / "arcgis_data.parquet")
|
|
)
|
|
)
|
|
PAGE_SIZE = 24
|
|
DELAY_BETWEEN_PAGES = 0.3
|
|
DELAY_BETWEEN_OUTCODES = 0.5
|
|
MAX_RETRIES = 3
|
|
RETRY_BASE_DELAY = 2.0
|
|
GRID_CELL_SIZE = 0.01 # degrees for postcode spatial index
|
|
MAX_BEDROOMS = 20 # sanity cap — values above this are almost certainly parsing errors
|
|
|
|
TYPEAHEAD_URL = "https://los.rightmove.co.uk/typeahead"
|
|
SEARCH_URL = "https://www.rightmove.co.uk/api/property-search/listing/search"
|
|
RIGHTMOVE_BASE = "https://www.rightmove.co.uk"
|
|
# Detail page (plain HTTPS GET, no Cloudflare). Its window.__PAGE_MODEL embeds
|
|
# propertyData.address.{outcode,incode}, which together form the property's TRUE
|
|
# full postcode — the search API only exposes the outcode. {id} is the numeric
|
|
# listing id from the search response.
|
|
RIGHTMOVE_DETAIL_URL = "https://www.rightmove.co.uk/properties/{id}"
|
|
|
|
# The Rightmove search API gives only an outcode-level display address, so the
|
|
# true full postcode is recovered from each listing's detail page (see
|
|
# finder/rightmove.py::parse_detail_postcode). One extra GET per listing is a
|
|
# big throughput increase over the ~1000-result-per-outcode search, so detail
|
|
# fetching is gated and capped per outcode (mirrors ZOOPLA_* below). Default ON.
|
|
RIGHTMOVE_FETCH_DETAILS = True # fetch detail pages for true per-listing postcodes
|
|
RIGHTMOVE_MAX_DETAILS_PER_OUTCODE = 4000 # max detail-page fetches per outcode
|
|
|
|
# OnTheMarket
|
|
ONTHEMARKET_BASE = "https://www.onthemarket.com"
|
|
|
|
# Zoopla
|
|
ZOOPLA_BASE = "https://www.zoopla.co.uk"
|
|
|
|
# Zoopla search cards only carry an outcode-level address, so the full postcode
|
|
# and precise coordinates are scraped from each listing's detail page. These
|
|
# bound that extra work (see finder/zoopla.py and finder/scraper.py).
|
|
ZOOPLA_FETCH_DETAILS = True # fetch detail pages for precise per-listing postcodes
|
|
ZOOPLA_MAX_DETAILS_PER_OUTCODE = 4000 # max detail-page fetches per outcode
|
|
ZOOPLA_DETAIL_GOTO_TIMEOUT_MS = 1500000 # per detail-page navigation timeout
|
|
# Fraction of a single outcode's wall-clock budget (ZOOPLA_OUTCODE_TIMEOUT_SECONDS)
|
|
# spent fetching details; the remainder is reserved for search pagination so
|
|
# detail fetches can never trip the timeout and discard collected listings.
|
|
ZOOPLA_DETAIL_BUDGET_FRACTION = 0.6
|
|
|
|
# Gluetun VPN. Network endpoints are env-overridable because they are
|
|
# deployment-specific: when finder runs in a SEPARATE container they use the
|
|
# `gluetun` hostname (defaults below); when finder SHARES gluetun's network
|
|
# namespace (docker-compose.yml, network_mode container:media_gluetun) they
|
|
# become localhost and GLUETUN_PROXY is empty (the shared netns already tunnels
|
|
# all traffic, so no HTTP proxy is needed).
|
|
# GLUETUN_PROXY="" (empty) => direct connection (no proxy); used in shared-netns.
|
|
GLUETUN_PROXY = os.environ.get("GLUETUN_PROXY", "http://gluetun:8888") or None
|
|
GLUETUN_CONTROL_URL = os.environ.get("GLUETUN_CONTROL_URL", "http://gluetun:8000")
|
|
GLUETUN_API_KEY = "My8AbvnKhfyFdRhpTVfoTfa5DkAMmg8K"
|
|
# Egress-IP rotations to try per Cloudflare challenge. Keep at 0 for Zoopla:
|
|
# rotating among Gluetun's datacenter IPs doesn't clear Cloudflare and would
|
|
# rotate away from the IP a cleared Cloudflare session was bound to, voiding it.
|
|
# Raise only with residential IPs where rotation helps.
|
|
GLUETUN_MAX_ROTATIONS = 0 # max egress-IP rotations per Cloudflare challenge
|
|
|
|
# Zoopla fetcher: "flaresolverr" (default) solves Cloudflare via the FlareSolverr
|
|
# sidecar (docker-compose.yml) and needs no display/VNC — verified to return the
|
|
# RSC flight stream with postcode + coordinates; "camoufox" drives a local
|
|
# anti-fingerprint browser (needs an interactive solve on datacenter IPs).
|
|
ZOOPLA_FETCHER = os.environ.get("ZOOPLA_FETCHER", "flaresolverr")
|
|
FLARESOLVERR_URL = os.environ.get("FLARESOLVERR_URL", "http://gluetun:8191/v1")
|
|
FLARESOLVERR_MAX_TIMEOUT_MS = 120000 # per-request solve budget; first solve is slow
|
|
|
|
# Greater London-ish postcode areas. This intentionally uses broad area
|
|
# prefixes so a manual scrape can include central/inner London plus common
|
|
# outer-London and near-London outcodes without maintaining a long borough list.
|
|
LONDON_OUTCODE_PREFIXES = {
|
|
"E",
|
|
"EC",
|
|
"N",
|
|
"NW",
|
|
"SE",
|
|
"SW",
|
|
"W",
|
|
"WC",
|
|
"BR",
|
|
"CR",
|
|
"DA",
|
|
"EN",
|
|
"HA",
|
|
"IG",
|
|
"KT",
|
|
"RM",
|
|
"SM",
|
|
"TW",
|
|
"UB",
|
|
"WD",
|
|
}
|
|
|
|
PROPERTY_TYPE_MAP = {
|
|
"Detached": "Detached",
|
|
"Semi-Detached": "Semi-Detached",
|
|
"Terraced": "Terraced",
|
|
"End of Terrace": "Terraced",
|
|
"Mid Terrace": "Terraced",
|
|
"Flat": "Flats/Maisonettes",
|
|
"Maisonette": "Flats/Maisonettes",
|
|
"Studio": "Flats/Maisonettes",
|
|
"Apartment": "Flats/Maisonettes",
|
|
"Penthouse": "Flats/Maisonettes",
|
|
"Ground Flat": "Flats/Maisonettes",
|
|
"Duplex": "Flats/Maisonettes",
|
|
"Detached Bungalow": "Detached",
|
|
"Semi-Detached Bungalow": "Semi-Detached",
|
|
"Town House": "Terraced",
|
|
"Link Detached": "Detached",
|
|
"Link Detached House": "Detached",
|
|
"Bungalow": "Other",
|
|
"Cottage": "Other",
|
|
"Park Home": "Other",
|
|
"Mobile Home": "Other",
|
|
"Caravan": "Other",
|
|
"Lodge": "Other",
|
|
"Land": "Other",
|
|
"Farm / Barn": "Other",
|
|
"Farm House": "Other",
|
|
"House": "Detached",
|
|
"House of Multiple Occupation": "Other",
|
|
"House Share": "Other",
|
|
"Not Specified": "Other",
|
|
"Chalet": "Other",
|
|
"Barn Conversion": "Other",
|
|
"Coach House": "Other",
|
|
"Character Property": "Other",
|
|
"Cluster House": "Other",
|
|
"Retirement Property": "Other",
|
|
"Parking": "Other",
|
|
"Plot": "Other",
|
|
"Garages": "Other",
|
|
"Mews": "Terraced",
|
|
"Property": "Other",
|
|
"Flat Share": "Other",
|
|
"Block of Apartments": "Other",
|
|
"Private Halls": "Other",
|
|
"Terraced Bungalow": "Terraced",
|
|
"Equestrian Facility": "Other",
|
|
"Ground Maisonette": "Flats/Maisonettes",
|
|
"Country House": "Detached",
|
|
"Village House": "Detached",
|
|
"Farm Land": "Other",
|
|
"House Boat": "Other",
|
|
"Barn": "Other",
|
|
"Serviced Apartments": "Other",
|
|
# Space-separated variants from legacy provider normalization.
|
|
"Semi Detached": "Semi-Detached",
|
|
"Semi Detached Bungalow": "Semi-Detached",
|
|
"End Of Terrace": "Terraced",
|
|
"End Terrace": "Terraced",
|
|
"Block Of Apartments": "Other",
|
|
# Lowercase variants from listing APIs.
|
|
"house": "Detached",
|
|
"bungalow": "Other",
|
|
"townhouse": "Terraced",
|
|
"land": "Other",
|
|
"other": "Other",
|
|
"not-specified": "Other",
|
|
"retirement-property": "Other",
|
|
"equestrian-facility": "Other",
|
|
"flat": "Flats/Maisonettes",
|
|
"detached": "Detached",
|
|
"semi-detached": "Semi-Detached",
|
|
"terraced": "Terraced",
|
|
"maisonette": "Flats/Maisonettes",
|
|
"apartment": "Flats/Maisonettes",
|
|
"studio": "Flats/Maisonettes",
|
|
"penthouse": "Flats/Maisonettes",
|
|
"cottage": "Other",
|
|
"chalet": "Other",
|
|
"farm_house": "Detached",
|
|
"country house": "Detached",
|
|
"village house": "Detached",
|
|
}
|
|
|
|
CHANNELS = [
|
|
{"channel": "BUY", "transactionType": "BUY", "sortType": "2"},
|
|
]
|