perfect-postcode/finder/constants.py

191 lines
7.1 KiB
Python

import os
from pathlib import Path
FINDER_DIR = Path(__file__).resolve().parent
REPO_DIR = FINDER_DIR.parent
DATA_DIR = Path(os.environ.get("DATA_DIR", str(FINDER_DIR / "data")))
ARCGIS_PATH = Path(
os.environ.get(
"ARCGIS_PATH", str(REPO_DIR / "property-data" / "arcgis_data.parquet")
)
)
PAGE_SIZE = 24
DELAY_BETWEEN_PAGES = 0.3
DELAY_BETWEEN_OUTCODES = 0.5
MAX_RETRIES = 3
RETRY_BASE_DELAY = 2.0
GRID_CELL_SIZE = 0.01 # degrees for postcode spatial index
MAX_BEDROOMS = 20 # sanity cap — values above this are almost certainly parsing errors
TYPEAHEAD_URL = "https://los.rightmove.co.uk/typeahead"
SEARCH_URL = "https://www.rightmove.co.uk/api/property-search/listing/search"
RIGHTMOVE_BASE = "https://www.rightmove.co.uk"
# Detail page (plain HTTPS GET, no Cloudflare). Its window.__PAGE_MODEL embeds
# propertyData.address.{outcode,incode}, which together form the property's TRUE
# full postcode — the search API only exposes the outcode. {id} is the numeric
# listing id from the search response.
RIGHTMOVE_DETAIL_URL = "https://www.rightmove.co.uk/properties/{id}"
# The Rightmove search API gives only an outcode-level display address, so the
# true full postcode is recovered from each listing's detail page (see
# finder/rightmove.py::parse_detail_postcode). One extra GET per listing is a
# big throughput increase over the ~1000-result-per-outcode search, so detail
# fetching is gated and capped per outcode (mirrors ZOOPLA_* below). Default ON.
RIGHTMOVE_FETCH_DETAILS = True # fetch detail pages for true per-listing postcodes
RIGHTMOVE_MAX_DETAILS_PER_OUTCODE = 4000 # max detail-page fetches per outcode
# OnTheMarket
ONTHEMARKET_BASE = "https://www.onthemarket.com"
# Zoopla
ZOOPLA_BASE = "https://www.zoopla.co.uk"
# Zoopla search cards only carry an outcode-level address, so the full postcode
# and precise coordinates are scraped from each listing's detail page. These
# bound that extra work (see finder/zoopla.py and finder/scraper.py).
ZOOPLA_FETCH_DETAILS = True # fetch detail pages for precise per-listing postcodes
ZOOPLA_MAX_DETAILS_PER_OUTCODE = 4000 # max detail-page fetches per outcode
ZOOPLA_DETAIL_GOTO_TIMEOUT_MS = 1500000 # per detail-page navigation timeout
# Fraction of a single outcode's wall-clock budget (ZOOPLA_OUTCODE_TIMEOUT_SECONDS)
# spent fetching details; the remainder is reserved for search pagination so
# detail fetches can never trip the timeout and discard collected listings.
ZOOPLA_DETAIL_BUDGET_FRACTION = 0.6
# Gluetun VPN. Network endpoints are env-overridable because they are
# deployment-specific: when finder runs in a SEPARATE container they use the
# `gluetun` hostname (defaults below); when finder SHARES gluetun's network
# namespace (docker-compose.yml, network_mode container:media_gluetun) they
# become localhost and GLUETUN_PROXY is empty (the shared netns already tunnels
# all traffic, so no HTTP proxy is needed).
# GLUETUN_PROXY="" (empty) => direct connection (no proxy); used in shared-netns.
GLUETUN_PROXY = os.environ.get("GLUETUN_PROXY", "http://gluetun:8888") or None
GLUETUN_CONTROL_URL = os.environ.get("GLUETUN_CONTROL_URL", "http://gluetun:8000")
GLUETUN_API_KEY = "My8AbvnKhfyFdRhpTVfoTfa5DkAMmg8K"
# Egress-IP rotations to try per Cloudflare challenge. Keep at 0 for Zoopla:
# rotating among Gluetun's datacenter IPs doesn't clear Cloudflare and would
# rotate away from the IP a cleared Cloudflare session was bound to, voiding it.
# Raise only with residential IPs where rotation helps.
GLUETUN_MAX_ROTATIONS = 0 # max egress-IP rotations per Cloudflare challenge
# Zoopla fetcher: "flaresolverr" (default) solves Cloudflare via the FlareSolverr
# sidecar (docker-compose.yml) and needs no display/VNC — verified to return the
# RSC flight stream with postcode + coordinates; "camoufox" drives a local
# anti-fingerprint browser (needs an interactive solve on datacenter IPs).
ZOOPLA_FETCHER = os.environ.get("ZOOPLA_FETCHER", "flaresolverr")
FLARESOLVERR_URL = os.environ.get("FLARESOLVERR_URL", "http://gluetun:8191/v1")
FLARESOLVERR_MAX_TIMEOUT_MS = 120000 # per-request solve budget; first solve is slow
# Greater London-ish postcode areas. This intentionally uses broad area
# prefixes so a manual scrape can include central/inner London plus common
# outer-London and near-London outcodes without maintaining a long borough list.
LONDON_OUTCODE_PREFIXES = {
"E",
"EC",
"N",
"NW",
"SE",
"SW",
"W",
"WC",
"BR",
"CR",
"DA",
"EN",
"HA",
"IG",
"KT",
"RM",
"SM",
"TW",
"UB",
"WD",
}
PROPERTY_TYPE_MAP = {
"Detached": "Detached",
"Semi-Detached": "Semi-Detached",
"Terraced": "Terraced",
"End of Terrace": "Terraced",
"Mid Terrace": "Terraced",
"Flat": "Flats/Maisonettes",
"Maisonette": "Flats/Maisonettes",
"Studio": "Flats/Maisonettes",
"Apartment": "Flats/Maisonettes",
"Penthouse": "Flats/Maisonettes",
"Ground Flat": "Flats/Maisonettes",
"Duplex": "Flats/Maisonettes",
"Detached Bungalow": "Detached",
"Semi-Detached Bungalow": "Semi-Detached",
"Town House": "Terraced",
"Link Detached": "Detached",
"Link Detached House": "Detached",
"Bungalow": "Other",
"Cottage": "Other",
"Park Home": "Other",
"Mobile Home": "Other",
"Caravan": "Other",
"Lodge": "Other",
"Land": "Other",
"Farm / Barn": "Other",
"Farm House": "Other",
"House": "Detached",
"House of Multiple Occupation": "Other",
"House Share": "Other",
"Not Specified": "Other",
"Chalet": "Other",
"Barn Conversion": "Other",
"Coach House": "Other",
"Character Property": "Other",
"Cluster House": "Other",
"Retirement Property": "Other",
"Parking": "Other",
"Plot": "Other",
"Garages": "Other",
"Mews": "Terraced",
"Property": "Other",
"Flat Share": "Other",
"Block of Apartments": "Other",
"Private Halls": "Other",
"Terraced Bungalow": "Terraced",
"Equestrian Facility": "Other",
"Ground Maisonette": "Flats/Maisonettes",
"Country House": "Detached",
"Village House": "Detached",
"Farm Land": "Other",
"House Boat": "Other",
"Barn": "Other",
"Serviced Apartments": "Other",
# Space-separated variants from legacy provider normalization.
"Semi Detached": "Semi-Detached",
"Semi Detached Bungalow": "Semi-Detached",
"End Of Terrace": "Terraced",
"End Terrace": "Terraced",
"Block Of Apartments": "Other",
# Lowercase variants from listing APIs.
"house": "Detached",
"bungalow": "Other",
"townhouse": "Terraced",
"land": "Other",
"other": "Other",
"not-specified": "Other",
"retirement-property": "Other",
"equestrian-facility": "Other",
"flat": "Flats/Maisonettes",
"detached": "Detached",
"semi-detached": "Semi-Detached",
"terraced": "Terraced",
"maisonette": "Flats/Maisonettes",
"apartment": "Flats/Maisonettes",
"studio": "Flats/Maisonettes",
"penthouse": "Flats/Maisonettes",
"cottage": "Other",
"chalet": "Other",
"farm_house": "Detached",
"country house": "Detached",
"village house": "Detached",
}
CHANNELS = [
{"channel": "BUY", "transactionType": "BUY", "sortType": "2"},
]