Fmt
This commit is contained in:
parent
479ef92236
commit
c38d654ac7
44 changed files with 2526 additions and 701 deletions
|
|
@ -16,9 +16,21 @@ SCHEDULE_HOUR = int(os.environ.get("SCHEDULE_HOUR", "3"))
|
|||
# Whether to run a scrape immediately on startup
|
||||
RUN_ON_STARTUP = os.environ.get("RUN_ON_STARTUP", "").lower() in ("1", "true", "yes")
|
||||
# Enable/disable individual sources
|
||||
SCRAPE_RIGHTMOVE = os.environ.get("SCRAPE_RIGHTMOVE", "true").lower() in ("1", "true", "yes")
|
||||
SCRAPE_HOMECOUK = os.environ.get("SCRAPE_HOMECOUK", "true").lower() in ("1", "true", "yes")
|
||||
SCRAPE_OPENRENT = os.environ.get("SCRAPE_OPENRENT", "true").lower() in ("1", "true", "yes")
|
||||
SCRAPE_RIGHTMOVE = os.environ.get("SCRAPE_RIGHTMOVE", "true").lower() in (
|
||||
"1",
|
||||
"true",
|
||||
"yes",
|
||||
)
|
||||
SCRAPE_HOMECOUK = os.environ.get("SCRAPE_HOMECOUK", "true").lower() in (
|
||||
"1",
|
||||
"true",
|
||||
"yes",
|
||||
)
|
||||
SCRAPE_OPENRENT = os.environ.get("SCRAPE_OPENRENT", "true").lower() in (
|
||||
"1",
|
||||
"true",
|
||||
"yes",
|
||||
)
|
||||
|
||||
TYPEAHEAD_URL = "https://los.rightmove.co.uk/typeahead"
|
||||
SEARCH_URL = "https://www.rightmove.co.uk/api/property-search/listing/search"
|
||||
|
|
|
|||
|
|
@ -86,7 +86,8 @@ def solve_cloudflare() -> tuple[dict[str, str], str] | None:
|
|||
|
||||
log.info(
|
||||
"Cloudflare solved — got %d cookies, UA: %s",
|
||||
len(cookies), user_agent[:60],
|
||||
len(cookies),
|
||||
user_agent[:60],
|
||||
)
|
||||
flaresolverr_attempts_total.labels(result="success").inc()
|
||||
return cookies, user_agent
|
||||
|
|
@ -129,11 +130,13 @@ def make_client(cookies: dict[str, str], user_agent: str) -> Session:
|
|||
Uses Chrome TLS impersonation so cf_clearance cookies (which are bound
|
||||
to Chrome's JA3 fingerprint from FlareSolverr) remain valid."""
|
||||
session = Session(impersonate="chrome")
|
||||
session.headers.update({
|
||||
"User-Agent": user_agent,
|
||||
"Accept": "application/json, text/plain, */*",
|
||||
"x-requested-with": "XMLHttpRequest",
|
||||
})
|
||||
session.headers.update(
|
||||
{
|
||||
"User-Agent": user_agent,
|
||||
"Accept": "application/json, text/plain, */*",
|
||||
"x-requested-with": "XMLHttpRequest",
|
||||
}
|
||||
)
|
||||
# Laravel CSRF: the XSRF-TOKEN cookie value must also be sent as the
|
||||
# X-XSRF-TOKEN request header (URL-decoded). Without this header, the
|
||||
# server rejects every request with 419/403.
|
||||
|
|
@ -165,7 +168,11 @@ def fetch_page(
|
|||
return resp.json()
|
||||
except json.JSONDecodeError:
|
||||
homecouk_errors_total.labels(type="json_decode").inc()
|
||||
log.error("Non-JSON response from %s (got %s)", url, resp.headers.get("content-type", "?"))
|
||||
log.error(
|
||||
"Non-JSON response from %s (got %s)",
|
||||
url,
|
||||
resp.headers.get("content-type", "?"),
|
||||
)
|
||||
return None
|
||||
if resp.status_code == 403:
|
||||
raise CookiesExpiredError("HTTP 403 — cookies likely expired")
|
||||
|
|
@ -173,7 +180,11 @@ def fetch_page(
|
|||
delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1)
|
||||
log.warning(
|
||||
"HTTP %d from %s, retry %d/%d in %.1fs",
|
||||
resp.status_code, url, attempt + 1, max_retries, delay,
|
||||
resp.status_code,
|
||||
url,
|
||||
attempt + 1,
|
||||
max_retries,
|
||||
delay,
|
||||
)
|
||||
time.sleep(delay)
|
||||
continue
|
||||
|
|
@ -186,7 +197,11 @@ def fetch_page(
|
|||
delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1)
|
||||
log.warning(
|
||||
"%s from %s, retry %d/%d in %.1fs",
|
||||
type(e).__name__, url, attempt + 1, max_retries, delay,
|
||||
type(e).__name__,
|
||||
url,
|
||||
attempt + 1,
|
||||
max_retries,
|
||||
delay,
|
||||
)
|
||||
time.sleep(delay)
|
||||
homecouk_errors_total.labels(type="retry_exhausted").inc()
|
||||
|
|
@ -218,7 +233,12 @@ def map_property_type(raw_type: str | None) -> str:
|
|||
# Home.co.uk uses types like "House", "Flat", "Apartment", "Detached", etc.
|
||||
# Try common patterns
|
||||
lower = raw_type.lower()
|
||||
if "flat" in lower or "apartment" in lower or "maisonette" in lower or "studio" in lower:
|
||||
if (
|
||||
"flat" in lower
|
||||
or "apartment" in lower
|
||||
or "maisonette" in lower
|
||||
or "studio" in lower
|
||||
):
|
||||
return "Flats/Maisonettes"
|
||||
if "detached" in lower and "semi" not in lower:
|
||||
return "Detached"
|
||||
|
|
@ -231,7 +251,9 @@ def map_property_type(raw_type: str | None) -> str:
|
|||
|
||||
|
||||
def transform_property(
|
||||
prop: dict, channel: str, pc_index: PostcodeSpatialIndex,
|
||||
prop: dict,
|
||||
channel: str,
|
||||
pc_index: PostcodeSpatialIndex,
|
||||
) -> dict | None:
|
||||
"""Transform a raw home.co.uk property dict into our output schema."""
|
||||
lat = prop.get("latitude")
|
||||
|
|
|
|||
|
|
@ -11,7 +11,9 @@ from metrics import http_errors_total, http_requests_total, ip_rotations_total
|
|||
|
||||
log = logging.getLogger("rightmove")
|
||||
|
||||
_ua = UserAgent(browsers=["Chrome", "Edge"], os=["Windows", "Mac OS X"], min_version=120.0)
|
||||
_ua = UserAgent(
|
||||
browsers=["Chrome", "Edge"], os=["Windows", "Mac OS X"], min_version=120.0
|
||||
)
|
||||
|
||||
|
||||
def _endpoint_label(url: str) -> str:
|
||||
|
|
@ -27,6 +29,7 @@ def _status_label(code: int) -> str:
|
|||
return "5xx"
|
||||
return str(code)
|
||||
|
||||
|
||||
# Gluetun control API — runs on port 8000 inside the gluetun container.
|
||||
# Since finder uses network_mode: service:gluetun, localhost IS gluetun.
|
||||
GLUETUN_API = "http://127.0.0.1:8000"
|
||||
|
|
@ -42,17 +45,25 @@ def rotate_ip() -> bool:
|
|||
# Get current IP
|
||||
with httpx.Client(timeout=10) as ctl:
|
||||
old_ip_resp = ctl.get(f"{GLUETUN_API}/v1/publicip/ip")
|
||||
old_ip = old_ip_resp.json().get("public_ip", "unknown") if old_ip_resp.status_code == 200 else "unknown"
|
||||
old_ip = (
|
||||
old_ip_resp.json().get("public_ip", "unknown")
|
||||
if old_ip_resp.status_code == 200
|
||||
else "unknown"
|
||||
)
|
||||
log.info("Current IP: %s", old_ip)
|
||||
|
||||
# Trigger server change — PUT with empty JSON body picks a random server
|
||||
resp = ctl.put(f"{GLUETUN_API}/v1/vpn/status", json={"status": "stopped"})
|
||||
resp = ctl.put(
|
||||
f"{GLUETUN_API}/v1/vpn/status", json={"status": "stopped"}
|
||||
)
|
||||
if resp.status_code != 200:
|
||||
log.error("Failed to stop VPN: %d %s", resp.status_code, resp.text)
|
||||
return False
|
||||
time.sleep(2)
|
||||
|
||||
resp = ctl.put(f"{GLUETUN_API}/v1/vpn/status", json={"status": "running"})
|
||||
resp = ctl.put(
|
||||
f"{GLUETUN_API}/v1/vpn/status", json={"status": "running"}
|
||||
)
|
||||
if resp.status_code != 200:
|
||||
log.error("Failed to start VPN: %d %s", resp.status_code, resp.text)
|
||||
return False
|
||||
|
|
@ -99,7 +110,9 @@ def fetch_with_retry(
|
|||
for attempt in range(MAX_RETRIES):
|
||||
try:
|
||||
resp = client.get(url, params=params)
|
||||
http_requests_total.labels(status=_status_label(resp.status_code), endpoint=endpoint).inc()
|
||||
http_requests_total.labels(
|
||||
status=_status_label(resp.status_code), endpoint=endpoint
|
||||
).inc()
|
||||
if resp.status_code == 200:
|
||||
return resp.json()
|
||||
if resp.status_code == 403 and on_403:
|
||||
|
|
@ -111,15 +124,34 @@ def fetch_with_retry(
|
|||
return None
|
||||
if resp.status_code in (429, 500, 502, 503, 504):
|
||||
delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1)
|
||||
log.warning("HTTP %d from %s, retry %d/%d in %.1fs", resp.status_code, url, attempt + 1, MAX_RETRIES, delay)
|
||||
log.warning(
|
||||
"HTTP %d from %s, retry %d/%d in %.1fs",
|
||||
resp.status_code,
|
||||
url,
|
||||
attempt + 1,
|
||||
MAX_RETRIES,
|
||||
delay,
|
||||
)
|
||||
time.sleep(delay)
|
||||
continue
|
||||
log.error("HTTP %d from %s (non-retryable)", resp.status_code, url)
|
||||
return None
|
||||
except (httpx.ConnectError, httpx.ReadTimeout, httpx.WriteTimeout, httpx.PoolTimeout) as e:
|
||||
except (
|
||||
httpx.ConnectError,
|
||||
httpx.ReadTimeout,
|
||||
httpx.WriteTimeout,
|
||||
httpx.PoolTimeout,
|
||||
) as e:
|
||||
http_errors_total.labels(type=type(e).__name__).inc()
|
||||
delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1)
|
||||
log.warning("%s from %s, retry %d/%d in %.1fs", type(e).__name__, url, attempt + 1, MAX_RETRIES, delay)
|
||||
log.warning(
|
||||
"%s from %s, retry %d/%d in %.1fs",
|
||||
type(e).__name__,
|
||||
url,
|
||||
attempt + 1,
|
||||
MAX_RETRIES,
|
||||
delay,
|
||||
)
|
||||
time.sleep(delay)
|
||||
http_errors_total.labels(type="retry_exhausted").inc()
|
||||
log.error("All %d retries exhausted for %s", MAX_RETRIES, url)
|
||||
|
|
|
|||
|
|
@ -7,7 +7,14 @@ from pathlib import Path
|
|||
from flask import Flask, Response, jsonify, send_from_directory
|
||||
from prometheus_client import generate_latest, CONTENT_TYPE_LATEST
|
||||
|
||||
from constants import DATA_DIR, RUN_ON_STARTUP, SCHEDULE_HOUR, SCRAPE_HOMECOUK, SCRAPE_OPENRENT, SCRAPE_RIGHTMOVE
|
||||
from constants import (
|
||||
DATA_DIR,
|
||||
RUN_ON_STARTUP,
|
||||
SCHEDULE_HOUR,
|
||||
SCRAPE_HOMECOUK,
|
||||
SCRAPE_OPENRENT,
|
||||
SCRAPE_RIGHTMOVE,
|
||||
)
|
||||
from homecouk import load_cookies as load_homecouk_cookies
|
||||
from openrent import load_cookies as load_openrent_cookies
|
||||
from rightmove import outcode_cache
|
||||
|
|
@ -49,8 +56,13 @@ log.info("Loading arcgis data...")
|
|||
OUTCODES = load_outcodes()
|
||||
PC_INDEX = build_postcode_index()
|
||||
PC_COORDS = build_postcode_coords() if SCRAPE_OPENRENT else None
|
||||
log.info("Ready — %d outcodes, postcode index built (rightmove=%s, homecouk=%s, openrent=%s)",
|
||||
len(OUTCODES), SCRAPE_RIGHTMOVE, SCRAPE_HOMECOUK, SCRAPE_OPENRENT)
|
||||
log.info(
|
||||
"Ready — %d outcodes, postcode index built (rightmove=%s, homecouk=%s, openrent=%s)",
|
||||
len(OUTCODES),
|
||||
SCRAPE_RIGHTMOVE,
|
||||
SCRAPE_HOMECOUK,
|
||||
SCRAPE_OPENRENT,
|
||||
)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Scheduler
|
||||
|
|
@ -63,7 +75,9 @@ def _start_scrape() -> bool:
|
|||
if status.state == "running":
|
||||
return False
|
||||
status.state = "running"
|
||||
thread = threading.Thread(target=run_scrape, args=(OUTCODES, PC_INDEX, PC_COORDS), daemon=True)
|
||||
thread = threading.Thread(
|
||||
target=run_scrape, args=(OUTCODES, PC_INDEX, PC_COORDS), daemon=True
|
||||
)
|
||||
thread.start()
|
||||
return True
|
||||
|
||||
|
|
@ -82,7 +96,9 @@ def _scheduler_loop() -> None:
|
|||
log.info("Scheduler active — will run daily at %02d:00 UTC", SCHEDULE_HOUR)
|
||||
while True:
|
||||
wait = _seconds_until(SCHEDULE_HOUR)
|
||||
log.info("Next scheduled scrape in %.0f seconds (%.1f hours)", wait, wait / 3600)
|
||||
log.info(
|
||||
"Next scheduled scrape in %.0f seconds (%.1f hours)", wait, wait / 3600
|
||||
)
|
||||
time.sleep(wait)
|
||||
log.info("Scheduled scrape triggered")
|
||||
if not _start_scrape():
|
||||
|
|
@ -144,15 +160,17 @@ def get_status():
|
|||
def get_debug():
|
||||
hk_cookies = load_homecouk_cookies() if SCRAPE_HOMECOUK else None
|
||||
or_cookies = load_openrent_cookies() if SCRAPE_OPENRENT else None
|
||||
return jsonify({
|
||||
"outcode_cache_size": len(outcode_cache),
|
||||
"outcode_cache_sample": dict(list(outcode_cache.items())[:20]),
|
||||
"scrape_rightmove": SCRAPE_RIGHTMOVE,
|
||||
"scrape_homecouk": SCRAPE_HOMECOUK,
|
||||
"scrape_openrent": SCRAPE_OPENRENT,
|
||||
"homecouk_cookies_available": hk_cookies is not None,
|
||||
"openrent_cookies_available": or_cookies is not None,
|
||||
})
|
||||
return jsonify(
|
||||
{
|
||||
"outcode_cache_size": len(outcode_cache),
|
||||
"outcode_cache_sample": dict(list(outcode_cache.items())[:20]),
|
||||
"scrape_rightmove": SCRAPE_RIGHTMOVE,
|
||||
"scrape_homecouk": SCRAPE_HOMECOUK,
|
||||
"scrape_openrent": SCRAPE_OPENRENT,
|
||||
"homecouk_cookies_available": hk_cookies is not None,
|
||||
"openrent_cookies_available": or_cookies is not None,
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@app.route("/metrics")
|
||||
|
|
|
|||
|
|
@ -79,7 +79,8 @@ def solve_waf() -> tuple[dict[str, str], str] | None:
|
|||
if "AwsWafIntegration" in content:
|
||||
log.info("Got WAF challenge page, waiting for resolution...")
|
||||
page.wait_for_selector(
|
||||
"a.pli, .pli, .search-property-card", timeout=30000,
|
||||
"a.pli, .pli, .search-property-card",
|
||||
timeout=30000,
|
||||
)
|
||||
|
||||
raw_cookies = context.cookies()
|
||||
|
|
@ -94,7 +95,8 @@ def solve_waf() -> tuple[dict[str, str], str] | None:
|
|||
|
||||
log.info(
|
||||
"AWS WAF solved — got %d cookies, UA: %s",
|
||||
len(cookies), user_agent[:60],
|
||||
len(cookies),
|
||||
user_agent[:60],
|
||||
)
|
||||
flaresolverr_attempts_total.labels(result="success").inc()
|
||||
return cookies, user_agent
|
||||
|
|
@ -130,11 +132,13 @@ def make_client(cookies: dict[str, str], user_agent: str) -> Session:
|
|||
"""Create a curl_cffi Session configured for OpenRent.
|
||||
Uses Chrome TLS impersonation so AWS WAF cookies remain valid."""
|
||||
session = Session(impersonate="chrome")
|
||||
session.headers.update({
|
||||
"User-Agent": user_agent,
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Accept-Language": "en-GB,en;q=0.9",
|
||||
})
|
||||
session.headers.update(
|
||||
{
|
||||
"User-Agent": user_agent,
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Accept-Language": "en-GB,en;q=0.9",
|
||||
}
|
||||
)
|
||||
for name, value in cookies.items():
|
||||
session.cookies.set(name, value, domain="openrent.co.uk")
|
||||
return session
|
||||
|
|
@ -152,7 +156,9 @@ def _status_label(code: int) -> str:
|
|||
|
||||
|
||||
def fetch_page(
|
||||
client: Session, url: str, max_retries: int = 3,
|
||||
client: Session,
|
||||
url: str,
|
||||
max_retries: int = 3,
|
||||
) -> str | None:
|
||||
"""GET HTML with retries on 429/5xx. Returns None on permanent failure.
|
||||
WAF challenge (202 or 403 with challenge JS) raises WafChallengeError."""
|
||||
|
|
@ -165,17 +171,25 @@ def fetch_page(
|
|||
html = resp.text
|
||||
# Detect WAF challenge page masquerading as 200
|
||||
if "AwsWafIntegration" in html and "challenge.js" in html:
|
||||
raise WafChallengeError("Got AWS WAF challenge page — cookies expired")
|
||||
raise WafChallengeError(
|
||||
"Got AWS WAF challenge page — cookies expired"
|
||||
)
|
||||
return html
|
||||
|
||||
if resp.status_code in (202, 403):
|
||||
raise WafChallengeError(f"HTTP {resp.status_code} — cookies likely expired")
|
||||
raise WafChallengeError(
|
||||
f"HTTP {resp.status_code} — cookies likely expired"
|
||||
)
|
||||
|
||||
if resp.status_code in (429, 500, 502, 503, 504):
|
||||
delay = RETRY_BASE_DELAY * (2 ** attempt)
|
||||
delay = RETRY_BASE_DELAY * (2**attempt)
|
||||
log.warning(
|
||||
"HTTP %d from %s, retry %d/%d in %.1fs",
|
||||
resp.status_code, url, attempt + 1, max_retries, delay,
|
||||
resp.status_code,
|
||||
url,
|
||||
attempt + 1,
|
||||
max_retries,
|
||||
delay,
|
||||
)
|
||||
time.sleep(delay)
|
||||
continue
|
||||
|
|
@ -187,10 +201,14 @@ def fetch_page(
|
|||
raise
|
||||
except RequestsError as e:
|
||||
openrent_errors_total.labels(type=type(e).__name__).inc()
|
||||
delay = RETRY_BASE_DELAY * (2 ** attempt)
|
||||
delay = RETRY_BASE_DELAY * (2**attempt)
|
||||
log.warning(
|
||||
"%s from %s, retry %d/%d in %.1fs",
|
||||
type(e).__name__, url, attempt + 1, max_retries, delay,
|
||||
type(e).__name__,
|
||||
url,
|
||||
attempt + 1,
|
||||
max_retries,
|
||||
delay,
|
||||
)
|
||||
time.sleep(delay)
|
||||
|
||||
|
|
@ -247,7 +265,9 @@ def _extract_bedrooms_from_title(title: str) -> int | None:
|
|||
return None
|
||||
|
||||
|
||||
def _extract_beds_baths_from_features(feature_items: list) -> tuple[int | None, int | None]:
|
||||
def _extract_beds_baths_from_features(
|
||||
feature_items: list,
|
||||
) -> tuple[int | None, int | None]:
|
||||
"""Extract bedrooms and bathrooms from feature list items.
|
||||
|
||||
OpenRent search cards have <ul> with items like:
|
||||
|
|
@ -442,11 +462,7 @@ def parse_search_results(html: str) -> list[dict]:
|
|||
# --- Coordinates from data attributes (may not be present on cards) ---
|
||||
for el in [card] + card.select("[data-lat], [data-latitude]"):
|
||||
lat = el.get("data-lat") or el.get("data-latitude")
|
||||
lng = (
|
||||
el.get("data-lng")
|
||||
or el.get("data-longitude")
|
||||
or el.get("data-lon")
|
||||
)
|
||||
lng = el.get("data-lng") or el.get("data-longitude") or el.get("data-lon")
|
||||
if lat and lng:
|
||||
try:
|
||||
prop["lat"] = float(lat)
|
||||
|
|
@ -543,9 +559,7 @@ def parse_property_detail(html: str) -> dict:
|
|||
break
|
||||
|
||||
# --- Description for floor area ---
|
||||
desc_el = soup.select_one(
|
||||
".description, [class*='description'], #description"
|
||||
)
|
||||
desc_el = soup.select_one(".description, [class*='description'], #description")
|
||||
if desc_el:
|
||||
details["description"] = desc_el.get_text(strip=True)
|
||||
|
||||
|
|
@ -567,7 +581,12 @@ def map_property_type(raw_type: str | None) -> str:
|
|||
lower = raw_type.lower()
|
||||
if "room" in lower or "shared" in lower:
|
||||
return "Other"
|
||||
if "flat" in lower or "apartment" in lower or "maisonette" in lower or "studio" in lower:
|
||||
if (
|
||||
"flat" in lower
|
||||
or "apartment" in lower
|
||||
or "maisonette" in lower
|
||||
or "studio" in lower
|
||||
):
|
||||
return "Flats/Maisonettes"
|
||||
if "detached" in lower and "semi" not in lower:
|
||||
return "Detached"
|
||||
|
|
@ -647,7 +666,8 @@ def transform_property(
|
|||
elif search_data.get("outcode"):
|
||||
# No spatial index — try outcode lookup as fallback
|
||||
outcode_pcs = _resolve_outcode_postcodes(
|
||||
search_data["outcode"], pc_coords,
|
||||
search_data["outcode"],
|
||||
pc_coords,
|
||||
)
|
||||
if outcode_pcs:
|
||||
postcode = outcode_pcs[0]
|
||||
|
|
@ -708,7 +728,8 @@ def transform_property(
|
|||
|
||||
prop_id = search_data.get("id", "")
|
||||
listing_url = search_data.get(
|
||||
"url", f"{OPENRENT_BASE}/{prop_id}" if prop_id else "",
|
||||
"url",
|
||||
f"{OPENRENT_BASE}/{prop_id}" if prop_id else "",
|
||||
)
|
||||
description = detail.get("description") or search_data.get("description", "")
|
||||
|
||||
|
|
@ -775,7 +796,10 @@ def search_outcode(
|
|||
time.sleep(DELAY_BETWEEN_PAGES * 0.5)
|
||||
|
||||
transformed = transform_property(
|
||||
search_data, detail_data, pc_index, pc_coords,
|
||||
search_data,
|
||||
detail_data,
|
||||
pc_index,
|
||||
pc_coords,
|
||||
)
|
||||
if transformed:
|
||||
properties.append(transformed)
|
||||
|
|
|
|||
|
|
@ -24,7 +24,9 @@ def resolve_outcode_id(client: httpx.Client, outcode: str) -> str | None:
|
|||
if outcode in outcode_cache:
|
||||
return outcode_cache[outcode]
|
||||
|
||||
data = fetch_with_retry(client, TYPEAHEAD_URL, {"query": outcode, "limit": "10", "exclude": "STREET"})
|
||||
data = fetch_with_retry(
|
||||
client, TYPEAHEAD_URL, {"query": outcode, "limit": "10", "exclude": "STREET"}
|
||||
)
|
||||
if not data:
|
||||
return None
|
||||
|
||||
|
|
@ -61,7 +63,12 @@ def search_outcode(
|
|||
|
||||
data = fetch_with_retry(client, SEARCH_URL, params)
|
||||
if not data:
|
||||
log.warning("Failed to fetch index %d for %s/%s", index, outcode, channel_cfg["channel"])
|
||||
log.warning(
|
||||
"Failed to fetch index %d for %s/%s",
|
||||
index,
|
||||
outcode,
|
||||
channel_cfg["channel"],
|
||||
)
|
||||
break
|
||||
|
||||
raw_props = data.get("properties", [])
|
||||
|
|
|
|||
|
|
@ -6,7 +6,16 @@ from dataclasses import dataclass, field
|
|||
|
||||
import polars as pl
|
||||
|
||||
from constants import ARCGIS_PATH, CHANNELS, DATA_DIR, DELAY_BETWEEN_OUTCODES, SCRAPE_HOMECOUK, SCRAPE_OPENRENT, SCRAPE_RIGHTMOVE, SEED
|
||||
from constants import (
|
||||
ARCGIS_PATH,
|
||||
CHANNELS,
|
||||
DATA_DIR,
|
||||
DELAY_BETWEEN_OUTCODES,
|
||||
SCRAPE_HOMECOUK,
|
||||
SCRAPE_OPENRENT,
|
||||
SCRAPE_RIGHTMOVE,
|
||||
SEED,
|
||||
)
|
||||
from homecouk import CookiesExpiredError
|
||||
from homecouk import load_cookies as load_homecouk_cookies
|
||||
from homecouk import make_client as make_homecouk_client
|
||||
|
|
@ -64,13 +73,23 @@ def _sync_gauges() -> None:
|
|||
scrape_outcodes_done.set(status.outcodes_done)
|
||||
scrape_outcodes_total.set(status.outcodes_total)
|
||||
# Total properties (both sources combined)
|
||||
scrape_properties_total.labels(channel="buy", source="total").set(status.properties_buy)
|
||||
scrape_properties_total.labels(channel="rent", source="total").set(status.properties_rent)
|
||||
scrape_properties_total.labels(channel="buy", source="total").set(
|
||||
status.properties_buy
|
||||
)
|
||||
scrape_properties_total.labels(channel="rent", source="total").set(
|
||||
status.properties_rent
|
||||
)
|
||||
# Per-source breakdown for current channel
|
||||
ch = "buy" if status.channel == "BUY" else "rent"
|
||||
scrape_properties_total.labels(channel=ch, source="rightmove").set(status.rm_properties)
|
||||
scrape_properties_total.labels(channel=ch, source="homecouk").set(status.hk_properties)
|
||||
scrape_properties_total.labels(channel=ch, source="openrent").set(status.or_properties)
|
||||
scrape_properties_total.labels(channel=ch, source="rightmove").set(
|
||||
status.rm_properties
|
||||
)
|
||||
scrape_properties_total.labels(channel=ch, source="homecouk").set(
|
||||
status.hk_properties
|
||||
)
|
||||
scrape_properties_total.labels(channel=ch, source="openrent").set(
|
||||
status.or_properties
|
||||
)
|
||||
if status.started_at:
|
||||
end = status.finished_at if status.finished_at else time.time()
|
||||
scrape_elapsed_seconds.set(end - status.started_at)
|
||||
|
|
@ -86,7 +105,9 @@ def load_outcodes() -> list[str]:
|
|||
log.info("England postcodes: %d", len(england))
|
||||
|
||||
outcodes = (
|
||||
england.select(pl.col("pcd").str.extract(r"^([A-Z]{1,2}\d[A-Z0-9]?)", 1).alias("outcode"))
|
||||
england.select(
|
||||
pl.col("pcd").str.extract(r"^([A-Z]{1,2}\d[A-Z0-9]?)", 1).alias("outcode")
|
||||
)
|
||||
.drop_nulls()
|
||||
.get_column("outcode")
|
||||
.unique()
|
||||
|
|
@ -101,7 +122,9 @@ def build_postcode_index() -> PostcodeSpatialIndex:
|
|||
"""Build spatial index from arcgis England postcodes."""
|
||||
log.info("Building postcode spatial index from %s", ARCGIS_PATH)
|
||||
df = pl.read_parquet(ARCGIS_PATH, columns=["pcd", "ctry", "lat", "long"])
|
||||
england = df.filter(pl.col("ctry") == "E92000001").drop_nulls(subset=["lat", "long"])
|
||||
england = df.filter(pl.col("ctry") == "E92000001").drop_nulls(
|
||||
subset=["lat", "long"]
|
||||
)
|
||||
return PostcodeSpatialIndex(
|
||||
england.get_column("lat").to_list(),
|
||||
england.get_column("long").to_list(),
|
||||
|
|
@ -114,7 +137,9 @@ def build_postcode_coords() -> dict[str, tuple[float, float]]:
|
|||
Used by OpenRent scraper to resolve coordinates from postcodes."""
|
||||
log.info("Building postcode coords lookup from %s", ARCGIS_PATH)
|
||||
df = pl.read_parquet(ARCGIS_PATH, columns=["pcd", "ctry", "lat", "long"])
|
||||
england = df.filter(pl.col("ctry") == "E92000001").drop_nulls(subset=["lat", "long"])
|
||||
england = df.filter(pl.col("ctry") == "E92000001").drop_nulls(
|
||||
subset=["lat", "long"]
|
||||
)
|
||||
coords: dict[str, tuple[float, float]] = {}
|
||||
for pcd, lat, lng in zip(
|
||||
england.get_column("pcd").to_list(),
|
||||
|
|
@ -179,7 +204,9 @@ def run_scrape(
|
|||
log.info("home.co.uk scraping ENABLED")
|
||||
homecouk_enabled.set(1)
|
||||
else:
|
||||
log.info("home.co.uk scraping DISABLED (need FlareSolverr or HOMECOUK_CF_CLEARANCE + HOMECOUK_SESSION)")
|
||||
log.info(
|
||||
"home.co.uk scraping DISABLED (need FlareSolverr or HOMECOUK_CF_CLEARANCE + HOMECOUK_SESSION)"
|
||||
)
|
||||
homecouk_enabled.set(0)
|
||||
|
||||
# OpenRent: must be enabled via SCRAPE_OPENRENT + cookies available
|
||||
|
|
@ -195,7 +222,9 @@ def run_scrape(
|
|||
log.info("OpenRent scraping ENABLED")
|
||||
openrent_enabled.set(1)
|
||||
else:
|
||||
log.info("OpenRent scraping DISABLED (need FlareSolverr or OPENRENT_WAF_TOKEN)")
|
||||
log.info(
|
||||
"OpenRent scraping DISABLED (need FlareSolverr or OPENRENT_WAF_TOKEN)"
|
||||
)
|
||||
openrent_enabled.set(0)
|
||||
|
||||
# Build postcode coords if OpenRent is active and caller didn't provide them
|
||||
|
|
@ -207,7 +236,9 @@ def run_scrape(
|
|||
channel_name = channel_cfg["channel"]
|
||||
file_suffix = "buy" if channel_name == "BUY" else "rent"
|
||||
all_properties: dict[str, dict] = {} # dedup by id
|
||||
seen_dedup_keys: set[tuple] = set() # cross-source dedup by (postcode, beds, price)
|
||||
seen_dedup_keys: set[tuple] = (
|
||||
set()
|
||||
) # cross-source dedup by (postcode, beds, price)
|
||||
rm_count = 0 # Rightmove properties this channel
|
||||
hk_count = 0 # home.co.uk properties this channel
|
||||
hk_dedup_count = 0 # home.co.uk skipped as cross-source duplicates
|
||||
|
|
@ -222,15 +253,22 @@ def run_scrape(
|
|||
status.hk_properties = 0
|
||||
status.or_properties = 0
|
||||
|
||||
log.info("=== Starting %s channel (%d outcodes) ===", channel_name, len(shuffled))
|
||||
log.info(
|
||||
"=== Starting %s channel (%d outcodes) ===", channel_name, len(shuffled)
|
||||
)
|
||||
|
||||
for i, outcode in enumerate(shuffled):
|
||||
with status_lock:
|
||||
status.outcode = outcode
|
||||
status.outcodes_done = i
|
||||
|
||||
log.debug("Outcode %s (%d/%d) — %d properties so far",
|
||||
outcode, i + 1, len(shuffled), len(all_properties))
|
||||
log.debug(
|
||||
"Outcode %s (%d/%d) — %d properties so far",
|
||||
outcode,
|
||||
i + 1,
|
||||
len(shuffled),
|
||||
len(all_properties),
|
||||
)
|
||||
|
||||
made_requests = False
|
||||
|
||||
|
|
@ -240,9 +278,13 @@ def run_scrape(
|
|||
try:
|
||||
outcode_id = resolve_outcode_id(client, outcode)
|
||||
if not outcode_id:
|
||||
log.debug("No Rightmove ID for outcode %s, skipping", outcode)
|
||||
log.debug(
|
||||
"No Rightmove ID for outcode %s, skipping", outcode
|
||||
)
|
||||
else:
|
||||
props = search_outcode(client, outcode_id, outcode, channel_cfg, pc_index)
|
||||
props = search_outcode(
|
||||
client, outcode_id, outcode, channel_cfg, pc_index
|
||||
)
|
||||
for p in props:
|
||||
pid = p["id"]
|
||||
if pid not in all_properties:
|
||||
|
|
@ -261,7 +303,10 @@ def run_scrape(
|
|||
made_requests = True
|
||||
try:
|
||||
hk_props = homecouk_search_outcode(
|
||||
hk_client, outcode, channel_name, pc_index,
|
||||
hk_client,
|
||||
outcode,
|
||||
channel_name,
|
||||
pc_index,
|
||||
)
|
||||
for p in hk_props:
|
||||
pid = p["id"]
|
||||
|
|
@ -276,9 +321,13 @@ def run_scrape(
|
|||
seen_dedup_keys.add(key)
|
||||
hk_count += 1
|
||||
if hk_props:
|
||||
log.info("home.co.uk %s: +%d properties", outcode, len(hk_props))
|
||||
log.info(
|
||||
"home.co.uk %s: +%d properties", outcode, len(hk_props)
|
||||
)
|
||||
except CookiesExpiredError:
|
||||
log.warning("home.co.uk cookies expired — attempting refresh via FlareSolverr")
|
||||
log.warning(
|
||||
"home.co.uk cookies expired — attempting refresh via FlareSolverr"
|
||||
)
|
||||
hk_client.close()
|
||||
hk_result = load_homecouk_cookies()
|
||||
if hk_result:
|
||||
|
|
@ -286,13 +335,17 @@ def run_scrape(
|
|||
log.info("home.co.uk cookies refreshed, continuing")
|
||||
cookie_refreshes_total.labels(result="success").inc()
|
||||
else:
|
||||
log.warning("Cookie refresh failed, disabling home.co.uk for rest of scrape")
|
||||
log.warning(
|
||||
"Cookie refresh failed, disabling home.co.uk for rest of scrape"
|
||||
)
|
||||
hk_client = None
|
||||
hk_failed = True
|
||||
homecouk_enabled.set(0)
|
||||
cookie_refreshes_total.labels(result="failure").inc()
|
||||
with status_lock:
|
||||
status.errors.append("home.co.uk cookies expired and refresh failed")
|
||||
status.errors.append(
|
||||
"home.co.uk cookies expired and refresh failed"
|
||||
)
|
||||
except Exception as e:
|
||||
msg = f"Error scraping home.co.uk {outcode}/{channel_name}: {e}"
|
||||
log.error(msg)
|
||||
|
|
@ -305,7 +358,10 @@ def run_scrape(
|
|||
made_requests = True
|
||||
try:
|
||||
or_props = openrent_search_outcode(
|
||||
or_client, outcode, pc_index, pc_coords,
|
||||
or_client,
|
||||
outcode,
|
||||
pc_index,
|
||||
pc_coords,
|
||||
)
|
||||
for p in or_props:
|
||||
pid = p["id"]
|
||||
|
|
@ -318,9 +374,13 @@ def run_scrape(
|
|||
seen_dedup_keys.add(key)
|
||||
or_count += 1
|
||||
if or_props:
|
||||
log.info("OpenRent %s: +%d properties", outcode, len(or_props))
|
||||
log.info(
|
||||
"OpenRent %s: +%d properties", outcode, len(or_props)
|
||||
)
|
||||
except WafChallengeError:
|
||||
log.warning("OpenRent WAF cookies expired — attempting refresh via FlareSolverr")
|
||||
log.warning(
|
||||
"OpenRent WAF cookies expired — attempting refresh via FlareSolverr"
|
||||
)
|
||||
or_client.close()
|
||||
or_result = load_openrent_cookies()
|
||||
if or_result:
|
||||
|
|
@ -328,13 +388,17 @@ def run_scrape(
|
|||
log.info("OpenRent cookies refreshed, continuing")
|
||||
cookie_refreshes_total.labels(result="success").inc()
|
||||
else:
|
||||
log.warning("Cookie refresh failed, disabling OpenRent for rest of scrape")
|
||||
log.warning(
|
||||
"Cookie refresh failed, disabling OpenRent for rest of scrape"
|
||||
)
|
||||
or_client = None
|
||||
or_failed = True
|
||||
openrent_enabled.set(0)
|
||||
cookie_refreshes_total.labels(result="failure").inc()
|
||||
with status_lock:
|
||||
status.errors.append("OpenRent WAF cookies expired and refresh failed")
|
||||
status.errors.append(
|
||||
"OpenRent WAF cookies expired and refresh failed"
|
||||
)
|
||||
except Exception as e:
|
||||
msg = f"Error scraping OpenRent {outcode}/{channel_name}: {e}"
|
||||
log.error(msg)
|
||||
|
|
@ -352,8 +416,14 @@ def run_scrape(
|
|||
status.or_properties = or_count
|
||||
_sync_gauges()
|
||||
|
||||
log.info("Outcode %s: total %d (rm: %d, hk: %d, or: %d)",
|
||||
outcode, len(all_properties), rm_count, hk_count, or_count)
|
||||
log.info(
|
||||
"Outcode %s: total %d (rm: %d, hk: %d, or: %d)",
|
||||
outcode,
|
||||
len(all_properties),
|
||||
rm_count,
|
||||
hk_count,
|
||||
or_count,
|
||||
)
|
||||
|
||||
if made_requests and i < len(shuffled) - 1:
|
||||
time.sleep(DELAY_BETWEEN_OUTCODES)
|
||||
|
|
@ -373,7 +443,11 @@ def run_scrape(
|
|||
|
||||
log.info(
|
||||
"=== %s channel complete: %d unique (rm: %d, hk: %d, or: %d, cross-dedup: %d) ===",
|
||||
channel_name, len(deduped), rm_count, hk_count, or_count,
|
||||
channel_name,
|
||||
len(deduped),
|
||||
rm_count,
|
||||
hk_count,
|
||||
or_count,
|
||||
hk_dedup_count + or_dedup_count,
|
||||
)
|
||||
|
||||
|
|
@ -382,8 +456,12 @@ def run_scrape(
|
|||
status.finished_at = time.time()
|
||||
_sync_gauges()
|
||||
elapsed = status.finished_at - status.started_at
|
||||
log.info("Scrape complete in %.0fs — buy: %d, rent: %d",
|
||||
elapsed, status.properties_buy, status.properties_rent)
|
||||
log.info(
|
||||
"Scrape complete in %.0fs — buy: %d, rent: %d",
|
||||
elapsed,
|
||||
status.properties_buy,
|
||||
status.properties_rent,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
log.exception("Fatal scrape error")
|
||||
|
|
|
|||
|
|
@ -11,12 +11,16 @@ class PostcodeSpatialIndex:
|
|||
"""Grid-based spatial index over arcgis postcodes for nearest-lookup."""
|
||||
|
||||
def __init__(self, lats: list[float], lngs: list[float], postcodes: list[str]):
|
||||
self.grid: dict[tuple[int, int], list[tuple[float, float, str]]] = defaultdict(list)
|
||||
self.grid: dict[tuple[int, int], list[tuple[float, float, str]]] = defaultdict(
|
||||
list
|
||||
)
|
||||
for lat, lng, pcd in zip(lats, lngs, postcodes):
|
||||
gx = int(math.floor(lng / GRID_CELL_SIZE))
|
||||
gy = int(math.floor(lat / GRID_CELL_SIZE))
|
||||
self.grid[(gx, gy)].append((lat, lng, pcd))
|
||||
log.info("Postcode spatial index: %d cells, %d postcodes", len(self.grid), len(lats))
|
||||
log.info(
|
||||
"Postcode spatial index: %d cells, %d postcodes", len(self.grid), len(lats)
|
||||
)
|
||||
|
||||
def nearest(self, lat: float, lng: float) -> str | None:
|
||||
gx = int(math.floor(lng / GRID_CELL_SIZE))
|
||||
|
|
|
|||
|
|
@ -60,9 +60,7 @@ def write_parquet(properties: list[dict], path: Path, channel: str) -> None:
|
|||
"Property type": [p["Property type"] for p in properties],
|
||||
"Property sub-type": [p["Property sub-type"] for p in properties],
|
||||
"Price qualifier": [p["Price qualifier"] for p in properties],
|
||||
"Total floor area (sqm)": [
|
||||
p["Total floor area (sqm)"] for p in properties
|
||||
],
|
||||
"Total floor area (sqm)": [p["Total floor area (sqm)"] for p in properties],
|
||||
"Listing URL": [p["Listing URL"] for p in properties],
|
||||
"Listing features": [p["Listing features"] for p in properties],
|
||||
"Listing date": listing_dates,
|
||||
|
|
|
|||
|
|
@ -51,9 +51,19 @@ def fix_coords(lat: float, lng: float) -> tuple[float, float]:
|
|||
if 49 <= lat <= 56 and -7 <= lng <= 2:
|
||||
return lat, lng
|
||||
if 49 <= lng <= 56 and -7 <= lat <= 2:
|
||||
log.debug("Swapping reversed coords: lat=%.4f lng=%.4f → lat=%.4f lng=%.4f", lat, lng, lng, lat)
|
||||
log.debug(
|
||||
"Swapping reversed coords: lat=%.4f lng=%.4f → lat=%.4f lng=%.4f",
|
||||
lat,
|
||||
lng,
|
||||
lng,
|
||||
lat,
|
||||
)
|
||||
return lng, lat
|
||||
log.warning("Coords outside England bounds even after swap attempt: lat=%.4f lng=%.4f", lat, lng)
|
||||
log.warning(
|
||||
"Coords outside England bounds even after swap attempt: lat=%.4f lng=%.4f",
|
||||
lat,
|
||||
lng,
|
||||
)
|
||||
return lat, lng
|
||||
|
||||
|
||||
|
|
@ -66,7 +76,9 @@ def normalize_price(amount: int, frequency: str) -> int:
|
|||
return amount
|
||||
|
||||
|
||||
def transform_property(prop: dict, outcode: str, pc_index: PostcodeSpatialIndex) -> dict | None:
|
||||
def transform_property(
|
||||
prop: dict, outcode: str, pc_index: PostcodeSpatialIndex
|
||||
) -> dict | None:
|
||||
"""Transform a raw Rightmove property dict into our output schema."""
|
||||
loc = prop.get("location")
|
||||
if not loc:
|
||||
|
|
@ -86,13 +98,19 @@ def transform_property(prop: dict, outcode: str, pc_index: PostcodeSpatialIndex)
|
|||
price = normalize_price(int(amount), frequency)
|
||||
|
||||
display_prices = price_obj.get("displayPrices", [])
|
||||
price_qualifier = display_prices[0].get("displayPriceQualifier", "") if display_prices else ""
|
||||
price_qualifier = (
|
||||
display_prices[0].get("displayPriceQualifier", "") if display_prices else ""
|
||||
)
|
||||
|
||||
sub_type = prop.get("propertySubType", "")
|
||||
bedrooms = prop.get("bedrooms", 0) or 0
|
||||
bathrooms = prop.get("bathrooms", 0) or 0
|
||||
|
||||
key_features = [kf.get("description", "") for kf in prop.get("keyFeatures", []) if kf.get("description")]
|
||||
key_features = [
|
||||
kf.get("description", "")
|
||||
for kf in prop.get("keyFeatures", [])
|
||||
if kf.get("description")
|
||||
]
|
||||
|
||||
postcode = pc_index.nearest(lat, lng)
|
||||
if not postcode:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue