More fixes
This commit is contained in:
parent
15fa09430b
commit
6b12e21d50
54 changed files with 1665 additions and 630 deletions
|
|
@ -5,9 +5,14 @@ COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
|
|||
WORKDIR /app
|
||||
COPY pyproject.toml ./
|
||||
RUN uv pip install --system -r pyproject.toml
|
||||
RUN playwright install --with-deps chromium
|
||||
RUN playwright install-deps firefox
|
||||
RUN camoufox fetch \
|
||||
&& python -c "from camoufox.pkgman import camoufox_path; p = camoufox_path(download_if_missing=False); print('Camoufox verified at', p)"
|
||||
|
||||
COPY *.py ./
|
||||
COPY property-data/arcgis_data.parquet /data/arcgis_data.parquet
|
||||
|
||||
HEALTHCHECK --interval=30s --timeout=5s --retries=3 \
|
||||
CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:1234/health')"
|
||||
|
||||
CMD ["python3", "main.py"]
|
||||
|
|
|
|||
|
|
@ -31,6 +31,11 @@ SCRAPE_OPENRENT = os.environ.get("SCRAPE_OPENRENT", "true").lower() in (
|
|||
"true",
|
||||
"yes",
|
||||
)
|
||||
SCRAPE_ZOOPLA = os.environ.get("SCRAPE_ZOOPLA", "true").lower() in (
|
||||
"1",
|
||||
"true",
|
||||
"yes",
|
||||
)
|
||||
|
||||
# URL to trigger server data reload after scrape (e.g. http://server:8001/api/reload)
|
||||
RELOAD_URL = os.environ.get("RELOAD_URL", "")
|
||||
|
|
@ -47,6 +52,9 @@ HOMECOUK_PER_PAGE = 30 # max supported by the API
|
|||
# OpenRent
|
||||
OPENRENT_BASE = "https://www.openrent.co.uk"
|
||||
|
||||
# Zoopla
|
||||
ZOOPLA_BASE = "https://www.zoopla.co.uk"
|
||||
|
||||
PROPERTY_TYPE_MAP = {
|
||||
"Detached": "Detached",
|
||||
"Semi-Detached": "Semi-Detached",
|
||||
|
|
|
|||
|
|
@ -14,6 +14,7 @@ from constants import (
|
|||
SCRAPE_HOMECOUK,
|
||||
SCRAPE_OPENRENT,
|
||||
SCRAPE_RIGHTMOVE,
|
||||
SCRAPE_ZOOPLA,
|
||||
)
|
||||
from homecouk import load_cookies as load_homecouk_cookies
|
||||
from openrent import load_cookies as load_openrent_cookies
|
||||
|
|
@ -48,6 +49,16 @@ log.setLevel(logging.DEBUG)
|
|||
logging.getLogger("httpx").setLevel(logging.WARNING)
|
||||
logging.getLogger("httpcore").setLevel(logging.WARNING)
|
||||
|
||||
|
||||
# Suppress noisy /metrics and /health request logs from werkzeug
|
||||
class _NoiseFilter(logging.Filter):
|
||||
def filter(self, record):
|
||||
msg = record.getMessage()
|
||||
return "GET /metrics" not in msg and "GET /health" not in msg
|
||||
|
||||
|
||||
logging.getLogger("werkzeug").addFilter(_NoiseFilter())
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Startup: load data
|
||||
# ---------------------------------------------------------------------------
|
||||
|
|
@ -55,13 +66,14 @@ logging.getLogger("httpcore").setLevel(logging.WARNING)
|
|||
log.info("Loading arcgis data...")
|
||||
OUTCODES = load_outcodes()
|
||||
PC_INDEX = build_postcode_index()
|
||||
PC_COORDS = build_postcode_coords() if SCRAPE_OPENRENT else None
|
||||
PC_COORDS = build_postcode_coords() if (SCRAPE_OPENRENT or SCRAPE_ZOOPLA) else None
|
||||
log.info(
|
||||
"Ready — %d outcodes, postcode index built (rightmove=%s, homecouk=%s, openrent=%s)",
|
||||
"Ready — %d outcodes, postcode index built (rightmove=%s, homecouk=%s, openrent=%s, zoopla=%s)",
|
||||
len(OUTCODES),
|
||||
SCRAPE_RIGHTMOVE,
|
||||
SCRAPE_HOMECOUK,
|
||||
SCRAPE_OPENRENT,
|
||||
SCRAPE_ZOOPLA,
|
||||
)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
|
|
@ -121,6 +133,11 @@ if SCHEDULE_HOUR >= 0:
|
|||
app = Flask(__name__)
|
||||
|
||||
|
||||
@app.route("/health")
|
||||
def health():
|
||||
return "ok", 200
|
||||
|
||||
|
||||
@app.route("/run", methods=["POST"])
|
||||
def trigger_run():
|
||||
if _start_scrape():
|
||||
|
|
@ -147,6 +164,7 @@ def get_status():
|
|||
"rightmove": status.rm_properties,
|
||||
"homecouk": status.hk_properties,
|
||||
"openrent": status.or_properties,
|
||||
"zoopla": status.zp_properties,
|
||||
},
|
||||
"errors": status.errors[-20:], # last 20 errors
|
||||
"elapsed_seconds": round(elapsed, 1),
|
||||
|
|
@ -167,8 +185,10 @@ def get_debug():
|
|||
"scrape_rightmove": SCRAPE_RIGHTMOVE,
|
||||
"scrape_homecouk": SCRAPE_HOMECOUK,
|
||||
"scrape_openrent": SCRAPE_OPENRENT,
|
||||
"scrape_zoopla": SCRAPE_ZOOPLA,
|
||||
"homecouk_cookies_available": hk_cookies is not None,
|
||||
"openrent_cookies_available": or_cookies is not None,
|
||||
"zoopla_note": "browser-based (Camoufox), no cookies needed",
|
||||
}
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -109,6 +109,28 @@ openrent_properties_scraped = Counter(
|
|||
["channel"],
|
||||
)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Counters — Zoopla
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
zoopla_pages_scraped = Counter(
|
||||
"zoopla_pages_scraped",
|
||||
"Search result pages scraped from Zoopla",
|
||||
["channel"],
|
||||
)
|
||||
|
||||
zoopla_errors_total = Counter(
|
||||
"zoopla_errors_total",
|
||||
"Zoopla scraping errors",
|
||||
["type"],
|
||||
)
|
||||
|
||||
zoopla_properties_scraped = Counter(
|
||||
"zoopla_properties_scraped",
|
||||
"Properties scraped from Zoopla (before dedup)",
|
||||
["channel"],
|
||||
)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Counters — FlareSolverr / cookie management
|
||||
# ---------------------------------------------------------------------------
|
||||
|
|
@ -138,3 +160,8 @@ openrent_enabled = Gauge(
|
|||
"openrent_enabled",
|
||||
"Whether OpenRent scraping is currently active (1=yes, 0=no)",
|
||||
)
|
||||
|
||||
zoopla_enabled = Gauge(
|
||||
"zoopla_enabled",
|
||||
"Whether Zoopla scraping is currently active (1=yes, 0=no)",
|
||||
)
|
||||
|
|
|
|||
|
|
@ -17,6 +17,7 @@ from constants import (
|
|||
SCRAPE_HOMECOUK,
|
||||
SCRAPE_OPENRENT,
|
||||
SCRAPE_RIGHTMOVE,
|
||||
SCRAPE_ZOOPLA,
|
||||
SEED,
|
||||
)
|
||||
from homecouk import CookiesExpiredError
|
||||
|
|
@ -35,12 +36,16 @@ from metrics import (
|
|||
scrape_outcodes_total,
|
||||
scrape_properties_total,
|
||||
scrape_state,
|
||||
zoopla_enabled,
|
||||
)
|
||||
from openrent import WafChallengeError
|
||||
from openrent import load_cookies as load_openrent_cookies
|
||||
from openrent import make_client as make_openrent_client
|
||||
from openrent import search_outcode as openrent_search_outcode
|
||||
from rightmove import resolve_outcode_id, search_outcode
|
||||
from zoopla import TurnstileError
|
||||
from zoopla import launch_browser as launch_zoopla_browser
|
||||
from zoopla import search_outcode as zoopla_search_outcode
|
||||
from spatial import PostcodeSpatialIndex
|
||||
from storage import write_parquet
|
||||
|
||||
|
|
@ -60,6 +65,7 @@ class ScrapeStatus:
|
|||
rm_properties: int = 0
|
||||
hk_properties: int = 0
|
||||
or_properties: int = 0
|
||||
zp_properties: int = 0
|
||||
errors: list[str] = field(default_factory=list)
|
||||
started_at: float = 0.0
|
||||
finished_at: float = 0.0
|
||||
|
|
@ -93,6 +99,9 @@ def _sync_gauges() -> None:
|
|||
scrape_properties_total.labels(channel=ch, source="openrent").set(
|
||||
status.or_properties
|
||||
)
|
||||
scrape_properties_total.labels(channel=ch, source="zoopla").set(
|
||||
status.zp_properties
|
||||
)
|
||||
if status.started_at:
|
||||
end = status.finished_at if status.finished_at else time.time()
|
||||
scrape_elapsed_seconds.set(end - status.started_at)
|
||||
|
|
@ -191,7 +200,7 @@ def run_scrape(
|
|||
random.seed(SEED)
|
||||
random.shuffle(shuffled)
|
||||
|
||||
if not SCRAPE_RIGHTMOVE and not SCRAPE_HOMECOUK and not SCRAPE_OPENRENT:
|
||||
if not SCRAPE_RIGHTMOVE and not SCRAPE_HOMECOUK and not SCRAPE_OPENRENT and not SCRAPE_ZOOPLA:
|
||||
log.warning("All scrapers disabled — nothing to do")
|
||||
with status_lock:
|
||||
status.state = "done"
|
||||
|
|
@ -239,8 +248,27 @@ def run_scrape(
|
|||
)
|
||||
openrent_enabled.set(0)
|
||||
|
||||
# Build postcode coords if OpenRent is active and caller didn't provide them
|
||||
if or_client and pc_coords is None:
|
||||
# Zoopla: uses Camoufox browser (no cookies/client pattern)
|
||||
zp_browser = None
|
||||
zp_page = None
|
||||
zp_failed = False
|
||||
if not SCRAPE_ZOOPLA:
|
||||
log.info("Zoopla scraping DISABLED (SCRAPE_ZOOPLA=false)")
|
||||
zoopla_enabled.set(0)
|
||||
else:
|
||||
try:
|
||||
zp_browser, zp_page = launch_zoopla_browser()
|
||||
log.info("Zoopla scraping ENABLED (Camoufox browser launched)")
|
||||
zoopla_enabled.set(1)
|
||||
except TurnstileError:
|
||||
log.warning("Zoopla Cloudflare Turnstile failed — disabling Zoopla")
|
||||
zoopla_enabled.set(0)
|
||||
except Exception as e:
|
||||
log.warning("Zoopla browser launch failed: %s — disabling Zoopla", e)
|
||||
zoopla_enabled.set(0)
|
||||
|
||||
# Build postcode coords if OpenRent/Zoopla is active and caller didn't provide them
|
||||
if (or_client or zp_page) and pc_coords is None:
|
||||
pc_coords = build_postcode_coords()
|
||||
|
||||
try:
|
||||
|
|
@ -256,6 +284,8 @@ def run_scrape(
|
|||
hk_dedup_count = 0 # home.co.uk skipped as cross-source duplicates
|
||||
or_count = 0 # OpenRent properties this channel
|
||||
or_dedup_count = 0 # OpenRent skipped as cross-source duplicates
|
||||
zp_count = 0 # Zoopla properties this channel
|
||||
zp_dedup_count = 0 # Zoopla skipped as cross-source duplicates
|
||||
|
||||
with status_lock:
|
||||
status.channel = channel_name
|
||||
|
|
@ -264,6 +294,7 @@ def run_scrape(
|
|||
status.rm_properties = 0
|
||||
status.hk_properties = 0
|
||||
status.or_properties = 0
|
||||
status.zp_properties = 0
|
||||
|
||||
channel_start = time.time()
|
||||
prev_prop_milestone = 0 # last 10k milestone we logged
|
||||
|
|
@ -412,6 +443,63 @@ def run_scrape(
|
|||
with status_lock:
|
||||
status.errors.append(msg)
|
||||
|
||||
# --- Zoopla ---
|
||||
if zp_page and not zp_failed:
|
||||
made_requests = True
|
||||
try:
|
||||
zp_props = zoopla_search_outcode(
|
||||
zp_page,
|
||||
outcode,
|
||||
channel_name,
|
||||
pc_index,
|
||||
pc_coords,
|
||||
)
|
||||
for p in zp_props:
|
||||
pid = p["id"]
|
||||
key = _dedup_key(p)
|
||||
if pid in all_properties or key in seen_dedup_keys:
|
||||
zp_dedup_count += 1
|
||||
cross_source_dedup_total.labels(
|
||||
channel="buy" if channel_name == "BUY" else "rent",
|
||||
).inc()
|
||||
continue
|
||||
all_properties[pid] = p
|
||||
seen_dedup_keys.add(key)
|
||||
zp_count += 1
|
||||
if zp_props:
|
||||
log.info(
|
||||
"Zoopla %s: +%d properties", outcode, len(zp_props)
|
||||
)
|
||||
except TurnstileError:
|
||||
log.warning(
|
||||
"Zoopla Cloudflare challenge failed — attempting browser relaunch"
|
||||
)
|
||||
try:
|
||||
zp_browser.close()
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
zp_browser, zp_page = launch_zoopla_browser()
|
||||
log.info("Zoopla browser relaunched, continuing")
|
||||
except Exception:
|
||||
log.warning(
|
||||
"Browser relaunch failed, disabling Zoopla for rest of scrape"
|
||||
)
|
||||
zp_page = None
|
||||
zp_browser = None
|
||||
zp_failed = True
|
||||
zoopla_enabled.set(0)
|
||||
with status_lock:
|
||||
status.errors.append(
|
||||
"Zoopla Cloudflare challenge failed and browser relaunch failed"
|
||||
)
|
||||
except Exception as e:
|
||||
msg = f"Error scraping Zoopla {outcode}/{channel_name}: {e}"
|
||||
log.error(msg)
|
||||
scrape_errors_total.labels(source="zoopla").inc()
|
||||
with status_lock:
|
||||
status.errors.append(msg)
|
||||
|
||||
with status_lock:
|
||||
if channel_name == "BUY":
|
||||
status.properties_buy = len(all_properties)
|
||||
|
|
@ -420,6 +508,7 @@ def run_scrape(
|
|||
status.rm_properties = rm_count
|
||||
status.hk_properties = hk_count
|
||||
status.or_properties = or_count
|
||||
status.zp_properties = zp_count
|
||||
_sync_gauges()
|
||||
|
||||
# Log progress every 100 outcodes
|
||||
|
|
@ -444,12 +533,13 @@ def run_scrape(
|
|||
if current_milestone > prev_prop_milestone:
|
||||
prev_prop_milestone = current_milestone
|
||||
log.info(
|
||||
"%s %dk properties (rm: %d, hk: %d, or: %d) at outcode %d/%d [%s]",
|
||||
"%s %dk properties (rm: %d, hk: %d, or: %d, zp: %d) at outcode %d/%d [%s]",
|
||||
channel_name,
|
||||
current_milestone * 10,
|
||||
rm_count,
|
||||
hk_count,
|
||||
or_count,
|
||||
zp_count,
|
||||
done,
|
||||
len(shuffled),
|
||||
_fmt_elapsed(elapsed),
|
||||
|
|
@ -472,13 +562,14 @@ def run_scrape(
|
|||
_sync_gauges()
|
||||
|
||||
log.info(
|
||||
"=== %s channel complete: %d unique (rm: %d, hk: %d, or: %d, cross-dedup: %d) ===",
|
||||
"=== %s channel complete: %d unique (rm: %d, hk: %d, or: %d, zp: %d, cross-dedup: %d) ===",
|
||||
channel_name,
|
||||
len(deduped),
|
||||
rm_count,
|
||||
hk_count,
|
||||
or_count,
|
||||
hk_dedup_count + or_dedup_count,
|
||||
zp_count,
|
||||
hk_dedup_count + or_dedup_count + zp_dedup_count,
|
||||
)
|
||||
|
||||
with status_lock:
|
||||
|
|
@ -525,3 +616,8 @@ def run_scrape(
|
|||
hk_client.close()
|
||||
if or_client:
|
||||
or_client.close()
|
||||
if zp_browser:
|
||||
try:
|
||||
zp_browser.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
|
|
|||
|
|
@ -25,7 +25,11 @@ def write_parquet(properties: list[dict], path: Path, channel: str) -> None:
|
|||
if fvd:
|
||||
try:
|
||||
dt = datetime.fromisoformat(fvd.replace("Z", "+00:00"))
|
||||
listing_dates.append(dt.replace(tzinfo=None))
|
||||
# Convert to UTC naive datetime for consistent storage
|
||||
if dt.tzinfo is not None:
|
||||
from datetime import timezone
|
||||
dt = dt.astimezone(timezone.utc).replace(tzinfo=None)
|
||||
listing_dates.append(dt)
|
||||
except (ValueError, TypeError):
|
||||
listing_dates.append(None)
|
||||
else:
|
||||
|
|
|
|||
520
finder/zoopla.py
Normal file
520
finder/zoopla.py
Normal file
|
|
@ -0,0 +1,520 @@
|
|||
"""Zoopla (zoopla.co.uk) scraper — buy and rental properties.
|
||||
|
||||
Zoopla is behind Cloudflare Turnstile (managed interactive challenge), which
|
||||
blocks all HTTP clients (curl_cffi, httpx) and even Playwright with stealth
|
||||
patches. Only Camoufox (an anti-fingerprinting Firefox fork) passes reliably.
|
||||
|
||||
Zoopla uses Next.js App Router with React Server Components (RSC). Search
|
||||
result data is server-rendered in an RSC stream, not available via
|
||||
__NEXT_DATA__ or a JSON API. URL-based location slugs return 0 results —
|
||||
the working flow requires typing into the autocomplete input, selecting a
|
||||
suggestion, and clicking Search.
|
||||
|
||||
Architecture:
|
||||
Unlike the other scrapers which use HTTP clients per outcode, Zoopla keeps
|
||||
a single Camoufox browser alive for the entire scrape. For each outcode, it:
|
||||
1. Clears and types the outcode into the search input
|
||||
2. Selects the first autocomplete suggestion
|
||||
3. Clicks Search
|
||||
4. Extracts listing data from the rendered DOM
|
||||
5. Handles pagination via ?pn=N parameter
|
||||
|
||||
The browser session replaces the cookie/client pattern used by other scrapers.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
import time
|
||||
|
||||
from constants import DELAY_BETWEEN_PAGES, PROPERTY_TYPE_MAP, ZOOPLA_BASE
|
||||
from metrics import zoopla_errors_total, zoopla_pages_scraped, zoopla_properties_scraped
|
||||
from spatial import PostcodeSpatialIndex
|
||||
|
||||
log = logging.getLogger("zoopla")
|
||||
|
||||
|
||||
class TurnstileError(Exception):
|
||||
"""Raised when Cloudflare Turnstile challenge cannot be passed."""
|
||||
|
||||
|
||||
# Maximum search result pages to scrape per outcode (25 listings/page)
|
||||
MAX_PAGES_PER_OUTCODE = 10
|
||||
|
||||
# JavaScript to extract listings from the rendered DOM.
|
||||
# Finds all detail links, walks up to the card container, and parses
|
||||
# price, beds, baths, floor area, address, and tenure from the card text.
|
||||
_EXTRACT_LISTINGS_JS = r"""() => {
|
||||
const links = Array.from(document.querySelectorAll(
|
||||
'a[href*="/for-sale/details/"], a[href*="/new-homes/details/"], a[href*="/to-rent/details/"]'
|
||||
));
|
||||
|
||||
const seen = new Set();
|
||||
const results = [];
|
||||
|
||||
for (const link of links) {
|
||||
const href = link.href;
|
||||
const match = href.match(/\/details\/(\d+)\//);
|
||||
if (!match) continue;
|
||||
|
||||
const id = match[1];
|
||||
if (seen.has(id)) continue;
|
||||
seen.add(id);
|
||||
|
||||
// Walk up to the listing card container
|
||||
let card = link;
|
||||
for (let j = 0; j < 10; j++) {
|
||||
card = card.parentElement;
|
||||
if (!card) break;
|
||||
const text = card.innerText || '';
|
||||
if (text.includes('\u00a3') && (text.includes('bed') || text.includes('sq ft'))) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!card) continue;
|
||||
|
||||
const text = card.innerText || '';
|
||||
const lines = text.split('\n').map(l => l.trim()).filter(Boolean);
|
||||
|
||||
const priceMatch = text.match(/\u00a3([\d,]+)/);
|
||||
const bedsMatch = text.match(/(\d+)\s*beds?/i);
|
||||
const bathsMatch = text.match(/(\d+)\s*baths?/i);
|
||||
const recMatch = text.match(/(\d+)\s*reception/i);
|
||||
const areaMatch = text.match(/([\d,]+)\s*sq\s*ft/i);
|
||||
|
||||
let address = '';
|
||||
for (const line of lines) {
|
||||
if (/[A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2}/i.test(line) ||
|
||||
(line.includes(',') && !line.includes('\u00a3') && !/^\d+ beds?/i.test(line))) {
|
||||
address = line;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
let tenure = '';
|
||||
if (/freehold/i.test(text)) tenure = 'Freehold';
|
||||
else if (/leasehold/i.test(text)) tenure = 'Leasehold';
|
||||
|
||||
results.push({
|
||||
id: id,
|
||||
url: href.replace(window.location.origin, ''),
|
||||
price: priceMatch ? parseInt(priceMatch[1].replace(/,/g, '')) : null,
|
||||
beds: bedsMatch ? parseInt(bedsMatch[1]) : null,
|
||||
baths: bathsMatch ? parseInt(bathsMatch[1]) : null,
|
||||
receptions: recMatch ? parseInt(recMatch[1]) : null,
|
||||
floor_area_sqft: areaMatch ? parseInt(areaMatch[1].replace(/,/g, '')) : null,
|
||||
address: address,
|
||||
tenure: tenure,
|
||||
});
|
||||
}
|
||||
|
||||
return results;
|
||||
}"""
|
||||
|
||||
# JavaScript to dismiss the Usercentrics cookie consent overlay (shadow DOM).
|
||||
_DISMISS_COOKIES_JS = """() => {
|
||||
const aside = document.querySelector('#usercentrics-cmp-ui');
|
||||
if (aside && aside.shadowRoot) {
|
||||
const btns = aside.shadowRoot.querySelectorAll('button');
|
||||
for (const btn of btns) {
|
||||
if (btn.innerText.includes('Accept')) { btn.click(); return true; }
|
||||
}
|
||||
}
|
||||
if (aside) { aside.remove(); return true; }
|
||||
return false;
|
||||
}"""
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Browser lifecycle
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def launch_browser():
|
||||
"""Launch Camoufox, navigate to Zoopla homepage, pass Cloudflare Turnstile,
|
||||
and dismiss cookie consent. Returns (browser, page) tuple.
|
||||
|
||||
Raises TurnstileError if Cloudflare cannot be passed within 60 seconds.
|
||||
Caller must close browser when done."""
|
||||
from camoufox.pkgman import camoufox_path
|
||||
|
||||
# Verify camoufox is pre-installed — never download at runtime
|
||||
camoufox_path(download_if_missing=False)
|
||||
|
||||
from camoufox.sync_api import Camoufox
|
||||
|
||||
log.info("Launching Camoufox browser for Zoopla...")
|
||||
browser = Camoufox(headless=True).__enter__()
|
||||
page = browser.new_page()
|
||||
|
||||
log.info("Navigating to Zoopla homepage...")
|
||||
page.goto(f"{ZOOPLA_BASE}/", wait_until="domcontentloaded", timeout=60000)
|
||||
|
||||
# Wait for Cloudflare Turnstile to resolve.
|
||||
# Try clicking the Turnstile checkbox if present (helps in some cases).
|
||||
for i in range(20):
|
||||
if "Just a moment" not in page.title():
|
||||
break
|
||||
# Attempt to click the Turnstile checkbox in the challenge iframe
|
||||
for frame in page.frames:
|
||||
if "challenges.cloudflare.com" in frame.url:
|
||||
try:
|
||||
iframe_el = page.query_selector('iframe[src*="challenges.cloudflare"]')
|
||||
if iframe_el:
|
||||
box = iframe_el.bounding_box()
|
||||
if box:
|
||||
page.mouse.click(box["x"] + 30, box["y"] + box["height"] / 2)
|
||||
except Exception:
|
||||
pass
|
||||
break
|
||||
time.sleep(3)
|
||||
else:
|
||||
page.close()
|
||||
browser.close()
|
||||
raise TurnstileError("Cloudflare Turnstile did not resolve after 60s")
|
||||
|
||||
log.info("Cloudflare passed — title: %s", page.title())
|
||||
time.sleep(2)
|
||||
|
||||
# Dismiss cookie consent
|
||||
page.evaluate(_DISMISS_COOKIES_JS)
|
||||
time.sleep(1)
|
||||
|
||||
return browser, page
|
||||
|
||||
|
||||
def _ensure_not_challenged(page) -> None:
|
||||
"""Check if current page is a Cloudflare challenge and wait/raise."""
|
||||
if "Just a moment" not in page.title():
|
||||
return
|
||||
|
||||
log.warning("Cloudflare challenge detected mid-session, waiting...")
|
||||
for i in range(20):
|
||||
time.sleep(3)
|
||||
if "Just a moment" not in page.title():
|
||||
log.info("Cloudflare challenge resolved")
|
||||
return
|
||||
|
||||
raise TurnstileError("Cloudflare re-challenge did not resolve")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Search navigation
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _navigate_search(page, outcode: str, channel: str) -> bool:
|
||||
"""Navigate to search results for an outcode via the homepage search flow.
|
||||
|
||||
Returns True if results were found, False if no results or navigation failed.
|
||||
Raises TurnstileError if Cloudflare blocks us."""
|
||||
# Navigate to homepage to reset search state
|
||||
page.goto(f"{ZOOPLA_BASE}/", wait_until="domcontentloaded", timeout=30000)
|
||||
time.sleep(2)
|
||||
_ensure_not_challenged(page)
|
||||
|
||||
# Dismiss cookie consent (may reappear after navigation)
|
||||
page.evaluate(_DISMISS_COOKIES_JS)
|
||||
time.sleep(1)
|
||||
|
||||
# Select Buy/Rent tab
|
||||
if channel == "RENT":
|
||||
rent_tab = page.query_selector(
|
||||
'button:has-text("Rent"), [role="tab"]:has-text("Rent")'
|
||||
)
|
||||
if rent_tab:
|
||||
rent_tab.click()
|
||||
time.sleep(0.5)
|
||||
|
||||
# Find and fill search input
|
||||
search_input = page.query_selector(
|
||||
'input[name="autosuggest-input"]'
|
||||
) or page.query_selector('input[type="text"]')
|
||||
if not search_input:
|
||||
log.warning("Could not find search input on homepage")
|
||||
return False
|
||||
|
||||
search_input.click()
|
||||
time.sleep(0.3)
|
||||
search_input.fill("")
|
||||
search_input.type(outcode, delay=60)
|
||||
time.sleep(2)
|
||||
|
||||
# Select first autocomplete suggestion
|
||||
first_option = page.query_selector('[role="option"]')
|
||||
if not first_option:
|
||||
log.debug("No autocomplete suggestions for outcode %s", outcode)
|
||||
return False
|
||||
|
||||
first_option.click()
|
||||
time.sleep(0.5)
|
||||
|
||||
# Click search button
|
||||
search_btn = page.query_selector('button:has-text("Search")')
|
||||
if search_btn:
|
||||
search_btn.click()
|
||||
else:
|
||||
search_input.press("Enter")
|
||||
|
||||
# Wait for results to load
|
||||
time.sleep(6)
|
||||
_ensure_not_challenged(page)
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def _get_result_count(page) -> int:
|
||||
"""Extract the total results count from the page body text."""
|
||||
try:
|
||||
body = page.inner_text("body")
|
||||
match = re.search(r"([\d,]+)\s+results?", body)
|
||||
if match:
|
||||
return int(match.group(1).replace(",", ""))
|
||||
except Exception:
|
||||
pass
|
||||
return 0
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Extraction and pagination
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _extract_listings(page) -> list[dict]:
|
||||
"""Extract listing data from the current search results page DOM."""
|
||||
try:
|
||||
return page.evaluate(_EXTRACT_LISTINGS_JS)
|
||||
except Exception as e:
|
||||
log.warning("Failed to extract listings from DOM: %s", e)
|
||||
zoopla_errors_total.labels(type="extract_failed").inc()
|
||||
return []
|
||||
|
||||
|
||||
def _paginate(page, total_results: int, channel: str) -> list[dict]:
|
||||
"""Extract listings from all pages of search results.
|
||||
|
||||
Page 1 is already loaded. For subsequent pages, clicks the Next button
|
||||
or navigates via URL parameter ?pn=N."""
|
||||
all_listings = _extract_listings(page)
|
||||
channel_label = "buy" if channel == "BUY" else "rent"
|
||||
zoopla_pages_scraped.labels(channel=channel_label).inc()
|
||||
|
||||
if not all_listings or total_results <= len(all_listings):
|
||||
return all_listings
|
||||
|
||||
seen_ids = {l["id"] for l in all_listings}
|
||||
current_url = page.url
|
||||
page_num = 2
|
||||
|
||||
while len(all_listings) < total_results and page_num <= MAX_PAGES_PER_OUTCODE:
|
||||
time.sleep(DELAY_BETWEEN_PAGES)
|
||||
|
||||
# Try navigating via URL parameter
|
||||
if "?" in current_url:
|
||||
next_url = re.sub(r"[?&]pn=\d+", "", current_url)
|
||||
separator = "&" if "?" in next_url else "?"
|
||||
next_url = f"{next_url}{separator}pn={page_num}"
|
||||
else:
|
||||
next_url = f"{current_url}?pn={page_num}"
|
||||
|
||||
try:
|
||||
page.goto(next_url, wait_until="domcontentloaded", timeout=30000)
|
||||
time.sleep(4)
|
||||
_ensure_not_challenged(page)
|
||||
except TurnstileError:
|
||||
raise
|
||||
except Exception as e:
|
||||
log.debug("Pagination navigation failed at page %d: %s", page_num, e)
|
||||
break
|
||||
|
||||
page_listings = _extract_listings(page)
|
||||
if not page_listings:
|
||||
break
|
||||
|
||||
# Deduplicate within this outcode
|
||||
new_count = 0
|
||||
for listing in page_listings:
|
||||
if listing["id"] not in seen_ids:
|
||||
seen_ids.add(listing["id"])
|
||||
all_listings.append(listing)
|
||||
new_count += 1
|
||||
|
||||
zoopla_pages_scraped.labels(channel=channel_label).inc()
|
||||
|
||||
if new_count == 0:
|
||||
break # No new listings on this page
|
||||
|
||||
page_num += 1
|
||||
|
||||
return all_listings
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Property transformation
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _extract_postcode(text: str) -> str | None:
|
||||
"""Extract a full UK postcode from text like 'Dollar Bay Place, Canary Wharf E14 9SS'."""
|
||||
match = re.search(r"([A-Z]{1,2}\d[A-Z0-9]?\s*\d[A-Z]{2})", text, re.IGNORECASE)
|
||||
if match:
|
||||
return match.group(1).upper().strip()
|
||||
return None
|
||||
|
||||
|
||||
def _extract_outcode(text: str) -> str | None:
|
||||
"""Extract a UK outcode from address text like 'Whitechapel Road, London E1'."""
|
||||
# Look for outcode at end of string or after last comma
|
||||
match = re.search(r"\b([A-Z]{1,2}\d[A-Z0-9]?)\s*$", text.strip(), re.IGNORECASE)
|
||||
if match:
|
||||
return match.group(1).upper()
|
||||
# Try after comma
|
||||
parts = text.split(",")
|
||||
if len(parts) > 1:
|
||||
last = parts[-1].strip()
|
||||
match = re.match(r"^([A-Z]{1,2}\d[A-Z0-9]?)$", last, re.IGNORECASE)
|
||||
if match:
|
||||
return match.group(1).upper()
|
||||
return None
|
||||
|
||||
|
||||
def _map_property_type(raw_type: str | None) -> str:
|
||||
"""Map Zoopla property type text to canonical type."""
|
||||
if not raw_type:
|
||||
return "Other"
|
||||
canonical = PROPERTY_TYPE_MAP.get(raw_type)
|
||||
if canonical:
|
||||
return canonical
|
||||
lower = raw_type.lower()
|
||||
if "flat" in lower or "apartment" in lower or "maisonette" in lower or "studio" in lower:
|
||||
return "Flats/Maisonettes"
|
||||
if "detached" in lower and "semi" not in lower:
|
||||
return "Detached"
|
||||
if "semi" in lower:
|
||||
return "Semi-Detached"
|
||||
if "terrace" in lower or "mews" in lower:
|
||||
return "Terraced"
|
||||
if "house" in lower:
|
||||
return "Detached"
|
||||
return "Other"
|
||||
|
||||
|
||||
def transform_property(
|
||||
raw: dict,
|
||||
channel: str,
|
||||
pc_index: PostcodeSpatialIndex,
|
||||
pc_coords: dict[str, tuple[float, float]],
|
||||
) -> dict | None:
|
||||
"""Transform a raw Zoopla listing dict into the standard output schema.
|
||||
|
||||
Zoopla search cards do not include coordinates, so we resolve lat/lng
|
||||
from postcodes extracted from the address text."""
|
||||
price = raw.get("price")
|
||||
if not price:
|
||||
return None
|
||||
|
||||
address = raw.get("address", "")
|
||||
|
||||
# Resolve postcode and coordinates from address
|
||||
postcode = _extract_postcode(address)
|
||||
lat = lng = None
|
||||
|
||||
if postcode:
|
||||
coords = pc_coords.get(postcode)
|
||||
if coords:
|
||||
lat, lng = coords
|
||||
|
||||
if lat is None:
|
||||
# Try outcode-level fallback
|
||||
outcode = _extract_outcode(address)
|
||||
if outcode:
|
||||
prefix = outcode + " "
|
||||
for pcd, coords in pc_coords.items():
|
||||
if pcd.startswith(prefix):
|
||||
postcode = pcd
|
||||
lat, lng = coords
|
||||
break
|
||||
|
||||
if lat is None or lng is None or not postcode:
|
||||
return None
|
||||
|
||||
# Validate coordinates are in England
|
||||
if not (49 <= lat <= 56 and -7 <= lng <= 2):
|
||||
return None
|
||||
|
||||
bedrooms = raw.get("beds") or 0
|
||||
bathrooms = raw.get("baths") or 0
|
||||
receptions = raw.get("receptions") or 0
|
||||
|
||||
# Floor area: convert sq ft to sq m
|
||||
floor_area_sqm = None
|
||||
sqft = raw.get("floor_area_sqft")
|
||||
if sqft:
|
||||
floor_area_sqm = round(sqft * 0.092903, 1)
|
||||
|
||||
listing_id = raw.get("id", "")
|
||||
listing_url = raw.get("url", "")
|
||||
if listing_url and not listing_url.startswith("http"):
|
||||
listing_url = ZOOPLA_BASE + listing_url
|
||||
|
||||
return {
|
||||
"id": f"zp_{listing_id}",
|
||||
"Bedrooms": bedrooms,
|
||||
"Bathrooms": bathrooms,
|
||||
"Number of bedrooms & living rooms": bedrooms + receptions,
|
||||
"lon": lng,
|
||||
"lat": lat,
|
||||
"Postcode": postcode,
|
||||
"Address per Property Register": address,
|
||||
"Leasehold/Freehold": raw.get("tenure") or None,
|
||||
"Property type": "Other", # Not reliably extractable from Zoopla search cards
|
||||
"Property sub-type": "",
|
||||
"price": int(price),
|
||||
"price_frequency": "" if channel == "BUY" else "monthly",
|
||||
"Price qualifier": "",
|
||||
"Total floor area (sqm)": floor_area_sqm,
|
||||
"Listing URL": listing_url,
|
||||
"Listing features": [],
|
||||
"first_visible_date": "",
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Top-level search function (called by scraper.py)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def search_outcode(
|
||||
page,
|
||||
outcode: str,
|
||||
channel: str,
|
||||
pc_index: PostcodeSpatialIndex,
|
||||
pc_coords: dict[str, tuple[float, float]],
|
||||
) -> list[dict]:
|
||||
"""Search Zoopla for properties in one outcode.
|
||||
|
||||
Takes a live Camoufox Page (from launch_browser). Navigates through the
|
||||
search flow, extracts listings from rendered DOM, and transforms to the
|
||||
standard output schema.
|
||||
|
||||
Raises TurnstileError if Cloudflare blocks us mid-session.
|
||||
"""
|
||||
if not _navigate_search(page, outcode, channel):
|
||||
return []
|
||||
|
||||
total_results = _get_result_count(page)
|
||||
if total_results == 0:
|
||||
return []
|
||||
|
||||
raw_listings = _paginate(page, total_results, channel)
|
||||
if not raw_listings:
|
||||
return []
|
||||
|
||||
channel_label = "buy" if channel == "BUY" else "rent"
|
||||
properties = []
|
||||
for raw in raw_listings:
|
||||
transformed = transform_property(raw, channel, pc_index, pc_coords)
|
||||
if transformed:
|
||||
properties.append(transformed)
|
||||
zoopla_properties_scraped.labels(channel=channel_label).inc()
|
||||
|
||||
return properties
|
||||
Loading…
Add table
Add a link
Reference in a new issue