More fixes

This commit is contained in:
Andras Schmelczer 2026-03-18 22:46:08 +00:00
parent 15fa09430b
commit 6b12e21d50
54 changed files with 1665 additions and 630 deletions

View file

@ -5,9 +5,14 @@ COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
WORKDIR /app
COPY pyproject.toml ./
RUN uv pip install --system -r pyproject.toml
RUN playwright install --with-deps chromium
RUN playwright install-deps firefox
RUN camoufox fetch \
&& python -c "from camoufox.pkgman import camoufox_path; p = camoufox_path(download_if_missing=False); print('Camoufox verified at', p)"
COPY *.py ./
COPY property-data/arcgis_data.parquet /data/arcgis_data.parquet
HEALTHCHECK --interval=30s --timeout=5s --retries=3 \
CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:1234/health')"
CMD ["python3", "main.py"]

View file

@ -31,6 +31,11 @@ SCRAPE_OPENRENT = os.environ.get("SCRAPE_OPENRENT", "true").lower() in (
"true",
"yes",
)
SCRAPE_ZOOPLA = os.environ.get("SCRAPE_ZOOPLA", "true").lower() in (
"1",
"true",
"yes",
)
# URL to trigger server data reload after scrape (e.g. http://server:8001/api/reload)
RELOAD_URL = os.environ.get("RELOAD_URL", "")
@ -47,6 +52,9 @@ HOMECOUK_PER_PAGE = 30 # max supported by the API
# OpenRent
OPENRENT_BASE = "https://www.openrent.co.uk"
# Zoopla
ZOOPLA_BASE = "https://www.zoopla.co.uk"
PROPERTY_TYPE_MAP = {
"Detached": "Detached",
"Semi-Detached": "Semi-Detached",

View file

@ -14,6 +14,7 @@ from constants import (
SCRAPE_HOMECOUK,
SCRAPE_OPENRENT,
SCRAPE_RIGHTMOVE,
SCRAPE_ZOOPLA,
)
from homecouk import load_cookies as load_homecouk_cookies
from openrent import load_cookies as load_openrent_cookies
@ -48,6 +49,16 @@ log.setLevel(logging.DEBUG)
logging.getLogger("httpx").setLevel(logging.WARNING)
logging.getLogger("httpcore").setLevel(logging.WARNING)
# Suppress noisy /metrics and /health request logs from werkzeug
class _NoiseFilter(logging.Filter):
def filter(self, record):
msg = record.getMessage()
return "GET /metrics" not in msg and "GET /health" not in msg
logging.getLogger("werkzeug").addFilter(_NoiseFilter())
# ---------------------------------------------------------------------------
# Startup: load data
# ---------------------------------------------------------------------------
@ -55,13 +66,14 @@ logging.getLogger("httpcore").setLevel(logging.WARNING)
log.info("Loading arcgis data...")
OUTCODES = load_outcodes()
PC_INDEX = build_postcode_index()
PC_COORDS = build_postcode_coords() if SCRAPE_OPENRENT else None
PC_COORDS = build_postcode_coords() if (SCRAPE_OPENRENT or SCRAPE_ZOOPLA) else None
log.info(
"Ready — %d outcodes, postcode index built (rightmove=%s, homecouk=%s, openrent=%s)",
"Ready — %d outcodes, postcode index built (rightmove=%s, homecouk=%s, openrent=%s, zoopla=%s)",
len(OUTCODES),
SCRAPE_RIGHTMOVE,
SCRAPE_HOMECOUK,
SCRAPE_OPENRENT,
SCRAPE_ZOOPLA,
)
# ---------------------------------------------------------------------------
@ -121,6 +133,11 @@ if SCHEDULE_HOUR >= 0:
app = Flask(__name__)
@app.route("/health")
def health():
return "ok", 200
@app.route("/run", methods=["POST"])
def trigger_run():
if _start_scrape():
@ -147,6 +164,7 @@ def get_status():
"rightmove": status.rm_properties,
"homecouk": status.hk_properties,
"openrent": status.or_properties,
"zoopla": status.zp_properties,
},
"errors": status.errors[-20:], # last 20 errors
"elapsed_seconds": round(elapsed, 1),
@ -167,8 +185,10 @@ def get_debug():
"scrape_rightmove": SCRAPE_RIGHTMOVE,
"scrape_homecouk": SCRAPE_HOMECOUK,
"scrape_openrent": SCRAPE_OPENRENT,
"scrape_zoopla": SCRAPE_ZOOPLA,
"homecouk_cookies_available": hk_cookies is not None,
"openrent_cookies_available": or_cookies is not None,
"zoopla_note": "browser-based (Camoufox), no cookies needed",
}
)

View file

@ -109,6 +109,28 @@ openrent_properties_scraped = Counter(
["channel"],
)
# ---------------------------------------------------------------------------
# Counters — Zoopla
# ---------------------------------------------------------------------------
zoopla_pages_scraped = Counter(
"zoopla_pages_scraped",
"Search result pages scraped from Zoopla",
["channel"],
)
zoopla_errors_total = Counter(
"zoopla_errors_total",
"Zoopla scraping errors",
["type"],
)
zoopla_properties_scraped = Counter(
"zoopla_properties_scraped",
"Properties scraped from Zoopla (before dedup)",
["channel"],
)
# ---------------------------------------------------------------------------
# Counters — FlareSolverr / cookie management
# ---------------------------------------------------------------------------
@ -138,3 +160,8 @@ openrent_enabled = Gauge(
"openrent_enabled",
"Whether OpenRent scraping is currently active (1=yes, 0=no)",
)
zoopla_enabled = Gauge(
"zoopla_enabled",
"Whether Zoopla scraping is currently active (1=yes, 0=no)",
)

View file

@ -17,6 +17,7 @@ from constants import (
SCRAPE_HOMECOUK,
SCRAPE_OPENRENT,
SCRAPE_RIGHTMOVE,
SCRAPE_ZOOPLA,
SEED,
)
from homecouk import CookiesExpiredError
@ -35,12 +36,16 @@ from metrics import (
scrape_outcodes_total,
scrape_properties_total,
scrape_state,
zoopla_enabled,
)
from openrent import WafChallengeError
from openrent import load_cookies as load_openrent_cookies
from openrent import make_client as make_openrent_client
from openrent import search_outcode as openrent_search_outcode
from rightmove import resolve_outcode_id, search_outcode
from zoopla import TurnstileError
from zoopla import launch_browser as launch_zoopla_browser
from zoopla import search_outcode as zoopla_search_outcode
from spatial import PostcodeSpatialIndex
from storage import write_parquet
@ -60,6 +65,7 @@ class ScrapeStatus:
rm_properties: int = 0
hk_properties: int = 0
or_properties: int = 0
zp_properties: int = 0
errors: list[str] = field(default_factory=list)
started_at: float = 0.0
finished_at: float = 0.0
@ -93,6 +99,9 @@ def _sync_gauges() -> None:
scrape_properties_total.labels(channel=ch, source="openrent").set(
status.or_properties
)
scrape_properties_total.labels(channel=ch, source="zoopla").set(
status.zp_properties
)
if status.started_at:
end = status.finished_at if status.finished_at else time.time()
scrape_elapsed_seconds.set(end - status.started_at)
@ -191,7 +200,7 @@ def run_scrape(
random.seed(SEED)
random.shuffle(shuffled)
if not SCRAPE_RIGHTMOVE and not SCRAPE_HOMECOUK and not SCRAPE_OPENRENT:
if not SCRAPE_RIGHTMOVE and not SCRAPE_HOMECOUK and not SCRAPE_OPENRENT and not SCRAPE_ZOOPLA:
log.warning("All scrapers disabled — nothing to do")
with status_lock:
status.state = "done"
@ -239,8 +248,27 @@ def run_scrape(
)
openrent_enabled.set(0)
# Build postcode coords if OpenRent is active and caller didn't provide them
if or_client and pc_coords is None:
# Zoopla: uses Camoufox browser (no cookies/client pattern)
zp_browser = None
zp_page = None
zp_failed = False
if not SCRAPE_ZOOPLA:
log.info("Zoopla scraping DISABLED (SCRAPE_ZOOPLA=false)")
zoopla_enabled.set(0)
else:
try:
zp_browser, zp_page = launch_zoopla_browser()
log.info("Zoopla scraping ENABLED (Camoufox browser launched)")
zoopla_enabled.set(1)
except TurnstileError:
log.warning("Zoopla Cloudflare Turnstile failed — disabling Zoopla")
zoopla_enabled.set(0)
except Exception as e:
log.warning("Zoopla browser launch failed: %s — disabling Zoopla", e)
zoopla_enabled.set(0)
# Build postcode coords if OpenRent/Zoopla is active and caller didn't provide them
if (or_client or zp_page) and pc_coords is None:
pc_coords = build_postcode_coords()
try:
@ -256,6 +284,8 @@ def run_scrape(
hk_dedup_count = 0 # home.co.uk skipped as cross-source duplicates
or_count = 0 # OpenRent properties this channel
or_dedup_count = 0 # OpenRent skipped as cross-source duplicates
zp_count = 0 # Zoopla properties this channel
zp_dedup_count = 0 # Zoopla skipped as cross-source duplicates
with status_lock:
status.channel = channel_name
@ -264,6 +294,7 @@ def run_scrape(
status.rm_properties = 0
status.hk_properties = 0
status.or_properties = 0
status.zp_properties = 0
channel_start = time.time()
prev_prop_milestone = 0 # last 10k milestone we logged
@ -412,6 +443,63 @@ def run_scrape(
with status_lock:
status.errors.append(msg)
# --- Zoopla ---
if zp_page and not zp_failed:
made_requests = True
try:
zp_props = zoopla_search_outcode(
zp_page,
outcode,
channel_name,
pc_index,
pc_coords,
)
for p in zp_props:
pid = p["id"]
key = _dedup_key(p)
if pid in all_properties or key in seen_dedup_keys:
zp_dedup_count += 1
cross_source_dedup_total.labels(
channel="buy" if channel_name == "BUY" else "rent",
).inc()
continue
all_properties[pid] = p
seen_dedup_keys.add(key)
zp_count += 1
if zp_props:
log.info(
"Zoopla %s: +%d properties", outcode, len(zp_props)
)
except TurnstileError:
log.warning(
"Zoopla Cloudflare challenge failed — attempting browser relaunch"
)
try:
zp_browser.close()
except Exception:
pass
try:
zp_browser, zp_page = launch_zoopla_browser()
log.info("Zoopla browser relaunched, continuing")
except Exception:
log.warning(
"Browser relaunch failed, disabling Zoopla for rest of scrape"
)
zp_page = None
zp_browser = None
zp_failed = True
zoopla_enabled.set(0)
with status_lock:
status.errors.append(
"Zoopla Cloudflare challenge failed and browser relaunch failed"
)
except Exception as e:
msg = f"Error scraping Zoopla {outcode}/{channel_name}: {e}"
log.error(msg)
scrape_errors_total.labels(source="zoopla").inc()
with status_lock:
status.errors.append(msg)
with status_lock:
if channel_name == "BUY":
status.properties_buy = len(all_properties)
@ -420,6 +508,7 @@ def run_scrape(
status.rm_properties = rm_count
status.hk_properties = hk_count
status.or_properties = or_count
status.zp_properties = zp_count
_sync_gauges()
# Log progress every 100 outcodes
@ -444,12 +533,13 @@ def run_scrape(
if current_milestone > prev_prop_milestone:
prev_prop_milestone = current_milestone
log.info(
"%s %dk properties (rm: %d, hk: %d, or: %d) at outcode %d/%d [%s]",
"%s %dk properties (rm: %d, hk: %d, or: %d, zp: %d) at outcode %d/%d [%s]",
channel_name,
current_milestone * 10,
rm_count,
hk_count,
or_count,
zp_count,
done,
len(shuffled),
_fmt_elapsed(elapsed),
@ -472,13 +562,14 @@ def run_scrape(
_sync_gauges()
log.info(
"=== %s channel complete: %d unique (rm: %d, hk: %d, or: %d, cross-dedup: %d) ===",
"=== %s channel complete: %d unique (rm: %d, hk: %d, or: %d, zp: %d, cross-dedup: %d) ===",
channel_name,
len(deduped),
rm_count,
hk_count,
or_count,
hk_dedup_count + or_dedup_count,
zp_count,
hk_dedup_count + or_dedup_count + zp_dedup_count,
)
with status_lock:
@ -525,3 +616,8 @@ def run_scrape(
hk_client.close()
if or_client:
or_client.close()
if zp_browser:
try:
zp_browser.close()
except Exception:
pass

View file

@ -25,7 +25,11 @@ def write_parquet(properties: list[dict], path: Path, channel: str) -> None:
if fvd:
try:
dt = datetime.fromisoformat(fvd.replace("Z", "+00:00"))
listing_dates.append(dt.replace(tzinfo=None))
# Convert to UTC naive datetime for consistent storage
if dt.tzinfo is not None:
from datetime import timezone
dt = dt.astimezone(timezone.utc).replace(tzinfo=None)
listing_dates.append(dt)
except (ValueError, TypeError):
listing_dates.append(None)
else:

520
finder/zoopla.py Normal file
View file

@ -0,0 +1,520 @@
"""Zoopla (zoopla.co.uk) scraper — buy and rental properties.
Zoopla is behind Cloudflare Turnstile (managed interactive challenge), which
blocks all HTTP clients (curl_cffi, httpx) and even Playwright with stealth
patches. Only Camoufox (an anti-fingerprinting Firefox fork) passes reliably.
Zoopla uses Next.js App Router with React Server Components (RSC). Search
result data is server-rendered in an RSC stream, not available via
__NEXT_DATA__ or a JSON API. URL-based location slugs return 0 results
the working flow requires typing into the autocomplete input, selecting a
suggestion, and clicking Search.
Architecture:
Unlike the other scrapers which use HTTP clients per outcode, Zoopla keeps
a single Camoufox browser alive for the entire scrape. For each outcode, it:
1. Clears and types the outcode into the search input
2. Selects the first autocomplete suggestion
3. Clicks Search
4. Extracts listing data from the rendered DOM
5. Handles pagination via ?pn=N parameter
The browser session replaces the cookie/client pattern used by other scrapers.
"""
import logging
import re
import time
from constants import DELAY_BETWEEN_PAGES, PROPERTY_TYPE_MAP, ZOOPLA_BASE
from metrics import zoopla_errors_total, zoopla_pages_scraped, zoopla_properties_scraped
from spatial import PostcodeSpatialIndex
log = logging.getLogger("zoopla")
class TurnstileError(Exception):
"""Raised when Cloudflare Turnstile challenge cannot be passed."""
# Maximum search result pages to scrape per outcode (25 listings/page)
MAX_PAGES_PER_OUTCODE = 10
# JavaScript to extract listings from the rendered DOM.
# Finds all detail links, walks up to the card container, and parses
# price, beds, baths, floor area, address, and tenure from the card text.
_EXTRACT_LISTINGS_JS = r"""() => {
const links = Array.from(document.querySelectorAll(
'a[href*="/for-sale/details/"], a[href*="/new-homes/details/"], a[href*="/to-rent/details/"]'
));
const seen = new Set();
const results = [];
for (const link of links) {
const href = link.href;
const match = href.match(/\/details\/(\d+)\//);
if (!match) continue;
const id = match[1];
if (seen.has(id)) continue;
seen.add(id);
// Walk up to the listing card container
let card = link;
for (let j = 0; j < 10; j++) {
card = card.parentElement;
if (!card) break;
const text = card.innerText || '';
if (text.includes('\u00a3') && (text.includes('bed') || text.includes('sq ft'))) {
break;
}
}
if (!card) continue;
const text = card.innerText || '';
const lines = text.split('\n').map(l => l.trim()).filter(Boolean);
const priceMatch = text.match(/\u00a3([\d,]+)/);
const bedsMatch = text.match(/(\d+)\s*beds?/i);
const bathsMatch = text.match(/(\d+)\s*baths?/i);
const recMatch = text.match(/(\d+)\s*reception/i);
const areaMatch = text.match(/([\d,]+)\s*sq\s*ft/i);
let address = '';
for (const line of lines) {
if (/[A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2}/i.test(line) ||
(line.includes(',') && !line.includes('\u00a3') && !/^\d+ beds?/i.test(line))) {
address = line;
break;
}
}
let tenure = '';
if (/freehold/i.test(text)) tenure = 'Freehold';
else if (/leasehold/i.test(text)) tenure = 'Leasehold';
results.push({
id: id,
url: href.replace(window.location.origin, ''),
price: priceMatch ? parseInt(priceMatch[1].replace(/,/g, '')) : null,
beds: bedsMatch ? parseInt(bedsMatch[1]) : null,
baths: bathsMatch ? parseInt(bathsMatch[1]) : null,
receptions: recMatch ? parseInt(recMatch[1]) : null,
floor_area_sqft: areaMatch ? parseInt(areaMatch[1].replace(/,/g, '')) : null,
address: address,
tenure: tenure,
});
}
return results;
}"""
# JavaScript to dismiss the Usercentrics cookie consent overlay (shadow DOM).
_DISMISS_COOKIES_JS = """() => {
const aside = document.querySelector('#usercentrics-cmp-ui');
if (aside && aside.shadowRoot) {
const btns = aside.shadowRoot.querySelectorAll('button');
for (const btn of btns) {
if (btn.innerText.includes('Accept')) { btn.click(); return true; }
}
}
if (aside) { aside.remove(); return true; }
return false;
}"""
# ---------------------------------------------------------------------------
# Browser lifecycle
# ---------------------------------------------------------------------------
def launch_browser():
"""Launch Camoufox, navigate to Zoopla homepage, pass Cloudflare Turnstile,
and dismiss cookie consent. Returns (browser, page) tuple.
Raises TurnstileError if Cloudflare cannot be passed within 60 seconds.
Caller must close browser when done."""
from camoufox.pkgman import camoufox_path
# Verify camoufox is pre-installed — never download at runtime
camoufox_path(download_if_missing=False)
from camoufox.sync_api import Camoufox
log.info("Launching Camoufox browser for Zoopla...")
browser = Camoufox(headless=True).__enter__()
page = browser.new_page()
log.info("Navigating to Zoopla homepage...")
page.goto(f"{ZOOPLA_BASE}/", wait_until="domcontentloaded", timeout=60000)
# Wait for Cloudflare Turnstile to resolve.
# Try clicking the Turnstile checkbox if present (helps in some cases).
for i in range(20):
if "Just a moment" not in page.title():
break
# Attempt to click the Turnstile checkbox in the challenge iframe
for frame in page.frames:
if "challenges.cloudflare.com" in frame.url:
try:
iframe_el = page.query_selector('iframe[src*="challenges.cloudflare"]')
if iframe_el:
box = iframe_el.bounding_box()
if box:
page.mouse.click(box["x"] + 30, box["y"] + box["height"] / 2)
except Exception:
pass
break
time.sleep(3)
else:
page.close()
browser.close()
raise TurnstileError("Cloudflare Turnstile did not resolve after 60s")
log.info("Cloudflare passed — title: %s", page.title())
time.sleep(2)
# Dismiss cookie consent
page.evaluate(_DISMISS_COOKIES_JS)
time.sleep(1)
return browser, page
def _ensure_not_challenged(page) -> None:
"""Check if current page is a Cloudflare challenge and wait/raise."""
if "Just a moment" not in page.title():
return
log.warning("Cloudflare challenge detected mid-session, waiting...")
for i in range(20):
time.sleep(3)
if "Just a moment" not in page.title():
log.info("Cloudflare challenge resolved")
return
raise TurnstileError("Cloudflare re-challenge did not resolve")
# ---------------------------------------------------------------------------
# Search navigation
# ---------------------------------------------------------------------------
def _navigate_search(page, outcode: str, channel: str) -> bool:
"""Navigate to search results for an outcode via the homepage search flow.
Returns True if results were found, False if no results or navigation failed.
Raises TurnstileError if Cloudflare blocks us."""
# Navigate to homepage to reset search state
page.goto(f"{ZOOPLA_BASE}/", wait_until="domcontentloaded", timeout=30000)
time.sleep(2)
_ensure_not_challenged(page)
# Dismiss cookie consent (may reappear after navigation)
page.evaluate(_DISMISS_COOKIES_JS)
time.sleep(1)
# Select Buy/Rent tab
if channel == "RENT":
rent_tab = page.query_selector(
'button:has-text("Rent"), [role="tab"]:has-text("Rent")'
)
if rent_tab:
rent_tab.click()
time.sleep(0.5)
# Find and fill search input
search_input = page.query_selector(
'input[name="autosuggest-input"]'
) or page.query_selector('input[type="text"]')
if not search_input:
log.warning("Could not find search input on homepage")
return False
search_input.click()
time.sleep(0.3)
search_input.fill("")
search_input.type(outcode, delay=60)
time.sleep(2)
# Select first autocomplete suggestion
first_option = page.query_selector('[role="option"]')
if not first_option:
log.debug("No autocomplete suggestions for outcode %s", outcode)
return False
first_option.click()
time.sleep(0.5)
# Click search button
search_btn = page.query_selector('button:has-text("Search")')
if search_btn:
search_btn.click()
else:
search_input.press("Enter")
# Wait for results to load
time.sleep(6)
_ensure_not_challenged(page)
return True
def _get_result_count(page) -> int:
"""Extract the total results count from the page body text."""
try:
body = page.inner_text("body")
match = re.search(r"([\d,]+)\s+results?", body)
if match:
return int(match.group(1).replace(",", ""))
except Exception:
pass
return 0
# ---------------------------------------------------------------------------
# Extraction and pagination
# ---------------------------------------------------------------------------
def _extract_listings(page) -> list[dict]:
"""Extract listing data from the current search results page DOM."""
try:
return page.evaluate(_EXTRACT_LISTINGS_JS)
except Exception as e:
log.warning("Failed to extract listings from DOM: %s", e)
zoopla_errors_total.labels(type="extract_failed").inc()
return []
def _paginate(page, total_results: int, channel: str) -> list[dict]:
"""Extract listings from all pages of search results.
Page 1 is already loaded. For subsequent pages, clicks the Next button
or navigates via URL parameter ?pn=N."""
all_listings = _extract_listings(page)
channel_label = "buy" if channel == "BUY" else "rent"
zoopla_pages_scraped.labels(channel=channel_label).inc()
if not all_listings or total_results <= len(all_listings):
return all_listings
seen_ids = {l["id"] for l in all_listings}
current_url = page.url
page_num = 2
while len(all_listings) < total_results and page_num <= MAX_PAGES_PER_OUTCODE:
time.sleep(DELAY_BETWEEN_PAGES)
# Try navigating via URL parameter
if "?" in current_url:
next_url = re.sub(r"[?&]pn=\d+", "", current_url)
separator = "&" if "?" in next_url else "?"
next_url = f"{next_url}{separator}pn={page_num}"
else:
next_url = f"{current_url}?pn={page_num}"
try:
page.goto(next_url, wait_until="domcontentloaded", timeout=30000)
time.sleep(4)
_ensure_not_challenged(page)
except TurnstileError:
raise
except Exception as e:
log.debug("Pagination navigation failed at page %d: %s", page_num, e)
break
page_listings = _extract_listings(page)
if not page_listings:
break
# Deduplicate within this outcode
new_count = 0
for listing in page_listings:
if listing["id"] not in seen_ids:
seen_ids.add(listing["id"])
all_listings.append(listing)
new_count += 1
zoopla_pages_scraped.labels(channel=channel_label).inc()
if new_count == 0:
break # No new listings on this page
page_num += 1
return all_listings
# ---------------------------------------------------------------------------
# Property transformation
# ---------------------------------------------------------------------------
def _extract_postcode(text: str) -> str | None:
"""Extract a full UK postcode from text like 'Dollar Bay Place, Canary Wharf E14 9SS'."""
match = re.search(r"([A-Z]{1,2}\d[A-Z0-9]?\s*\d[A-Z]{2})", text, re.IGNORECASE)
if match:
return match.group(1).upper().strip()
return None
def _extract_outcode(text: str) -> str | None:
"""Extract a UK outcode from address text like 'Whitechapel Road, London E1'."""
# Look for outcode at end of string or after last comma
match = re.search(r"\b([A-Z]{1,2}\d[A-Z0-9]?)\s*$", text.strip(), re.IGNORECASE)
if match:
return match.group(1).upper()
# Try after comma
parts = text.split(",")
if len(parts) > 1:
last = parts[-1].strip()
match = re.match(r"^([A-Z]{1,2}\d[A-Z0-9]?)$", last, re.IGNORECASE)
if match:
return match.group(1).upper()
return None
def _map_property_type(raw_type: str | None) -> str:
"""Map Zoopla property type text to canonical type."""
if not raw_type:
return "Other"
canonical = PROPERTY_TYPE_MAP.get(raw_type)
if canonical:
return canonical
lower = raw_type.lower()
if "flat" in lower or "apartment" in lower or "maisonette" in lower or "studio" in lower:
return "Flats/Maisonettes"
if "detached" in lower and "semi" not in lower:
return "Detached"
if "semi" in lower:
return "Semi-Detached"
if "terrace" in lower or "mews" in lower:
return "Terraced"
if "house" in lower:
return "Detached"
return "Other"
def transform_property(
raw: dict,
channel: str,
pc_index: PostcodeSpatialIndex,
pc_coords: dict[str, tuple[float, float]],
) -> dict | None:
"""Transform a raw Zoopla listing dict into the standard output schema.
Zoopla search cards do not include coordinates, so we resolve lat/lng
from postcodes extracted from the address text."""
price = raw.get("price")
if not price:
return None
address = raw.get("address", "")
# Resolve postcode and coordinates from address
postcode = _extract_postcode(address)
lat = lng = None
if postcode:
coords = pc_coords.get(postcode)
if coords:
lat, lng = coords
if lat is None:
# Try outcode-level fallback
outcode = _extract_outcode(address)
if outcode:
prefix = outcode + " "
for pcd, coords in pc_coords.items():
if pcd.startswith(prefix):
postcode = pcd
lat, lng = coords
break
if lat is None or lng is None or not postcode:
return None
# Validate coordinates are in England
if not (49 <= lat <= 56 and -7 <= lng <= 2):
return None
bedrooms = raw.get("beds") or 0
bathrooms = raw.get("baths") or 0
receptions = raw.get("receptions") or 0
# Floor area: convert sq ft to sq m
floor_area_sqm = None
sqft = raw.get("floor_area_sqft")
if sqft:
floor_area_sqm = round(sqft * 0.092903, 1)
listing_id = raw.get("id", "")
listing_url = raw.get("url", "")
if listing_url and not listing_url.startswith("http"):
listing_url = ZOOPLA_BASE + listing_url
return {
"id": f"zp_{listing_id}",
"Bedrooms": bedrooms,
"Bathrooms": bathrooms,
"Number of bedrooms & living rooms": bedrooms + receptions,
"lon": lng,
"lat": lat,
"Postcode": postcode,
"Address per Property Register": address,
"Leasehold/Freehold": raw.get("tenure") or None,
"Property type": "Other", # Not reliably extractable from Zoopla search cards
"Property sub-type": "",
"price": int(price),
"price_frequency": "" if channel == "BUY" else "monthly",
"Price qualifier": "",
"Total floor area (sqm)": floor_area_sqm,
"Listing URL": listing_url,
"Listing features": [],
"first_visible_date": "",
}
# ---------------------------------------------------------------------------
# Top-level search function (called by scraper.py)
# ---------------------------------------------------------------------------
def search_outcode(
page,
outcode: str,
channel: str,
pc_index: PostcodeSpatialIndex,
pc_coords: dict[str, tuple[float, float]],
) -> list[dict]:
"""Search Zoopla for properties in one outcode.
Takes a live Camoufox Page (from launch_browser). Navigates through the
search flow, extracts listings from rendered DOM, and transforms to the
standard output schema.
Raises TurnstileError if Cloudflare blocks us mid-session.
"""
if not _navigate_search(page, outcode, channel):
return []
total_results = _get_result_count(page)
if total_results == 0:
return []
raw_listings = _paginate(page, total_results, channel)
if not raw_listings:
return []
channel_label = "buy" if channel == "BUY" else "rent"
properties = []
for raw in raw_listings:
transformed = transform_property(raw, channel, pc_index, pc_coords)
if transformed:
properties.append(transformed)
zoopla_properties_scraped.labels(channel=channel_label).inc()
return properties