checkpoint finder
This commit is contained in:
parent
8616837c01
commit
96dfdd7491
5 changed files with 130 additions and 20 deletions
|
|
@ -4,8 +4,8 @@ from pathlib import Path
|
||||||
ARCGIS_PATH = os.environ.get("ARCGIS_PATH", "/data/arcgis_data.parquet")
|
ARCGIS_PATH = os.environ.get("ARCGIS_PATH", "/data/arcgis_data.parquet")
|
||||||
DATA_DIR = Path("/app/data")
|
DATA_DIR = Path("/app/data")
|
||||||
PAGE_SIZE = 24
|
PAGE_SIZE = 24
|
||||||
DELAY_BETWEEN_PAGES = 0.5
|
DELAY_BETWEEN_PAGES = 0.3
|
||||||
DELAY_BETWEEN_OUTCODES = 1.0
|
DELAY_BETWEEN_OUTCODES = 0.5
|
||||||
MAX_RETRIES = 3
|
MAX_RETRIES = 3
|
||||||
RETRY_BASE_DELAY = 2.0
|
RETRY_BASE_DELAY = 2.0
|
||||||
GRID_CELL_SIZE = 0.01 # degrees for postcode spatial index
|
GRID_CELL_SIZE = 0.01 # degrees for postcode spatial index
|
||||||
|
|
|
||||||
|
|
@ -351,7 +351,7 @@ def parse_search_results(html: str) -> list[dict]:
|
||||||
<div class="fw-medium text-primary fs-3">1 Bed Flat, Location, SW1Y</div>
|
<div class="fw-medium text-primary fs-3">1 Bed Flat, Location, SW1Y</div>
|
||||||
<ul>...<li>1 Bed</li><li>1 Bath</li><li>Furnished</li>...</ul>
|
<ul>...<li>1 Bed</li><li>1 Bath</li><li>Furnished</li>...</ul>
|
||||||
"""
|
"""
|
||||||
soup = BeautifulSoup(html, "html.parser")
|
soup = BeautifulSoup(html, "lxml")
|
||||||
properties = []
|
properties = []
|
||||||
|
|
||||||
# Property cards: <a class="pli search-property-card">
|
# Property cards: <a class="pli search-property-card">
|
||||||
|
|
@ -486,7 +486,7 @@ def parse_property_detail(html: str) -> dict:
|
||||||
- Tables have "Rent PCM", "Deposit", "Bills Included", etc. (NOT bedrooms)
|
- Tables have "Rent PCM", "Deposit", "Bills Included", etc. (NOT bedrooms)
|
||||||
- Description in elements with class containing "description"
|
- Description in elements with class containing "description"
|
||||||
"""
|
"""
|
||||||
soup = BeautifulSoup(html, "html.parser")
|
soup = BeautifulSoup(html, "lxml")
|
||||||
details: dict = {}
|
details: dict = {}
|
||||||
|
|
||||||
# --- Title from h1 ---
|
# --- Title from h1 ---
|
||||||
|
|
@ -810,7 +810,7 @@ def search_outcode(
|
||||||
if detail_html:
|
if detail_html:
|
||||||
detail_data = parse_property_detail(detail_html)
|
detail_data = parse_property_detail(detail_html)
|
||||||
# Shorter delay for detail pages (within same outcode)
|
# Shorter delay for detail pages (within same outcode)
|
||||||
time.sleep(DELAY_BETWEEN_PAGES * 0.5)
|
time.sleep(0.15)
|
||||||
|
|
||||||
transformed = transform_property(
|
transformed = transform_property(
|
||||||
search_data,
|
search_data,
|
||||||
|
|
|
||||||
|
|
@ -10,6 +10,7 @@ dependencies = [
|
||||||
"fake-useragent>=2.2.0",
|
"fake-useragent>=2.2.0",
|
||||||
"prometheus-client",
|
"prometheus-client",
|
||||||
"beautifulsoup4",
|
"beautifulsoup4",
|
||||||
|
"lxml",
|
||||||
"playwright>=1.58.0",
|
"playwright>=1.58.0",
|
||||||
"playwright-stealth>=2.0.2",
|
"playwright-stealth>=2.0.2",
|
||||||
"camoufox>=0.4.11",
|
"camoufox>=0.4.11",
|
||||||
|
|
|
||||||
|
|
@ -472,12 +472,23 @@ def run_scrape(
|
||||||
|
|
||||||
try:
|
try:
|
||||||
for i, outcode in enumerate(shuffled):
|
for i, outcode in enumerate(shuffled):
|
||||||
|
search_url = None
|
||||||
for ch_cfg in CHANNELS:
|
for ch_cfg in CHANNELS:
|
||||||
ch = ch_cfg["channel"]
|
ch = ch_cfg["channel"]
|
||||||
|
# Build direct URL for second channel by swapping path
|
||||||
|
direct_url = None
|
||||||
|
if search_url:
|
||||||
|
if ch == "BUY":
|
||||||
|
direct_url = search_url.replace("/to-rent/", "/for-sale/")
|
||||||
|
else:
|
||||||
|
direct_url = search_url.replace("/for-sale/", "/to-rent/")
|
||||||
try:
|
try:
|
||||||
props = zoopla_search_outcode(
|
props, result_url = zoopla_search_outcode(
|
||||||
page, outcode, ch, pc_index, pc_coords
|
page, outcode, ch, pc_index, pc_coords,
|
||||||
|
base_search_url=direct_url,
|
||||||
)
|
)
|
||||||
|
if result_url:
|
||||||
|
search_url = result_url
|
||||||
zp_results[ch].extend(props)
|
zp_results[ch].extend(props)
|
||||||
if props:
|
if props:
|
||||||
log.info("Zoopla %s: +%d properties", outcode, len(props))
|
log.info("Zoopla %s: +%d properties", outcode, len(props))
|
||||||
|
|
|
||||||
124
finder/zoopla.py
124
finder/zoopla.py
|
|
@ -263,6 +263,46 @@ def _ensure_not_challenged(page) -> None:
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def _navigate_direct(page, url: str) -> bool:
|
||||||
|
"""Navigate directly to a Zoopla search URL (skipping the homepage flow).
|
||||||
|
|
||||||
|
Used to load the second channel (e.g., RENT after BUY) for the same outcode
|
||||||
|
by swapping the path component. Falls back gracefully — returns False if
|
||||||
|
the page has no listings, so the caller can retry via the full search flow.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
page.goto(url, wait_until="domcontentloaded", timeout=30000)
|
||||||
|
except Exception as e:
|
||||||
|
log.debug("Direct navigation failed: %s", e)
|
||||||
|
return False
|
||||||
|
_ensure_not_challenged(page)
|
||||||
|
|
||||||
|
# Wait for listing content to hydrate
|
||||||
|
try:
|
||||||
|
page.wait_for_function(
|
||||||
|
"""() => {
|
||||||
|
const cards = document.querySelectorAll(
|
||||||
|
'[data-testid="regular-listings"] > div'
|
||||||
|
);
|
||||||
|
if (cards.length === 0) return false;
|
||||||
|
for (const card of cards) {
|
||||||
|
const t = card.innerText || '';
|
||||||
|
if (t.includes('\\u00a3') && t.length > 50) return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}""",
|
||||||
|
timeout=8000,
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
# Check if the page has any listings at all
|
||||||
|
has_listings = page.query_selector('a[href*="/details/"]')
|
||||||
|
if not has_listings:
|
||||||
|
return False
|
||||||
|
time.sleep(1.5)
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
def _navigate_search(page, outcode: str, channel: str) -> bool:
|
def _navigate_search(page, outcode: str, channel: str) -> bool:
|
||||||
"""Navigate to search results for an outcode via the homepage search flow.
|
"""Navigate to search results for an outcode via the homepage search flow.
|
||||||
|
|
||||||
|
|
@ -270,12 +310,12 @@ def _navigate_search(page, outcode: str, channel: str) -> bool:
|
||||||
Raises TurnstileError if Cloudflare blocks us."""
|
Raises TurnstileError if Cloudflare blocks us."""
|
||||||
# Navigate to homepage to reset search state
|
# Navigate to homepage to reset search state
|
||||||
page.goto(f"{ZOOPLA_BASE}/", wait_until="domcontentloaded", timeout=30000)
|
page.goto(f"{ZOOPLA_BASE}/", wait_until="domcontentloaded", timeout=30000)
|
||||||
time.sleep(2)
|
time.sleep(0.5)
|
||||||
_ensure_not_challenged(page)
|
_ensure_not_challenged(page)
|
||||||
|
|
||||||
# Dismiss cookie consent (may reappear after navigation)
|
# Dismiss cookie consent (may reappear after navigation)
|
||||||
page.evaluate(_DISMISS_COOKIES_JS)
|
page.evaluate(_DISMISS_COOKIES_JS)
|
||||||
time.sleep(1)
|
time.sleep(0.3)
|
||||||
|
|
||||||
# Select Buy/Rent tab
|
# Select Buy/Rent tab
|
||||||
if channel == "RENT":
|
if channel == "RENT":
|
||||||
|
|
@ -284,7 +324,7 @@ def _navigate_search(page, outcode: str, channel: str) -> bool:
|
||||||
)
|
)
|
||||||
if rent_tab:
|
if rent_tab:
|
||||||
rent_tab.click()
|
rent_tab.click()
|
||||||
time.sleep(0.5)
|
time.sleep(0.2)
|
||||||
|
|
||||||
# Find and fill search input
|
# Find and fill search input
|
||||||
search_input = page.query_selector(
|
search_input = page.query_selector(
|
||||||
|
|
@ -295,10 +335,10 @@ def _navigate_search(page, outcode: str, channel: str) -> bool:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
search_input.click()
|
search_input.click()
|
||||||
time.sleep(0.3)
|
time.sleep(0.1)
|
||||||
search_input.fill("")
|
search_input.fill("")
|
||||||
search_input.type(outcode, delay=60)
|
search_input.type(outcode, delay=60)
|
||||||
time.sleep(2)
|
time.sleep(1.2)
|
||||||
|
|
||||||
# Select first autocomplete suggestion
|
# Select first autocomplete suggestion
|
||||||
first_option = page.query_selector('[role="option"]')
|
first_option = page.query_selector('[role="option"]')
|
||||||
|
|
@ -307,7 +347,7 @@ def _navigate_search(page, outcode: str, channel: str) -> bool:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
first_option.click()
|
first_option.click()
|
||||||
time.sleep(0.5)
|
time.sleep(0.2)
|
||||||
|
|
||||||
# Click search button
|
# Click search button
|
||||||
search_btn = page.query_selector('button:has-text("Search")')
|
search_btn = page.query_selector('button:has-text("Search")')
|
||||||
|
|
@ -326,6 +366,29 @@ def _navigate_search(page, outcode: str, channel: str) -> bool:
|
||||||
time.sleep(4)
|
time.sleep(4)
|
||||||
_ensure_not_challenged(page)
|
_ensure_not_challenged(page)
|
||||||
|
|
||||||
|
# Wait for client-side hydration to populate listing content (prices, addresses).
|
||||||
|
# The structural container appears in server-rendered HTML before React hydrates
|
||||||
|
# the actual card content — extracting too early yields empty price/address fields.
|
||||||
|
try:
|
||||||
|
page.wait_for_function(
|
||||||
|
"""() => {
|
||||||
|
const cards = document.querySelectorAll(
|
||||||
|
'[data-testid="regular-listings"] > div'
|
||||||
|
);
|
||||||
|
if (cards.length === 0) return false;
|
||||||
|
for (const card of cards) {
|
||||||
|
const t = card.innerText || '';
|
||||||
|
if (t.includes('\\u00a3') && t.length > 50) return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}""",
|
||||||
|
timeout=8000,
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
# Content never appeared — extraction will likely fail but let it try
|
||||||
|
log.debug("Listing content hydration wait timed out — prices may not have rendered")
|
||||||
|
time.sleep(2)
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -437,8 +500,25 @@ def _paginate(page, total_results: int, channel: str) -> list[dict]:
|
||||||
|
|
||||||
try:
|
try:
|
||||||
page.goto(next_url, wait_until="domcontentloaded", timeout=30000)
|
page.goto(next_url, wait_until="domcontentloaded", timeout=30000)
|
||||||
time.sleep(4)
|
|
||||||
_ensure_not_challenged(page)
|
_ensure_not_challenged(page)
|
||||||
|
# Wait for listing content instead of fixed sleep
|
||||||
|
try:
|
||||||
|
page.wait_for_function(
|
||||||
|
"""() => {
|
||||||
|
const cards = document.querySelectorAll(
|
||||||
|
'[data-testid="regular-listings"] > div'
|
||||||
|
);
|
||||||
|
if (cards.length === 0) return false;
|
||||||
|
for (const card of cards) {
|
||||||
|
const t = card.innerText || '';
|
||||||
|
if (t.includes('\\u00a3') && t.length > 50) return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}""",
|
||||||
|
timeout=8000,
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
time.sleep(1.5)
|
||||||
except TurnstileError:
|
except TurnstileError:
|
||||||
raise
|
raise
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
@ -608,17 +688,32 @@ def search_outcode(
|
||||||
channel: str,
|
channel: str,
|
||||||
pc_index: PostcodeSpatialIndex,
|
pc_index: PostcodeSpatialIndex,
|
||||||
pc_coords: dict[str, tuple[float, float]],
|
pc_coords: dict[str, tuple[float, float]],
|
||||||
) -> list[dict]:
|
base_search_url: str | None = None,
|
||||||
|
) -> tuple[list[dict], str | None]:
|
||||||
"""Search Zoopla for properties in one outcode.
|
"""Search Zoopla for properties in one outcode.
|
||||||
|
|
||||||
Takes a live Camoufox Page (from launch_browser). Navigates through the
|
Takes a live Camoufox Page (from launch_browser). Navigates through the
|
||||||
search flow, extracts listings from rendered DOM, and transforms to the
|
search flow, extracts listings from rendered DOM, and transforms to the
|
||||||
standard output schema.
|
standard output schema.
|
||||||
|
|
||||||
|
If base_search_url is provided (from a previous channel search for the same
|
||||||
|
outcode), tries direct URL navigation first — skipping the slow homepage
|
||||||
|
search flow. Falls back to full navigation if direct fails.
|
||||||
|
|
||||||
|
Returns (properties, search_url) where search_url can be passed to the next
|
||||||
|
channel call for this outcode.
|
||||||
|
|
||||||
Raises TurnstileError if Cloudflare blocks us mid-session.
|
Raises TurnstileError if Cloudflare blocks us mid-session.
|
||||||
"""
|
"""
|
||||||
if not _navigate_search(page, outcode, channel):
|
navigated = False
|
||||||
return []
|
if base_search_url:
|
||||||
|
navigated = _navigate_direct(page, base_search_url)
|
||||||
|
if navigated:
|
||||||
|
log.debug("Zoopla %s %s: used direct URL navigation", outcode, channel)
|
||||||
|
|
||||||
|
if not navigated:
|
||||||
|
if not _navigate_search(page, outcode, channel):
|
||||||
|
return [], None
|
||||||
|
|
||||||
total_results = _get_result_count(page)
|
total_results = _get_result_count(page)
|
||||||
|
|
||||||
|
|
@ -632,7 +727,7 @@ def search_outcode(
|
||||||
"DOM selectors may need updating",
|
"DOM selectors may need updating",
|
||||||
outcode, channel, total_results,
|
outcode, channel, total_results,
|
||||||
)
|
)
|
||||||
return []
|
return [], None
|
||||||
|
|
||||||
channel_label = "buy" if channel == "BUY" else "rent"
|
channel_label = "buy" if channel == "BUY" else "rent"
|
||||||
properties = []
|
properties = []
|
||||||
|
|
@ -646,10 +741,13 @@ def search_outcode(
|
||||||
dropped += 1
|
dropped += 1
|
||||||
|
|
||||||
if dropped and not properties:
|
if dropped and not properties:
|
||||||
|
# Log a sample raw listing to diagnose which fields are missing
|
||||||
|
sample = raw_listings[0] if raw_listings else {}
|
||||||
log.debug(
|
log.debug(
|
||||||
"Zoopla %s %s: extracted %d raw listings but all %d dropped in transform "
|
"Zoopla %s %s: extracted %d raw listings but all %d dropped in transform "
|
||||||
"(no price/postcode/coords)",
|
"(no price/postcode/coords). Sample raw: price=%s address=%r",
|
||||||
outcode, channel, len(raw_listings), dropped,
|
outcode, channel, len(raw_listings), dropped,
|
||||||
|
sample.get("price"), sample.get("address", ""),
|
||||||
)
|
)
|
||||||
elif dropped > len(raw_listings) // 2:
|
elif dropped > len(raw_listings) // 2:
|
||||||
log.debug(
|
log.debug(
|
||||||
|
|
@ -657,4 +755,4 @@ def search_outcode(
|
||||||
outcode, channel, dropped, len(raw_listings),
|
outcode, channel, dropped, len(raw_listings),
|
||||||
)
|
)
|
||||||
|
|
||||||
return properties
|
return properties, page.url
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue