checkpoint finder
This commit is contained in:
parent
8616837c01
commit
96dfdd7491
5 changed files with 130 additions and 20 deletions
|
|
@ -4,8 +4,8 @@ from pathlib import Path
|
|||
ARCGIS_PATH = os.environ.get("ARCGIS_PATH", "/data/arcgis_data.parquet")
|
||||
DATA_DIR = Path("/app/data")
|
||||
PAGE_SIZE = 24
|
||||
DELAY_BETWEEN_PAGES = 0.5
|
||||
DELAY_BETWEEN_OUTCODES = 1.0
|
||||
DELAY_BETWEEN_PAGES = 0.3
|
||||
DELAY_BETWEEN_OUTCODES = 0.5
|
||||
MAX_RETRIES = 3
|
||||
RETRY_BASE_DELAY = 2.0
|
||||
GRID_CELL_SIZE = 0.01 # degrees for postcode spatial index
|
||||
|
|
|
|||
|
|
@ -351,7 +351,7 @@ def parse_search_results(html: str) -> list[dict]:
|
|||
<div class="fw-medium text-primary fs-3">1 Bed Flat, Location, SW1Y</div>
|
||||
<ul>...<li>1 Bed</li><li>1 Bath</li><li>Furnished</li>...</ul>
|
||||
"""
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
properties = []
|
||||
|
||||
# Property cards: <a class="pli search-property-card">
|
||||
|
|
@ -486,7 +486,7 @@ def parse_property_detail(html: str) -> dict:
|
|||
- Tables have "Rent PCM", "Deposit", "Bills Included", etc. (NOT bedrooms)
|
||||
- Description in elements with class containing "description"
|
||||
"""
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
details: dict = {}
|
||||
|
||||
# --- Title from h1 ---
|
||||
|
|
@ -810,7 +810,7 @@ def search_outcode(
|
|||
if detail_html:
|
||||
detail_data = parse_property_detail(detail_html)
|
||||
# Shorter delay for detail pages (within same outcode)
|
||||
time.sleep(DELAY_BETWEEN_PAGES * 0.5)
|
||||
time.sleep(0.15)
|
||||
|
||||
transformed = transform_property(
|
||||
search_data,
|
||||
|
|
|
|||
|
|
@ -10,6 +10,7 @@ dependencies = [
|
|||
"fake-useragent>=2.2.0",
|
||||
"prometheus-client",
|
||||
"beautifulsoup4",
|
||||
"lxml",
|
||||
"playwright>=1.58.0",
|
||||
"playwright-stealth>=2.0.2",
|
||||
"camoufox>=0.4.11",
|
||||
|
|
|
|||
|
|
@ -472,12 +472,23 @@ def run_scrape(
|
|||
|
||||
try:
|
||||
for i, outcode in enumerate(shuffled):
|
||||
search_url = None
|
||||
for ch_cfg in CHANNELS:
|
||||
ch = ch_cfg["channel"]
|
||||
# Build direct URL for second channel by swapping path
|
||||
direct_url = None
|
||||
if search_url:
|
||||
if ch == "BUY":
|
||||
direct_url = search_url.replace("/to-rent/", "/for-sale/")
|
||||
else:
|
||||
direct_url = search_url.replace("/for-sale/", "/to-rent/")
|
||||
try:
|
||||
props = zoopla_search_outcode(
|
||||
page, outcode, ch, pc_index, pc_coords
|
||||
props, result_url = zoopla_search_outcode(
|
||||
page, outcode, ch, pc_index, pc_coords,
|
||||
base_search_url=direct_url,
|
||||
)
|
||||
if result_url:
|
||||
search_url = result_url
|
||||
zp_results[ch].extend(props)
|
||||
if props:
|
||||
log.info("Zoopla %s: +%d properties", outcode, len(props))
|
||||
|
|
|
|||
124
finder/zoopla.py
124
finder/zoopla.py
|
|
@ -263,6 +263,46 @@ def _ensure_not_challenged(page) -> None:
|
|||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _navigate_direct(page, url: str) -> bool:
|
||||
"""Navigate directly to a Zoopla search URL (skipping the homepage flow).
|
||||
|
||||
Used to load the second channel (e.g., RENT after BUY) for the same outcode
|
||||
by swapping the path component. Falls back gracefully — returns False if
|
||||
the page has no listings, so the caller can retry via the full search flow.
|
||||
"""
|
||||
try:
|
||||
page.goto(url, wait_until="domcontentloaded", timeout=30000)
|
||||
except Exception as e:
|
||||
log.debug("Direct navigation failed: %s", e)
|
||||
return False
|
||||
_ensure_not_challenged(page)
|
||||
|
||||
# Wait for listing content to hydrate
|
||||
try:
|
||||
page.wait_for_function(
|
||||
"""() => {
|
||||
const cards = document.querySelectorAll(
|
||||
'[data-testid="regular-listings"] > div'
|
||||
);
|
||||
if (cards.length === 0) return false;
|
||||
for (const card of cards) {
|
||||
const t = card.innerText || '';
|
||||
if (t.includes('\\u00a3') && t.length > 50) return true;
|
||||
}
|
||||
return false;
|
||||
}""",
|
||||
timeout=8000,
|
||||
)
|
||||
except Exception:
|
||||
# Check if the page has any listings at all
|
||||
has_listings = page.query_selector('a[href*="/details/"]')
|
||||
if not has_listings:
|
||||
return False
|
||||
time.sleep(1.5)
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def _navigate_search(page, outcode: str, channel: str) -> bool:
|
||||
"""Navigate to search results for an outcode via the homepage search flow.
|
||||
|
||||
|
|
@ -270,12 +310,12 @@ def _navigate_search(page, outcode: str, channel: str) -> bool:
|
|||
Raises TurnstileError if Cloudflare blocks us."""
|
||||
# Navigate to homepage to reset search state
|
||||
page.goto(f"{ZOOPLA_BASE}/", wait_until="domcontentloaded", timeout=30000)
|
||||
time.sleep(2)
|
||||
time.sleep(0.5)
|
||||
_ensure_not_challenged(page)
|
||||
|
||||
# Dismiss cookie consent (may reappear after navigation)
|
||||
page.evaluate(_DISMISS_COOKIES_JS)
|
||||
time.sleep(1)
|
||||
time.sleep(0.3)
|
||||
|
||||
# Select Buy/Rent tab
|
||||
if channel == "RENT":
|
||||
|
|
@ -284,7 +324,7 @@ def _navigate_search(page, outcode: str, channel: str) -> bool:
|
|||
)
|
||||
if rent_tab:
|
||||
rent_tab.click()
|
||||
time.sleep(0.5)
|
||||
time.sleep(0.2)
|
||||
|
||||
# Find and fill search input
|
||||
search_input = page.query_selector(
|
||||
|
|
@ -295,10 +335,10 @@ def _navigate_search(page, outcode: str, channel: str) -> bool:
|
|||
return False
|
||||
|
||||
search_input.click()
|
||||
time.sleep(0.3)
|
||||
time.sleep(0.1)
|
||||
search_input.fill("")
|
||||
search_input.type(outcode, delay=60)
|
||||
time.sleep(2)
|
||||
time.sleep(1.2)
|
||||
|
||||
# Select first autocomplete suggestion
|
||||
first_option = page.query_selector('[role="option"]')
|
||||
|
|
@ -307,7 +347,7 @@ def _navigate_search(page, outcode: str, channel: str) -> bool:
|
|||
return False
|
||||
|
||||
first_option.click()
|
||||
time.sleep(0.5)
|
||||
time.sleep(0.2)
|
||||
|
||||
# Click search button
|
||||
search_btn = page.query_selector('button:has-text("Search")')
|
||||
|
|
@ -326,6 +366,29 @@ def _navigate_search(page, outcode: str, channel: str) -> bool:
|
|||
time.sleep(4)
|
||||
_ensure_not_challenged(page)
|
||||
|
||||
# Wait for client-side hydration to populate listing content (prices, addresses).
|
||||
# The structural container appears in server-rendered HTML before React hydrates
|
||||
# the actual card content — extracting too early yields empty price/address fields.
|
||||
try:
|
||||
page.wait_for_function(
|
||||
"""() => {
|
||||
const cards = document.querySelectorAll(
|
||||
'[data-testid="regular-listings"] > div'
|
||||
);
|
||||
if (cards.length === 0) return false;
|
||||
for (const card of cards) {
|
||||
const t = card.innerText || '';
|
||||
if (t.includes('\\u00a3') && t.length > 50) return true;
|
||||
}
|
||||
return false;
|
||||
}""",
|
||||
timeout=8000,
|
||||
)
|
||||
except Exception:
|
||||
# Content never appeared — extraction will likely fail but let it try
|
||||
log.debug("Listing content hydration wait timed out — prices may not have rendered")
|
||||
time.sleep(2)
|
||||
|
||||
return True
|
||||
|
||||
|
||||
|
|
@ -437,8 +500,25 @@ def _paginate(page, total_results: int, channel: str) -> list[dict]:
|
|||
|
||||
try:
|
||||
page.goto(next_url, wait_until="domcontentloaded", timeout=30000)
|
||||
time.sleep(4)
|
||||
_ensure_not_challenged(page)
|
||||
# Wait for listing content instead of fixed sleep
|
||||
try:
|
||||
page.wait_for_function(
|
||||
"""() => {
|
||||
const cards = document.querySelectorAll(
|
||||
'[data-testid="regular-listings"] > div'
|
||||
);
|
||||
if (cards.length === 0) return false;
|
||||
for (const card of cards) {
|
||||
const t = card.innerText || '';
|
||||
if (t.includes('\\u00a3') && t.length > 50) return true;
|
||||
}
|
||||
return false;
|
||||
}""",
|
||||
timeout=8000,
|
||||
)
|
||||
except Exception:
|
||||
time.sleep(1.5)
|
||||
except TurnstileError:
|
||||
raise
|
||||
except Exception as e:
|
||||
|
|
@ -608,17 +688,32 @@ def search_outcode(
|
|||
channel: str,
|
||||
pc_index: PostcodeSpatialIndex,
|
||||
pc_coords: dict[str, tuple[float, float]],
|
||||
) -> list[dict]:
|
||||
base_search_url: str | None = None,
|
||||
) -> tuple[list[dict], str | None]:
|
||||
"""Search Zoopla for properties in one outcode.
|
||||
|
||||
Takes a live Camoufox Page (from launch_browser). Navigates through the
|
||||
search flow, extracts listings from rendered DOM, and transforms to the
|
||||
standard output schema.
|
||||
|
||||
If base_search_url is provided (from a previous channel search for the same
|
||||
outcode), tries direct URL navigation first — skipping the slow homepage
|
||||
search flow. Falls back to full navigation if direct fails.
|
||||
|
||||
Returns (properties, search_url) where search_url can be passed to the next
|
||||
channel call for this outcode.
|
||||
|
||||
Raises TurnstileError if Cloudflare blocks us mid-session.
|
||||
"""
|
||||
if not _navigate_search(page, outcode, channel):
|
||||
return []
|
||||
navigated = False
|
||||
if base_search_url:
|
||||
navigated = _navigate_direct(page, base_search_url)
|
||||
if navigated:
|
||||
log.debug("Zoopla %s %s: used direct URL navigation", outcode, channel)
|
||||
|
||||
if not navigated:
|
||||
if not _navigate_search(page, outcode, channel):
|
||||
return [], None
|
||||
|
||||
total_results = _get_result_count(page)
|
||||
|
||||
|
|
@ -632,7 +727,7 @@ def search_outcode(
|
|||
"DOM selectors may need updating",
|
||||
outcode, channel, total_results,
|
||||
)
|
||||
return []
|
||||
return [], None
|
||||
|
||||
channel_label = "buy" if channel == "BUY" else "rent"
|
||||
properties = []
|
||||
|
|
@ -646,10 +741,13 @@ def search_outcode(
|
|||
dropped += 1
|
||||
|
||||
if dropped and not properties:
|
||||
# Log a sample raw listing to diagnose which fields are missing
|
||||
sample = raw_listings[0] if raw_listings else {}
|
||||
log.debug(
|
||||
"Zoopla %s %s: extracted %d raw listings but all %d dropped in transform "
|
||||
"(no price/postcode/coords)",
|
||||
"(no price/postcode/coords). Sample raw: price=%s address=%r",
|
||||
outcode, channel, len(raw_listings), dropped,
|
||||
sample.get("price"), sample.get("address", ""),
|
||||
)
|
||||
elif dropped > len(raw_listings) // 2:
|
||||
log.debug(
|
||||
|
|
@ -657,4 +755,4 @@ def search_outcode(
|
|||
outcode, channel, dropped, len(raw_listings),
|
||||
)
|
||||
|
||||
return properties
|
||||
return properties, page.url
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue