Compare commits
2 commits
e09aa574b0
...
4f61c702b1
| Author | SHA1 | Date | |
|---|---|---|---|
| 4f61c702b1 | |||
| 13980a2887 |
4 changed files with 374 additions and 42 deletions
94
.forgejo/workflows/ci.yml
Normal file
94
.forgejo/workflows/ci.yml
Normal file
|
|
@ -0,0 +1,94 @@
|
||||||
|
name: CI
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches: [main]
|
||||||
|
pull_request:
|
||||||
|
branches: [main]
|
||||||
|
|
||||||
|
concurrency:
|
||||||
|
group: ${{ gitea.workflow }}-${{ gitea.ref }}
|
||||||
|
cancel-in-progress: true
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
python:
|
||||||
|
name: Python (lint + test)
|
||||||
|
runs-on: docker
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- uses: astral-sh/setup-uv@v4
|
||||||
|
with:
|
||||||
|
enable-cache: true
|
||||||
|
|
||||||
|
- name: Install dependencies
|
||||||
|
run: uv sync
|
||||||
|
|
||||||
|
- name: Ruff check
|
||||||
|
run: uv run ruff check .
|
||||||
|
|
||||||
|
- name: Deptry (unused dependencies)
|
||||||
|
run: uv run deptry .
|
||||||
|
|
||||||
|
- name: Tests
|
||||||
|
run: |
|
||||||
|
uv run pytest pipeline/utils/test_haversine.py
|
||||||
|
uv run pytest pipeline/utils/test_poi_counts.py
|
||||||
|
uv run pytest pipeline/transform/postcode_boundaries/test_postcode_boundaries.py
|
||||||
|
|
||||||
|
frontend:
|
||||||
|
name: Frontend (lint + typecheck)
|
||||||
|
runs-on: docker
|
||||||
|
defaults:
|
||||||
|
run:
|
||||||
|
working-directory: frontend
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- uses: actions/setup-node@v4
|
||||||
|
with:
|
||||||
|
node-version: 22
|
||||||
|
cache: npm
|
||||||
|
cache-dependency-path: frontend/package-lock.json
|
||||||
|
|
||||||
|
- name: Install dependencies
|
||||||
|
run: npm ci
|
||||||
|
|
||||||
|
- name: ESLint
|
||||||
|
run: npm run lint
|
||||||
|
|
||||||
|
- name: Prettier check
|
||||||
|
run: npm run format:check
|
||||||
|
|
||||||
|
- name: TypeScript typecheck
|
||||||
|
run: npm run typecheck
|
||||||
|
|
||||||
|
rust:
|
||||||
|
name: Rust (lint + test)
|
||||||
|
runs-on: docker
|
||||||
|
defaults:
|
||||||
|
run:
|
||||||
|
working-directory: server-rs
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- uses: dtolnay/rust-toolchain@stable
|
||||||
|
|
||||||
|
- uses: Swatinem/rust-cache@v2
|
||||||
|
with:
|
||||||
|
workspaces: server-rs
|
||||||
|
|
||||||
|
- name: Clippy
|
||||||
|
run: cargo clippy -- -D warnings
|
||||||
|
|
||||||
|
- name: Format check
|
||||||
|
run: cargo fmt --check
|
||||||
|
|
||||||
|
- name: Install cargo-machete
|
||||||
|
run: cargo install cargo-machete
|
||||||
|
|
||||||
|
- name: Unused dependencies check
|
||||||
|
run: cargo machete
|
||||||
|
|
||||||
|
- name: Tests
|
||||||
|
run: cargo test
|
||||||
94
.forgejo/workflows/docker-publish.yml
Normal file
94
.forgejo/workflows/docker-publish.yml
Normal file
|
|
@ -0,0 +1,94 @@
|
||||||
|
name: Build and publish Docker image
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches: [main]
|
||||||
|
tags: ["v*"]
|
||||||
|
workflow_dispatch:
|
||||||
|
|
||||||
|
env:
|
||||||
|
REGISTRY: ${{ gitea.server_url }}
|
||||||
|
IMAGE_NAME: ${{ gitea.repository }}
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
build-and-push:
|
||||||
|
runs-on: docker
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Set up uv
|
||||||
|
uses: astral-sh/setup-uv@v4
|
||||||
|
|
||||||
|
- name: Download map assets (fonts, sprites, twemoji)
|
||||||
|
run: uv run python -m pipeline.download.map_assets --output frontend/public/assets
|
||||||
|
|
||||||
|
- name: Download arcgis data for finder
|
||||||
|
run: uv run python -m pipeline.download.arcgis --output property-data/arcgis_data.parquet
|
||||||
|
|
||||||
|
- name: Set up Docker Buildx
|
||||||
|
uses: docker/setup-buildx-action@v3
|
||||||
|
|
||||||
|
- name: Log in to Forgejo Container Registry
|
||||||
|
uses: docker/login-action@v3
|
||||||
|
with:
|
||||||
|
registry: ${{ env.REGISTRY }}
|
||||||
|
username: ${{ gitea.actor }}
|
||||||
|
password: ${{ secrets.GITEA_TOKEN }}
|
||||||
|
|
||||||
|
- name: Determine image tags
|
||||||
|
id: tags
|
||||||
|
run: |
|
||||||
|
REPO=$(echo "${{ env.IMAGE_NAME }}" | tr '[:upper:]' '[:lower:]')
|
||||||
|
SHA_SHORT=$(echo "${{ gitea.sha }}" | cut -c1-7)
|
||||||
|
TAGS="${{ env.REGISTRY }}/${REPO}:sha-${SHA_SHORT}"
|
||||||
|
|
||||||
|
# Add latest tag on default branch
|
||||||
|
if [ "${{ gitea.ref }}" = "refs/heads/main" ]; then
|
||||||
|
TAGS="${TAGS},${{ env.REGISTRY }}/${REPO}:latest"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Add version tags for semver tags
|
||||||
|
REF="${{ gitea.ref }}"
|
||||||
|
if [[ "$REF" =~ ^refs/tags/v([0-9]+)\.([0-9]+)\.([0-9]+)$ ]]; then
|
||||||
|
VERSION="${BASH_REMATCH[1]}.${BASH_REMATCH[2]}.${BASH_REMATCH[3]}"
|
||||||
|
MINOR="${BASH_REMATCH[1]}.${BASH_REMATCH[2]}"
|
||||||
|
TAGS="${TAGS},${{ env.REGISTRY }}/${REPO}:${VERSION}"
|
||||||
|
TAGS="${TAGS},${{ env.REGISTRY }}/${REPO}:${MINOR}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "tags=${TAGS}" >> "$GITHUB_OUTPUT"
|
||||||
|
echo "repo=${REPO}" >> "$GITHUB_OUTPUT"
|
||||||
|
echo "sha_short=${SHA_SHORT}" >> "$GITHUB_OUTPUT"
|
||||||
|
|
||||||
|
- name: Build and push
|
||||||
|
uses: docker/build-push-action@v6
|
||||||
|
with:
|
||||||
|
context: .
|
||||||
|
push: true
|
||||||
|
tags: ${{ steps.tags.outputs.tags }}
|
||||||
|
cache-from: type=registry,ref=${{ env.REGISTRY }}/${{ steps.tags.outputs.repo }}:buildcache
|
||||||
|
cache-to: type=registry,ref=${{ env.REGISTRY }}/${{ steps.tags.outputs.repo }}:buildcache,mode=max
|
||||||
|
|
||||||
|
- name: Build and push screenshot service
|
||||||
|
uses: docker/build-push-action@v6
|
||||||
|
with:
|
||||||
|
context: ./screenshot
|
||||||
|
push: true
|
||||||
|
tags: |
|
||||||
|
${{ env.REGISTRY }}/${{ steps.tags.outputs.repo }}-screenshot:latest
|
||||||
|
${{ env.REGISTRY }}/${{ steps.tags.outputs.repo }}-screenshot:sha-${{ steps.tags.outputs.sha_short }}
|
||||||
|
cache-from: type=registry,ref=${{ env.REGISTRY }}/${{ steps.tags.outputs.repo }}-screenshot:buildcache
|
||||||
|
cache-to: type=registry,ref=${{ env.REGISTRY }}/${{ steps.tags.outputs.repo }}-screenshot:buildcache,mode=max
|
||||||
|
|
||||||
|
- name: Build and push finder service
|
||||||
|
uses: docker/build-push-action@v6
|
||||||
|
with:
|
||||||
|
context: .
|
||||||
|
file: Dockerfile.finder
|
||||||
|
push: true
|
||||||
|
tags: |
|
||||||
|
${{ env.REGISTRY }}/${{ steps.tags.outputs.repo }}-finder:latest
|
||||||
|
${{ env.REGISTRY }}/${{ steps.tags.outputs.repo }}-finder:sha-${{ steps.tags.outputs.sha_short }}
|
||||||
|
cache-from: type=registry,ref=${{ env.REGISTRY }}/${{ steps.tags.outputs.repo }}-finder:buildcache
|
||||||
|
cache-to: type=registry,ref=${{ env.REGISTRY }}/${{ steps.tags.outputs.repo }}-finder:buildcache,mode=max
|
||||||
|
|
@ -5,6 +5,10 @@ COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
COPY finder/pyproject.toml ./
|
COPY finder/pyproject.toml ./
|
||||||
RUN uv pip install --system -r pyproject.toml
|
RUN uv pip install --system -r pyproject.toml
|
||||||
|
RUN playwright install-deps chromium firefox
|
||||||
|
RUN playwright install chromium
|
||||||
|
RUN camoufox fetch \
|
||||||
|
&& python -c "from camoufox.pkgman import camoufox_path; p = camoufox_path(download_if_missing=False); print('Camoufox verified at', p)"
|
||||||
|
|
||||||
COPY finder/*.py ./
|
COPY finder/*.py ./
|
||||||
COPY property-data/arcgis_data.parquet /data/arcgis_data.parquet
|
COPY property-data/arcgis_data.parquet /data/arcgis_data.parquet
|
||||||
|
|
|
||||||
224
finder/zoopla.py
224
finder/zoopla.py
|
|
@ -41,17 +41,23 @@ class TurnstileError(Exception):
|
||||||
MAX_PAGES_PER_OUTCODE = 10
|
MAX_PAGES_PER_OUTCODE = 10
|
||||||
|
|
||||||
# JavaScript to extract listings from the rendered DOM.
|
# JavaScript to extract listings from the rendered DOM.
|
||||||
# Finds all detail links, walks up to the card container, and parses
|
# Uses data-testid attributes as primary selectors (stable across deployments),
|
||||||
# price, beds, baths, floor area, address, and tenure from the card text.
|
# then falls back to href-based link matching with parent-walking.
|
||||||
_EXTRACT_LISTINGS_JS = r"""() => {
|
_EXTRACT_LISTINGS_JS = r"""() => {
|
||||||
const links = Array.from(document.querySelectorAll(
|
|
||||||
'a[href*="/for-sale/details/"], a[href*="/new-homes/details/"], a[href*="/to-rent/details/"]'
|
|
||||||
));
|
|
||||||
|
|
||||||
const seen = new Set();
|
const seen = new Set();
|
||||||
const results = [];
|
const results = [];
|
||||||
|
|
||||||
for (const link of links) {
|
// Strategy 1: Use data-testid selectors (post-2025 redesign)
|
||||||
|
const listingCards = document.querySelectorAll(
|
||||||
|
'[data-testid="regular-listings"] > div, [data-testid="search-content"] li'
|
||||||
|
);
|
||||||
|
|
||||||
|
for (const card of listingCards) {
|
||||||
|
const link = card.querySelector(
|
||||||
|
'a[href*="/for-sale/details/"], a[href*="/new-homes/details/"], a[href*="/to-rent/details/"]'
|
||||||
|
);
|
||||||
|
if (!link) continue;
|
||||||
|
|
||||||
const href = link.href;
|
const href = link.href;
|
||||||
const match = href.match(/\/details\/(\d+)\//);
|
const match = href.match(/\/details\/(\d+)\//);
|
||||||
if (!match) continue;
|
if (!match) continue;
|
||||||
|
|
@ -60,53 +66,108 @@ _EXTRACT_LISTINGS_JS = r"""() => {
|
||||||
if (seen.has(id)) continue;
|
if (seen.has(id)) continue;
|
||||||
seen.add(id);
|
seen.add(id);
|
||||||
|
|
||||||
// Walk up to the listing card container
|
const text = card.innerText || '';
|
||||||
let card = link;
|
|
||||||
for (let j = 0; j < 10; j++) {
|
// Try data-testid price element first, then regex
|
||||||
card = card.parentElement;
|
const priceEl = card.querySelector('[data-testid="listing-price"]');
|
||||||
if (!card) break;
|
const priceText = priceEl ? priceEl.innerText : text;
|
||||||
const text = card.innerText || '';
|
const priceMatch = priceText.match(/\u00a3([\d,]+)/);
|
||||||
if (text.includes('\u00a3') && (text.includes('bed') || text.includes('sq ft'))) {
|
|
||||||
break;
|
// Try address element first, then regex
|
||||||
|
const addressEl = card.querySelector('address');
|
||||||
|
let address = addressEl ? addressEl.innerText.trim() : '';
|
||||||
|
|
||||||
|
if (!address) {
|
||||||
|
const lines = text.split('\n').map(l => l.trim()).filter(Boolean);
|
||||||
|
for (const line of lines) {
|
||||||
|
if (/[A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2}/i.test(line) ||
|
||||||
|
(line.includes(',') && !line.includes('\u00a3') && !/^\d+ beds?/i.test(line))) {
|
||||||
|
address = line;
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!card) continue;
|
|
||||||
|
|
||||||
const text = card.innerText || '';
|
|
||||||
const lines = text.split('\n').map(l => l.trim()).filter(Boolean);
|
|
||||||
|
|
||||||
const priceMatch = text.match(/\u00a3([\d,]+)/);
|
|
||||||
const bedsMatch = text.match(/(\d+)\s*beds?/i);
|
const bedsMatch = text.match(/(\d+)\s*beds?/i);
|
||||||
const bathsMatch = text.match(/(\d+)\s*baths?/i);
|
const bathsMatch = text.match(/(\d+)\s*baths?/i);
|
||||||
const recMatch = text.match(/(\d+)\s*reception/i);
|
const recMatch = text.match(/(\d+)\s*reception/i);
|
||||||
const areaMatch = text.match(/([\d,]+)\s*sq\s*ft/i);
|
const areaMatch = text.match(/([\d,]+)\s*sq\.?\s*ft/i);
|
||||||
|
|
||||||
let address = '';
|
|
||||||
for (const line of lines) {
|
|
||||||
if (/[A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2}/i.test(line) ||
|
|
||||||
(line.includes(',') && !line.includes('\u00a3') && !/^\d+ beds?/i.test(line))) {
|
|
||||||
address = line;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let tenure = '';
|
let tenure = '';
|
||||||
if (/freehold/i.test(text)) tenure = 'Freehold';
|
if (/freehold/i.test(text)) tenure = 'Freehold';
|
||||||
else if (/leasehold/i.test(text)) tenure = 'Leasehold';
|
else if (/leasehold/i.test(text)) tenure = 'Leasehold';
|
||||||
|
|
||||||
results.push({
|
results.push({
|
||||||
id: id,
|
id, url: href.replace(window.location.origin, ''),
|
||||||
url: href.replace(window.location.origin, ''),
|
|
||||||
price: priceMatch ? parseInt(priceMatch[1].replace(/,/g, '')) : null,
|
price: priceMatch ? parseInt(priceMatch[1].replace(/,/g, '')) : null,
|
||||||
beds: bedsMatch ? parseInt(bedsMatch[1]) : null,
|
beds: bedsMatch ? parseInt(bedsMatch[1]) : null,
|
||||||
baths: bathsMatch ? parseInt(bathsMatch[1]) : null,
|
baths: bathsMatch ? parseInt(bathsMatch[1]) : null,
|
||||||
receptions: recMatch ? parseInt(recMatch[1]) : null,
|
receptions: recMatch ? parseInt(recMatch[1]) : null,
|
||||||
floor_area_sqft: areaMatch ? parseInt(areaMatch[1].replace(/,/g, '')) : null,
|
floor_area_sqft: areaMatch ? parseInt(areaMatch[1].replace(/,/g, '')) : null,
|
||||||
address: address,
|
address, tenure,
|
||||||
tenure: tenure,
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Strategy 2: Fall back to href-based link matching with parent-walking
|
||||||
|
if (results.length === 0) {
|
||||||
|
const links = Array.from(document.querySelectorAll(
|
||||||
|
'a[href*="/for-sale/details/"], a[href*="/new-homes/details/"], a[href*="/to-rent/details/"]'
|
||||||
|
));
|
||||||
|
|
||||||
|
for (const link of links) {
|
||||||
|
const href = link.href;
|
||||||
|
const match = href.match(/\/details\/(\d+)\//);
|
||||||
|
if (!match) continue;
|
||||||
|
|
||||||
|
const id = match[1];
|
||||||
|
if (seen.has(id)) continue;
|
||||||
|
seen.add(id);
|
||||||
|
|
||||||
|
let card = link;
|
||||||
|
for (let j = 0; j < 15; j++) {
|
||||||
|
card = card.parentElement;
|
||||||
|
if (!card) break;
|
||||||
|
const t = card.innerText || '';
|
||||||
|
if (t.includes('\u00a3') && (t.includes('bed') || t.includes('Bath') || t.includes('sq ft'))) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!card) continue;
|
||||||
|
|
||||||
|
const text = card.innerText || '';
|
||||||
|
const lines = text.split('\n').map(l => l.trim()).filter(Boolean);
|
||||||
|
|
||||||
|
const priceMatch = text.match(/\u00a3([\d,]+)/);
|
||||||
|
const bedsMatch = text.match(/(\d+)\s*beds?/i);
|
||||||
|
const bathsMatch = text.match(/(\d+)\s*baths?/i);
|
||||||
|
const recMatch = text.match(/(\d+)\s*reception/i);
|
||||||
|
const areaMatch = text.match(/([\d,]+)\s*sq\.?\s*ft/i);
|
||||||
|
|
||||||
|
let address = '';
|
||||||
|
for (const line of lines) {
|
||||||
|
if (/[A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2}/i.test(line) ||
|
||||||
|
(line.includes(',') && !line.includes('\u00a3') && !/^\d+ beds?/i.test(line))) {
|
||||||
|
address = line;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let tenure = '';
|
||||||
|
if (/freehold/i.test(text)) tenure = 'Freehold';
|
||||||
|
else if (/leasehold/i.test(text)) tenure = 'Leasehold';
|
||||||
|
|
||||||
|
results.push({
|
||||||
|
id, url: href.replace(window.location.origin, ''),
|
||||||
|
price: priceMatch ? parseInt(priceMatch[1].replace(/,/g, '')) : null,
|
||||||
|
beds: bedsMatch ? parseInt(bedsMatch[1]) : null,
|
||||||
|
baths: bathsMatch ? parseInt(bathsMatch[1]) : null,
|
||||||
|
receptions: recMatch ? parseInt(recMatch[1]) : null,
|
||||||
|
floor_area_sqft: areaMatch ? parseInt(areaMatch[1].replace(/,/g, '')) : null,
|
||||||
|
address, tenure,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return results;
|
return results;
|
||||||
}"""
|
}"""
|
||||||
|
|
||||||
|
|
@ -255,18 +316,44 @@ def _navigate_search(page, outcode: str, channel: str) -> bool:
|
||||||
else:
|
else:
|
||||||
search_input.press("Enter")
|
search_input.press("Enter")
|
||||||
|
|
||||||
# Wait for results to load
|
# Wait for results to load — try waiting for listings container, fall back to fixed wait
|
||||||
time.sleep(6)
|
try:
|
||||||
|
page.wait_for_selector(
|
||||||
|
'[data-testid="regular-listings"], a[href*="/details/"]',
|
||||||
|
timeout=10000,
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
time.sleep(4)
|
||||||
_ensure_not_challenged(page)
|
_ensure_not_challenged(page)
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
def _get_result_count(page) -> int:
|
def _get_result_count(page) -> int:
|
||||||
"""Extract the total results count from the page body text."""
|
"""Extract the total results count from the page.
|
||||||
|
|
||||||
|
Tries __ZAD_TARGETING__ JSON first (most reliable), then body text regex
|
||||||
|
matching both "N results" and "N properties" patterns."""
|
||||||
|
try:
|
||||||
|
# Try the ZAD targeting JSON script tag first
|
||||||
|
count = page.evaluate("""() => {
|
||||||
|
const s = document.querySelector('#__ZAD_TARGETING__');
|
||||||
|
if (s) {
|
||||||
|
try {
|
||||||
|
const d = JSON.parse(s.textContent);
|
||||||
|
if (d.search_results_count != null) return d.search_results_count;
|
||||||
|
} catch(e) {}
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}""")
|
||||||
|
if count is not None and count > 0:
|
||||||
|
return count
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
try:
|
try:
|
||||||
body = page.inner_text("body")
|
body = page.inner_text("body")
|
||||||
match = re.search(r"([\d,]+)\s+results?", body)
|
match = re.search(r"([\d,]+)\s+(?:results?|properties)", body)
|
||||||
if match:
|
if match:
|
||||||
return int(match.group(1).replace(",", ""))
|
return int(match.group(1).replace(",", ""))
|
||||||
except Exception:
|
except Exception:
|
||||||
|
|
@ -279,10 +366,42 @@ def _get_result_count(page) -> int:
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
_first_extraction_logged = False
|
||||||
|
|
||||||
|
|
||||||
def _extract_listings(page) -> list[dict]:
|
def _extract_listings(page) -> list[dict]:
|
||||||
"""Extract listing data from the current search results page DOM."""
|
"""Extract listing data from the current search results page DOM."""
|
||||||
|
global _first_extraction_logged
|
||||||
try:
|
try:
|
||||||
return page.evaluate(_EXTRACT_LISTINGS_JS)
|
listings = page.evaluate(_EXTRACT_LISTINGS_JS)
|
||||||
|
|
||||||
|
# Log diagnostic info on the very first extraction attempt
|
||||||
|
if not _first_extraction_logged:
|
||||||
|
_first_extraction_logged = True
|
||||||
|
try:
|
||||||
|
diag = page.evaluate("""() => {
|
||||||
|
const details = document.querySelectorAll('a[href*="/details/"]');
|
||||||
|
const testids = document.querySelectorAll('[data-testid]');
|
||||||
|
const testidNames = [...new Set([...testids].map(e => e.dataset.testid))];
|
||||||
|
return {
|
||||||
|
url: location.href,
|
||||||
|
title: document.title,
|
||||||
|
detailLinks: details.length,
|
||||||
|
testids: testidNames.slice(0, 30),
|
||||||
|
bodySnippet: document.body?.innerText?.slice(0, 500) || '',
|
||||||
|
};
|
||||||
|
}""")
|
||||||
|
log.info(
|
||||||
|
"Zoopla first-page diagnostic: url=%s title=%s detailLinks=%d "
|
||||||
|
"testids=%s bodySnippet=%.200s",
|
||||||
|
diag.get("url"), diag.get("title"), diag.get("detailLinks", 0),
|
||||||
|
diag.get("testids", []), diag.get("bodySnippet", ""),
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
log.info("Zoopla first extraction: %d listings found", len(listings))
|
||||||
|
|
||||||
|
return listings
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.warning("Failed to extract listings from DOM: %s", e)
|
log.warning("Failed to extract listings from DOM: %s", e)
|
||||||
zoopla_errors_total.labels(type="extract_failed").inc()
|
zoopla_errors_total.labels(type="extract_failed").inc()
|
||||||
|
|
@ -502,19 +621,40 @@ def search_outcode(
|
||||||
return []
|
return []
|
||||||
|
|
||||||
total_results = _get_result_count(page)
|
total_results = _get_result_count(page)
|
||||||
if total_results == 0:
|
|
||||||
return []
|
|
||||||
|
|
||||||
raw_listings = _paginate(page, total_results, channel)
|
# Always try extraction even if result count is 0 — the count regex may
|
||||||
|
# not match Zoopla's current text format, but listings may still be in DOM
|
||||||
|
raw_listings = _paginate(page, max(total_results, 25), channel)
|
||||||
if not raw_listings:
|
if not raw_listings:
|
||||||
|
if total_results > 0:
|
||||||
|
log.debug(
|
||||||
|
"Zoopla %s %s: page claims %d results but extraction found 0 — "
|
||||||
|
"DOM selectors may need updating",
|
||||||
|
outcode, channel, total_results,
|
||||||
|
)
|
||||||
return []
|
return []
|
||||||
|
|
||||||
channel_label = "buy" if channel == "BUY" else "rent"
|
channel_label = "buy" if channel == "BUY" else "rent"
|
||||||
properties = []
|
properties = []
|
||||||
|
dropped = 0
|
||||||
for raw in raw_listings:
|
for raw in raw_listings:
|
||||||
transformed = transform_property(raw, channel, pc_index, pc_coords)
|
transformed = transform_property(raw, channel, pc_index, pc_coords)
|
||||||
if transformed:
|
if transformed:
|
||||||
properties.append(transformed)
|
properties.append(transformed)
|
||||||
zoopla_properties_scraped.labels(channel=channel_label).inc()
|
zoopla_properties_scraped.labels(channel=channel_label).inc()
|
||||||
|
else:
|
||||||
|
dropped += 1
|
||||||
|
|
||||||
|
if dropped and not properties:
|
||||||
|
log.debug(
|
||||||
|
"Zoopla %s %s: extracted %d raw listings but all %d dropped in transform "
|
||||||
|
"(no price/postcode/coords)",
|
||||||
|
outcode, channel, len(raw_listings), dropped,
|
||||||
|
)
|
||||||
|
elif dropped > len(raw_listings) // 2:
|
||||||
|
log.debug(
|
||||||
|
"Zoopla %s %s: %d/%d listings dropped in transform",
|
||||||
|
outcode, channel, dropped, len(raw_listings),
|
||||||
|
)
|
||||||
|
|
||||||
return properties
|
return properties
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue