diff --git a/.forgejo/workflows/ci.yml b/.forgejo/workflows/ci.yml deleted file mode 100644 index c333373..0000000 --- a/.forgejo/workflows/ci.yml +++ /dev/null @@ -1,94 +0,0 @@ -name: CI - -on: - push: - branches: [main] - pull_request: - branches: [main] - -concurrency: - group: ${{ gitea.workflow }}-${{ gitea.ref }} - cancel-in-progress: true - -jobs: - python: - name: Python (lint + test) - runs-on: docker - steps: - - uses: actions/checkout@v4 - - - uses: astral-sh/setup-uv@v4 - with: - enable-cache: true - - - name: Install dependencies - run: uv sync - - - name: Ruff check - run: uv run ruff check . - - - name: Deptry (unused dependencies) - run: uv run deptry . - - - name: Tests - run: | - uv run pytest pipeline/utils/test_haversine.py - uv run pytest pipeline/utils/test_poi_counts.py - uv run pytest pipeline/transform/postcode_boundaries/test_postcode_boundaries.py - - frontend: - name: Frontend (lint + typecheck) - runs-on: docker - defaults: - run: - working-directory: frontend - steps: - - uses: actions/checkout@v4 - - - uses: actions/setup-node@v4 - with: - node-version: 22 - cache: npm - cache-dependency-path: frontend/package-lock.json - - - name: Install dependencies - run: npm ci - - - name: ESLint - run: npm run lint - - - name: Prettier check - run: npm run format:check - - - name: TypeScript typecheck - run: npm run typecheck - - rust: - name: Rust (lint + test) - runs-on: docker - defaults: - run: - working-directory: server-rs - steps: - - uses: actions/checkout@v4 - - - uses: dtolnay/rust-toolchain@stable - - - uses: Swatinem/rust-cache@v2 - with: - workspaces: server-rs - - - name: Clippy - run: cargo clippy -- -D warnings - - - name: Format check - run: cargo fmt --check - - - name: Install cargo-machete - run: cargo install cargo-machete - - - name: Unused dependencies check - run: cargo machete - - - name: Tests - run: cargo test diff --git a/.forgejo/workflows/docker-publish.yml b/.forgejo/workflows/docker-publish.yml deleted file mode 100644 index a53ccc2..0000000 --- a/.forgejo/workflows/docker-publish.yml +++ /dev/null @@ -1,94 +0,0 @@ -name: Build and publish Docker image - -on: - push: - branches: [main] - tags: ["v*"] - workflow_dispatch: - -env: - REGISTRY: ${{ gitea.server_url }} - IMAGE_NAME: ${{ gitea.repository }} - -jobs: - build-and-push: - runs-on: docker - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Set up uv - uses: astral-sh/setup-uv@v4 - - - name: Download map assets (fonts, sprites, twemoji) - run: uv run python -m pipeline.download.map_assets --output frontend/public/assets - - - name: Download arcgis data for finder - run: uv run python -m pipeline.download.arcgis --output property-data/arcgis_data.parquet - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Log in to Forgejo Container Registry - uses: docker/login-action@v3 - with: - registry: ${{ env.REGISTRY }} - username: ${{ gitea.actor }} - password: ${{ secrets.GITEA_TOKEN }} - - - name: Determine image tags - id: tags - run: | - REPO=$(echo "${{ env.IMAGE_NAME }}" | tr '[:upper:]' '[:lower:]') - SHA_SHORT=$(echo "${{ gitea.sha }}" | cut -c1-7) - TAGS="${{ env.REGISTRY }}/${REPO}:sha-${SHA_SHORT}" - - # Add latest tag on default branch - if [ "${{ gitea.ref }}" = "refs/heads/main" ]; then - TAGS="${TAGS},${{ env.REGISTRY }}/${REPO}:latest" - fi - - # Add version tags for semver tags - REF="${{ gitea.ref }}" - if [[ "$REF" =~ ^refs/tags/v([0-9]+)\.([0-9]+)\.([0-9]+)$ ]]; then - VERSION="${BASH_REMATCH[1]}.${BASH_REMATCH[2]}.${BASH_REMATCH[3]}" - MINOR="${BASH_REMATCH[1]}.${BASH_REMATCH[2]}" - TAGS="${TAGS},${{ env.REGISTRY }}/${REPO}:${VERSION}" - TAGS="${TAGS},${{ env.REGISTRY }}/${REPO}:${MINOR}" - fi - - echo "tags=${TAGS}" >> "$GITHUB_OUTPUT" - echo "repo=${REPO}" >> "$GITHUB_OUTPUT" - echo "sha_short=${SHA_SHORT}" >> "$GITHUB_OUTPUT" - - - name: Build and push - uses: docker/build-push-action@v6 - with: - context: . - push: true - tags: ${{ steps.tags.outputs.tags }} - cache-from: type=registry,ref=${{ env.REGISTRY }}/${{ steps.tags.outputs.repo }}:buildcache - cache-to: type=registry,ref=${{ env.REGISTRY }}/${{ steps.tags.outputs.repo }}:buildcache,mode=max - - - name: Build and push screenshot service - uses: docker/build-push-action@v6 - with: - context: ./screenshot - push: true - tags: | - ${{ env.REGISTRY }}/${{ steps.tags.outputs.repo }}-screenshot:latest - ${{ env.REGISTRY }}/${{ steps.tags.outputs.repo }}-screenshot:sha-${{ steps.tags.outputs.sha_short }} - cache-from: type=registry,ref=${{ env.REGISTRY }}/${{ steps.tags.outputs.repo }}-screenshot:buildcache - cache-to: type=registry,ref=${{ env.REGISTRY }}/${{ steps.tags.outputs.repo }}-screenshot:buildcache,mode=max - - - name: Build and push finder service - uses: docker/build-push-action@v6 - with: - context: . - file: Dockerfile.finder - push: true - tags: | - ${{ env.REGISTRY }}/${{ steps.tags.outputs.repo }}-finder:latest - ${{ env.REGISTRY }}/${{ steps.tags.outputs.repo }}-finder:sha-${{ steps.tags.outputs.sha_short }} - cache-from: type=registry,ref=${{ env.REGISTRY }}/${{ steps.tags.outputs.repo }}-finder:buildcache - cache-to: type=registry,ref=${{ env.REGISTRY }}/${{ steps.tags.outputs.repo }}-finder:buildcache,mode=max diff --git a/Dockerfile.finder b/Dockerfile.finder index c0ab6d6..52ef745 100644 --- a/Dockerfile.finder +++ b/Dockerfile.finder @@ -5,10 +5,6 @@ COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv WORKDIR /app COPY finder/pyproject.toml ./ RUN uv pip install --system -r pyproject.toml -RUN playwright install-deps chromium firefox -RUN playwright install chromium -RUN camoufox fetch \ - && python -c "from camoufox.pkgman import camoufox_path; p = camoufox_path(download_if_missing=False); print('Camoufox verified at', p)" COPY finder/*.py ./ COPY property-data/arcgis_data.parquet /data/arcgis_data.parquet diff --git a/finder/zoopla.py b/finder/zoopla.py index 052794f..ecd8a4b 100644 --- a/finder/zoopla.py +++ b/finder/zoopla.py @@ -41,23 +41,17 @@ class TurnstileError(Exception): MAX_PAGES_PER_OUTCODE = 10 # JavaScript to extract listings from the rendered DOM. -# Uses data-testid attributes as primary selectors (stable across deployments), -# then falls back to href-based link matching with parent-walking. +# Finds all detail links, walks up to the card container, and parses +# price, beds, baths, floor area, address, and tenure from the card text. _EXTRACT_LISTINGS_JS = r"""() => { + const links = Array.from(document.querySelectorAll( + 'a[href*="/for-sale/details/"], a[href*="/new-homes/details/"], a[href*="/to-rent/details/"]' + )); + const seen = new Set(); const results = []; - // Strategy 1: Use data-testid selectors (post-2025 redesign) - const listingCards = document.querySelectorAll( - '[data-testid="regular-listings"] > div, [data-testid="search-content"] li' - ); - - for (const card of listingCards) { - const link = card.querySelector( - 'a[href*="/for-sale/details/"], a[href*="/new-homes/details/"], a[href*="/to-rent/details/"]' - ); - if (!link) continue; - + for (const link of links) { const href = link.href; const match = href.match(/\/details\/(\d+)\//); if (!match) continue; @@ -66,108 +60,53 @@ _EXTRACT_LISTINGS_JS = r"""() => { if (seen.has(id)) continue; seen.add(id); - const text = card.innerText || ''; - - // Try data-testid price element first, then regex - const priceEl = card.querySelector('[data-testid="listing-price"]'); - const priceText = priceEl ? priceEl.innerText : text; - const priceMatch = priceText.match(/\u00a3([\d,]+)/); - - // Try address element first, then regex - const addressEl = card.querySelector('address'); - let address = addressEl ? addressEl.innerText.trim() : ''; - - if (!address) { - const lines = text.split('\n').map(l => l.trim()).filter(Boolean); - for (const line of lines) { - if (/[A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2}/i.test(line) || - (line.includes(',') && !line.includes('\u00a3') && !/^\d+ beds?/i.test(line))) { - address = line; - break; - } + // Walk up to the listing card container + let card = link; + for (let j = 0; j < 10; j++) { + card = card.parentElement; + if (!card) break; + const text = card.innerText || ''; + if (text.includes('\u00a3') && (text.includes('bed') || text.includes('sq ft'))) { + break; } } + if (!card) continue; + const text = card.innerText || ''; + const lines = text.split('\n').map(l => l.trim()).filter(Boolean); + + const priceMatch = text.match(/\u00a3([\d,]+)/); const bedsMatch = text.match(/(\d+)\s*beds?/i); const bathsMatch = text.match(/(\d+)\s*baths?/i); const recMatch = text.match(/(\d+)\s*reception/i); - const areaMatch = text.match(/([\d,]+)\s*sq\.?\s*ft/i); + const areaMatch = text.match(/([\d,]+)\s*sq\s*ft/i); + + let address = ''; + for (const line of lines) { + if (/[A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2}/i.test(line) || + (line.includes(',') && !line.includes('\u00a3') && !/^\d+ beds?/i.test(line))) { + address = line; + break; + } + } let tenure = ''; if (/freehold/i.test(text)) tenure = 'Freehold'; else if (/leasehold/i.test(text)) tenure = 'Leasehold'; results.push({ - id, url: href.replace(window.location.origin, ''), + id: id, + url: href.replace(window.location.origin, ''), price: priceMatch ? parseInt(priceMatch[1].replace(/,/g, '')) : null, beds: bedsMatch ? parseInt(bedsMatch[1]) : null, baths: bathsMatch ? parseInt(bathsMatch[1]) : null, receptions: recMatch ? parseInt(recMatch[1]) : null, floor_area_sqft: areaMatch ? parseInt(areaMatch[1].replace(/,/g, '')) : null, - address, tenure, + address: address, + tenure: tenure, }); } - // Strategy 2: Fall back to href-based link matching with parent-walking - if (results.length === 0) { - const links = Array.from(document.querySelectorAll( - 'a[href*="/for-sale/details/"], a[href*="/new-homes/details/"], a[href*="/to-rent/details/"]' - )); - - for (const link of links) { - const href = link.href; - const match = href.match(/\/details\/(\d+)\//); - if (!match) continue; - - const id = match[1]; - if (seen.has(id)) continue; - seen.add(id); - - let card = link; - for (let j = 0; j < 15; j++) { - card = card.parentElement; - if (!card) break; - const t = card.innerText || ''; - if (t.includes('\u00a3') && (t.includes('bed') || t.includes('Bath') || t.includes('sq ft'))) { - break; - } - } - if (!card) continue; - - const text = card.innerText || ''; - const lines = text.split('\n').map(l => l.trim()).filter(Boolean); - - const priceMatch = text.match(/\u00a3([\d,]+)/); - const bedsMatch = text.match(/(\d+)\s*beds?/i); - const bathsMatch = text.match(/(\d+)\s*baths?/i); - const recMatch = text.match(/(\d+)\s*reception/i); - const areaMatch = text.match(/([\d,]+)\s*sq\.?\s*ft/i); - - let address = ''; - for (const line of lines) { - if (/[A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2}/i.test(line) || - (line.includes(',') && !line.includes('\u00a3') && !/^\d+ beds?/i.test(line))) { - address = line; - break; - } - } - - let tenure = ''; - if (/freehold/i.test(text)) tenure = 'Freehold'; - else if (/leasehold/i.test(text)) tenure = 'Leasehold'; - - results.push({ - id, url: href.replace(window.location.origin, ''), - price: priceMatch ? parseInt(priceMatch[1].replace(/,/g, '')) : null, - beds: bedsMatch ? parseInt(bedsMatch[1]) : null, - baths: bathsMatch ? parseInt(bathsMatch[1]) : null, - receptions: recMatch ? parseInt(recMatch[1]) : null, - floor_area_sqft: areaMatch ? parseInt(areaMatch[1].replace(/,/g, '')) : null, - address, tenure, - }); - } - } - return results; }""" @@ -316,44 +255,18 @@ def _navigate_search(page, outcode: str, channel: str) -> bool: else: search_input.press("Enter") - # Wait for results to load — try waiting for listings container, fall back to fixed wait - try: - page.wait_for_selector( - '[data-testid="regular-listings"], a[href*="/details/"]', - timeout=10000, - ) - except Exception: - time.sleep(4) + # Wait for results to load + time.sleep(6) _ensure_not_challenged(page) return True def _get_result_count(page) -> int: - """Extract the total results count from the page. - - Tries __ZAD_TARGETING__ JSON first (most reliable), then body text regex - matching both "N results" and "N properties" patterns.""" - try: - # Try the ZAD targeting JSON script tag first - count = page.evaluate("""() => { - const s = document.querySelector('#__ZAD_TARGETING__'); - if (s) { - try { - const d = JSON.parse(s.textContent); - if (d.search_results_count != null) return d.search_results_count; - } catch(e) {} - } - return null; - }""") - if count is not None and count > 0: - return count - except Exception: - pass - + """Extract the total results count from the page body text.""" try: body = page.inner_text("body") - match = re.search(r"([\d,]+)\s+(?:results?|properties)", body) + match = re.search(r"([\d,]+)\s+results?", body) if match: return int(match.group(1).replace(",", "")) except Exception: @@ -366,42 +279,10 @@ def _get_result_count(page) -> int: # --------------------------------------------------------------------------- -_first_extraction_logged = False - - def _extract_listings(page) -> list[dict]: """Extract listing data from the current search results page DOM.""" - global _first_extraction_logged try: - listings = page.evaluate(_EXTRACT_LISTINGS_JS) - - # Log diagnostic info on the very first extraction attempt - if not _first_extraction_logged: - _first_extraction_logged = True - try: - diag = page.evaluate("""() => { - const details = document.querySelectorAll('a[href*="/details/"]'); - const testids = document.querySelectorAll('[data-testid]'); - const testidNames = [...new Set([...testids].map(e => e.dataset.testid))]; - return { - url: location.href, - title: document.title, - detailLinks: details.length, - testids: testidNames.slice(0, 30), - bodySnippet: document.body?.innerText?.slice(0, 500) || '', - }; - }""") - log.info( - "Zoopla first-page diagnostic: url=%s title=%s detailLinks=%d " - "testids=%s bodySnippet=%.200s", - diag.get("url"), diag.get("title"), diag.get("detailLinks", 0), - diag.get("testids", []), diag.get("bodySnippet", ""), - ) - except Exception: - pass - log.info("Zoopla first extraction: %d listings found", len(listings)) - - return listings + return page.evaluate(_EXTRACT_LISTINGS_JS) except Exception as e: log.warning("Failed to extract listings from DOM: %s", e) zoopla_errors_total.labels(type="extract_failed").inc() @@ -621,40 +502,19 @@ def search_outcode( return [] total_results = _get_result_count(page) + if total_results == 0: + return [] - # Always try extraction even if result count is 0 — the count regex may - # not match Zoopla's current text format, but listings may still be in DOM - raw_listings = _paginate(page, max(total_results, 25), channel) + raw_listings = _paginate(page, total_results, channel) if not raw_listings: - if total_results > 0: - log.debug( - "Zoopla %s %s: page claims %d results but extraction found 0 — " - "DOM selectors may need updating", - outcode, channel, total_results, - ) return [] channel_label = "buy" if channel == "BUY" else "rent" properties = [] - dropped = 0 for raw in raw_listings: transformed = transform_property(raw, channel, pc_index, pc_coords) if transformed: properties.append(transformed) zoopla_properties_scraped.labels(channel=channel_label).inc() - else: - dropped += 1 - - if dropped and not properties: - log.debug( - "Zoopla %s %s: extracted %d raw listings but all %d dropped in transform " - "(no price/postcode/coords)", - outcode, channel, len(raw_listings), dropped, - ) - elif dropped > len(raw_listings) // 2: - log.debug( - "Zoopla %s %s: %d/%d listings dropped in transform", - outcode, channel, dropped, len(raw_listings), - ) return properties