Fix zoopla

Migrate to forgejo
2026-03-24 08:12:23 +00:00 · 2026-03-24 08:12:12 +00:00
4 changed files with 374 additions and 42 deletions
--- a/.forgejo/workflows/ci.yml
+++ b/.forgejo/workflows/ci.yml
@ -0,0 +1,94 @@
+name: CI
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+
+concurrency:
+  group: ${{ gitea.workflow }}-${{ gitea.ref }}
+  cancel-in-progress: true
+
+jobs:
+  python:
+    name: Python (lint + test)
+    runs-on: docker
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: astral-sh/setup-uv@v4
+        with:
+          enable-cache: true
+
+      - name: Install dependencies
+        run: uv sync
+
+      - name: Ruff check
+        run: uv run ruff check .
+
+      - name: Deptry (unused dependencies)
+        run: uv run deptry .
+
+      - name: Tests
+        run: |
+          uv run pytest pipeline/utils/test_haversine.py
+          uv run pytest pipeline/utils/test_poi_counts.py
+          uv run pytest pipeline/transform/postcode_boundaries/test_postcode_boundaries.py
+
+  frontend:
+    name: Frontend (lint + typecheck)
+    runs-on: docker
+    defaults:
+      run:
+        working-directory: frontend
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-node@v4
+        with:
+          node-version: 22
+          cache: npm
+          cache-dependency-path: frontend/package-lock.json
+
+      - name: Install dependencies
+        run: npm ci
+
+      - name: ESLint
+        run: npm run lint
+
+      - name: Prettier check
+        run: npm run format:check
+
+      - name: TypeScript typecheck
+        run: npm run typecheck
+
+  rust:
+    name: Rust (lint + test)
+    runs-on: docker
+    defaults:
+      run:
+        working-directory: server-rs
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: dtolnay/rust-toolchain@stable
+
+      - uses: Swatinem/rust-cache@v2
+        with:
+          workspaces: server-rs
+
+      - name: Clippy
+        run: cargo clippy -- -D warnings
+
+      - name: Format check
+        run: cargo fmt --check
+
+      - name: Install cargo-machete
+        run: cargo install cargo-machete
+
+      - name: Unused dependencies check
+        run: cargo machete
+
+      - name: Tests
+        run: cargo test
--- a/.forgejo/workflows/docker-publish.yml
+++ b/.forgejo/workflows/docker-publish.yml
@ -0,0 +1,94 @@
+name: Build and publish Docker image
+
+on:
+  push:
+    branches: [main]
+    tags: ["v*"]
+  workflow_dispatch:
+
+env:
+  REGISTRY: ${{ gitea.server_url }}
+  IMAGE_NAME: ${{ gitea.repository }}
+
+jobs:
+  build-and-push:
+    runs-on: docker
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Set up uv
+        uses: astral-sh/setup-uv@v4
+
+      - name: Download map assets (fonts, sprites, twemoji)
+        run: uv run python -m pipeline.download.map_assets --output frontend/public/assets
+
+      - name: Download arcgis data for finder
+        run: uv run python -m pipeline.download.arcgis --output property-data/arcgis_data.parquet
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Log in to Forgejo Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ gitea.actor }}
+          password: ${{ secrets.GITEA_TOKEN }}
+
+      - name: Determine image tags
+        id: tags
+        run: |
+          REPO=$(echo "${{ env.IMAGE_NAME }}" | tr '[:upper:]' '[:lower:]')
+          SHA_SHORT=$(echo "${{ gitea.sha }}" | cut -c1-7)
+          TAGS="${{ env.REGISTRY }}/${REPO}:sha-${SHA_SHORT}"
+
+          # Add latest tag on default branch
+          if [ "${{ gitea.ref }}" = "refs/heads/main" ]; then
+            TAGS="${TAGS},${{ env.REGISTRY }}/${REPO}:latest"
+          fi
+
+          # Add version tags for semver tags
+          REF="${{ gitea.ref }}"
+          if [[ "$REF" =~ ^refs/tags/v([0-9]+)\.([0-9]+)\.([0-9]+)$ ]]; then
+            VERSION="${BASH_REMATCH[1]}.${BASH_REMATCH[2]}.${BASH_REMATCH[3]}"
+            MINOR="${BASH_REMATCH[1]}.${BASH_REMATCH[2]}"
+            TAGS="${TAGS},${{ env.REGISTRY }}/${REPO}:${VERSION}"
+            TAGS="${TAGS},${{ env.REGISTRY }}/${REPO}:${MINOR}"
+          fi
+
+          echo "tags=${TAGS}" >> "$GITHUB_OUTPUT"
+          echo "repo=${REPO}" >> "$GITHUB_OUTPUT"
+          echo "sha_short=${SHA_SHORT}" >> "$GITHUB_OUTPUT"
+
+      - name: Build and push
+        uses: docker/build-push-action@v6
+        with:
+          context: .
+          push: true
+          tags: ${{ steps.tags.outputs.tags }}
+          cache-from: type=registry,ref=${{ env.REGISTRY }}/${{ steps.tags.outputs.repo }}:buildcache
+          cache-to: type=registry,ref=${{ env.REGISTRY }}/${{ steps.tags.outputs.repo }}:buildcache,mode=max
+
+      - name: Build and push screenshot service
+        uses: docker/build-push-action@v6
+        with:
+          context: ./screenshot
+          push: true
+          tags: |
+            ${{ env.REGISTRY }}/${{ steps.tags.outputs.repo }}-screenshot:latest
+            ${{ env.REGISTRY }}/${{ steps.tags.outputs.repo }}-screenshot:sha-${{ steps.tags.outputs.sha_short }}
+          cache-from: type=registry,ref=${{ env.REGISTRY }}/${{ steps.tags.outputs.repo }}-screenshot:buildcache
+          cache-to: type=registry,ref=${{ env.REGISTRY }}/${{ steps.tags.outputs.repo }}-screenshot:buildcache,mode=max
+
+      - name: Build and push finder service
+        uses: docker/build-push-action@v6
+        with:
+          context: .
+          file: Dockerfile.finder
+          push: true
+          tags: |
+            ${{ env.REGISTRY }}/${{ steps.tags.outputs.repo }}-finder:latest
+            ${{ env.REGISTRY }}/${{ steps.tags.outputs.repo }}-finder:sha-${{ steps.tags.outputs.sha_short }}
+          cache-from: type=registry,ref=${{ env.REGISTRY }}/${{ steps.tags.outputs.repo }}-finder:buildcache
+          cache-to: type=registry,ref=${{ env.REGISTRY }}/${{ steps.tags.outputs.repo }}-finder:buildcache,mode=max
--- a/Dockerfile.finder
+++ b/Dockerfile.finder
@ -5,6 +5,10 @@ COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
 WORKDIR /app
 COPY finder/pyproject.toml ./
 RUN uv pip install --system -r pyproject.toml
+RUN playwright install-deps chromium firefox
+RUN playwright install chromium
+RUN camoufox fetch \
+    && python -c "from camoufox.pkgman import camoufox_path; p = camoufox_path(download_if_missing=False); print('Camoufox verified at', p)"

 COPY finder/*.py ./
 COPY property-data/arcgis_data.parquet /data/arcgis_data.parquet
--- a/finder/zoopla.py
+++ b/finder/zoopla.py
@ -41,17 +41,23 @@ class TurnstileError(Exception):
 MAX_PAGES_PER_OUTCODE = 10

 # JavaScript to extract listings from the rendered DOM.
-# Finds all detail links, walks up to the card container, and parses
-# price, beds, baths, floor area, address, and tenure from the card text.
+# Uses data-testid attributes as primary selectors (stable across deployments),
+# then falls back to href-based link matching with parent-walking.
 _EXTRACT_LISTINGS_JS = r"""() => {
-    const links = Array.from(document.querySelectorAll(
-        'a[href*="/for-sale/details/"], a[href*="/new-homes/details/"], a[href*="/to-rent/details/"]'
-    ));
-
    const seen = new Set();
    const results = [];

-    for (const link of links) {
+    // Strategy 1: Use data-testid selectors (post-2025 redesign)
+    const listingCards = document.querySelectorAll(
+        '[data-testid="regular-listings"] > div, [data-testid="search-content"] li'
+    );
+
+    for (const card of listingCards) {
+        const link = card.querySelector(
+            'a[href*="/for-sale/details/"], a[href*="/new-homes/details/"], a[href*="/to-rent/details/"]'
+        );
+        if (!link) continue;
+
        const href = link.href;
        const match = href.match(/\/details\/(\d+)\//);
        if (!match) continue;
@ -60,53 +66,108 @@ _EXTRACT_LISTINGS_JS = r"""() => {
        if (seen.has(id)) continue;
        seen.add(id);

-        // Walk up to the listing card container
-        let card = link;
-        for (let j = 0; j < 10; j++) {
-            card = card.parentElement;
-            if (!card) break;
-            const text = card.innerText || '';
-            if (text.includes('\u00a3') && (text.includes('bed') || text.includes('sq ft'))) {
-                break;
+        const text = card.innerText || '';
+
+        // Try data-testid price element first, then regex
+        const priceEl = card.querySelector('[data-testid="listing-price"]');
+        const priceText = priceEl ? priceEl.innerText : text;
+        const priceMatch = priceText.match(/\u00a3([\d,]+)/);
+
+        // Try address element first, then regex
+        const addressEl = card.querySelector('address');
+        let address = addressEl ? addressEl.innerText.trim() : '';
+
+        if (!address) {
+            const lines = text.split('\n').map(l => l.trim()).filter(Boolean);
+            for (const line of lines) {
+                if (/[A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2}/i.test(line) ||
+                    (line.includes(',') && !line.includes('\u00a3') && !/^\d+ beds?/i.test(line))) {
+                    address = line;
+                    break;
+                }
            }
        }
-        if (!card) continue;

-        const text = card.innerText || '';
-        const lines = text.split('\n').map(l => l.trim()).filter(Boolean);
-
-        const priceMatch = text.match(/\u00a3([\d,]+)/);
        const bedsMatch = text.match(/(\d+)\s*beds?/i);
        const bathsMatch = text.match(/(\d+)\s*baths?/i);
        const recMatch = text.match(/(\d+)\s*reception/i);
-        const areaMatch = text.match(/([\d,]+)\s*sq\s*ft/i);
-
-        let address = '';
-        for (const line of lines) {
-            if (/[A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2}/i.test(line) ||
-                (line.includes(',') && !line.includes('\u00a3') && !/^\d+ beds?/i.test(line))) {
-                address = line;
-                break;
-            }
-        }
+        const areaMatch = text.match(/([\d,]+)\s*sq\.?\s*ft/i);

        let tenure = '';
        if (/freehold/i.test(text)) tenure = 'Freehold';
        else if (/leasehold/i.test(text)) tenure = 'Leasehold';

        results.push({
-            id: id,
-            url: href.replace(window.location.origin, ''),
+            id, url: href.replace(window.location.origin, ''),
            price: priceMatch ? parseInt(priceMatch[1].replace(/,/g, '')) : null,
            beds: bedsMatch ? parseInt(bedsMatch[1]) : null,
            baths: bathsMatch ? parseInt(bathsMatch[1]) : null,
            receptions: recMatch ? parseInt(recMatch[1]) : null,
            floor_area_sqft: areaMatch ? parseInt(areaMatch[1].replace(/,/g, '')) : null,
-            address: address,
-            tenure: tenure,
+            address, tenure,
        });
    }

+    // Strategy 2: Fall back to href-based link matching with parent-walking
+    if (results.length === 0) {
+        const links = Array.from(document.querySelectorAll(
+            'a[href*="/for-sale/details/"], a[href*="/new-homes/details/"], a[href*="/to-rent/details/"]'
+        ));
+
+        for (const link of links) {
+            const href = link.href;
+            const match = href.match(/\/details\/(\d+)\//);
+            if (!match) continue;
+
+            const id = match[1];
+            if (seen.has(id)) continue;
+            seen.add(id);
+
+            let card = link;
+            for (let j = 0; j < 15; j++) {
+                card = card.parentElement;
+                if (!card) break;
+                const t = card.innerText || '';
+                if (t.includes('\u00a3') && (t.includes('bed') || t.includes('Bath') || t.includes('sq ft'))) {
+                    break;
+                }
+            }
+            if (!card) continue;
+
+            const text = card.innerText || '';
+            const lines = text.split('\n').map(l => l.trim()).filter(Boolean);
+
+            const priceMatch = text.match(/\u00a3([\d,]+)/);
+            const bedsMatch = text.match(/(\d+)\s*beds?/i);
+            const bathsMatch = text.match(/(\d+)\s*baths?/i);
+            const recMatch = text.match(/(\d+)\s*reception/i);
+            const areaMatch = text.match(/([\d,]+)\s*sq\.?\s*ft/i);
+
+            let address = '';
+            for (const line of lines) {
+                if (/[A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2}/i.test(line) ||
+                    (line.includes(',') && !line.includes('\u00a3') && !/^\d+ beds?/i.test(line))) {
+                    address = line;
+                    break;
+                }
+            }
+
+            let tenure = '';
+            if (/freehold/i.test(text)) tenure = 'Freehold';
+            else if (/leasehold/i.test(text)) tenure = 'Leasehold';
+
+            results.push({
+                id, url: href.replace(window.location.origin, ''),
+                price: priceMatch ? parseInt(priceMatch[1].replace(/,/g, '')) : null,
+                beds: bedsMatch ? parseInt(bedsMatch[1]) : null,
+                baths: bathsMatch ? parseInt(bathsMatch[1]) : null,
+                receptions: recMatch ? parseInt(recMatch[1]) : null,
+                floor_area_sqft: areaMatch ? parseInt(areaMatch[1].replace(/,/g, '')) : null,
+                address, tenure,
+            });
+        }
+    }
+
    return results;
 }"""

@ -255,18 +316,44 @@ def _navigate_search(page, outcode: str, channel: str) -> bool:
    else:
        search_input.press("Enter")

-    # Wait for results to load
-    time.sleep(6)
+    # Wait for results to load — try waiting for listings container, fall back to fixed wait
+    try:
+        page.wait_for_selector(
+            '[data-testid="regular-listings"], a[href*="/details/"]',
+            timeout=10000,
+        )
+    except Exception:
+        time.sleep(4)
    _ensure_not_challenged(page)

    return True


 def _get_result_count(page) -> int:
-    """Extract the total results count from the page body text."""
+    """Extract the total results count from the page.
+
+    Tries __ZAD_TARGETING__ JSON first (most reliable), then body text regex
+    matching both "N results" and "N properties" patterns."""
+    try:
+        # Try the ZAD targeting JSON script tag first
+        count = page.evaluate("""() => {
+            const s = document.querySelector('#__ZAD_TARGETING__');
+            if (s) {
+                try {
+                    const d = JSON.parse(s.textContent);
+                    if (d.search_results_count != null) return d.search_results_count;
+                } catch(e) {}
+            }
+            return null;
+        }""")
+        if count is not None and count > 0:
+            return count
+    except Exception:
+        pass
+
    try:
        body = page.inner_text("body")
-        match = re.search(r"([\d,]+)\s+results?", body)
+        match = re.search(r"([\d,]+)\s+(?:results?|properties)", body)
        if match:
            return int(match.group(1).replace(",", ""))
    except Exception:
@ -279,10 +366,42 @@ def _get_result_count(page) -> int:
 # ---------------------------------------------------------------------------


+_first_extraction_logged = False
+
+
 def _extract_listings(page) -> list[dict]:
    """Extract listing data from the current search results page DOM."""
+    global _first_extraction_logged
    try:
-        return page.evaluate(_EXTRACT_LISTINGS_JS)
+        listings = page.evaluate(_EXTRACT_LISTINGS_JS)
+
+        # Log diagnostic info on the very first extraction attempt
+        if not _first_extraction_logged:
+            _first_extraction_logged = True
+            try:
+                diag = page.evaluate("""() => {
+                    const details = document.querySelectorAll('a[href*="/details/"]');
+                    const testids = document.querySelectorAll('[data-testid]');
+                    const testidNames = [...new Set([...testids].map(e => e.dataset.testid))];
+                    return {
+                        url: location.href,
+                        title: document.title,
+                        detailLinks: details.length,
+                        testids: testidNames.slice(0, 30),
+                        bodySnippet: document.body?.innerText?.slice(0, 500) || '',
+                    };
+                }""")
+                log.info(
+                    "Zoopla first-page diagnostic: url=%s title=%s detailLinks=%d "
+                    "testids=%s bodySnippet=%.200s",
+                    diag.get("url"), diag.get("title"), diag.get("detailLinks", 0),
+                    diag.get("testids", []), diag.get("bodySnippet", ""),
+                )
+            except Exception:
+                pass
+            log.info("Zoopla first extraction: %d listings found", len(listings))
+
+        return listings
    except Exception as e:
        log.warning("Failed to extract listings from DOM: %s", e)
        zoopla_errors_total.labels(type="extract_failed").inc()
@ -502,19 +621,40 @@ def search_outcode(
        return []

    total_results = _get_result_count(page)
-    if total_results == 0:
-        return []

-    raw_listings = _paginate(page, total_results, channel)
+    # Always try extraction even if result count is 0 — the count regex may
+    # not match Zoopla's current text format, but listings may still be in DOM
+    raw_listings = _paginate(page, max(total_results, 25), channel)
    if not raw_listings:
+        if total_results > 0:
+            log.debug(
+                "Zoopla %s %s: page claims %d results but extraction found 0 — "
+                "DOM selectors may need updating",
+                outcode, channel, total_results,
+            )
        return []

    channel_label = "buy" if channel == "BUY" else "rent"
    properties = []
+    dropped = 0
    for raw in raw_listings:
        transformed = transform_property(raw, channel, pc_index, pc_coords)
        if transformed:
            properties.append(transformed)
            zoopla_properties_scraped.labels(channel=channel_label).inc()
+        else:
+            dropped += 1
+
+    if dropped and not properties:
+        log.debug(
+            "Zoopla %s %s: extracted %d raw listings but all %d dropped in transform "
+            "(no price/postcode/coords)",
+            outcode, channel, len(raw_listings), dropped,
+        )
+    elif dropped > len(raw_listings) // 2:
+        log.debug(
+            "Zoopla %s %s: %d/%d listings dropped in transform",
+            outcode, channel, dropped, len(raw_listings),
+        )

    return properties
Author	SHA1	Message	Date
Andras Schmelczer	4f61c702b1	Fix zoopla Some checks failed CI / Rust (lint + test) (push) Failing after 6m54s Details CI / Python (lint + test) (push) Failing after 7m8s Details CI / Frontend (lint + typecheck) (push) Successful in 8m55s Details Build and publish Docker image / build-and-push (push) Failing after 3m8s Details	2026-03-24 08:12:23 +00:00
Andras Schmelczer	13980a2887	Migrate to forgejo	2026-03-24 08:12:12 +00:00