Fix zoopla

Migrate to forgejo
2026-03-24 08:12:23 +00:00 · 2026-03-24 08:12:12 +00:00
4 changed files with 374 additions and 42 deletions
--- a/.forgejo/workflows/ci.yml
+++ b/.forgejo/workflows/ci.yml
@ -0,0 +1,94 @@
 name: CI
 on:
  push:
    branches: [main]
  pull_request:
    branches: [main]
 concurrency:
  group: ${{ gitea.workflow }}-${{ gitea.ref }}
  cancel-in-progress: true
 jobs:
  python:
    name: Python (lint + test)
    runs-on: docker
    steps:
      - uses: actions/checkout@v4
      - uses: astral-sh/setup-uv@v4
        with:
          enable-cache: true
      - name: Install dependencies
        run: uv sync
      - name: Ruff check
        run: uv run ruff check .
      - name: Deptry (unused dependencies)
        run: uv run deptry .
      - name: Tests
        run: |
          uv run pytest pipeline/utils/test_haversine.py
          uv run pytest pipeline/utils/test_poi_counts.py
          uv run pytest pipeline/transform/postcode_boundaries/test_postcode_boundaries.py
  frontend:
    name: Frontend (lint + typecheck)
    runs-on: docker
    defaults:
      run:
        working-directory: frontend
    steps:
      - uses: actions/checkout@v4
      - uses: actions/setup-node@v4
        with:
          node-version: 22
          cache: npm
          cache-dependency-path: frontend/package-lock.json
      - name: Install dependencies
        run: npm ci
      - name: ESLint
        run: npm run lint
      - name: Prettier check
        run: npm run format:check
      - name: TypeScript typecheck
        run: npm run typecheck
  rust:
    name: Rust (lint + test)
    runs-on: docker
    defaults:
      run:
        working-directory: server-rs
    steps:
      - uses: actions/checkout@v4
      - uses: dtolnay/rust-toolchain@stable
      - uses: Swatinem/rust-cache@v2
        with:
          workspaces: server-rs
      - name: Clippy
        run: cargo clippy -- -D warnings
      - name: Format check
        run: cargo fmt --check
      - name: Install cargo-machete
        run: cargo install cargo-machete
      - name: Unused dependencies check
        run: cargo machete
      - name: Tests
        run: cargo test
--- a/.forgejo/workflows/docker-publish.yml
+++ b/.forgejo/workflows/docker-publish.yml
@ -0,0 +1,94 @@
 name: Build and publish Docker image
 on:
  push:
    branches: [main]
    tags: ["v*"]
  workflow_dispatch:
 env:
  REGISTRY: ${{ gitea.server_url }}
  IMAGE_NAME: ${{ gitea.repository }}
 jobs:
  build-and-push:
    runs-on: docker
    steps:
      - name: Checkout
        uses: actions/checkout@v4
      - name: Set up uv
        uses: astral-sh/setup-uv@v4
      - name: Download map assets (fonts, sprites, twemoji)
        run: uv run python -m pipeline.download.map_assets --output frontend/public/assets
      - name: Download arcgis data for finder
        run: uv run python -m pipeline.download.arcgis --output property-data/arcgis_data.parquet
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
      - name: Log in to Forgejo Container Registry
        uses: docker/login-action@v3
        with:
          registry: ${{ env.REGISTRY }}
          username: ${{ gitea.actor }}
          password: ${{ secrets.GITEA_TOKEN }}
      - name: Determine image tags
        id: tags
        run: |
          REPO=$(echo "${{ env.IMAGE_NAME }}" | tr '[:upper:]' '[:lower:]')
          SHA_SHORT=$(echo "${{ gitea.sha }}" | cut -c1-7)
          TAGS="${{ env.REGISTRY }}/${REPO}:sha-${SHA_SHORT}"
          # Add latest tag on default branch
          if [ "${{ gitea.ref }}" = "refs/heads/main" ]; then
            TAGS="${TAGS},${{ env.REGISTRY }}/${REPO}:latest"
          fi
          # Add version tags for semver tags
          REF="${{ gitea.ref }}"
          if [[ "$REF" =~ ^refs/tags/v([0-9]+)\.([0-9]+)\.([0-9]+)$ ]]; then
            VERSION="${BASH_REMATCH[1]}.${BASH_REMATCH[2]}.${BASH_REMATCH[3]}"
            MINOR="${BASH_REMATCH[1]}.${BASH_REMATCH[2]}"
            TAGS="${TAGS},${{ env.REGISTRY }}/${REPO}:${VERSION}"
            TAGS="${TAGS},${{ env.REGISTRY }}/${REPO}:${MINOR}"
          fi
          echo "tags=${TAGS}" >> "$GITHUB_OUTPUT"
          echo "repo=${REPO}" >> "$GITHUB_OUTPUT"
          echo "sha_short=${SHA_SHORT}" >> "$GITHUB_OUTPUT"
      - name: Build and push
        uses: docker/build-push-action@v6
        with:
          context: .
          push: true
          tags: ${{ steps.tags.outputs.tags }}
          cache-from: type=registry,ref=${{ env.REGISTRY }}/${{ steps.tags.outputs.repo }}:buildcache
          cache-to: type=registry,ref=${{ env.REGISTRY }}/${{ steps.tags.outputs.repo }}:buildcache,mode=max
      - name: Build and push screenshot service
        uses: docker/build-push-action@v6
        with:
          context: ./screenshot
          push: true
          tags: |
            ${{ env.REGISTRY }}/${{ steps.tags.outputs.repo }}-screenshot:latest
            ${{ env.REGISTRY }}/${{ steps.tags.outputs.repo }}-screenshot:sha-${{ steps.tags.outputs.sha_short }}
          cache-from: type=registry,ref=${{ env.REGISTRY }}/${{ steps.tags.outputs.repo }}-screenshot:buildcache
          cache-to: type=registry,ref=${{ env.REGISTRY }}/${{ steps.tags.outputs.repo }}-screenshot:buildcache,mode=max
      - name: Build and push finder service
        uses: docker/build-push-action@v6
        with:
          context: .
          file: Dockerfile.finder
          push: true
          tags: |
            ${{ env.REGISTRY }}/${{ steps.tags.outputs.repo }}-finder:latest
            ${{ env.REGISTRY }}/${{ steps.tags.outputs.repo }}-finder:sha-${{ steps.tags.outputs.sha_short }}
          cache-from: type=registry,ref=${{ env.REGISTRY }}/${{ steps.tags.outputs.repo }}-finder:buildcache
          cache-to: type=registry,ref=${{ env.REGISTRY }}/${{ steps.tags.outputs.repo }}-finder:buildcache,mode=max
--- a/Dockerfile.finder
+++ b/Dockerfile.finder
@ -5,6 +5,10 @@ COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
 WORKDIR /app
 COPY finder/pyproject.toml ./
 RUN uv pip install --system -r pyproject.toml
 RUN playwright install-deps chromium firefox
 RUN playwright install chromium
 RUN camoufox fetch \
    && python -c "from camoufox.pkgman import camoufox_path; p = camoufox_path(download_if_missing=False); print('Camoufox verified at', p)"
 COPY finder/*.py ./
 COPY property-data/arcgis_data.parquet /data/arcgis_data.parquet
--- a/finder/zoopla.py
+++ b/finder/zoopla.py
@ -41,17 +41,23 @@ class TurnstileError(Exception):
 MAX_PAGES_PER_OUTCODE = 10
 # JavaScript to extract listings from the rendered DOM.
-# Finds all detail links, walks up to the card container, and parses
+# Uses data-testid attributes as primary selectors (stable across deployments),
-# price, beds, baths, floor area, address, and tenure from the card text.
+# then falls back to href-based link matching with parent-walking.
 _EXTRACT_LISTINGS_JS = r"""() => {
    const links = Array.from(document.querySelectorAll(
        'a[href*="/for-sale/details/"], a[href*="/new-homes/details/"], a[href*="/to-rent/details/"]'
    ));
    const seen = new Set();
    const results = [];
-    for (const link of links) {
+    // Strategy 1: Use data-testid selectors (post-2025 redesign)
    const listingCards = document.querySelectorAll(
        '[data-testid="regular-listings"] > div, [data-testid="search-content"] li'
    );
    for (const card of listingCards) {
        const link = card.querySelector(
            'a[href*="/for-sale/details/"], a[href*="/new-homes/details/"], a[href*="/to-rent/details/"]'
        );
        if (!link) continue;
        const href = link.href;
        const match = href.match(/\/details\/(\d+)\//);
        if (!match) continue;
@ -60,53 +66,108 @@ _EXTRACT_LISTINGS_JS = r"""() => {
        if (seen.has(id)) continue;
        seen.add(id);
-        // Walk up to the listing card container
+        const text = card.innerText || '';
-        let card = link;
+
-        for (let j = 0; j < 10; j++) {
+        // Try data-testid price element first, then regex
-            card = card.parentElement;
+        const priceEl = card.querySelector('[data-testid="listing-price"]');
-            if (!card) break;
+        const priceText = priceEl ? priceEl.innerText : text;
-            const text = card.innerText || '';
+        const priceMatch = priceText.match(/\u00a3([\d,]+)/);
-            if (text.includes('\u00a3') && (text.includes('bed') || text.includes('sq ft'))) {
+
-                break;
+        // Try address element first, then regex
        const addressEl = card.querySelector('address');
        let address = addressEl ? addressEl.innerText.trim() : '';
        if (!address) {
            const lines = text.split('\n').map(l => l.trim()).filter(Boolean);
            for (const line of lines) {
                if (/[A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2}/i.test(line) ||
                    (line.includes(',') && !line.includes('\u00a3') && !/^\d+ beds?/i.test(line))) {
                    address = line;
                    break;
                }
            }
        }
        if (!card) continue;
        const text = card.innerText || '';
        const lines = text.split('\n').map(l => l.trim()).filter(Boolean);
        const priceMatch = text.match(/\u00a3([\d,]+)/);
        const bedsMatch = text.match(/(\d+)\s*beds?/i);
        const bathsMatch = text.match(/(\d+)\s*baths?/i);
        const recMatch = text.match(/(\d+)\s*reception/i);
-        const areaMatch = text.match(/([\d,]+)\s*sq\s*ft/i);
+        const areaMatch = text.match(/([\d,]+)\s*sq\.?\s*ft/i);
        let address = '';
        for (const line of lines) {
            if (/[A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2}/i.test(line) ||
                (line.includes(',') && !line.includes('\u00a3') && !/^\d+ beds?/i.test(line))) {
                address = line;
                break;
            }
        }
        let tenure = '';
        if (/freehold/i.test(text)) tenure = 'Freehold';
        else if (/leasehold/i.test(text)) tenure = 'Leasehold';
        results.push({
-            id: id,
+            id, url: href.replace(window.location.origin, ''),
            url: href.replace(window.location.origin, ''),
            price: priceMatch ? parseInt(priceMatch[1].replace(/,/g, '')) : null,
            beds: bedsMatch ? parseInt(bedsMatch[1]) : null,
            baths: bathsMatch ? parseInt(bathsMatch[1]) : null,
            receptions: recMatch ? parseInt(recMatch[1]) : null,
            floor_area_sqft: areaMatch ? parseInt(areaMatch[1].replace(/,/g, '')) : null,
-            address: address,
+            address, tenure,
            tenure: tenure,
        });
    }
    // Strategy 2: Fall back to href-based link matching with parent-walking
    if (results.length === 0) {
        const links = Array.from(document.querySelectorAll(
            'a[href*="/for-sale/details/"], a[href*="/new-homes/details/"], a[href*="/to-rent/details/"]'
        ));
        for (const link of links) {
            const href = link.href;
            const match = href.match(/\/details\/(\d+)\//);
            if (!match) continue;
            const id = match[1];
            if (seen.has(id)) continue;
            seen.add(id);
            let card = link;
            for (let j = 0; j < 15; j++) {
                card = card.parentElement;
                if (!card) break;
                const t = card.innerText || '';
                if (t.includes('\u00a3') && (t.includes('bed') || t.includes('Bath') || t.includes('sq ft'))) {
                    break;
                }
            }
            if (!card) continue;
            const text = card.innerText || '';
            const lines = text.split('\n').map(l => l.trim()).filter(Boolean);
            const priceMatch = text.match(/\u00a3([\d,]+)/);
            const bedsMatch = text.match(/(\d+)\s*beds?/i);
            const bathsMatch = text.match(/(\d+)\s*baths?/i);
            const recMatch = text.match(/(\d+)\s*reception/i);
            const areaMatch = text.match(/([\d,]+)\s*sq\.?\s*ft/i);
            let address = '';
            for (const line of lines) {
                if (/[A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2}/i.test(line) ||
                    (line.includes(',') && !line.includes('\u00a3') && !/^\d+ beds?/i.test(line))) {
                    address = line;
                    break;
                }
            }
            let tenure = '';
            if (/freehold/i.test(text)) tenure = 'Freehold';
            else if (/leasehold/i.test(text)) tenure = 'Leasehold';
            results.push({
                id, url: href.replace(window.location.origin, ''),
                price: priceMatch ? parseInt(priceMatch[1].replace(/,/g, '')) : null,
                beds: bedsMatch ? parseInt(bedsMatch[1]) : null,
                baths: bathsMatch ? parseInt(bathsMatch[1]) : null,
                receptions: recMatch ? parseInt(recMatch[1]) : null,
                floor_area_sqft: areaMatch ? parseInt(areaMatch[1].replace(/,/g, '')) : null,
                address, tenure,
            });
        }
    }
    return results;
 }"""
@ -255,18 +316,44 @@ def _navigate_search(page, outcode: str, channel: str) -> bool:
    else:
        search_input.press("Enter")
-    # Wait for results to load
+    # Wait for results to load — try waiting for listings container, fall back to fixed wait
-    time.sleep(6)
+    try:
        page.wait_for_selector(
            '[data-testid="regular-listings"], a[href*="/details/"]',
            timeout=10000,
        )
    except Exception:
        time.sleep(4)
    _ensure_not_challenged(page)
    return True
 def _get_result_count(page) -> int:
-    """Extract the total results count from the page body text."""
+    """Extract the total results count from the page.
    Tries __ZAD_TARGETING__ JSON first (most reliable), then body text regex
    matching both "N results" and "N properties" patterns."""
    try:
        # Try the ZAD targeting JSON script tag first
        count = page.evaluate("""() => {
            const s = document.querySelector('#__ZAD_TARGETING__');
            if (s) {
                try {
                    const d = JSON.parse(s.textContent);
                    if (d.search_results_count != null) return d.search_results_count;
                } catch(e) {}
            }
            return null;
        }""")
        if count is not None and count > 0:
            return count
    except Exception:
        pass
    try:
        body = page.inner_text("body")
-        match = re.search(r"([\d,]+)\s+results?", body)
+        match = re.search(r"([\d,]+)\s+(?:results?|properties)", body)
        if match:
            return int(match.group(1).replace(",", ""))
    except Exception:
@ -279,10 +366,42 @@ def _get_result_count(page) -> int:
 # ---------------------------------------------------------------------------
 _first_extraction_logged = False
 def _extract_listings(page) -> list[dict]:
    """Extract listing data from the current search results page DOM."""
    global _first_extraction_logged
    try:
-        return page.evaluate(_EXTRACT_LISTINGS_JS)
+        listings = page.evaluate(_EXTRACT_LISTINGS_JS)
        # Log diagnostic info on the very first extraction attempt
        if not _first_extraction_logged:
            _first_extraction_logged = True
            try:
                diag = page.evaluate("""() => {
                    const details = document.querySelectorAll('a[href*="/details/"]');
                    const testids = document.querySelectorAll('[data-testid]');
                    const testidNames = [...new Set([...testids].map(e => e.dataset.testid))];
                    return {
                        url: location.href,
                        title: document.title,
                        detailLinks: details.length,
                        testids: testidNames.slice(0, 30),
                        bodySnippet: document.body?.innerText?.slice(0, 500) || '',
                    };
                }""")
                log.info(
                    "Zoopla first-page diagnostic: url=%s title=%s detailLinks=%d "
                    "testids=%s bodySnippet=%.200s",
                    diag.get("url"), diag.get("title"), diag.get("detailLinks", 0),
                    diag.get("testids", []), diag.get("bodySnippet", ""),
                )
            except Exception:
                pass
            log.info("Zoopla first extraction: %d listings found", len(listings))
        return listings
    except Exception as e:
        log.warning("Failed to extract listings from DOM: %s", e)
        zoopla_errors_total.labels(type="extract_failed").inc()
@ -502,19 +621,40 @@ def search_outcode(
        return []
    total_results = _get_result_count(page)
    if total_results == 0:
        return []
-    raw_listings = _paginate(page, total_results, channel)
+    # Always try extraction even if result count is 0 — the count regex may
    # not match Zoopla's current text format, but listings may still be in DOM
    raw_listings = _paginate(page, max(total_results, 25), channel)
    if not raw_listings:
        if total_results > 0:
            log.debug(
                "Zoopla %s %s: page claims %d results but extraction found 0 — "
                "DOM selectors may need updating",
                outcode, channel, total_results,
            )
        return []
    channel_label = "buy" if channel == "BUY" else "rent"
    properties = []
    dropped = 0
    for raw in raw_listings:
        transformed = transform_property(raw, channel, pc_index, pc_coords)
        if transformed:
            properties.append(transformed)
            zoopla_properties_scraped.labels(channel=channel_label).inc()
        else:
            dropped += 1
    if dropped and not properties:
        log.debug(
            "Zoopla %s %s: extracted %d raw listings but all %d dropped in transform "
            "(no price/postcode/coords)",
            outcode, channel, len(raw_listings), dropped,
        )
    elif dropped > len(raw_listings) // 2:
        log.debug(
            "Zoopla %s %s: %d/%d listings dropped in transform",
            outcode, channel, dropped, len(raw_listings),
        )
    return properties
Author	SHA1	Message	Date
Andras Schmelczer	4f61c702b1	Fix zoopla Some checks failed CI / Rust (lint + test) (push) Failing after 6m54s Details CI / Python (lint + test) (push) Failing after 7m8s Details CI / Frontend (lint + typecheck) (push) Successful in 8m55s Details Build and publish Docker image / build-and-push (push) Failing after 3m8s Details	2026-03-24 08:12:23 +00:00
Andras Schmelczer	13980a2887	Migrate to forgejo	2026-03-24 08:12:12 +00:00